aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib/opensolaris
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib/opensolaris')
-rw-r--r--sys/cddl/contrib/opensolaris/OPENSOLARIS.LICENSE384
-rw-r--r--sys/cddl/contrib/opensolaris/common/acl/acl_common.c1765
-rw-r--r--sys/cddl/contrib/opensolaris/common/acl/acl_common.h69
-rw-r--r--sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S133
-rw-r--r--sys/cddl/contrib/opensolaris/common/avl/avl.c1063
-rw-r--r--sys/cddl/contrib/opensolaris/common/lz4/lz4.c1043
-rw-r--r--sys/cddl/contrib/opensolaris/common/lz4/lz4.h50
-rw-r--r--sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c512
-rw-r--r--sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c3600
-rw-r--r--sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c118
-rw-r--r--sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c2130
-rw-r--r--sys/cddl/contrib/opensolaris/common/util/strtolctype.h79
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c310
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h111
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c206
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h52
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c235
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h90
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c279
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h58
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c1380
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h543
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c399
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h66
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c718
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h131
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c250
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c430
-rw-r--r--sys/cddl/contrib/opensolaris/uts/aarch64/dtrace/fasttrap_isa.c29
-rw-r--r--sys/cddl/contrib/opensolaris/uts/aarch64/sys/fasttrap_isa.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/arm/dtrace/fasttrap_isa.c30
-rw-r--r--sys/cddl/contrib/opensolaris/uts/arm/sys/fasttrap_isa.h94
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/Makefile.files183
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c177
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c96
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c18424
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.c89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c2663
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c94
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash19
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz430
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c960
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c234
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c8569
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c152
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c606
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c301
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c111
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c4248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c242
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c1189
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c165
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c2748
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c251
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c444
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c2484
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c3550
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c712
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c1345
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c374
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c2418
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c779
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c566
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c4252
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c561
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c760
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c1097
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c2184
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c1372
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c1211
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c4001
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c256
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c667
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c114
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c69
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs80
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c1283
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c791
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h176
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c296
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c212
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c885
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h83
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c102
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c154
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c607
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c691
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c173
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c161
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h33
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c1220
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h157
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c529
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h78
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h308
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c99
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c283
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h606
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c107
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h288
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c1637
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h119
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c321
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h228
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c185
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c1050
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c589
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c284
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h443
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h555
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h55
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c258
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c930
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h44
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h65
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c129
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c4624
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c750
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c423
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c670
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c321
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c396
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c2012
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c105
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c105
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c8972
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c623
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c594
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c406
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c628
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c2523
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c1073
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c149
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h154
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h58
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h290
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h39
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h95
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h65
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h54
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h417
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h1028
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h315
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h221
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h69
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h152
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h599
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h457
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h81
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h68
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h209
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h191
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h115
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h188
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h127
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h127
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h501
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h107
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h124
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h170
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h291
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h969
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h48
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h44
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h435
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h230
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h51
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h136
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h50
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h145
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h57
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h196
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h67
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h49
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h571
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h80
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h141
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h50
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h96
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h514
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h242
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h185
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h73
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h146
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h65
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h99
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h132
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h466
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h66
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h90
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h142
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h55
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h192
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h374
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h464
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h229
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h675
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h119
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h128
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h256
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h39
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h85
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c634
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c977
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c4520
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c434
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c971
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c307
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c1193
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c1849
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c212
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c593
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c782
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c1701
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c779
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c113
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c1047
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c2707
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c2156
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c157
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c1378
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c849
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c1609
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c1432
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c865
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c531
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c360
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c505
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c2778
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c199
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c1364
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c112
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c968
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c871
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c762
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c7692
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c688
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c254
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c1069
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c641
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c326
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c2813
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c6124
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c2388
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c3499
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c4386
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c475
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c215
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c755
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c86
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c187
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c431
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c3347
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/callb.c438
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/fm.c1399
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/list.c243
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl.h313
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h61
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h82
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/avl.h333
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h164
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h198
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/callb.h215
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h127
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h128
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/compress.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h158
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h830
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/cred.h193
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h360
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h251
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/debug.h159
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h2510
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h1351
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h83
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h98
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h235
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h431
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h97
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h369
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h102
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h1248
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h93
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h97
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h697
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/list.h65
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h51
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/note.h54
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h351
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h90
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/processor.h140
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/procset.h166
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/synch.h162
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h289
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h256
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h230
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h466
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h115
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h35376
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h427
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h68
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c138
-rw-r--r--sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c1841
-rw-r--r--sys/cddl/contrib/opensolaris/uts/intel/sys/fasttrap_isa.h114
-rw-r--r--sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c30
-rw-r--r--sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h48
-rw-r--r--sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c548
-rw-r--r--sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h76
-rw-r--r--sys/cddl/contrib/opensolaris/uts/riscv/dtrace/fasttrap_isa.c29
-rw-r--r--sys/cddl/contrib/opensolaris/uts/riscv/sys/fasttrap_isa.h46
356 files changed, 271184 insertions, 0 deletions
diff --git a/sys/cddl/contrib/opensolaris/OPENSOLARIS.LICENSE b/sys/cddl/contrib/opensolaris/OPENSOLARIS.LICENSE
new file mode 100644
index 000000000000..da23621dc843
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/OPENSOLARIS.LICENSE
@@ -0,0 +1,384 @@
+Unless otherwise noted, all files in this distribution are released
+under the Common Development and Distribution License (CDDL).
+Exceptions are noted within the associated source files.
+
+--------------------------------------------------------------------
+
+
+COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0
+
+1. Definitions.
+
+ 1.1. "Contributor" means each individual or entity that creates
+ or contributes to the creation of Modifications.
+
+ 1.2. "Contributor Version" means the combination of the Original
+ Software, prior Modifications used by a Contributor (if any),
+ and the Modifications made by that particular Contributor.
+
+ 1.3. "Covered Software" means (a) the Original Software, or (b)
+ Modifications, or (c) the combination of files containing
+ Original Software with files containing Modifications, in
+ each case including portions thereof.
+
+ 1.4. "Executable" means the Covered Software in any form other
+ than Source Code.
+
+ 1.5. "Initial Developer" means the individual or entity that first
+ makes Original Software available under this License.
+
+ 1.6. "Larger Work" means a work which combines Covered Software or
+ portions thereof with code not governed by the terms of this
+ License.
+
+ 1.7. "License" means this document.
+
+ 1.8. "Licensable" means having the right to grant, to the maximum
+ extent possible, whether at the time of the initial grant or
+ subsequently acquired, any and all of the rights conveyed
+ herein.
+
+ 1.9. "Modifications" means the Source Code and Executable form of
+ any of the following:
+
+ A. Any file that results from an addition to, deletion from or
+ modification of the contents of a file containing Original
+ Software or previous Modifications;
+
+ B. Any new file that contains any part of the Original
+ Software or previous Modifications; or
+
+ C. Any new file that is contributed or otherwise made
+ available under the terms of this License.
+
+ 1.10. "Original Software" means the Source Code and Executable
+ form of computer software code that is originally released
+ under this License.
+
+ 1.11. "Patent Claims" means any patent claim(s), now owned or
+ hereafter acquired, including without limitation, method,
+ process, and apparatus claims, in any patent Licensable by
+ grantor.
+
+ 1.12. "Source Code" means (a) the common form of computer software
+ code in which modifications are made and (b) associated
+ documentation included in or with such code.
+
+ 1.13. "You" (or "Your") means an individual or a legal entity
+ exercising rights under, and complying with all of the terms
+ of, this License. For legal entities, "You" includes any
+ entity which controls, is controlled by, or is under common
+ control with You. For purposes of this definition,
+ "control" means (a) the power, direct or indirect, to cause
+ the direction or management of such entity, whether by
+ contract or otherwise, or (b) ownership of more than fifty
+ percent (50%) of the outstanding shares or beneficial
+ ownership of such entity.
+
+2. License Grants.
+
+ 2.1. The Initial Developer Grant.
+
+ Conditioned upon Your compliance with Section 3.1 below and
+ subject to third party intellectual property claims, the Initial
+ Developer hereby grants You a world-wide, royalty-free,
+ non-exclusive license:
+
+ (a) under intellectual property rights (other than patent or
+ trademark) Licensable by Initial Developer, to use,
+ reproduce, modify, display, perform, sublicense and
+ distribute the Original Software (or portions thereof),
+ with or without Modifications, and/or as part of a Larger
+ Work; and
+
+ (b) under Patent Claims infringed by the making, using or
+ selling of Original Software, to make, have made, use,
+ practice, sell, and offer for sale, and/or otherwise
+ dispose of the Original Software (or portions thereof).
+
+ (c) The licenses granted in Sections 2.1(a) and (b) are
+ effective on the date Initial Developer first distributes
+ or otherwise makes the Original Software available to a
+ third party under the terms of this License.
+
+ (d) Notwithstanding Section 2.1(b) above, no patent license is
+ granted: (1) for code that You delete from the Original
+ Software, or (2) for infringements caused by: (i) the
+ modification of the Original Software, or (ii) the
+ combination of the Original Software with other software
+ or devices.
+
+ 2.2. Contributor Grant.
+
+ Conditioned upon Your compliance with Section 3.1 below and
+ subject to third party intellectual property claims, each
+ Contributor hereby grants You a world-wide, royalty-free,
+ non-exclusive license:
+
+ (a) under intellectual property rights (other than patent or
+ trademark) Licensable by Contributor to use, reproduce,
+ modify, display, perform, sublicense and distribute the
+ Modifications created by such Contributor (or portions
+ thereof), either on an unmodified basis, with other
+ Modifications, as Covered Software and/or as part of a
+ Larger Work; and
+
+ (b) under Patent Claims infringed by the making, using, or
+ selling of Modifications made by that Contributor either
+ alone and/or in combination with its Contributor Version
+ (or portions of such combination), to make, use, sell,
+ offer for sale, have made, and/or otherwise dispose of:
+ (1) Modifications made by that Contributor (or portions
+ thereof); and (2) the combination of Modifications made by
+ that Contributor with its Contributor Version (or portions
+ of such combination).
+
+ (c) The licenses granted in Sections 2.2(a) and 2.2(b) are
+ effective on the date Contributor first distributes or
+ otherwise makes the Modifications available to a third
+ party.
+
+ (d) Notwithstanding Section 2.2(b) above, no patent license is
+ granted: (1) for any code that Contributor has deleted
+ from the Contributor Version; (2) for infringements caused
+ by: (i) third party modifications of Contributor Version,
+ or (ii) the combination of Modifications made by that
+ Contributor with other software (except as part of the
+ Contributor Version) or other devices; or (3) under Patent
+ Claims infringed by Covered Software in the absence of
+ Modifications made by that Contributor.
+
+3. Distribution Obligations.
+
+ 3.1. Availability of Source Code.
+
+ Any Covered Software that You distribute or otherwise make
+ available in Executable form must also be made available in Source
+ Code form and that Source Code form must be distributed only under
+ the terms of this License. You must include a copy of this
+ License with every copy of the Source Code form of the Covered
+ Software You distribute or otherwise make available. You must
+ inform recipients of any such Covered Software in Executable form
+ as to how they can obtain such Covered Software in Source Code
+ form in a reasonable manner on or through a medium customarily
+ used for software exchange.
+
+ 3.2. Modifications.
+
+ The Modifications that You create or to which You contribute are
+ governed by the terms of this License. You represent that You
+ believe Your Modifications are Your original creation(s) and/or
+ You have sufficient rights to grant the rights conveyed by this
+ License.
+
+ 3.3. Required Notices.
+
+ You must include a notice in each of Your Modifications that
+ identifies You as the Contributor of the Modification. You may
+ not remove or alter any copyright, patent or trademark notices
+ contained within the Covered Software, or any notices of licensing
+ or any descriptive text giving attribution to any Contributor or
+ the Initial Developer.
+
+ 3.4. Application of Additional Terms.
+
+ You may not offer or impose any terms on any Covered Software in
+ Source Code form that alters or restricts the applicable version
+ of this License or the recipients' rights hereunder. You may
+ choose to offer, and to charge a fee for, warranty, support,
+ indemnity or liability obligations to one or more recipients of
+ Covered Software. However, you may do so only on Your own behalf,
+ and not on behalf of the Initial Developer or any Contributor.
+ You must make it absolutely clear that any such warranty, support,
+ indemnity or liability obligation is offered by You alone, and You
+ hereby agree to indemnify the Initial Developer and every
+ Contributor for any liability incurred by the Initial Developer or
+ such Contributor as a result of warranty, support, indemnity or
+ liability terms You offer.
+
+ 3.5. Distribution of Executable Versions.
+
+ You may distribute the Executable form of the Covered Software
+ under the terms of this License or under the terms of a license of
+ Your choice, which may contain terms different from this License,
+ provided that You are in compliance with the terms of this License
+ and that the license for the Executable form does not attempt to
+ limit or alter the recipient's rights in the Source Code form from
+ the rights set forth in this License. If You distribute the
+ Covered Software in Executable form under a different license, You
+ must make it absolutely clear that any terms which differ from
+ this License are offered by You alone, not by the Initial
+ Developer or Contributor. You hereby agree to indemnify the
+ Initial Developer and every Contributor for any liability incurred
+ by the Initial Developer or such Contributor as a result of any
+ such terms You offer.
+
+ 3.6. Larger Works.
+
+ You may create a Larger Work by combining Covered Software with
+ other code not governed by the terms of this License and
+ distribute the Larger Work as a single product. In such a case,
+ You must make sure the requirements of this License are fulfilled
+ for the Covered Software.
+
+4. Versions of the License.
+
+ 4.1. New Versions.
+
+ Sun Microsystems, Inc. is the initial license steward and may
+ publish revised and/or new versions of this License from time to
+ time. Each version will be given a distinguishing version number.
+ Except as provided in Section 4.3, no one other than the license
+ steward has the right to modify this License.
+
+ 4.2. Effect of New Versions.
+
+ You may always continue to use, distribute or otherwise make the
+ Covered Software available under the terms of the version of the
+ License under which You originally received the Covered Software.
+ If the Initial Developer includes a notice in the Original
+ Software prohibiting it from being distributed or otherwise made
+ available under any subsequent version of the License, You must
+ distribute and make the Covered Software available under the terms
+ of the version of the License under which You originally received
+ the Covered Software. Otherwise, You may also choose to use,
+ distribute or otherwise make the Covered Software available under
+ the terms of any subsequent version of the License published by
+ the license steward.
+
+ 4.3. Modified Versions.
+
+ When You are an Initial Developer and You want to create a new
+ license for Your Original Software, You may create and use a
+ modified version of this License if You: (a) rename the license
+ and remove any references to the name of the license steward
+ (except to note that the license differs from this License); and
+ (b) otherwise make it clear that the license contains terms which
+ differ from this License.
+
+5. DISCLAIMER OF WARRANTY.
+
+ COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS"
+ BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED,
+ INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED
+ SOFTWARE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR
+ PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND
+ PERFORMANCE OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY
+ COVERED SOFTWARE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE
+ INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY
+ NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF
+ WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF
+ ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS
+ DISCLAIMER.
+
+6. TERMINATION.
+
+ 6.1. This License and the rights granted hereunder will terminate
+ automatically if You fail to comply with terms herein and fail to
+ cure such breach within 30 days of becoming aware of the breach.
+ Provisions which, by their nature, must remain in effect beyond
+ the termination of this License shall survive.
+
+ 6.2. If You assert a patent infringement claim (excluding
+ declaratory judgment actions) against Initial Developer or a
+ Contributor (the Initial Developer or Contributor against whom You
+ assert such claim is referred to as "Participant") alleging that
+ the Participant Software (meaning the Contributor Version where
+ the Participant is a Contributor or the Original Software where
+ the Participant is the Initial Developer) directly or indirectly
+ infringes any patent, then any and all rights granted directly or
+ indirectly to You by such Participant, the Initial Developer (if
+ the Initial Developer is not the Participant) and all Contributors
+ under Sections 2.1 and/or 2.2 of this License shall, upon 60 days
+ notice from Participant terminate prospectively and automatically
+ at the expiration of such 60 day notice period, unless if within
+ such 60 day period You withdraw Your claim with respect to the
+ Participant Software against such Participant either unilaterally
+ or pursuant to a written agreement with Participant.
+
+ 6.3. In the event of termination under Sections 6.1 or 6.2 above,
+ all end user licenses that have been validly granted by You or any
+ distributor hereunder prior to termination (excluding licenses
+ granted to You by any distributor) shall survive termination.
+
+7. LIMITATION OF LIABILITY.
+
+ UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT
+ (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE
+ INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF
+ COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE
+ LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR
+ CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT
+ LIMITATION, DAMAGES FOR LOST PROFITS, LOSS OF GOODWILL, WORK
+ STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER
+ COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN
+ INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF
+ LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL
+ INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT
+ APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO
+ NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR
+ CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT
+ APPLY TO YOU.
+
+8. U.S. GOVERNMENT END USERS.
+
+ The Covered Software is a "commercial item," as that term is
+ defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial
+ computer software" (as that term is defined at 48
+ C.F.R. 252.227-7014(a)(1)) and "commercial computer software
+ documentation" as such terms are used in 48 C.F.R. 12.212
+ (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48
+ C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all
+ U.S. Government End Users acquire Covered Software with only those
+ rights set forth herein. This U.S. Government Rights clause is in
+ lieu of, and supersedes, any other FAR, DFAR, or other clause or
+ provision that addresses Government rights in computer software
+ under this License.
+
+9. MISCELLANEOUS.
+
+ This License represents the complete agreement concerning subject
+ matter hereof. If any provision of this License is held to be
+ unenforceable, such provision shall be reformed only to the extent
+ necessary to make it enforceable. This License shall be governed
+ by the law of the jurisdiction specified in a notice contained
+ within the Original Software (except to the extent applicable law,
+ if any, provides otherwise), excluding such jurisdiction's
+ conflict-of-law provisions. Any litigation relating to this
+ License shall be subject to the jurisdiction of the courts located
+ in the jurisdiction and venue specified in a notice contained
+ within the Original Software, with the losing party responsible
+ for costs, including, without limitation, court costs and
+ reasonable attorneys' fees and expenses. The application of the
+ United Nations Convention on Contracts for the International Sale
+ of Goods is expressly excluded. Any law or regulation which
+ provides that the language of a contract shall be construed
+ against the drafter shall not apply to this License. You agree
+ that You alone are responsible for compliance with the United
+ States export administration regulations (and the export control
+ laws and regulation of any other countries) when You use,
+ distribute or otherwise make available any Covered Software.
+
+10. RESPONSIBILITY FOR CLAIMS.
+
+ As between Initial Developer and the Contributors, each party is
+ responsible for claims and damages arising, directly or
+ indirectly, out of its utilization of rights under this License
+ and You agree to work with Initial Developer and Contributors to
+ distribute such responsibility on an equitable basis. Nothing
+ herein is intended or shall be deemed to constitute any admission
+ of liability.
+
+--------------------------------------------------------------------
+
+NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND
+DISTRIBUTION LICENSE (CDDL)
+
+For Covered Software in this distribution, this License shall
+be governed by the laws of the State of California (excluding
+conflict-of-law provisions).
+
+Any litigation relating to this License shall be subject to the
+jurisdiction of the Federal Courts of the Northern District of
+California and the state courts of the State of California, with
+venue lying in Santa Clara County, California.
diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
new file mode 100644
index 000000000000..a681905579c6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c
@@ -0,0 +1,1765 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/avl.h>
+#include <sys/misc.h>
+#if defined(_KERNEL)
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <acl/acl_common.h>
+#include <sys/debug.h>
+#else
+#include <errno.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <grp.h>
+#include <pwd.h>
+#include <acl_common.h>
+#define ASSERT assert
+#endif
+
+#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
+ ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
+ ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
+
+
+#define ACL_SYNCHRONIZE_SET_DENY 0x0000001
+#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002
+#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004
+#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008
+
+#define ACL_WRITE_OWNER_SET_DENY 0x0000010
+#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020
+#define ACL_WRITE_OWNER_ERR_DENY 0x0000040
+#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080
+
+#define ACL_DELETE_SET_DENY 0x0000100
+#define ACL_DELETE_SET_ALLOW 0x0000200
+#define ACL_DELETE_ERR_DENY 0x0000400
+#define ACL_DELETE_ERR_ALLOW 0x0000800
+
+#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000
+#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000
+#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000
+#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000
+
+#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000
+#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000
+#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000
+#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000
+
+#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000
+#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000
+#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000
+#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000
+
+#define ACL_READ_NAMED_READER_SET_DENY 0x1000000
+#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000
+#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000
+#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000
+
+
+#define ACE_VALID_MASK_BITS (\
+ ACE_READ_DATA | \
+ ACE_LIST_DIRECTORY | \
+ ACE_WRITE_DATA | \
+ ACE_ADD_FILE | \
+ ACE_APPEND_DATA | \
+ ACE_ADD_SUBDIRECTORY | \
+ ACE_READ_NAMED_ATTRS | \
+ ACE_WRITE_NAMED_ATTRS | \
+ ACE_EXECUTE | \
+ ACE_DELETE_CHILD | \
+ ACE_READ_ATTRIBUTES | \
+ ACE_WRITE_ATTRIBUTES | \
+ ACE_DELETE | \
+ ACE_READ_ACL | \
+ ACE_WRITE_ACL | \
+ ACE_WRITE_OWNER | \
+ ACE_SYNCHRONIZE)
+
+#define ACE_MASK_UNDEFINED 0x80000000
+
+#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \
+ ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \
+ ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \
+ ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE)
+
+/*
+ * ACL conversion helpers
+ */
+
+typedef enum {
+ ace_unused,
+ ace_user_obj,
+ ace_user,
+ ace_group, /* includes GROUP and GROUP_OBJ */
+ ace_other_obj
+} ace_to_aent_state_t;
+
+typedef struct acevals {
+ uid_t key;
+ avl_node_t avl;
+ uint32_t mask;
+ uint32_t allowed;
+ uint32_t denied;
+ int aent_type;
+} acevals_t;
+
+typedef struct ace_list {
+ acevals_t user_obj;
+ avl_tree_t user;
+ int numusers;
+ acevals_t group_obj;
+ avl_tree_t group;
+ int numgroups;
+ acevals_t other_obj;
+ uint32_t acl_mask;
+ int hasmask;
+ int dfacl_flag;
+ ace_to_aent_state_t state;
+ int seen; /* bitmask of all aclent_t a_type values seen */
+} ace_list_t;
+
+/*
+ * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
+ * v = Ptr to array/vector of objs
+ * n = # objs in the array
+ * s = size of each obj (must be multiples of a word size)
+ * f = ptr to function to compare two objs
+ * returns (-1 = less than, 0 = equal, 1 = greater than
+ */
+void
+ksort(caddr_t v, int n, int s, int (*f)())
+{
+ int g, i, j, ii;
+ unsigned int *p1, *p2;
+ unsigned int tmp;
+
+ /* No work to do */
+ if (v == NULL || n <= 1)
+ return;
+
+ /* Sanity check on arguments */
+ ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
+ ASSERT(s > 0);
+ for (g = n / 2; g > 0; g /= 2) {
+ for (i = g; i < n; i++) {
+ for (j = i - g; j >= 0 &&
+ (*f)(v + j * s, v + (j + g) * s) == 1;
+ j -= g) {
+ p1 = (void *)(v + j * s);
+ p2 = (void *)(v + (j + g) * s);
+ for (ii = 0; ii < s / 4; ii++) {
+ tmp = *p1;
+ *p1++ = *p2;
+ *p2++ = tmp;
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Compare two acls, all fields. Returns:
+ * -1 (less than)
+ * 0 (equal)
+ * +1 (greater than)
+ */
+int
+cmp2acls(void *a, void *b)
+{
+ aclent_t *x = (aclent_t *)a;
+ aclent_t *y = (aclent_t *)b;
+
+ /* Compare types */
+ if (x->a_type < y->a_type)
+ return (-1);
+ if (x->a_type > y->a_type)
+ return (1);
+ /* Equal types; compare id's */
+ if (x->a_id < y->a_id)
+ return (-1);
+ if (x->a_id > y->a_id)
+ return (1);
+ /* Equal ids; compare perms */
+ if (x->a_perm < y->a_perm)
+ return (-1);
+ if (x->a_perm > y->a_perm)
+ return (1);
+ /* Totally equal */
+ return (0);
+}
+
+/*ARGSUSED*/
+static void *
+cacl_realloc(void *ptr, size_t size, size_t new_size)
+{
+#if defined(_KERNEL)
+ void *tmp;
+
+ tmp = kmem_alloc(new_size, KM_SLEEP);
+ (void) memcpy(tmp, ptr, (size < new_size) ? size : new_size);
+ kmem_free(ptr, size);
+ return (tmp);
+#else
+ return (realloc(ptr, new_size));
+#endif
+}
+
+static int
+cacl_malloc(void **ptr, size_t size)
+{
+#if defined(_KERNEL)
+ *ptr = kmem_zalloc(size, KM_SLEEP);
+ return (0);
+#else
+ *ptr = calloc(1, size);
+ if (*ptr == NULL)
+ return (errno);
+
+ return (0);
+#endif
+}
+
+/*ARGSUSED*/
+static void
+cacl_free(void *ptr, size_t size)
+{
+#if defined(_KERNEL)
+ kmem_free(ptr, size);
+#else
+ free(ptr);
+#endif
+}
+
+#if !defined(_KERNEL)
+acl_t *
+acl_alloc(enum acl_type type)
+{
+ acl_t *aclp;
+
+ if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
+ return (NULL);
+
+ aclp->acl_aclp = NULL;
+ aclp->acl_cnt = 0;
+
+ switch (type) {
+ case ACE_T:
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ break;
+ case ACLENT_T:
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ break;
+ default:
+ acl_free(aclp);
+ aclp = NULL;
+ }
+ return (aclp);
+}
+
+/*
+ * Free acl_t structure
+ */
+void
+acl_free(acl_t *aclp)
+{
+ int acl_size;
+
+ if (aclp == NULL)
+ return;
+
+ if (aclp->acl_aclp) {
+ acl_size = aclp->acl_cnt * aclp->acl_entry_size;
+ cacl_free(aclp->acl_aclp, acl_size);
+ }
+
+ cacl_free(aclp, sizeof (acl_t));
+}
+
+static uint32_t
+access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
+{
+ uint32_t access_mask = 0;
+ int acl_produce;
+ int synchronize_set = 0, write_owner_set = 0;
+ int delete_set = 0, write_attrs_set = 0;
+ int read_named_set = 0, write_named_set = 0;
+
+ acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY);
+
+ if (isallow) {
+ synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
+ write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
+ delete_set = ACL_DELETE_SET_ALLOW;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ } else {
+
+ synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
+ write_owner_set = ACL_WRITE_OWNER_SET_DENY;
+ delete_set = ACL_DELETE_SET_DENY;
+ if (hasreadperm)
+ read_named_set = ACL_READ_NAMED_READER_SET_DENY;
+ if (haswriteperm)
+ write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ if (isowner)
+ write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ else if (haswriteperm)
+ write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ else
+ /*
+ * If the entity is not the owner and does not
+ * have write permissions ACE_WRITE_ATTRIBUTES will
+ * always go in the DENY ACE.
+ */
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ }
+
+ if (acl_produce & synchronize_set)
+ access_mask |= ACE_SYNCHRONIZE;
+ if (acl_produce & write_owner_set)
+ access_mask |= ACE_WRITE_OWNER;
+ if (acl_produce & delete_set)
+ access_mask |= ACE_DELETE;
+ if (acl_produce & write_attrs_set)
+ access_mask |= ACE_WRITE_ATTRIBUTES;
+ if (acl_produce & read_named_set)
+ access_mask |= ACE_READ_NAMED_ATTRS;
+ if (acl_produce & write_named_set)
+ access_mask |= ACE_WRITE_NAMED_ATTRS;
+
+ return (access_mask);
+}
+
+/*
+ * Given an mode_t, convert it into an access_mask as used
+ * by nfsace, assuming aclent_t -> nfsace semantics.
+ */
+static uint32_t
+mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow)
+{
+ uint32_t access = 0;
+ int haswriteperm = 0;
+ int hasreadperm = 0;
+
+ if (isallow) {
+ haswriteperm = (mode & S_IWOTH);
+ hasreadperm = (mode & S_IROTH);
+ } else {
+ haswriteperm = !(mode & S_IWOTH);
+ hasreadperm = !(mode & S_IROTH);
+ }
+
+ /*
+ * The following call takes care of correctly setting the following
+ * mask bits in the access_mask:
+ * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
+ * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
+ */
+ access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
+
+ if (isallow) {
+ access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
+ if (isowner)
+ access |= ACE_WRITE_ACL;
+ } else {
+ if (! isowner)
+ access |= ACE_WRITE_ACL;
+ }
+
+ /* read */
+ if (mode & S_IROTH) {
+ access |= ACE_READ_DATA;
+ }
+ /* write */
+ if (mode & S_IWOTH) {
+ access |= ACE_WRITE_DATA |
+ ACE_APPEND_DATA;
+ if (isdir)
+ access |= ACE_DELETE_CHILD;
+ }
+ /* exec */
+ if (mode & S_IXOTH) {
+ access |= ACE_EXECUTE;
+ }
+
+ return (access);
+}
+
+/*
+ * Given an nfsace (presumably an ALLOW entry), make a
+ * corresponding DENY entry at the address given.
+ */
+static void
+ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
+{
+ (void) memcpy(deny, allow, sizeof (ace_t));
+
+ deny->a_who = allow->a_who;
+
+ deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ deny->a_access_mask ^= ACE_DELETE_CHILD;
+
+ deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
+ ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+ ACE_WRITE_NAMED_ATTRS);
+ deny->a_access_mask |= access_mask_set((allow->a_access_mask &
+ ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
+ B_FALSE);
+}
+/*
+ * Make an initial pass over an array of aclent_t's. Gather
+ * information such as an ACL_MASK (if any), number of users,
+ * number of groups, and whether the array needs to be sorted.
+ */
+static int
+ln_aent_preprocess(aclent_t *aclent, int n,
+ int *hasmask, mode_t *mask,
+ int *numuser, int *numgroup, int *needsort)
+{
+ int error = 0;
+ int i;
+ int curtype = 0;
+
+ *hasmask = 0;
+ *mask = 07;
+ *needsort = 0;
+ *numuser = 0;
+ *numgroup = 0;
+
+ for (i = 0; i < n; i++) {
+ if (aclent[i].a_type < curtype)
+ *needsort = 1;
+ else if (aclent[i].a_type > curtype)
+ curtype = aclent[i].a_type;
+ if (aclent[i].a_type & USER)
+ (*numuser)++;
+ if (aclent[i].a_type & (GROUP | GROUP_OBJ))
+ (*numgroup)++;
+ if (aclent[i].a_type & CLASS_OBJ) {
+ if (*hasmask) {
+ error = EINVAL;
+ goto out;
+ } else {
+ *hasmask = 1;
+ *mask = aclent[i].a_perm;
+ }
+ }
+ }
+
+ if ((! *hasmask) && (*numuser + *numgroup > 1)) {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+/*
+ * Convert an array of aclent_t into an array of nfsace entries,
+ * following POSIX draft -> nfsv4 conversion semantics as outlined in
+ * the IETF draft.
+ */
+static int
+ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
+{
+ int error = 0;
+ mode_t mask;
+ int numuser, numgroup, needsort;
+ int resultsize = 0;
+ int i, groupi = 0, skip;
+ ace_t *acep, *result = NULL;
+ int hasmask;
+
+ error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
+ &numuser, &numgroup, &needsort);
+ if (error != 0)
+ goto out;
+
+ /* allow + deny for each aclent */
+ resultsize = n * 2;
+ if (hasmask) {
+ /*
+ * stick extra deny on the group_obj and on each
+ * user|group for the mask (the group_obj was added
+ * into the count for numgroup)
+ */
+ resultsize += numuser + numgroup;
+ /* ... and don't count the mask itself */
+ resultsize -= 2;
+ }
+
+ /* sort the source if necessary */
+ if (needsort)
+ ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
+
+ if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0)
+ goto out;
+
+ acep = result;
+
+ for (i = 0; i < n; i++) {
+ /*
+ * don't process CLASS_OBJ (mask); mask was grabbed in
+ * ln_aent_preprocess()
+ */
+ if (aclent[i].a_type & CLASS_OBJ)
+ continue;
+
+ /* If we need an ACL_MASK emulator, prepend it now */
+ if ((hasmask) &&
+ (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
+ acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+ acep->a_flags = 0;
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |=
+ (ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ }
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+ /*
+ * Set the access mask for the prepended deny
+ * ace. To do this, we invert the mask (found
+ * in ln_aent_preprocess()) then convert it to an
+ * DENY ace access_mask.
+ */
+ acep->a_access_mask = mode_to_ace_access((mask ^ 07),
+ isdir, 0, 0);
+ acep += 1;
+ }
+
+ /* handle a_perm -> access_mask */
+ acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
+ isdir, aclent[i].a_type & USER_OBJ, 1);
+
+ /* emulate a default aclent */
+ if (aclent[i].a_type & ACL_DEFAULT) {
+ acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE;
+ }
+
+ /*
+ * handle a_perm and a_id
+ *
+ * this must be done last, since it involves the
+ * corresponding deny aces, which are handled
+ * differently for each different a_type.
+ */
+ if (aclent[i].a_type & USER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_OWNER;
+ ace_make_deny(acep, acep + 1, isdir, B_TRUE);
+ acep += 2;
+ } else if (aclent[i].a_type & USER) {
+ acep->a_who = aclent[i].a_id;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
+ if (aclent[i].a_type & GROUP_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_GROUP;
+ } else {
+ acep->a_who = aclent[i].a_id;
+ }
+ acep->a_flags |= ACE_IDENTIFIER_GROUP;
+ /*
+ * Set the corresponding deny for the group ace.
+ *
+ * The deny aces go after all of the groups, unlike
+ * everything else, where they immediately follow
+ * the allow ace.
+ *
+ * We calculate "skip", the number of slots to
+ * skip ahead for the deny ace, here.
+ *
+ * The pattern is:
+ * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
+ * thus, skip is
+ * (2 * numgroup) - 1 - groupi
+ * (2 * numgroup) to account for MD + A
+ * - 1 to account for the fact that we're on the
+ * access (A), not the mask (MD)
+ * - groupi to account for the fact that we have
+ * passed up groupi number of MD's.
+ */
+ skip = (2 * numgroup) - 1 - groupi;
+ ace_make_deny(acep, acep + skip, isdir, B_FALSE);
+ /*
+ * If we just did the last group, skip acep past
+ * all of the denies; else, just move ahead one.
+ */
+ if (++groupi >= numgroup)
+ acep += numgroup + 1;
+ else
+ acep += 1;
+ } else if (aclent[i].a_type & OTHER_OBJ) {
+ acep->a_who = (uid_t)-1;
+ acep->a_flags |= ACE_EVERYONE;
+ ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+ acep += 2;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ *acepp = result;
+ *rescount = resultsize;
+
+out:
+ if (error != 0) {
+ if ((result != NULL) && (resultsize > 0)) {
+ cacl_free(result, resultsize * sizeof (ace_t));
+ }
+ }
+
+ return (error);
+}
+
+static int
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir,
+ ace_t **retacep, int *retacecnt)
+{
+ ace_t *acep;
+ ace_t *dfacep;
+ int acecnt = 0;
+ int dfacecnt = 0;
+ int dfaclstart = 0;
+ int dfaclcnt = 0;
+ aclent_t *aclp;
+ int i;
+ int error;
+ int acesz, dfacesz;
+
+ ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
+
+ for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
+ if (aclp->a_type & ACL_DEFAULT)
+ break;
+ }
+
+ if (i < aclcnt) {
+ dfaclstart = i;
+ dfaclcnt = aclcnt - i;
+ }
+
+ if (dfaclcnt && !isdir) {
+ return (EINVAL);
+ }
+
+ error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir);
+ if (error)
+ return (error);
+
+ if (dfaclcnt) {
+ error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
+ &dfacep, &dfacecnt, isdir);
+ if (error) {
+ if (acep) {
+ cacl_free(acep, acecnt * sizeof (ace_t));
+ }
+ return (error);
+ }
+ }
+
+ if (dfacecnt != 0) {
+ acesz = sizeof (ace_t) * acecnt;
+ dfacesz = sizeof (ace_t) * dfacecnt;
+ acep = cacl_realloc(acep, acesz, acesz + dfacesz);
+ if (acep == NULL)
+ return (ENOMEM);
+ if (dfaclcnt) {
+ (void) memcpy(acep + acecnt, dfacep, dfacesz);
+ }
+ }
+ if (dfaclcnt)
+ cacl_free(dfacep, dfacecnt * sizeof (ace_t));
+
+ *retacecnt = acecnt + dfacecnt;
+ *retacep = acep;
+ return (0);
+}
+
+static int
+ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
+{
+ int error = 0;
+ o_mode_t mode = 0;
+ uint32_t bits, wantbits;
+
+ /* read */
+ if (mask & ACE_READ_DATA)
+ mode |= S_IROTH;
+
+ /* write */
+ wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA);
+ if (isdir)
+ wantbits |= ACE_DELETE_CHILD;
+ bits = mask & wantbits;
+ if (bits != 0) {
+ if (bits != wantbits) {
+ error = ENOTSUP;
+ goto out;
+ }
+ mode |= S_IWOTH;
+ }
+
+ /* exec */
+ if (mask & ACE_EXECUTE) {
+ mode |= S_IXOTH;
+ }
+
+ *modep = mode;
+
+out:
+ return (error);
+}
+
+static void
+acevals_init(acevals_t *vals, uid_t key)
+{
+ bzero(vals, sizeof (*vals));
+ vals->allowed = ACE_MASK_UNDEFINED;
+ vals->denied = ACE_MASK_UNDEFINED;
+ vals->mask = ACE_MASK_UNDEFINED;
+ vals->key = key;
+}
+
+static void
+ace_list_init(ace_list_t *al, int dfacl_flag)
+{
+ acevals_init(&al->user_obj, 0);
+ acevals_init(&al->group_obj, 0);
+ acevals_init(&al->other_obj, 0);
+ al->numusers = 0;
+ al->numgroups = 0;
+ al->acl_mask = 0;
+ al->hasmask = 0;
+ al->state = ace_unused;
+ al->seen = 0;
+ al->dfacl_flag = dfacl_flag;
+}
+
+/*
+ * Find or create an acevals holder for a given id and avl tree.
+ *
+ * Note that only one thread will ever touch these avl trees, so
+ * there is no need for locking.
+ */
+static acevals_t *
+acevals_find(ace_t *ace, avl_tree_t *avl, int *num)
+{
+ acevals_t key, *rc;
+ avl_index_t where;
+
+ key.key = ace->a_who;
+ rc = avl_find(avl, &key, &where);
+ if (rc != NULL)
+ return (rc);
+
+ /* this memory is freed by ln_ace_to_aent()->ace_list_free() */
+ if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
+ return (NULL);
+
+ acevals_init(rc, ace->a_who);
+ avl_insert(avl, rc, where);
+ (*num)++;
+
+ return (rc);
+}
+
+static int
+access_mask_check(ace_t *acep, int mask_bit, int isowner)
+{
+ int set_deny, err_deny;
+ int set_allow, err_allow;
+ int acl_consume;
+ int haswriteperm, hasreadperm;
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
+ } else {
+ haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
+ hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
+ }
+
+ acl_consume = (ACL_SYNCHRONIZE_ERR_DENY |
+ ACL_DELETE_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_DENY |
+ ACL_WRITE_OWNER_ERR_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+ ACL_WRITE_ATTRS_OWNER_ERR_DENY |
+ ACL_WRITE_ATTRS_WRITER_SET_DENY |
+ ACL_WRITE_ATTRS_WRITER_ERR_ALLOW |
+ ACL_WRITE_NAMED_WRITER_ERR_DENY |
+ ACL_READ_NAMED_READER_ERR_DENY);
+
+ if (mask_bit == ACE_SYNCHRONIZE) {
+ set_deny = ACL_SYNCHRONIZE_SET_DENY;
+ err_deny = ACL_SYNCHRONIZE_ERR_DENY;
+ set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
+ err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_OWNER) {
+ set_deny = ACL_WRITE_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
+ } else if (mask_bit == ACE_DELETE) {
+ set_deny = ACL_DELETE_SET_DENY;
+ err_deny = ACL_DELETE_ERR_DENY;
+ set_allow = ACL_DELETE_SET_ALLOW;
+ err_allow = ACL_DELETE_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
+ if (isowner) {
+ set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
+ } else if (haswriteperm) {
+ set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
+ } else {
+ if ((acep->a_access_mask & mask_bit) &&
+ (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ return (ENOTSUP);
+ }
+ return (0);
+ }
+ } else if (mask_bit == ACE_READ_NAMED_ATTRS) {
+ if (!hasreadperm)
+ return (0);
+
+ set_deny = ACL_READ_NAMED_READER_SET_DENY;
+ err_deny = ACL_READ_NAMED_READER_ERR_DENY;
+ set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
+ err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
+ } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
+ if (!haswriteperm)
+ return (0);
+
+ set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
+ err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
+ set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+ err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
+ } else {
+ return (EINVAL);
+ }
+
+ if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+ if (acl_consume & set_deny) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_deny) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ } else {
+ /* ACE_ACCESS_ALLOWED_ACE_TYPE */
+ if (acl_consume & set_allow) {
+ if (!(acep->a_access_mask & mask_bit)) {
+ return (ENOTSUP);
+ }
+ } else if (acl_consume & err_allow) {
+ if (acep->a_access_mask & mask_bit) {
+ return (ENOTSUP);
+ }
+ }
+ }
+ return (0);
+}
+
+static int
+ace_to_aent_legal(ace_t *acep)
+{
+ int error = 0;
+ int isowner;
+
+ /* only ALLOW or DENY */
+ if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid flags */
+ if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* some flags are illegal */
+ if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG |
+ ACE_FAILED_ACCESS_ACE_FLAG |
+ ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /* check for invalid masks */
+ if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ isowner = 1;
+ } else {
+ isowner = 0;
+ }
+
+ error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_DELETE, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
+ if (error)
+ goto out;
+
+ /* more detailed checking of masks */
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (! (acep->a_access_mask & ACE_APPEND_DATA))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
+ (acep->a_access_mask & ACE_APPEND_DATA)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+ /* ACL enforcement */
+ if ((acep->a_access_mask & ACE_READ_ACL) &&
+ (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (acep->a_access_mask & ACE_WRITE_ACL) {
+ if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
+ (isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+ (! isowner)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ }
+
+out:
+ return (error);
+}
+
+static int
+ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
+{
+ /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
+ if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
+ (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) {
+ return (ENOTSUP);
+ }
+
+ return (ace_mask_to_mode(mask, modep, isdir));
+}
+
+static int
+acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
+ uid_t owner, gid_t group, boolean_t isdir)
+{
+ int error;
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ if (vals->allowed != (vals->denied ^ flips)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((list->hasmask) && (list->acl_mask != vals->mask) &&
+ (vals->aent_type & (USER | GROUP | GROUP_OBJ))) {
+ error = ENOTSUP;
+ goto out;
+ }
+ error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ dest->a_type = vals->aent_type;
+ if (dest->a_type & (USER | GROUP)) {
+ dest->a_id = vals->key;
+ } else if (dest->a_type & USER_OBJ) {
+ dest->a_id = owner;
+ } else if (dest->a_type & GROUP_OBJ) {
+ dest->a_id = group;
+ } else if (dest->a_type & OTHER_OBJ) {
+ dest->a_id = 0;
+ } else {
+ error = EINVAL;
+ goto out;
+ }
+
+out:
+ return (error);
+}
+
+
+static int
+ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
+ uid_t owner, gid_t group, boolean_t isdir)
+{
+ int error = 0;
+ aclent_t *aent, *result = NULL;
+ acevals_t *vals;
+ int resultcount;
+
+ if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) !=
+ (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ resultcount = 3 + list->numusers + list->numgroups;
+ /*
+ * This must be the same condition as below, when we add the CLASS_OBJ
+ * (aka ACL mask)
+ */
+ if ((list->hasmask) || (! list->dfacl_flag))
+ resultcount += 1;
+
+ if (cacl_malloc((void **)&result,
+ resultcount * sizeof (aclent_t)) != 0) {
+ error = ENOMEM;
+ goto out;
+ }
+ aent = result;
+
+ /* USER_OBJ */
+ if (!(list->user_obj.aent_type & USER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
+ isdir);
+
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* USER */
+ vals = NULL;
+ for (vals = avl_first(&list->user); vals != NULL;
+ vals = AVL_NEXT(&list->user, vals)) {
+ if (!(vals->aent_type & USER)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /* GROUP_OBJ */
+ if (!(list->group_obj.aent_type & GROUP_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ /* GROUP */
+ vals = NULL;
+ for (vals = avl_first(&list->group); vals != NULL;
+ vals = AVL_NEXT(&list->group, vals)) {
+ if (!(vals->aent_type & GROUP)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(vals, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+ }
+ /*
+ * CLASS_OBJ (aka ACL_MASK)
+ *
+ * An ACL_MASK is not fabricated if the ACL is a default ACL.
+ * This is to follow UFS's behavior.
+ */
+ if ((list->hasmask) || (! list->dfacl_flag)) {
+ if (list->hasmask) {
+ uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+ if (isdir)
+ flips |= ACE_DELETE_CHILD;
+ error = ace_mask_to_mode(list->acl_mask ^ flips,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ } else {
+ /* fabricate the ACL_MASK from the group permissions */
+ error = ace_mask_to_mode(list->group_obj.allowed,
+ &aent->a_perm, isdir);
+ if (error != 0)
+ goto out;
+ }
+ aent->a_id = 0;
+ aent->a_type = CLASS_OBJ | list->dfacl_flag;
+ ++aent;
+ }
+ /* OTHER_OBJ */
+ if (!(list->other_obj.aent_type & OTHER_OBJ)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
+ isdir);
+ if (error != 0)
+ goto out;
+ ++aent;
+
+ *aclentp = result;
+ *aclcnt = resultcount;
+
+out:
+ if (error != 0) {
+ if (result != NULL)
+ cacl_free(result, resultcount * sizeof (aclent_t));
+ }
+
+ return (error);
+}
+
+
+/*
+ * free all data associated with an ace_list
+ */
+static void
+ace_list_free(ace_list_t *al)
+{
+ acevals_t *node;
+ void *cookie;
+
+ if (al == NULL)
+ return;
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
+ cacl_free(node, sizeof (acevals_t));
+
+ avl_destroy(&al->user);
+ avl_destroy(&al->group);
+
+ /* free the container itself */
+ cacl_free(al, sizeof (ace_list_t));
+}
+
+static int
+acevals_compare(const void *va, const void *vb)
+{
+ const acevals_t *a = va, *b = vb;
+
+ if (a->key == b->key)
+ return (0);
+
+ if (a->key > b->key)
+ return (1);
+
+ else
+ return (-1);
+}
+
+/*
+ * Convert a list of ace_t entries to equivalent regular and default
+ * aclent_t lists. Return error (ENOTSUP) when conversion is not possible.
+ */
+static int
+ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
+ aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
+ boolean_t isdir)
+{
+ int error = 0;
+ ace_t *acep;
+ uint32_t bits;
+ int i;
+ ace_list_t *normacl = NULL, *dfacl = NULL, *acl;
+ acevals_t *vals;
+
+ *aclentp = NULL;
+ *aclcnt = 0;
+ *dfaclentp = NULL;
+ *dfaclcnt = 0;
+
+ /* we need at least user_obj, group_obj, and other_obj */
+ if (n < 6) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if (ace == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+
+ ace_list_init(normacl, 0);
+
+ error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
+ if (error != 0)
+ goto out;
+
+ avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
+ offsetof(acevals_t, avl));
+ ace_list_init(dfacl, ACL_DEFAULT);
+
+ /* process every ace_t... */
+ for (i = 0; i < n; i++) {
+ acep = &ace[i];
+
+ /* rule out certain cases quickly */
+ error = ace_to_aent_legal(acep);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Turn off these bits in order to not have to worry about
+ * them when doing the checks for compliments.
+ */
+ acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE |
+ ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES |
+ ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS);
+
+ /* see if this should be a regular or default acl */
+ bits = acep->a_flags &
+ (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE);
+ if (bits != 0) {
+ /* all or nothing on these inherit bits */
+ if (bits != (ACE_INHERIT_ONLY_ACE |
+ ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl = dfacl;
+ } else {
+ acl = normacl;
+ }
+
+ if ((acep->a_flags & ACE_OWNER)) {
+ if (acl->state > ace_user_obj) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user_obj;
+ acl->seen |= USER_OBJ;
+ vals = &acl->user_obj;
+ vals->aent_type = USER_OBJ | acl->dfacl_flag;
+ } else if ((acep->a_flags & ACE_EVERYONE)) {
+ acl->state = ace_other_obj;
+ acl->seen |= OTHER_OBJ;
+ vals = &acl->other_obj;
+ vals->aent_type = OTHER_OBJ | acl->dfacl_flag;
+ } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
+ if (acl->state > ace_group) {
+ error = ENOTSUP;
+ goto out;
+ }
+ if ((acep->a_flags & ACE_GROUP)) {
+ acl->seen |= GROUP_OBJ;
+ vals = &acl->group_obj;
+ vals->aent_type = GROUP_OBJ | acl->dfacl_flag;
+ } else {
+ acl->seen |= GROUP;
+ vals = acevals_find(acep, &acl->group,
+ &acl->numgroups);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = GROUP | acl->dfacl_flag;
+ }
+ acl->state = ace_group;
+ } else {
+ if (acl->state > ace_user) {
+ error = ENOTSUP;
+ goto out;
+ }
+ acl->state = ace_user;
+ acl->seen |= USER;
+ vals = acevals_find(acep, &acl->user,
+ &acl->numusers);
+ if (vals == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ vals->aent_type = USER | acl->dfacl_flag;
+ }
+
+ if (!(acl->state > ace_unused)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ /* no more than one allowed per aclent_t */
+ if (vals->allowed != ACE_MASK_UNDEFINED) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->allowed = acep->a_access_mask;
+ } else {
+ /*
+ * it's a DENY; if there was a previous DENY, it
+ * must have been an ACL_MASK.
+ */
+ if (vals->denied != ACE_MASK_UNDEFINED) {
+ /* ACL_MASK is for USER and GROUP only */
+ if ((acl->state != ace_user) &&
+ (acl->state != ace_group)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ if (! acl->hasmask) {
+ acl->hasmask = 1;
+ acl->acl_mask = vals->denied;
+ /* check for mismatched ACL_MASK emulations */
+ } else if (acl->acl_mask != vals->denied) {
+ error = ENOTSUP;
+ goto out;
+ }
+ vals->mask = vals->denied;
+ }
+ vals->denied = acep->a_access_mask;
+ }
+ }
+
+ /* done collating; produce the aclent_t lists */
+ if (normacl->state != ace_unused) {
+ error = ace_list_to_aent(normacl, aclentp, aclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+ if (dfacl->state != ace_unused) {
+ error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
+ owner, group, isdir);
+ if (error != 0) {
+ goto out;
+ }
+ }
+
+out:
+ if (normacl != NULL)
+ ace_list_free(normacl);
+ if (dfacl != NULL)
+ ace_list_free(dfacl);
+
+ return (error);
+}
+
+static int
+convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir,
+ uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
+{
+ int error = 0;
+ aclent_t *aclentp, *dfaclentp;
+ int aclcnt, dfaclcnt;
+ int aclsz, dfaclsz;
+
+ error = ln_ace_to_aent(acebufp, acecnt, owner, group,
+ &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
+
+ if (error)
+ return (error);
+
+
+ if (dfaclcnt != 0) {
+ /*
+ * Slap aclentp and dfaclentp into a single array.
+ */
+ aclsz = sizeof (aclent_t) * aclcnt;
+ dfaclsz = sizeof (aclent_t) * dfaclcnt;
+ aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
+ if (aclentp != NULL) {
+ (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
+ } else {
+ error = ENOMEM;
+ }
+ }
+
+ if (aclentp) {
+ *retaclentp = aclentp;
+ *retaclcnt = aclcnt + dfaclcnt;
+ }
+
+ if (dfaclentp)
+ cacl_free(dfaclentp, dfaclsz);
+
+ return (error);
+}
+
+
+int
+acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner,
+ gid_t group)
+{
+ int aclcnt;
+ void *acldata;
+ int error;
+
+ /*
+ * See if we need to translate
+ */
+ if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) ||
+ (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACLENT_T))
+ return (0);
+
+ if (target_flavor == -1) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (target_flavor == _ACL_ACE_ENABLED &&
+ aclp->acl_type == ACLENT_T) {
+ error = convert_aent_to_ace(aclp->acl_aclp,
+ aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+
+ } else if (target_flavor == _ACL_ACLENT_ENABLED &&
+ aclp->acl_type == ACE_T) {
+ error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
+ isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
+ if (error)
+ goto out;
+ } else {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * replace old acl with newly translated acl
+ */
+ cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
+ aclp->acl_aclp = acldata;
+ aclp->acl_cnt = aclcnt;
+ if (target_flavor == _ACL_ACE_ENABLED) {
+ aclp->acl_type = ACE_T;
+ aclp->acl_entry_size = sizeof (ace_t);
+ } else {
+ aclp->acl_type = ACLENT_T;
+ aclp->acl_entry_size = sizeof (aclent_t);
+ }
+ return (0);
+
+out:
+
+#if !defined(_KERNEL)
+ errno = error;
+ return (-1);
+#else
+ return (error);
+#endif
+}
+#endif /* !_KERNEL */
+
+#define SET_ACE(acl, index, who, mask, type, flags) { \
+ acl[0][index].a_who = (uint32_t)who; \
+ acl[0][index].a_type = type; \
+ acl[0][index].a_flags = flags; \
+ acl[0][index++].a_access_mask = mask; \
+}
+
+void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+ uint32_t read_mask = ACE_READ_DATA;
+ uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+ uint32_t execute_mask = ACE_EXECUTE;
+
+ (void) isdir; /* will need this later */
+
+ masks->deny1 = 0;
+ if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+ masks->deny1 |= read_mask;
+ if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+ masks->deny1 |= write_mask;
+ if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+ masks->deny1 |= execute_mask;
+
+ masks->deny2 = 0;
+ if (!(mode & S_IRGRP) && (mode & S_IROTH))
+ masks->deny2 |= read_mask;
+ if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+ masks->deny2 |= write_mask;
+ if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+ masks->deny2 |= execute_mask;
+
+ masks->allow0 = 0;
+ if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+ masks->allow0 |= read_mask;
+ if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+ masks->allow0 |= write_mask;
+ if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+ masks->allow0 |= execute_mask;
+
+ masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+ if (mode & S_IRUSR)
+ masks->owner |= read_mask;
+ if (mode & S_IWUSR)
+ masks->owner |= write_mask;
+ if (mode & S_IXUSR)
+ masks->owner |= execute_mask;
+
+ masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IRGRP)
+ masks->group |= read_mask;
+ if (mode & S_IWGRP)
+ masks->group |= write_mask;
+ if (mode & S_IXGRP)
+ masks->group |= execute_mask;
+
+ masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IROTH)
+ masks->everyone |= read_mask;
+ if (mode & S_IWOTH)
+ masks->everyone |= write_mask;
+ if (mode & S_IXOTH)
+ masks->everyone |= execute_mask;
+}
+
+int
+acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count)
+{
+ int index = 0;
+ int error;
+ trivial_acl_t masks;
+
+ *count = 3;
+ acl_trivial_access_masks(mode, isdir, &masks);
+
+ if (masks.allow0)
+ (*count)++;
+ if (masks.deny1)
+ (*count)++;
+ if (masks.deny2)
+ (*count)++;
+
+ if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0)
+ return (error);
+
+ if (masks.allow0) {
+ SET_ACE(acl, index, -1, masks.allow0,
+ ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
+ }
+ if (masks.deny1) {
+ SET_ACE(acl, index, -1, masks.deny1,
+ ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER);
+ }
+ if (masks.deny2) {
+ SET_ACE(acl, index, -1, masks.deny2,
+ ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP);
+ }
+
+ SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_OWNER);
+ SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_IDENTIFIER_GROUP|ACE_GROUP);
+ SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
+ ACE_EVERYONE);
+
+ return (0);
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries. ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial_common(void *acep, int aclcnt,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uint16_t *, uint16_t *, uint32_t *))
+{
+ uint16_t flags;
+ uint32_t mask;
+ uint16_t type;
+ uint64_t cookie = 0;
+
+ while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
+ switch (flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+ case ACE_EVERYONE:
+ break;
+ default:
+ return (1);
+
+ }
+
+ if (flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE))
+ return (1);
+
+ /*
+ * Special check for some special bits
+ *
+ * Don't allow anybody to deny reading basic
+ * attributes or a files ACL.
+ */
+ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (type == ACE_ACCESS_DENIED_ACE_TYPE))
+ return (1);
+
+ /*
+ * Delete permissions are never set by default
+ */
+ if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
+ return (1);
+ /*
+ * only allow owner@ to have
+ * write_acl/write_owner/write_attributes/write_xattr/
+ */
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(flags & ACE_OWNER) && (mask &
+ (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+ ACE_WRITE_NAMED_ATTRS))))
+ return (1);
+
+ }
+ return (0);
+}
+
+uint64_t
+ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
+ uint16_t *type, uint32_t *mask)
+{
+ ace_t *acep = datap;
+
+ if (cookie >= aclcnt)
+ return (0);
+
+ *flags = acep[cookie].a_flags;
+ *type = acep[cookie].a_type;
+ *mask = acep[cookie++].a_access_mask;
+
+ return (cookie);
+}
+
+int
+ace_trivial(ace_t *acep, int aclcnt)
+{
+ return (ace_trivial_common(acep, aclcnt, ace_walk));
+}
diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
new file mode 100644
index 000000000000..acf1f5da89d6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef _ACL_COMMON_H
+#define _ACL_COMMON_H
+
+#include <sys/types.h>
+#include <sys/acl.h>
+#include <sys/stat.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct trivial_acl {
+ uint32_t allow0; /* allow mask for bits only in owner */
+ uint32_t deny1; /* deny mask for bits not in owner */
+ uint32_t deny2; /* deny mask for bits not in group */
+ uint32_t owner; /* allow mask matching mode */
+ uint32_t group; /* allow mask matching mode */
+ uint32_t everyone; /* allow mask matching mode */
+} trivial_acl_t;
+
+extern int acltrivial(const char *);
+extern void adjust_ace_pair(ace_t *pair, mode_t mode);
+extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t);
+extern int ace_trivial(ace_t *acep, int aclcnt);
+extern int ace_trivial_common(void *, int,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *,
+ uint32_t *mask));
+#if !defined(_KERNEL)
+extern acl_t *acl_alloc(acl_type_t);
+extern void acl_free(acl_t *aclp);
+extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir,
+ uid_t owner, gid_t group);
+#endif /* !_KERNEL */
+void ksort(caddr_t v, int n, int s, int (*f)());
+int cmp2acls(void *a, void *b);
+int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count);
+void acl_trivial_access_masks(mode_t mode, boolean_t isdir,
+ trivial_acl_t *masks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ACL_COMMON_H */
diff --git a/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
new file mode 100644
index 000000000000..bc21e85878df
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
@@ -0,0 +1,133 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+ .file "atomic.s"
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ /*
+ * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever
+ * separated, it is important to edit the libc i386 platform
+ * specific mapfile and remove the NODYNSORT attribute
+ * from atomic_dec_64_nv.
+ */
+ ENTRY(atomic_dec_64)
+ ALTENTRY(atomic_dec_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi // %edi = target address
+ movl (%edi), %eax
+ movl 4(%edi), %edx // %edx:%eax = old value
+1:
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ not %ecx
+ not %ebx // %ecx:%ebx = -1
+ addl %eax, %ebx
+ adcl %edx, %ecx // add in the carry from inc
+ lock
+ cmpxchg8b (%edi) // try to stick it in
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx // return new value
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_dec_64_nv)
+ SET_SIZE(atomic_dec_64)
+
+ /*
+ * NOTE: If atomic_add_64 and atomic_add_64_nv are ever
+ * separated, it is important to edit the libc i386 platform
+ * specific mapfile and remove the NODYNSORT attribute
+ * from atomic_add_64_nv.
+ */
+ ENTRY(atomic_add_64)
+ ALTENTRY(atomic_add_64_nv)
+ pushl %edi
+ pushl %ebx
+ movl 12(%esp), %edi // %edi = target address
+ movl (%edi), %eax
+ movl 4(%edi), %edx // %edx:%eax = old value
+1:
+ movl 16(%esp), %ebx
+ movl 20(%esp), %ecx // %ecx:%ebx = delta
+ addl %eax, %ebx
+ adcl %edx, %ecx // %ecx:%ebx = new value
+ lock
+ cmpxchg8b (%edi) // try to stick it in
+ jne 1b
+ movl %ebx, %eax
+ movl %ecx, %edx // return new value
+ popl %ebx
+ popl %edi
+ ret
+ SET_SIZE(atomic_add_64_nv)
+ SET_SIZE(atomic_add_64)
+
+ ENTRY(atomic_cas_64)
+ pushl %ebx
+ pushl %esi
+ movl 12(%esp), %esi
+ movl 16(%esp), %eax
+ movl 20(%esp), %edx
+ movl 24(%esp), %ebx
+ movl 28(%esp), %ecx
+ lock
+ cmpxchg8b (%esi)
+ popl %esi
+ popl %ebx
+ ret
+ SET_SIZE(atomic_cas_64)
+
+ ENTRY(atomic_swap_64)
+ pushl %esi
+ pushl %ebx
+ movl 12(%esp), %esi
+ movl 16(%esp), %ebx
+ movl 20(%esp), %ecx
+ movl (%esi), %eax
+ movl 4(%esi), %edx // %edx:%eax = old value
+1:
+ lock
+ cmpxchg8b (%esi)
+ jne 1b
+ popl %ebx
+ popl %esi
+ ret
+ SET_SIZE(atomic_swap_64)
+
+ ENTRY(atomic_load_64)
+ pushl %esi
+ movl 8(%esp), %esi
+ movl %ebx, %eax // make old and new values equal, so that
+ movl %ecx, %edx // destination is never changed
+ lock
+ cmpxchg8b (%esi)
+ popl %esi
+ ret
+ SET_SIZE(atomic_load_64)
diff --git a/sys/cddl/contrib/opensolaris/common/avl/avl.c b/sys/cddl/contrib/opensolaris/common/avl/avl.c
new file mode 100644
index 000000000000..2349aba2bf3e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/avl/avl.c
@@ -0,0 +1,1063 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/*
+ * AVL - generic AVL tree implementation for kernel use
+ *
+ * A complete description of AVL trees can be found in many CS textbooks.
+ *
+ * Here is a very brief overview. An AVL tree is a binary search tree that is
+ * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
+ * any given node, the left and right subtrees are allowed to differ in height
+ * by at most 1 level.
+ *
+ * This relaxation from a perfectly balanced binary tree allows doing
+ * insertion and deletion relatively efficiently. Searching the tree is
+ * still a fast operation, roughly O(log(N)).
+ *
+ * The key to insertion and deletion is a set of tree manipulations called
+ * rotations, which bring unbalanced subtrees back into the semi-balanced state.
+ *
+ * This implementation of AVL trees has the following peculiarities:
+ *
+ * - The AVL specific data structures are physically embedded as fields
+ * in the "using" data structures. To maintain generality the code
+ * must constantly translate between "avl_node_t *" and containing
+ * data structure "void *"s by adding/subtracting the avl_offset.
+ *
+ * - Since the AVL data is always embedded in other structures, there is
+ * no locking or memory allocation in the AVL routines. This must be
+ * provided for by the enclosing data structure's semantics. Typically,
+ * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
+ * exclusive write lock. Other operations require a read lock.
+ *
+ * - The implementation uses iteration instead of explicit recursion,
+ * since it is intended to run on limited size kernel stacks. Since
+ * there is no recursion stack present to move "up" in the tree,
+ * there is an explicit "parent" link in the avl_node_t.
+ *
+ * - The left/right children pointers of a node are in an array.
+ * In the code, variables (instead of constants) are used to represent
+ * left and right indices. The implementation is written as if it only
+ * dealt with left handed manipulations. By changing the value assigned
+ * to "left", the code also works for right handed trees. The
+ * following variables/terms are frequently used:
+ *
+ * int left; // 0 when dealing with left children,
+ * // 1 for dealing with right children
+ *
+ * int left_heavy; // -1 when left subtree is taller at some node,
+ * // +1 when right subtree is taller
+ *
+ * int right; // will be the opposite of left (0 or 1)
+ * int right_heavy;// will be the opposite of left_heavy (-1 or 1)
+ *
+ * int direction; // 0 for "<" (ie. left child); 1 for ">" (right)
+ *
+ * Though it is a little more confusing to read the code, the approach
+ * allows using half as much code (and hence cache footprint) for tree
+ * manipulations and eliminates many conditional branches.
+ *
+ * - The avl_index_t is an opaque "cookie" used to find nodes at or
+ * adjacent to where a new value would be inserted in the tree. The value
+ * is a modified "avl_node_t *". The bottom bit (normally 0 for a
+ * pointer) is set to indicate if that the new node has a value greater
+ * than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/stdint.h>
+#include <sys/debug.h>
+#include <sys/avl.h>
+
+/*
+ * Small arrays to translate between balance (or diff) values and child indices.
+ *
+ * Code that deals with binary tree data structures will randomly use
+ * left and right children when examining a tree. C "if()" statements
+ * which evaluate randomly suffer from very poor hardware branch prediction.
+ * In this code we avoid some of the branch mispredictions by using the
+ * following translation arrays. They replace random branches with an
+ * additional memory reference. Since the translation arrays are both very
+ * small the data should remain efficiently in cache.
+ */
+static const int avl_child2balance[2] = {-1, 1};
+static const int avl_balance2child[] = {0, 0, 1};
+
+
+/*
+ * Walk from one node to the previous valued node (ie. an infix walk
+ * towards the left). At any given node we do one of 2 things:
+ *
+ * - If there is a left child, go to it, then to it's rightmost descendant.
+ *
+ * - otherwise we return through parent nodes until we've come from a right
+ * child.
+ *
+ * Return Value:
+ * NULL - if at the end of the nodes
+ * otherwise next node
+ */
+void *
+avl_walk(avl_tree_t *tree, void *oldnode, int left)
+{
+ size_t off = tree->avl_offset;
+ avl_node_t *node = AVL_DATA2NODE(oldnode, off);
+ int right = 1 - left;
+ int was_child;
+
+
+ /*
+ * nowhere to walk to if tree is empty
+ */
+ if (node == NULL)
+ return (NULL);
+
+ /*
+ * Visit the previous valued node. There are two possibilities:
+ *
+ * If this node has a left child, go down one left, then all
+ * the way right.
+ */
+ if (node->avl_child[left] != NULL) {
+ for (node = node->avl_child[left];
+ node->avl_child[right] != NULL;
+ node = node->avl_child[right])
+ ;
+ /*
+ * Otherwise, return thru left children as far as we can.
+ */
+ } else {
+ for (;;) {
+ was_child = AVL_XCHILD(node);
+ node = AVL_XPARENT(node);
+ if (node == NULL)
+ return (NULL);
+ if (was_child == right)
+ break;
+ }
+ }
+
+ return (AVL_NODE2DATA(node, off));
+}
+
+/*
+ * Return the lowest valued node in a tree or NULL.
+ * (leftmost child from root of tree)
+ */
+void *
+avl_first(avl_tree_t *tree)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
+ prev = node;
+
+ if (prev != NULL)
+ return (AVL_NODE2DATA(prev, off));
+ return (NULL);
+}
+
+/*
+ * Return the highest valued node in a tree or NULL.
+ * (rightmost child from root of tree)
+ */
+void *
+avl_last(avl_tree_t *tree)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
+ prev = node;
+
+ if (prev != NULL)
+ return (AVL_NODE2DATA(prev, off));
+ return (NULL);
+}
+
+/*
+ * Access the node immediately before or after an insertion point.
+ *
+ * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
+ *
+ * Return value:
+ * NULL: no node in the given direction
+ * "void *" of the found tree node
+ */
+void *
+avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
+{
+ int child = AVL_INDEX2CHILD(where);
+ avl_node_t *node = AVL_INDEX2NODE(where);
+ void *data;
+ size_t off = tree->avl_offset;
+
+ if (node == NULL) {
+ ASSERT(tree->avl_root == NULL);
+ return (NULL);
+ }
+ data = AVL_NODE2DATA(node, off);
+ if (child != direction)
+ return (data);
+
+ return (avl_walk(tree, data, direction));
+}
+
+
+/*
+ * Search for the node which contains "value". The algorithm is a
+ * simple binary tree search.
+ *
+ * return value:
+ * NULL: the value is not in the AVL tree
+ * *where (if not NULL) is set to indicate the insertion point
+ * "void *" of the found tree node
+ */
+void *
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
+{
+ avl_node_t *node;
+ avl_node_t *prev = NULL;
+ int child = 0;
+ int diff;
+ size_t off = tree->avl_offset;
+
+ for (node = tree->avl_root; node != NULL;
+ node = node->avl_child[child]) {
+
+ prev = node;
+
+ diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
+ ASSERT(-1 <= diff && diff <= 1);
+ if (diff == 0) {
+#ifdef DEBUG
+ if (where != NULL)
+ *where = 0;
+#endif
+ return (AVL_NODE2DATA(node, off));
+ }
+ child = avl_balance2child[1 + diff];
+
+ }
+
+ if (where != NULL)
+ *where = AVL_MKINDEX(prev, child);
+
+ return (NULL);
+}
+
+
+/*
+ * Perform a rotation to restore balance at the subtree given by depth.
+ *
+ * This routine is used by both insertion and deletion. The return value
+ * indicates:
+ * 0 : subtree did not change height
+ * !0 : subtree was reduced in height
+ *
+ * The code is written as if handling left rotations, right rotations are
+ * symmetric and handled by swapping values of variables right/left[_heavy]
+ *
+ * On input balance is the "new" balance at "node". This value is either
+ * -2 or +2.
+ */
+static int
+avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance)
+{
+ int left = !(balance < 0); /* when balance = -2, left will be 0 */
+ int right = 1 - left;
+ int left_heavy = balance >> 1;
+ int right_heavy = -left_heavy;
+ avl_node_t *parent = AVL_XPARENT(node);
+ avl_node_t *child = node->avl_child[left];
+ avl_node_t *cright;
+ avl_node_t *gchild;
+ avl_node_t *gright;
+ avl_node_t *gleft;
+ int which_child = AVL_XCHILD(node);
+ int child_bal = AVL_XBALANCE(child);
+
+ /* BEGIN CSTYLED */
+ /*
+ * case 1 : node is overly left heavy, the left child is balanced or
+ * also left heavy. This requires the following rotation.
+ *
+ * (node bal:-2)
+ * / \
+ * / \
+ * (child bal:0 or -1)
+ * / \
+ * / \
+ * cright
+ *
+ * becomes:
+ *
+ * (child bal:1 or 0)
+ * / \
+ * / \
+ * (node bal:-1 or 0)
+ * / \
+ * / \
+ * cright
+ *
+ * we detect this situation by noting that child's balance is not
+ * right_heavy.
+ */
+ /* END CSTYLED */
+ if (child_bal != right_heavy) {
+
+ /*
+ * compute new balance of nodes
+ *
+ * If child used to be left heavy (now balanced) we reduced
+ * the height of this sub-tree -- used in "return...;" below
+ */
+ child_bal += right_heavy; /* adjust towards right */
+
+ /*
+ * move "cright" to be node's left child
+ */
+ cright = child->avl_child[right];
+ node->avl_child[left] = cright;
+ if (cright != NULL) {
+ AVL_SETPARENT(cright, node);
+ AVL_SETCHILD(cright, left);
+ }
+
+ /*
+ * move node to be child's right child
+ */
+ child->avl_child[right] = node;
+ AVL_SETBALANCE(node, -child_bal);
+ AVL_SETCHILD(node, right);
+ AVL_SETPARENT(node, child);
+
+ /*
+ * update the pointer into this subtree
+ */
+ AVL_SETBALANCE(child, child_bal);
+ AVL_SETCHILD(child, which_child);
+ AVL_SETPARENT(child, parent);
+ if (parent != NULL)
+ parent->avl_child[which_child] = child;
+ else
+ tree->avl_root = child;
+
+ return (child_bal == 0);
+ }
+
+ /* BEGIN CSTYLED */
+ /*
+ * case 2 : When node is left heavy, but child is right heavy we use
+ * a different rotation.
+ *
+ * (node b:-2)
+ * / \
+ * / \
+ * / \
+ * (child b:+1)
+ * / \
+ * / \
+ * (gchild b: != 0)
+ * / \
+ * / \
+ * gleft gright
+ *
+ * becomes:
+ *
+ * (gchild b:0)
+ * / \
+ * / \
+ * / \
+ * (child b:?) (node b:?)
+ * / \ / \
+ * / \ / \
+ * gleft gright
+ *
+ * computing the new balances is more complicated. As an example:
+ * if gchild was right_heavy, then child is now left heavy
+ * else it is balanced
+ */
+ /* END CSTYLED */
+ gchild = child->avl_child[right];
+ gleft = gchild->avl_child[left];
+ gright = gchild->avl_child[right];
+
+ /*
+ * move gright to left child of node and
+ *
+ * move gleft to right child of node
+ */
+ node->avl_child[left] = gright;
+ if (gright != NULL) {
+ AVL_SETPARENT(gright, node);
+ AVL_SETCHILD(gright, left);
+ }
+
+ child->avl_child[right] = gleft;
+ if (gleft != NULL) {
+ AVL_SETPARENT(gleft, child);
+ AVL_SETCHILD(gleft, right);
+ }
+
+ /*
+ * move child to left child of gchild and
+ *
+ * move node to right child of gchild and
+ *
+ * fixup parent of all this to point to gchild
+ */
+ balance = AVL_XBALANCE(gchild);
+ gchild->avl_child[left] = child;
+ AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
+ AVL_SETPARENT(child, gchild);
+ AVL_SETCHILD(child, left);
+
+ gchild->avl_child[right] = node;
+ AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
+ AVL_SETPARENT(node, gchild);
+ AVL_SETCHILD(node, right);
+
+ AVL_SETBALANCE(gchild, 0);
+ AVL_SETPARENT(gchild, parent);
+ AVL_SETCHILD(gchild, which_child);
+ if (parent != NULL)
+ parent->avl_child[which_child] = gchild;
+ else
+ tree->avl_root = gchild;
+
+ return (1); /* the new tree is always shorter */
+}
+
+
+/*
+ * Insert a new node into an AVL tree at the specified (from avl_find()) place.
+ *
+ * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
+ * searches out to the leaf positions. The avl_index_t indicates the node
+ * which will be the parent of the new node.
+ *
+ * After the node is inserted, a single rotation further up the tree may
+ * be necessary to maintain an acceptable AVL balance.
+ */
+void
+avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
+{
+ avl_node_t *node;
+ avl_node_t *parent = AVL_INDEX2NODE(where);
+ int old_balance;
+ int new_balance;
+ int which_child = AVL_INDEX2CHILD(where);
+ size_t off = tree->avl_offset;
+
+ ASSERT(tree);
+#ifdef _LP64
+ ASSERT(((uintptr_t)new_data & 0x7) == 0);
+#endif
+
+ node = AVL_DATA2NODE(new_data, off);
+
+ /*
+ * First, add the node to the tree at the indicated position.
+ */
+ ++tree->avl_numnodes;
+
+ node->avl_child[0] = NULL;
+ node->avl_child[1] = NULL;
+
+ AVL_SETCHILD(node, which_child);
+ AVL_SETBALANCE(node, 0);
+ AVL_SETPARENT(node, parent);
+ if (parent != NULL) {
+ ASSERT(parent->avl_child[which_child] == NULL);
+ parent->avl_child[which_child] = node;
+ } else {
+ ASSERT(tree->avl_root == NULL);
+ tree->avl_root = node;
+ }
+ /*
+ * Now, back up the tree modifying the balance of all nodes above the
+ * insertion point. If we get to a highly unbalanced ancestor, we
+ * need to do a rotation. If we back out of the tree we are done.
+ * If we brought any subtree into perfect balance (0), we are also done.
+ */
+ for (;;) {
+ node = parent;
+ if (node == NULL)
+ return;
+
+ /*
+ * Compute the new balance
+ */
+ old_balance = AVL_XBALANCE(node);
+ new_balance = old_balance + avl_child2balance[which_child];
+
+ /*
+ * If we introduced equal balance, then we are done immediately
+ */
+ if (new_balance == 0) {
+ AVL_SETBALANCE(node, 0);
+ return;
+ }
+
+ /*
+ * If both old and new are not zero we went
+ * from -1 to -2 balance, do a rotation.
+ */
+ if (old_balance != 0)
+ break;
+
+ AVL_SETBALANCE(node, new_balance);
+ parent = AVL_XPARENT(node);
+ which_child = AVL_XCHILD(node);
+ }
+
+ /*
+ * perform a rotation to fix the tree and return
+ */
+ (void) avl_rotation(tree, node, new_balance);
+}
+
+/*
+ * Insert "new_data" in "tree" in the given "direction" either after or
+ * before (AVL_AFTER, AVL_BEFORE) the data "here".
+ *
+ * Insertions can only be done at empty leaf points in the tree, therefore
+ * if the given child of the node is already present we move to either
+ * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
+ * every other node in the tree is a leaf, this always works.
+ *
+ * To help developers using this interface, we assert that the new node
+ * is correctly ordered at every step of the way in DEBUG kernels.
+ */
+void
+avl_insert_here(
+ avl_tree_t *tree,
+ void *new_data,
+ void *here,
+ int direction)
+{
+ avl_node_t *node;
+ int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
+#ifdef DEBUG
+ int diff;
+#endif
+
+ ASSERT(tree != NULL);
+ ASSERT(new_data != NULL);
+ ASSERT(here != NULL);
+ ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER);
+
+ /*
+ * If corresponding child of node is not NULL, go to the neighboring
+ * node and reverse the insertion direction.
+ */
+ node = AVL_DATA2NODE(here, tree->avl_offset);
+
+#ifdef DEBUG
+ diff = tree->avl_compar(new_data, here);
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+
+ if (node->avl_child[child] != NULL) {
+ node = node->avl_child[child];
+ child = 1 - child;
+ while (node->avl_child[child] != NULL) {
+#ifdef DEBUG
+ diff = tree->avl_compar(new_data,
+ AVL_NODE2DATA(node, tree->avl_offset));
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+ node = node->avl_child[child];
+ }
+#ifdef DEBUG
+ diff = tree->avl_compar(new_data,
+ AVL_NODE2DATA(node, tree->avl_offset));
+ ASSERT(-1 <= diff && diff <= 1);
+ ASSERT(diff != 0);
+ ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+ }
+ ASSERT(node->avl_child[child] == NULL);
+
+ avl_insert(tree, new_data, AVL_MKINDEX(node, child));
+}
+
+/*
+ * Add a new node to an AVL tree.
+ */
+void
+avl_add(avl_tree_t *tree, void *new_node)
+{
+ avl_index_t where;
+
+ /*
+ * This is unfortunate. We want to call panic() here, even for
+ * non-DEBUG kernels. In userland, however, we can't depend on anything
+ * in libc or else the rtld build process gets confused.
+ * Thankfully, rtld provides us with its own assfail() so we can use
+ * that here. We use assfail() directly to get a nice error message
+ * in the core - much like what panic() does for crashdumps.
+ */
+ if (avl_find(tree, new_node, &where) != NULL)
+#ifdef _KERNEL
+ panic("avl_find() succeeded inside avl_add()");
+#else
+ (void) assfail("avl_find() succeeded inside avl_add()",
+ __FILE__, __LINE__);
+#endif
+ avl_insert(tree, new_node, where);
+}
+
+/*
+ * Delete a node from the AVL tree. Deletion is similar to insertion, but
+ * with 2 complications.
+ *
+ * First, we may be deleting an interior node. Consider the following subtree:
+ *
+ * d c c
+ * / \ / \ / \
+ * b e b e b e
+ * / \ / \ /
+ * a c a a
+ *
+ * When we are deleting node (d), we find and bring up an adjacent valued leaf
+ * node, say (c), to take the interior node's place. In the code this is
+ * handled by temporarily swapping (d) and (c) in the tree and then using
+ * common code to delete (d) from the leaf position.
+ *
+ * Secondly, an interior deletion from a deep tree may require more than one
+ * rotation to fix the balance. This is handled by moving up the tree through
+ * parents and applying rotations as needed. The return value from
+ * avl_rotation() is used to detect when a subtree did not change overall
+ * height due to a rotation.
+ */
+void
+avl_remove(avl_tree_t *tree, void *data)
+{
+ avl_node_t *delete;
+ avl_node_t *parent;
+ avl_node_t *node;
+ avl_node_t tmp;
+ int old_balance;
+ int new_balance;
+ int left;
+ int right;
+ int which_child;
+ size_t off = tree->avl_offset;
+
+ ASSERT(tree);
+
+ delete = AVL_DATA2NODE(data, off);
+
+ /*
+ * Deletion is easiest with a node that has at most 1 child.
+ * We swap a node with 2 children with a sequentially valued
+ * neighbor node. That node will have at most 1 child. Note this
+ * has no effect on the ordering of the remaining nodes.
+ *
+ * As an optimization, we choose the greater neighbor if the tree
+ * is right heavy, otherwise the left neighbor. This reduces the
+ * number of rotations needed.
+ */
+ if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
+
+ /*
+ * choose node to swap from whichever side is taller
+ */
+ old_balance = AVL_XBALANCE(delete);
+ left = avl_balance2child[old_balance + 1];
+ right = 1 - left;
+
+ /*
+ * get to the previous value'd node
+ * (down 1 left, as far as possible right)
+ */
+ for (node = delete->avl_child[left];
+ node->avl_child[right] != NULL;
+ node = node->avl_child[right])
+ ;
+
+ /*
+ * create a temp placeholder for 'node'
+ * move 'node' to delete's spot in the tree
+ */
+ tmp = *node;
+
+ *node = *delete;
+ if (node->avl_child[left] == node)
+ node->avl_child[left] = &tmp;
+
+ parent = AVL_XPARENT(node);
+ if (parent != NULL)
+ parent->avl_child[AVL_XCHILD(node)] = node;
+ else
+ tree->avl_root = node;
+ AVL_SETPARENT(node->avl_child[left], node);
+ AVL_SETPARENT(node->avl_child[right], node);
+
+ /*
+ * Put tmp where node used to be (just temporary).
+ * It always has a parent and at most 1 child.
+ */
+ delete = &tmp;
+ parent = AVL_XPARENT(delete);
+ parent->avl_child[AVL_XCHILD(delete)] = delete;
+ which_child = (delete->avl_child[1] != 0);
+ if (delete->avl_child[which_child] != NULL)
+ AVL_SETPARENT(delete->avl_child[which_child], delete);
+ }
+
+
+ /*
+ * Here we know "delete" is at least partially a leaf node. It can
+ * be easily removed from the tree.
+ */
+ ASSERT(tree->avl_numnodes > 0);
+ --tree->avl_numnodes;
+ parent = AVL_XPARENT(delete);
+ which_child = AVL_XCHILD(delete);
+ if (delete->avl_child[0] != NULL)
+ node = delete->avl_child[0];
+ else
+ node = delete->avl_child[1];
+
+ /*
+ * Connect parent directly to node (leaving out delete).
+ */
+ if (node != NULL) {
+ AVL_SETPARENT(node, parent);
+ AVL_SETCHILD(node, which_child);
+ }
+ if (parent == NULL) {
+ tree->avl_root = node;
+ return;
+ }
+ parent->avl_child[which_child] = node;
+
+
+ /*
+ * Since the subtree is now shorter, begin adjusting parent balances
+ * and performing any needed rotations.
+ */
+ do {
+
+ /*
+ * Move up the tree and adjust the balance
+ *
+ * Capture the parent and which_child values for the next
+ * iteration before any rotations occur.
+ */
+ node = parent;
+ old_balance = AVL_XBALANCE(node);
+ new_balance = old_balance - avl_child2balance[which_child];
+ parent = AVL_XPARENT(node);
+ which_child = AVL_XCHILD(node);
+
+ /*
+ * If a node was in perfect balance but isn't anymore then
+ * we can stop, since the height didn't change above this point
+ * due to a deletion.
+ */
+ if (old_balance == 0) {
+ AVL_SETBALANCE(node, new_balance);
+ break;
+ }
+
+ /*
+ * If the new balance is zero, we don't need to rotate
+ * else
+ * need a rotation to fix the balance.
+ * If the rotation doesn't change the height
+ * of the sub-tree we have finished adjusting.
+ */
+ if (new_balance == 0)
+ AVL_SETBALANCE(node, new_balance);
+ else if (!avl_rotation(tree, node, new_balance))
+ break;
+ } while (parent != NULL);
+}
+
+#define AVL_REINSERT(tree, obj) \
+ avl_remove((tree), (obj)); \
+ avl_add((tree), (obj))
+
+boolean_t
+avl_update_lt(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) ||
+ (t->avl_compar(obj, neighbor) <= 0));
+
+ neighbor = AVL_PREV(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+avl_update_gt(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) ||
+ (t->avl_compar(obj, neighbor) >= 0));
+
+ neighbor = AVL_NEXT(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+avl_update(avl_tree_t *t, void *obj)
+{
+ void *neighbor;
+
+ neighbor = AVL_PREV(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ neighbor = AVL_NEXT(t, obj);
+ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+ AVL_REINSERT(t, obj);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+ avl_node_t *temp_node;
+ ulong_t temp_numnodes;
+
+ ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
+ ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
+ ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
+
+ temp_node = tree1->avl_root;
+ temp_numnodes = tree1->avl_numnodes;
+ tree1->avl_root = tree2->avl_root;
+ tree1->avl_numnodes = tree2->avl_numnodes;
+ tree2->avl_root = temp_node;
+ tree2->avl_numnodes = temp_numnodes;
+}
+
+/*
+ * initialize a new AVL tree
+ */
+void
+avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
+ size_t size, size_t offset)
+{
+ ASSERT(tree);
+ ASSERT(compar);
+ ASSERT(size > 0);
+ ASSERT(size >= offset + sizeof (avl_node_t));
+#ifdef _LP64
+ ASSERT((offset & 0x7) == 0);
+#endif
+
+ tree->avl_compar = compar;
+ tree->avl_root = NULL;
+ tree->avl_numnodes = 0;
+ tree->avl_size = size;
+ tree->avl_offset = offset;
+}
+
+/*
+ * Delete a tree.
+ */
+/* ARGSUSED */
+void
+avl_destroy(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ ASSERT(tree->avl_numnodes == 0);
+ ASSERT(tree->avl_root == NULL);
+}
+
+
+/*
+ * Return the number of nodes in an AVL tree.
+ */
+ulong_t
+avl_numnodes(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ return (tree->avl_numnodes);
+}
+
+boolean_t
+avl_is_empty(avl_tree_t *tree)
+{
+ ASSERT(tree);
+ return (tree->avl_numnodes == 0);
+}
+
+#define CHILDBIT (1L)
+
+/*
+ * Post-order tree walk used to visit all tree nodes and destroy the tree
+ * in post order. This is used for destroying a tree without paying any cost
+ * for rebalancing it.
+ *
+ * example:
+ *
+ * void *cookie = NULL;
+ * my_data_t *node;
+ *
+ * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
+ * free(node);
+ * avl_destroy(tree);
+ *
+ * The cookie is really an avl_node_t to the current node's parent and
+ * an indication of which child you looked at last.
+ *
+ * On input, a cookie value of CHILDBIT indicates the tree is done.
+ */
+void *
+avl_destroy_nodes(avl_tree_t *tree, void **cookie)
+{
+ avl_node_t *node;
+ avl_node_t *parent;
+ int child;
+ void *first;
+ size_t off = tree->avl_offset;
+
+ /*
+ * Initial calls go to the first node or it's right descendant.
+ */
+ if (*cookie == NULL) {
+ first = avl_first(tree);
+
+ /*
+ * deal with an empty tree
+ */
+ if (first == NULL) {
+ *cookie = (void *)CHILDBIT;
+ return (NULL);
+ }
+
+ node = AVL_DATA2NODE(first, off);
+ parent = AVL_XPARENT(node);
+ goto check_right_side;
+ }
+
+ /*
+ * If there is no parent to return to we are done.
+ */
+ parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT);
+ if (parent == NULL) {
+ if (tree->avl_root != NULL) {
+ ASSERT(tree->avl_numnodes == 1);
+ tree->avl_root = NULL;
+ tree->avl_numnodes = 0;
+ }
+ return (NULL);
+ }
+
+ /*
+ * Remove the child pointer we just visited from the parent and tree.
+ */
+ child = (uintptr_t)(*cookie) & CHILDBIT;
+ parent->avl_child[child] = NULL;
+ ASSERT(tree->avl_numnodes > 1);
+ --tree->avl_numnodes;
+
+ /*
+ * If we just did a right child or there isn't one, go up to parent.
+ */
+ if (child == 1 || parent->avl_child[1] == NULL) {
+ node = parent;
+ parent = AVL_XPARENT(parent);
+ goto done;
+ }
+
+ /*
+ * Do parent's right child, then leftmost descendent.
+ */
+ node = parent->avl_child[1];
+ while (node->avl_child[0] != NULL) {
+ parent = node;
+ node = node->avl_child[0];
+ }
+
+ /*
+ * If here, we moved to a left child. It may have one
+ * child on the right (when balance == +1).
+ */
+check_right_side:
+ if (node->avl_child[1] != NULL) {
+ ASSERT(AVL_XBALANCE(node) == 1);
+ parent = node;
+ node = node->avl_child[1];
+ ASSERT(node->avl_child[0] == NULL &&
+ node->avl_child[1] == NULL);
+ } else {
+ ASSERT(AVL_XBALANCE(node) <= 0);
+ }
+
+done:
+ if (parent == NULL) {
+ *cookie = (void *)CHILDBIT;
+ ASSERT(node == tree->avl_root);
+ } else {
+ *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node));
+ }
+
+ return (AVL_NODE2DATA(node, off));
+}
diff --git a/sys/cddl/contrib/opensolaris/common/lz4/lz4.c b/sys/cddl/contrib/opensolaris/common/lz4/lz4.c
new file mode 100644
index 000000000000..c21d51350c2c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/lz4/lz4.c
@@ -0,0 +1,1043 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Header File
+ * Copyright (C) 2011-2013, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+#include <sys/zfs_context.h>
+#elif defined(_STANDALONE)
+#include <sys/cdefs.h>
+#include <stand.h>
+#include <sys/types.h>
+#include <sys/endian.h>
+#include <assert.h>
+
+#define ASSERT assert
+#else
+#include <string.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <assert.h>
+
+#define ASSERT assert
+#endif
+#include <lz4.h>
+
+static int real_LZ4_compress(const char *source, char *dest, int isize,
+ int osize);
+static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
+ int isize, int maxOutputSize);
+static int LZ4_compressCtx(void *ctx, const char *source, char *dest,
+ int isize, int osize);
+static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
+ int isize, int osize);
+
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+static kmem_cache_t *lz4_ctx_cache;
+#endif
+
+size_t
+lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+ int n __unused)
+{
+ uint32_t bufsiz;
+ char *dest = d_start;
+
+ ASSERT(d_len >= sizeof (bufsiz));
+
+ bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len,
+ d_len - sizeof (bufsiz));
+
+ /* Signal an error if the compression routine returned zero. */
+ if (bufsiz == 0)
+ return (s_len);
+
+ /*
+ * Encode the compresed buffer size at the start. We'll need this in
+ * decompression to counter the effects of padding which might be
+ * added to the compressed buffer and which, if unhandled, would
+ * confuse the hell out of our decompression function.
+ */
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+ *(uint32_t *)(void *)dest = BE_32(bufsiz);
+#else
+ *(uint32_t *)(void *)dest = htonl(bufsiz);
+#endif
+
+ return (bufsiz + sizeof (bufsiz));
+}
+
+int
+lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+ int n __unused)
+{
+ const char *src = s_start;
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+ uint32_t bufsiz = BE_IN32(s_start);
+#else
+ uint32_t bufsiz = htonl(*(uint32_t *)s_start);
+#endif
+
+ /* invalid compressed buffer size encoded at start */
+ if (bufsiz + sizeof (bufsiz) > s_len)
+ return (1);
+
+ /*
+ * Returns 0 on success (decompression function returned non-negative)
+ * and non-zero on failure (decompression function returned negative).
+ */
+ return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
+ d_start, bufsiz, d_len) < 0);
+}
+
+/*
+ * LZ4 API Description:
+ *
+ * Simple Functions:
+ * real_LZ4_compress() :
+ * isize : is the input size. Max supported value is ~1.9GB
+ * return : the number of bytes written in buffer dest
+ * or 0 if the compression fails (if LZ4_COMPRESSMIN is set).
+ * note : destination buffer must be already allocated.
+ * destination buffer must be sized to handle worst cases
+ * situations (input data not compressible).
+ *
+ * Advanced Functions
+ *
+ * LZ4_uncompress_unknownOutputSize() :
+ * isize : is the input size, therefore the compressed size
+ * maxOutputSize : is the size of the destination buffer (which must be
+ * already allocated)
+ * return : the number of bytes decoded in the destination buffer
+ * (necessarily <= maxOutputSize). If the source stream is
+ * malformed, the function will stop decoding and return a
+ * negative result, indicating the byte position of the faulty
+ * instruction. This function never writes beyond dest +
+ * maxOutputSize, and is therefore protected against malicious
+ * data packets.
+ * note : Destination buffer must be already allocated.
+ *
+ * LZ4_compressCtx() :
+ * This function explicitly handles the CTX memory structure.
+ *
+ * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * by the caller (either on the stack or using kmem_zalloc). Passing NULL
+ * isn't valid.
+ *
+ * LZ4_compress64kCtx() :
+ * Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
+ * isize *Must* be <64KB, otherwise the output will be corrupted.
+ *
+ * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * by the caller (either on the stack or using kmem_zalloc). Passing NULL
+ * isn't valid.
+ */
+
+/*
+ * Tuning parameters
+ */
+
+/*
+ * COMPRESSIONLEVEL: Increasing this value improves compression ratio
+ * Lowering this value reduces memory usage. Reduced memory usage
+ * typically improves speed, due to cache effect (ex: L1 32KB for Intel,
+ * L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes
+ * (examples : 12 -> 16KB ; 17 -> 512KB)
+ */
+#define COMPRESSIONLEVEL 12
+
+/*
+ * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the
+ * algorithm skip faster data segments considered "incompressible".
+ * This may decrease compression ratio dramatically, but will be
+ * faster on incompressible data. Increasing this value will make
+ * the algorithm search more before declaring a segment "incompressible".
+ * This could improve compression a bit, but will be slower on
+ * incompressible data. The default value (6) is recommended.
+ */
+#define NOTCOMPRESSIBLE_CONFIRMATION 6
+
+/*
+ * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to
+ * performance for big endian cpu, but the resulting compressed stream
+ * will be incompatible with little-endian CPU. You can set this option
+ * to 1 in situations where data will stay within closed environment.
+ * This option is useless on Little_Endian CPU (such as x86).
+ */
+/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
+
+/*
+ * CPU Feature Detection
+ */
+
+/* 32 or 64 bits ? */
+#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || \
+ defined(__amd64) || defined(__ppc64__) || defined(_WIN64) || \
+ defined(__LP64__) || defined(_LP64))
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+/*
+ * Limits the amount of stack space that the algorithm may consume to hold
+ * the compression lookup table. The value `9' here means we'll never use
+ * more than 2k of stack (see above for a description of COMPRESSIONLEVEL).
+ * If more memory is needed, it is allocated from the heap.
+ */
+/* FreeBSD: Use heap for all platforms for now */
+#define STACKLIMIT 0
+
+/*
+ * Little Endian or Big Endian?
+ * Note: overwrite the below #define if you know your architecture endianess.
+ */
+#if BYTE_ORDER == BIG_ENDIAN
+#define LZ4_BIG_ENDIAN 1
+#else
+/*
+ * Little Endian assumed. PDP Endian and other very rare endian format
+ * are unsupported.
+ */
+#endif
+
+/*
+ * Unaligned memory access is automatically enabled for "common" CPU,
+ * such as x86. For others CPU, the compiler will be more cautious, and
+ * insert extra code to ensure aligned access is respected. If you know
+ * your target CPU supports unaligned memory access, you may want to
+ * force this option manually to improve performance
+ */
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+/*
+ * Compiler Options
+ */
+#if __STDC_VERSION__ >= 199901L /* C99 */
+/* "restrict" is a known keyword */
+#else
+/* Disable restrict */
+#define restrict
+#endif
+
+#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \
+ (((x) & 0xffu) << 8)))
+
+#define expect(expr, value) (__builtin_expect((expr), (value)))
+
+#if defined(likely)
+#undef likely
+#endif
+#if defined(unlikely)
+#undef unlikely
+#endif
+
+#ifndef likely
+#define likely(expr) expect((expr) != 0, 1)
+#endif
+
+#ifndef unlikely
+#define unlikely(expr) expect((expr) != 0, 0)
+#endif
+
+/* Basic types */
+#define BYTE uint8_t
+#define U16 uint16_t
+#define U32 uint32_t
+#define S32 int32_t
+#define U64 uint64_t
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack(1)
+#endif
+
+typedef struct _U16_S {
+ U16 v;
+} U16_S;
+typedef struct _U32_S {
+ U32 v;
+} U32_S;
+typedef struct _U64_S {
+ U64 v;
+} U64_S;
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack()
+#endif
+
+#define A64(x) (((U64_S *)(__DECONST(void *, x)))->v)
+#define A32(x) (((U32_S *)(__DECONST(void *, x)))->v)
+#define A16(x) (((U16_S *)(__DECONST(void *, x)))->v)
+
+/*
+ * Constants
+ */
+#define MINMATCH 4
+
+#define HASH_LOG COMPRESSIONLEVEL
+#define HASHTABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASHTABLESIZE - 1)
+
+#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \
+ NOTCOMPRESSIBLE_CONFIRMATION : 2)
+
+/*
+ * Defines if memory is allocated into the stack (local variable),
+ * or into the heap (kmem_alloc()).
+ */
+#define HEAPMODE (HASH_LOG > STACKLIMIT)
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH + MINMATCH)
+#define MINLENGTH (MFLIMIT + 1)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS 4
+#define ML_MASK ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*
+ * Architecture-specific macros
+ */
+#if LZ4_ARCH64
+#define STEPSIZE 8
+#define UARCH U64
+#define AARCH A64
+#define LZ4_COPYSTEP(s, d) A64(d) = A64(s); d += 8; s += 8;
+#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d)
+#define LZ4_SECURECOPY(s, d, e) if (d < e) LZ4_WILDCOPY(s, d, e)
+#define HTYPE U32
+#define INITBASE(base) const BYTE* const base = ip
+#else /* !LZ4_ARCH64 */
+#define STEPSIZE 4
+#define UARCH U32
+#define AARCH A32
+#define LZ4_COPYSTEP(s, d) A32(d) = A32(s); d += 4; s += 4;
+#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d);
+#define LZ4_SECURECOPY LZ4_WILDCOPY
+#define HTYPE const BYTE *
+#define INITBASE(base) const int base = 0
+#endif /* !LZ4_ARCH64 */
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+ { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#define LZ4_WRITE_LITTLEENDIAN_16(p, i) \
+ { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; }
+#else
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); }
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v) { A16(p) = v; p += 2; }
+#endif
+
+
+/* Local structures */
+struct refTables {
+ HTYPE hashTable[HASHTABLESIZE];
+};
+
+
+/* Macros */
+#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \
+ HASH_LOG))
+#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
+#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e);
+#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \
+ d = e; }
+
+
+/* Private functions */
+#if LZ4_ARCH64
+
+static inline int
+LZ4_NbCommonBytes(register U64 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clzll(val) >> 3);
+#else
+ int r;
+ if (!(val >> 32)) {
+ r = 4;
+ } else {
+ r = 0;
+ val >>= 32;
+ }
+ if (!(val >> 16)) {
+ r += 2;
+ val >>= 8;
+ } else {
+ val >>= 24;
+ }
+ r += (!val);
+ return (r);
+#endif
+#else
+#if !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctzll(val) >> 3);
+#else
+ static const int DeBruijnBytePos[64] =
+ { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5,
+ 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5,
+ 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4,
+ 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
+ };
+ return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >>
+ 58];
+#endif
+#endif
+}
+
+#else
+
+static inline int
+LZ4_NbCommonBytes(register U32 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clz(val) >> 3);
+#else
+ int r;
+ if (!(val >> 16)) {
+ r = 2;
+ val >>= 8;
+ } else {
+ r = 0;
+ val >>= 24;
+ }
+ r += (!val);
+ return (r);
+#endif
+#else
+#if !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctz(val) >> 3);
+#else
+ static const int DeBruijnBytePos[32] = {
+ 0, 0, 3, 0, 3, 1, 3, 0,
+ 3, 2, 2, 1, 3, 2, 0, 1,
+ 3, 3, 1, 2, 2, 2, 2, 0,
+ 3, 1, 2, 0, 1, 0, 1, 1
+ };
+ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >>
+ 27];
+#endif
+#endif
+}
+
+#endif
+
+/* Compression functions */
+
+/*ARGSUSED*/
+static int
+LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize,
+ int osize)
+{
+#if HEAPMODE
+ struct refTables *srt = (struct refTables *)ctx;
+ HTYPE *HashTable = (HTYPE *) (srt->hashTable);
+#else
+ HTYPE HashTable[HASHTABLESIZE] = { 0 };
+#endif
+
+ const BYTE *ip = (const BYTE *) source;
+ INITBASE(base);
+ const BYTE *anchor = ip;
+ const BYTE *const iend = ip + isize;
+ const BYTE *const oend = (BYTE *) dest + osize;
+ const BYTE *const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+ BYTE *op = (BYTE *) dest;
+
+ int len, length;
+ const int skipStrength = SKIPSTRENGTH;
+ U32 forwardH;
+
+
+ /* Init */
+ if (isize < MINLENGTH)
+ goto _last_literals;
+
+ /* First Byte */
+ HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+ ip++;
+ forwardH = LZ4_HASH_VALUE(ip);
+
+ /* Main Loop */
+ for (;;) {
+ int findMatchAttempts = (1U << skipStrength) + 3;
+ const BYTE *forwardIp = ip;
+ const BYTE *ref;
+ BYTE *token;
+
+ /* Find a match */
+ do {
+ U32 h = forwardH;
+ int step = findMatchAttempts++ >> skipStrength;
+ ip = forwardIp;
+ forwardIp = ip + step;
+
+ if unlikely(forwardIp > mflimit) {
+ goto _last_literals;
+ }
+
+ forwardH = LZ4_HASH_VALUE(forwardIp);
+ ref = base + HashTable[h];
+ HashTable[h] = ip - base;
+
+ } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+ /* Catch up */
+ while ((ip > anchor) && (ref > (const BYTE *) source) &&
+ unlikely(ip[-1] == ref[-1])) {
+ ip--;
+ ref--;
+ }
+
+ /* Encode Literal length */
+ length = ip - anchor;
+ token = op++;
+
+ /* Check output limit */
+ if unlikely(op + length + (2 + 1 + LASTLITERALS) +
+ (length >> 8) > oend)
+ return (0);
+
+ if (length >= (int)RUN_MASK) {
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254; len -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(anchor, op, length);
+
+ _next_match:
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+ /* Start Counting */
+ ip += MINMATCH;
+ ref += MINMATCH; /* MinMatch verified */
+ anchor = ip;
+ while likely(ip < matchlimit - (STEPSIZE - 1)) {
+ UARCH diff = AARCH(ref) ^ AARCH(ip);
+ if (!diff) {
+ ip += STEPSIZE;
+ ref += STEPSIZE;
+ continue;
+ }
+ ip += LZ4_NbCommonBytes(diff);
+ goto _endCount;
+ }
+#if LZ4_ARCH64
+ if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+ ip += 4;
+ ref += 4;
+ }
+#endif
+ if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+ ip += 2;
+ ref += 2;
+ }
+ if ((ip < matchlimit) && (*ref == *ip))
+ ip++;
+ _endCount:
+
+ /* Encode MatchLength */
+ len = (ip - anchor);
+ /* Check output limit */
+ if unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)
+ return (0);
+ if (len >= (int)ML_MASK) {
+ *token += ML_MASK;
+ len -= ML_MASK;
+ for (; len > 509; len -= 510) {
+ *op++ = 255;
+ *op++ = 255;
+ }
+ if (len > 254) {
+ len -= 255;
+ *op++ = 255;
+ }
+ *op++ = (BYTE)len;
+ } else
+ *token += len;
+
+ /* Test end of chunk */
+ if (ip > mflimit) {
+ anchor = ip;
+ break;
+ }
+ /* Fill table */
+ HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base;
+
+ /* Test next position */
+ ref = base + HashTable[LZ4_HASH_VALUE(ip)];
+ HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+ if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+ token = op++;
+ *token = 0;
+ goto _next_match;
+ }
+ /* Prepare next loop */
+ anchor = ip++;
+ forwardH = LZ4_HASH_VALUE(ip);
+ }
+
+ _last_literals:
+ /* Encode Last Literals */
+ {
+ int lastRun = iend - anchor;
+ if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+ oend)
+ return (0);
+ if (lastRun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastRun -= RUN_MASK;
+ for (; lastRun > 254; lastRun -= 255) {
+ *op++ = 255;
+ }
+ *op++ = (BYTE)lastRun;
+ } else
+ *op++ = (lastRun << ML_BITS);
+ (void) memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+ }
+
+ /* End */
+ return (int)(((char *)op) - dest);
+}
+
+
+
+/* Note : this function is valid only if isize < LZ4_64KLIMIT */
+#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1))
+#define HASHLOG64K (HASH_LOG + 1)
+#define HASH64KTABLESIZE (1U << HASHLOG64K)
+#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \
+ HASHLOG64K))
+#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p))
+
+/*ARGSUSED*/
+static int
+LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize,
+ int osize)
+{
+#if HEAPMODE
+ struct refTables *srt = (struct refTables *)ctx;
+ U16 *HashTable = (U16 *) (srt->hashTable);
+#else
+ U16 HashTable[HASH64KTABLESIZE] = { 0 };
+#endif
+
+ const BYTE *ip = (const BYTE *) source;
+ const BYTE *anchor = ip;
+ const BYTE *const base = ip;
+ const BYTE *const iend = ip + isize;
+ const BYTE *const oend = (BYTE *) dest + osize;
+ const BYTE *const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+ BYTE *op = (BYTE *) dest;
+
+ int len, length;
+ const int skipStrength = SKIPSTRENGTH;
+ U32 forwardH;
+
+ /* Init */
+ if (isize < MINLENGTH)
+ goto _last_literals;
+
+ /* First Byte */
+ ip++;
+ forwardH = LZ4_HASH64K_VALUE(ip);
+
+ /* Main Loop */
+ for (;;) {
+ int findMatchAttempts = (1U << skipStrength) + 3;
+ const BYTE *forwardIp = ip;
+ const BYTE *ref;
+ BYTE *token;
+
+ /* Find a match */
+ do {
+ U32 h = forwardH;
+ int step = findMatchAttempts++ >> skipStrength;
+ ip = forwardIp;
+ forwardIp = ip + step;
+
+ if (forwardIp > mflimit) {
+ goto _last_literals;
+ }
+
+ forwardH = LZ4_HASH64K_VALUE(forwardIp);
+ ref = base + HashTable[h];
+ HashTable[h] = ip - base;
+
+ } while (A32(ref) != A32(ip));
+
+ /* Catch up */
+ while ((ip > anchor) && (ref > (const BYTE *) source) &&
+ (ip[-1] == ref[-1])) {
+ ip--;
+ ref--;
+ }
+
+ /* Encode Literal length */
+ length = ip - anchor;
+ token = op++;
+
+ /* Check output limit */
+ if unlikely(op + length + (2 + 1 + LASTLITERALS) +
+ (length >> 8) > oend)
+ return (0);
+
+ if (length >= (int)RUN_MASK) {
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254; len -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(anchor, op, length);
+
+ _next_match:
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+ /* Start Counting */
+ ip += MINMATCH;
+ ref += MINMATCH; /* MinMatch verified */
+ anchor = ip;
+ while (ip < matchlimit - (STEPSIZE - 1)) {
+ UARCH diff = AARCH(ref) ^ AARCH(ip);
+ if (!diff) {
+ ip += STEPSIZE;
+ ref += STEPSIZE;
+ continue;
+ }
+ ip += LZ4_NbCommonBytes(diff);
+ goto _endCount;
+ }
+#if LZ4_ARCH64
+ if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+ ip += 4;
+ ref += 4;
+ }
+#endif
+ if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+ ip += 2;
+ ref += 2;
+ }
+ if ((ip < matchlimit) && (*ref == *ip))
+ ip++;
+ _endCount:
+
+ /* Encode MatchLength */
+ len = (ip - anchor);
+ /* Check output limit */
+ if unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)
+ return (0);
+ if (len >= (int)ML_MASK) {
+ *token += ML_MASK;
+ len -= ML_MASK;
+ for (; len > 509; len -= 510) {
+ *op++ = 255;
+ *op++ = 255;
+ }
+ if (len > 254) {
+ len -= 255;
+ *op++ = 255;
+ }
+ *op++ = (BYTE)len;
+ } else
+ *token += len;
+
+ /* Test end of chunk */
+ if (ip > mflimit) {
+ anchor = ip;
+ break;
+ }
+ /* Fill table */
+ HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base;
+
+ /* Test next position */
+ ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
+ HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base;
+ if (A32(ref) == A32(ip)) {
+ token = op++;
+ *token = 0;
+ goto _next_match;
+ }
+ /* Prepare next loop */
+ anchor = ip++;
+ forwardH = LZ4_HASH64K_VALUE(ip);
+ }
+
+ _last_literals:
+ /* Encode Last Literals */
+ {
+ int lastRun = iend - anchor;
+ if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+ oend)
+ return (0);
+ if (lastRun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastRun -= RUN_MASK;
+ for (; lastRun > 254; lastRun -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)lastRun;
+ } else
+ *op++ = (lastRun << ML_BITS);
+ (void) memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+ }
+
+ /* End */
+ return (int)(((char *)op) - dest);
+}
+
+static int
+real_LZ4_compress(const char *source, char *dest, int isize, int osize)
+{
+#if HEAPMODE
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+ void *ctx = kmem_cache_alloc(lz4_ctx_cache, KM_NOSLEEP);
+#else
+ void *ctx = calloc(1, sizeof(struct refTables));
+#endif
+ int result;
+
+ /*
+ * out of kernel memory, gently fall through - this will disable
+ * compression in zio_compress_data
+ */
+ if (ctx == NULL)
+ return (0);
+
+ bzero(ctx, sizeof(struct refTables));
+ if (isize < LZ4_64KLIMIT)
+ result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
+ else
+ result = LZ4_compressCtx(ctx, source, dest, isize, osize);
+
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+ kmem_cache_free(lz4_ctx_cache, ctx);
+#else
+ free(ctx);
+#endif
+ return (result);
+#else
+ if (isize < (int)LZ4_64KLIMIT)
+ return (LZ4_compress64kCtx(NULL, source, dest, isize, osize));
+ return (LZ4_compressCtx(NULL, source, dest, isize, osize));
+#endif
+}
+
+/* Decompression functions */
+
+/*
+ * Note: The decoding function LZ4_uncompress_unknownOutputSize() is safe
+ * against "buffer overflow" attack type. It will never write nor
+ * read outside of the provided output buffers.
+ * LZ4_uncompress_unknownOutputSize() also insures that it will never
+ * read outside of the input buffer. A corrupted input will produce
+ * an error result, a negative int, indicating the position of the
+ * error within input stream.
+ */
+
+static int
+LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize,
+ int maxOutputSize)
+{
+ /* Local Variables */
+ const BYTE *restrict ip = (const BYTE *) source;
+ const BYTE *const iend = ip + isize;
+ const BYTE *ref;
+
+ BYTE *op = (BYTE *) dest;
+ BYTE *const oend = op + maxOutputSize;
+ BYTE *cpy;
+
+ size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+ size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+#endif
+
+ /* Main Loop */
+ while (ip < iend) {
+ unsigned token;
+ size_t length;
+
+ /* get runlength */
+ token = *ip++;
+ if ((length = (token >> ML_BITS)) == RUN_MASK) {
+ int s = 255;
+ while ((ip < iend) && (s == 255)) {
+ s = *ip++;
+ length += s;
+ }
+ }
+ /* copy literals */
+ cpy = op + length;
+ /* CORNER-CASE: cpy might overflow. */
+ if (cpy < op)
+ goto _output_error; /* cpy was overflowed, bail! */
+ if ((cpy > oend - COPYLENGTH) ||
+ (ip + length > iend - COPYLENGTH)) {
+ if (cpy > oend)
+ /* Error: writes beyond output buffer */
+ goto _output_error;
+ if (ip + length != iend)
+ /*
+ * Error: LZ4 format requires to consume all
+ * input at this stage
+ */
+ goto _output_error;
+ (void) memcpy(op, ip, length);
+ op += length;
+ /* Necessarily EOF, due to parsing restrictions */
+ break;
+ }
+ LZ4_WILDCOPY(ip, op, cpy);
+ ip -= (op - cpy);
+ op = cpy;
+
+ /* get offset */
+ LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+ ip += 2;
+ if (ref < (BYTE * const) dest)
+ /*
+ * Error: offset creates reference outside of
+ * destination buffer
+ */
+ goto _output_error;
+
+ /* get matchlength */
+ if ((length = (token & ML_MASK)) == ML_MASK) {
+ while (ip < iend) {
+ int s = *ip++;
+ length += s;
+ if (s == 255)
+ continue;
+ break;
+ }
+ }
+ /* copy repeated sequence */
+ if unlikely(op - ref < STEPSIZE) {
+#if LZ4_ARCH64
+ size_t dec64 = dec64table[op-ref];
+#else
+ const int dec64 = 0;
+#endif
+ op[0] = ref[0];
+ op[1] = ref[1];
+ op[2] = ref[2];
+ op[3] = ref[3];
+ op += 4;
+ ref += 4;
+ ref -= dec32table[op-ref];
+ A32(op) = A32(ref);
+ op += STEPSIZE - 4;
+ ref -= dec64;
+ } else {
+ LZ4_COPYSTEP(ref, op);
+ }
+ cpy = op + length - (STEPSIZE - 4);
+ if (cpy > oend - COPYLENGTH) {
+ if (cpy > oend)
+ /*
+ * Error: request to write outside of
+ * destination buffer
+ */
+ goto _output_error;
+ LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+ while (op < cpy)
+ *op++ = *ref++;
+ op = cpy;
+ if (op == oend)
+ /*
+ * Check EOF (should never happen, since
+ * last 5 bytes are supposed to be literals)
+ */
+ goto _output_error;
+ continue;
+ }
+ LZ4_SECURECOPY(ref, op, cpy);
+ op = cpy; /* correction */
+ }
+
+ /* end of decoding */
+ return (int)(((char *)op) - dest);
+
+ /* write overflow error detected */
+ _output_error:
+ return (int)(-(((const char *)ip) - source));
+}
+
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+extern void
+lz4_init(void)
+{
+
+#if HEAPMODE
+ lz4_ctx_cache = kmem_cache_create("lz4_ctx", sizeof(struct refTables),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+#endif
+}
+
+extern void
+lz4_fini(void)
+{
+
+#if HEAPMODE
+ kmem_cache_destroy(lz4_ctx_cache);
+#endif
+}
+#endif /* _KERNEL || _FAKE_KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/common/lz4/lz4.h b/sys/cddl/contrib/opensolaris/common/lz4/lz4.h
new file mode 100644
index 000000000000..bf48ed53bc7f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/lz4/lz4.h
@@ -0,0 +1,50 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Header File
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+#ifndef _LZ4_H
+#define _LZ4_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern size_t lz4_compress(void *, void *, size_t, size_t, int);
+extern int lz4_decompress(void *, void *, size_t, size_t, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LZ4_H */
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c
new file mode 100644
index 000000000000..eb200a24e6d2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c
@@ -0,0 +1,512 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/nvpair.h>
+#ifndef _KERNEL
+#include <sys/zfs_context.h>
+#else
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/debug.h>
+#endif
+
+/*
+ * "Force" nvlist wrapper.
+ *
+ * These functions wrap the nvlist_* functions with assertions that assume
+ * the operation is successful. This allows the caller's code to be much
+ * more readable, especially for the fnvlist_lookup_* and fnvpair_value_*
+ * functions, which can return the requested value (rather than filling in
+ * a pointer).
+ *
+ * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate
+ * with KM_SLEEP.
+ *
+ * More wrappers should be added as needed -- for example
+ * nvlist_lookup_*_array and nvpair_value_*_array.
+ */
+
+nvlist_t *
+fnvlist_alloc(void)
+{
+ nvlist_t *nvl;
+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP));
+ return (nvl);
+}
+
+void
+fnvlist_free(nvlist_t *nvl)
+{
+ nvlist_free(nvl);
+}
+
+size_t
+fnvlist_size(nvlist_t *nvl)
+{
+ size_t size;
+ VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE));
+ return (size);
+}
+
+/*
+ * Returns allocated buffer of size *sizep. Caller must free the buffer with
+ * fnvlist_pack_free().
+ */
+char *
+fnvlist_pack(nvlist_t *nvl, size_t *sizep)
+{
+ char *packed = 0;
+ VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE,
+ KM_SLEEP), ==, 0);
+ return (packed);
+}
+
+/*ARGSUSED*/
+void
+fnvlist_pack_free(char *pack, size_t size)
+{
+#ifdef _KERNEL
+ kmem_free(pack, size);
+#else
+ free(pack);
+#endif
+}
+
+nvlist_t *
+fnvlist_unpack(char *buf, size_t buflen)
+{
+ nvlist_t *rv;
+ VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP));
+ return (rv);
+}
+
+nvlist_t *
+fnvlist_dup(nvlist_t *nvl)
+{
+ nvlist_t *rv;
+ VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP));
+ return (rv);
+}
+
+void
+fnvlist_merge(nvlist_t *dst, nvlist_t *src)
+{
+ VERIFY0(nvlist_merge(dst, src, KM_SLEEP));
+}
+
+size_t
+fnvlist_num_pairs(nvlist_t *nvl)
+{
+ size_t count = 0;
+ nvpair_t *pair;
+
+ for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL;
+ pair = nvlist_next_nvpair(nvl, pair))
+ count++;
+ return (count);
+}
+
+void
+fnvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+ VERIFY0(nvlist_add_boolean(nvl, name));
+}
+
+void
+fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+ VERIFY0(nvlist_add_boolean_value(nvl, name, val));
+}
+
+void
+fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+ VERIFY0(nvlist_add_byte(nvl, name, val));
+}
+
+void
+fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+ VERIFY0(nvlist_add_int8(nvl, name, val));
+}
+
+void
+fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+ VERIFY0(nvlist_add_uint8(nvl, name, val));
+}
+
+void
+fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+ VERIFY0(nvlist_add_int16(nvl, name, val));
+}
+
+void
+fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+ VERIFY0(nvlist_add_uint16(nvl, name, val));
+}
+
+void
+fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+ VERIFY0(nvlist_add_int32(nvl, name, val));
+}
+
+void
+fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+ VERIFY0(nvlist_add_uint32(nvl, name, val));
+}
+
+void
+fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+ VERIFY0(nvlist_add_int64(nvl, name, val));
+}
+
+void
+fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+ VERIFY0(nvlist_add_uint64(nvl, name, val));
+}
+
+void
+fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+ VERIFY0(nvlist_add_string(nvl, name, val));
+}
+
+void
+fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+ VERIFY0(nvlist_add_nvlist(nvl, name, val));
+}
+
+void
+fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+ VERIFY0(nvlist_add_nvpair(nvl, pair));
+}
+
+void
+fnvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+ boolean_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_boolean_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_byte_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_int8_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_uint8_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_int16_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint16_array(nvlist_t *nvl, const char *name,
+ uint16_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_uint16_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_int32_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint32_array(nvlist_t *nvl, const char *name,
+ uint32_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_uint32_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_int64_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint64_array(nvlist_t *nvl, const char *name,
+ uint64_t *val, uint_t n)
+{
+ VERIFY0(nvlist_add_uint64_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_string_array(nvlist_t *nvl, const char *name,
+ char * const *val, uint_t n)
+{
+ VERIFY0(nvlist_add_string_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name,
+ nvlist_t **val, uint_t n)
+{
+ VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n));
+}
+
+void
+fnvlist_remove(nvlist_t *nvl, const char *name)
+{
+ VERIFY0(nvlist_remove_all(nvl, name));
+}
+
+void
+fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+ VERIFY0(nvlist_remove_nvpair(nvl, pair));
+}
+
+nvpair_t *
+fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name)
+{
+ nvpair_t *rv;
+ VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv));
+ return (rv);
+}
+
+/* returns B_TRUE if the entry exists */
+boolean_t
+fnvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+ return (nvlist_lookup_boolean(nvl, name) == 0);
+}
+
+boolean_t
+fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name)
+{
+ boolean_t rv;
+ VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv));
+ return (rv);
+}
+
+uchar_t
+fnvlist_lookup_byte(nvlist_t *nvl, const char *name)
+{
+ uchar_t rv;
+ VERIFY0(nvlist_lookup_byte(nvl, name, &rv));
+ return (rv);
+}
+
+int8_t
+fnvlist_lookup_int8(nvlist_t *nvl, const char *name)
+{
+ int8_t rv;
+ VERIFY0(nvlist_lookup_int8(nvl, name, &rv));
+ return (rv);
+}
+
+int16_t
+fnvlist_lookup_int16(nvlist_t *nvl, const char *name)
+{
+ int16_t rv;
+ VERIFY0(nvlist_lookup_int16(nvl, name, &rv));
+ return (rv);
+}
+
+int32_t
+fnvlist_lookup_int32(nvlist_t *nvl, const char *name)
+{
+ int32_t rv;
+ VERIFY0(nvlist_lookup_int32(nvl, name, &rv));
+ return (rv);
+}
+
+int64_t
+fnvlist_lookup_int64(nvlist_t *nvl, const char *name)
+{
+ int64_t rv;
+ VERIFY0(nvlist_lookup_int64(nvl, name, &rv));
+ return (rv);
+}
+
+uint8_t
+fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name)
+{
+ uint8_t rv;
+ VERIFY0(nvlist_lookup_uint8(nvl, name, &rv));
+ return (rv);
+}
+
+uint16_t
+fnvlist_lookup_uint16(nvlist_t *nvl, const char *name)
+{
+ uint16_t rv;
+ VERIFY0(nvlist_lookup_uint16(nvl, name, &rv));
+ return (rv);
+}
+
+uint32_t
+fnvlist_lookup_uint32(nvlist_t *nvl, const char *name)
+{
+ uint32_t rv;
+ VERIFY0(nvlist_lookup_uint32(nvl, name, &rv));
+ return (rv);
+}
+
+uint64_t
+fnvlist_lookup_uint64(nvlist_t *nvl, const char *name)
+{
+ uint64_t rv;
+ VERIFY0(nvlist_lookup_uint64(nvl, name, &rv));
+ return (rv);
+}
+
+char *
+fnvlist_lookup_string(nvlist_t *nvl, const char *name)
+{
+ char *rv;
+ VERIFY0(nvlist_lookup_string(nvl, name, &rv));
+ return (rv);
+}
+
+nvlist_t *
+fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name)
+{
+ nvlist_t *rv;
+ VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv));
+ return (rv);
+}
+
+boolean_t
+fnvpair_value_boolean_value(nvpair_t *nvp)
+{
+ boolean_t rv;
+ VERIFY0(nvpair_value_boolean_value(nvp, &rv));
+ return (rv);
+}
+
+uchar_t
+fnvpair_value_byte(nvpair_t *nvp)
+{
+ uchar_t rv;
+ VERIFY0(nvpair_value_byte(nvp, &rv));
+ return (rv);
+}
+
+int8_t
+fnvpair_value_int8(nvpair_t *nvp)
+{
+ int8_t rv;
+ VERIFY0(nvpair_value_int8(nvp, &rv));
+ return (rv);
+}
+
+int16_t
+fnvpair_value_int16(nvpair_t *nvp)
+{
+ int16_t rv;
+ VERIFY0(nvpair_value_int16(nvp, &rv));
+ return (rv);
+}
+
+int32_t
+fnvpair_value_int32(nvpair_t *nvp)
+{
+ int32_t rv;
+ VERIFY0(nvpair_value_int32(nvp, &rv));
+ return (rv);
+}
+
+int64_t
+fnvpair_value_int64(nvpair_t *nvp)
+{
+ int64_t rv;
+ VERIFY0(nvpair_value_int64(nvp, &rv));
+ return (rv);
+}
+
+uint8_t
+fnvpair_value_uint8_t(nvpair_t *nvp)
+{
+ uint8_t rv;
+ VERIFY0(nvpair_value_uint8(nvp, &rv));
+ return (rv);
+}
+
+uint16_t
+fnvpair_value_uint16(nvpair_t *nvp)
+{
+ uint16_t rv;
+ VERIFY0(nvpair_value_uint16(nvp, &rv));
+ return (rv);
+}
+
+uint32_t
+fnvpair_value_uint32(nvpair_t *nvp)
+{
+ uint32_t rv;
+ VERIFY0(nvpair_value_uint32(nvp, &rv));
+ return (rv);
+}
+
+uint64_t
+fnvpair_value_uint64(nvpair_t *nvp)
+{
+ uint64_t rv;
+ VERIFY0(nvpair_value_uint64(nvp, &rv));
+ return (rv);
+}
+
+char *
+fnvpair_value_string(nvpair_t *nvp)
+{
+ char *rv;
+ VERIFY0(nvpair_value_string(nvp, &rv));
+ return (rv);
+}
+
+nvlist_t *
+fnvpair_value_nvlist(nvpair_t *nvp)
+{
+ nvlist_t *rv;
+ VERIFY0(nvpair_value_nvlist(nvp, &rv));
+ return (rv);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
new file mode 100644
index 000000000000..c322a5bd2179
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
@@ -0,0 +1,3600 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/debug.h>
+#include <sys/nvpair.h>
+#include <sys/nvpair_impl.h>
+#include <rpc/types.h>
+#include <rpc/xdr.h>
+
+#if defined(_KERNEL) && !defined(_BOOT)
+#include <sys/varargs.h>
+#include <sys/sunddi.h>
+#else
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#endif
+
+#ifndef offsetof
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#endif
+#define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++
+
+#if defined(__FreeBSD__) && !defined(_KERNEL)
+/*
+ * libnvpair is the lowest commen denominator for ZFS related libraries,
+ * defining aok here makes it usable by all ZFS related libraries
+ */
+int aok;
+#endif
+
+/*
+ * nvpair.c - Provides kernel & userland interfaces for manipulating
+ * name-value pairs.
+ *
+ * Overview Diagram
+ *
+ * +--------------+
+ * | nvlist_t |
+ * |--------------|
+ * | nvl_version |
+ * | nvl_nvflag |
+ * | nvl_priv -+-+
+ * | nvl_flag | |
+ * | nvl_pad | |
+ * +--------------+ |
+ * V
+ * +--------------+ last i_nvp in list
+ * | nvpriv_t | +--------------------->
+ * |--------------| |
+ * +--+- nvp_list | | +------------+
+ * | | nvp_last -+--+ + nv_alloc_t |
+ * | | nvp_curr | |------------|
+ * | | nvp_nva -+----> | nva_ops |
+ * | | nvp_stat | | nva_arg |
+ * | +--------------+ +------------+
+ * |
+ * +-------+
+ * V
+ * +---------------------+ +-------------------+
+ * | i_nvp_t | +-->| i_nvp_t | +-->
+ * |---------------------| | |-------------------| |
+ * | nvi_next -+--+ | nvi_next -+--+
+ * | nvi_prev (NULL) | <----+ nvi_prev |
+ * | . . . . . . . . . . | | . . . . . . . . . |
+ * | nvp (nvpair_t) | | nvp (nvpair_t) |
+ * | - nvp_size | | - nvp_size |
+ * | - nvp_name_sz | | - nvp_name_sz |
+ * | - nvp_value_elem | | - nvp_value_elem |
+ * | - nvp_type | | - nvp_type |
+ * | - data ... | | - data ... |
+ * +---------------------+ +-------------------+
+ *
+ *
+ *
+ * +---------------------+ +---------------------+
+ * | i_nvp_t | +--> +-->| i_nvp_t (last) |
+ * |---------------------| | | |---------------------|
+ * | nvi_next -+--+ ... --+ | nvi_next (NULL) |
+ * <-+- nvi_prev |<-- ... <----+ nvi_prev |
+ * | . . . . . . . . . | | . . . . . . . . . |
+ * | nvp (nvpair_t) | | nvp (nvpair_t) |
+ * | - nvp_size | | - nvp_size |
+ * | - nvp_name_sz | | - nvp_name_sz |
+ * | - nvp_value_elem | | - nvp_value_elem |
+ * | - DATA_TYPE_NVLIST | | - nvp_type |
+ * | - data (embedded) | | - data ... |
+ * | nvlist name | +---------------------+
+ * | +--------------+ |
+ * | | nvlist_t | |
+ * | |--------------| |
+ * | | nvl_version | |
+ * | | nvl_nvflag | |
+ * | | nvl_priv --+---+---->
+ * | | nvl_flag | |
+ * | | nvl_pad | |
+ * | +--------------+ |
+ * +---------------------+
+ *
+ *
+ * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
+ * allow value to be aligned on 8 byte boundary
+ *
+ * name_len is the length of the name string including the null terminator
+ * so it must be >= 1
+ */
+#define NVP_SIZE_CALC(name_len, data_len) \
+ (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
+
+static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
+static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
+ uint_t nelem, const void *data);
+
+#define NV_STAT_EMBEDDED 0x1
+#define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp))
+#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp))
+
+#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
+#define NVPAIR2I_NVP(nvp) \
+ ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
+
+#ifdef _KERNEL
+int nvpair_max_recursion = 20;
+#else
+int nvpair_max_recursion = 100;
+#endif
+
+uint64_t nvlist_hashtable_init_size = (1 << 4);
+
+int
+nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
+{
+ va_list valist;
+ int err = 0;
+
+ nva->nva_ops = nvo;
+ nva->nva_arg = NULL;
+
+ va_start(valist, nvo);
+ if (nva->nva_ops->nv_ao_init != NULL)
+ err = nva->nva_ops->nv_ao_init(nva, valist);
+ va_end(valist);
+
+ return (err);
+}
+
+void
+nv_alloc_reset(nv_alloc_t *nva)
+{
+ if (nva->nva_ops->nv_ao_reset != NULL)
+ nva->nva_ops->nv_ao_reset(nva);
+}
+
+void
+nv_alloc_fini(nv_alloc_t *nva)
+{
+ if (nva->nva_ops->nv_ao_fini != NULL)
+ nva->nva_ops->nv_ao_fini(nva);
+}
+
+nv_alloc_t *
+nvlist_lookup_nv_alloc(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ return (priv->nvp_nva);
+}
+
+static void *
+nv_mem_zalloc(nvpriv_t *nvp, size_t size)
+{
+ nv_alloc_t *nva = nvp->nvp_nva;
+ void *buf;
+
+ if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
+ bzero(buf, size);
+
+ return (buf);
+}
+
+static void
+nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
+{
+ nv_alloc_t *nva = nvp->nvp_nva;
+
+ nva->nva_ops->nv_ao_free(nva, buf, size);
+}
+
+static void
+nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
+{
+ bzero(priv, sizeof (nvpriv_t));
+
+ priv->nvp_nva = nva;
+ priv->nvp_stat = stat;
+}
+
+static nvpriv_t *
+nv_priv_alloc(nv_alloc_t *nva)
+{
+ nvpriv_t *priv;
+
+ /*
+ * nv_mem_alloc() cannot called here because it needs the priv
+ * argument.
+ */
+ if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
+ return (NULL);
+
+ nv_priv_init(priv, nva, 0);
+
+ return (priv);
+}
+
+/*
+ * Embedded lists need their own nvpriv_t's. We create a new
+ * nvpriv_t using the parameters and allocator from the parent
+ * list's nvpriv_t.
+ */
+static nvpriv_t *
+nv_priv_alloc_embedded(nvpriv_t *priv)
+{
+ nvpriv_t *emb_priv;
+
+ if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
+ return (NULL);
+
+ nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
+
+ return (emb_priv);
+}
+
+static int
+nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
+{
+ ASSERT3P(priv->nvp_hashtable, ==, NULL);
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+
+ i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *));
+ if (tab == NULL)
+ return (ENOMEM);
+
+ priv->nvp_hashtable = tab;
+ priv->nvp_nbuckets = buckets;
+ return (0);
+}
+
+static void
+nvt_tab_free(nvpriv_t *priv)
+{
+ i_nvp_t **tab = priv->nvp_hashtable;
+ if (tab == NULL) {
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+ return;
+ }
+
+ nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
+
+ priv->nvp_hashtable = NULL;
+ priv->nvp_nbuckets = 0;
+ priv->nvp_nentries = 0;
+}
+
+static uint32_t
+nvt_hash(const char *p)
+{
+ uint32_t g, hval = 0;
+
+ while (*p) {
+ hval = (hval << 4) + *p++;
+ if ((g = (hval & 0xf0000000)) != 0)
+ hval ^= g >> 24;
+ hval &= ~g;
+ }
+ return (hval);
+}
+
+static boolean_t
+nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag)
+{
+ boolean_t match = B_FALSE;
+ if (nvflag & NV_UNIQUE_NAME_TYPE) {
+ if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
+ NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
+ match = B_TRUE;
+ } else {
+ ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME);
+ if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
+ match = B_TRUE;
+ }
+ return (match);
+}
+
+static nvpair_t *
+nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ ASSERT(priv != NULL);
+
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ if (tab == NULL) {
+ ASSERT3P(priv->nvp_list, ==, NULL);
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+ return (NULL);
+ } else {
+ ASSERT(priv->nvp_nbuckets != 0);
+ }
+
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *entry = tab[index];
+
+ for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
+ if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
+ (type == DATA_TYPE_DONTCARE ||
+ NVP_TYPE(&e->nvi_nvp) == type))
+ return (&e->nvi_nvp);
+ }
+ return (NULL);
+}
+
+static nvpair_t *
+nvt_lookup_name(nvlist_t *nvl, const char *name)
+{
+ return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
+}
+
+static int
+nvt_resize(nvpriv_t *priv, uint32_t new_size)
+{
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ /*
+ * Migrate all the entries from the current table
+ * to a newly-allocated table with the new size by
+ * re-adjusting the pointers of their entries.
+ */
+ uint32_t size = priv->nvp_nbuckets;
+ uint32_t new_mask = new_size - 1;
+ ASSERT(((new_size) & ((new_size) - 1)) == 0);
+
+ i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *));
+ if (new_tab == NULL)
+ return (ENOMEM);
+
+ uint32_t nentries = 0;
+ for (uint32_t i = 0; i < size; i++) {
+ i_nvp_t *next, *e = tab[i];
+
+ while (e != NULL) {
+ next = e->nvi_hashtable_next;
+
+ uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
+ uint32_t index = hash & new_mask;
+
+ e->nvi_hashtable_next = new_tab[index];
+ new_tab[index] = e;
+ nentries++;
+
+ e = next;
+ }
+ tab[i] = NULL;
+ }
+ ASSERT3U(nentries, ==, priv->nvp_nentries);
+
+ nvt_tab_free(priv);
+
+ priv->nvp_hashtable = new_tab;
+ priv->nvp_nbuckets = new_size;
+ priv->nvp_nentries = nentries;
+
+ return (0);
+}
+
+static boolean_t
+nvt_needs_togrow(nvpriv_t *priv)
+{
+ /*
+ * Grow only when we have more elements than buckets
+ * and the # of buckets doesn't overflow.
+ */
+ return (priv->nvp_nentries > priv->nvp_nbuckets &&
+ (UINT32_MAX >> 1) >= priv->nvp_nbuckets);
+}
+
+/*
+ * Allocate a new table that's twice the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_grow(nvpriv_t *priv)
+{
+ uint32_t current_size = priv->nvp_nbuckets;
+ /* ensure we won't overflow */
+ ASSERT3U(UINT32_MAX >> 1, >=, current_size);
+ return (nvt_resize(priv, current_size << 1));
+}
+
+static boolean_t
+nvt_needs_toshrink(nvpriv_t *priv)
+{
+ /*
+ * Shrink only when the # of elements is less than or
+ * equal to 1/4 the # of buckets. Never shrink less than
+ * nvlist_hashtable_init_size.
+ */
+ ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
+ if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
+ return (B_FALSE);
+ return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
+}
+
+/*
+ * Allocate a new table that's half the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_shrink(nvpriv_t *priv)
+{
+ uint32_t current_size = priv->nvp_nbuckets;
+ /* ensure we won't overflow */
+ ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
+ return (nvt_resize(priv, current_size >> 1));
+}
+
+static int
+nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+ if (nvt_needs_toshrink(priv)) {
+ int err = nvt_shrink(priv);
+ if (err != 0)
+ return (err);
+ }
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ char *name = NVP_NAME(nvp);
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *bucket = tab[index];
+
+ for (i_nvp_t *prev = NULL, *e = bucket;
+ e != NULL; prev = e, e = e->nvi_hashtable_next) {
+ if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_flag)) {
+ if (prev != NULL) {
+ prev->nvi_hashtable_next =
+ e->nvi_hashtable_next;
+ } else {
+ ASSERT3P(e, ==, bucket);
+ tab[index] = e->nvi_hashtable_next;
+ }
+ e->nvi_hashtable_next = NULL;
+ priv->nvp_nentries--;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static int
+nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+ /* initialize nvpair table now if it doesn't exist. */
+ if (priv->nvp_hashtable == NULL) {
+ int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
+ if (err != 0)
+ return (err);
+ }
+
+ /*
+ * if we don't allow duplicate entries, make sure to
+ * unlink any existing entries from the table.
+ */
+ if (nvl->nvl_nvflag != 0) {
+ int err = nvt_remove_nvpair(nvl, nvp);
+ if (err != 0)
+ return (err);
+ }
+
+ if (nvt_needs_togrow(priv)) {
+ int err = nvt_grow(priv);
+ if (err != 0)
+ return (err);
+ }
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ char *name = NVP_NAME(nvp);
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *bucket = tab[index];
+
+ /* insert link at the beginning of the bucket */
+ i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
+ ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
+ new_entry->nvi_hashtable_next = bucket;
+ tab[index] = new_entry;
+
+ priv->nvp_nentries++;
+ return (0);
+}
+
+static void
+nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
+{
+ nvl->nvl_version = NV_VERSION;
+ nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE);
+ nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
+ nvl->nvl_flag = 0;
+ nvl->nvl_pad = 0;
+}
+
+uint_t
+nvlist_nvflag(nvlist_t *nvl)
+{
+ return (nvl->nvl_nvflag);
+}
+
+/*
+ * nvlist_alloc - Allocate nvlist.
+ */
+/*ARGSUSED1*/
+int
+nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
+{
+#if defined(_KERNEL) && !defined(_BOOT)
+ return (nvlist_xalloc(nvlp, nvflag,
+ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
+#else
+ return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep));
+#endif
+}
+
+int
+nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva)
+{
+ nvpriv_t *priv;
+
+ if (nvlp == NULL || nva == NULL)
+ return (EINVAL);
+
+ if ((priv = nv_priv_alloc(nva)) == NULL)
+ return (ENOMEM);
+
+ if ((*nvlp = nv_mem_zalloc(priv,
+ NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
+ nv_mem_free(priv, priv, sizeof (nvpriv_t));
+ return (ENOMEM);
+ }
+
+ nvlist_init(*nvlp, nvflag, priv);
+
+ return (0);
+}
+
+/*
+ * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
+ */
+static nvpair_t *
+nvp_buf_alloc(nvlist_t *nvl, size_t len)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *buf;
+ nvpair_t *nvp;
+ size_t nvsize;
+
+ /*
+ * Allocate the buffer
+ */
+ nvsize = len + offsetof(i_nvp_t, nvi_nvp);
+
+ if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
+ return (NULL);
+
+ nvp = &buf->nvi_nvp;
+ nvp->nvp_size = len;
+
+ return (nvp);
+}
+
+/*
+ * nvp_buf_free - de-Allocate an i_nvp_t.
+ */
+static void
+nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
+
+ nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
+}
+
+/*
+ * nvp_buf_link - link a new nv pair into the nvlist.
+ */
+static void
+nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr = NVPAIR2I_NVP(nvp);
+
+ /* Put element at end of nvlist */
+ if (priv->nvp_list == NULL) {
+ priv->nvp_list = priv->nvp_last = curr;
+ } else {
+ curr->nvi_prev = priv->nvp_last;
+ priv->nvp_last->nvi_next = curr;
+ priv->nvp_last = curr;
+ }
+}
+
+/*
+ * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
+ */
+static void
+nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr = NVPAIR2I_NVP(nvp);
+
+ /*
+ * protect nvlist_next_nvpair() against walking on freed memory.
+ */
+ if (priv->nvp_curr == curr)
+ priv->nvp_curr = curr->nvi_next;
+
+ if (curr == priv->nvp_list)
+ priv->nvp_list = curr->nvi_next;
+ else
+ curr->nvi_prev->nvi_next = curr->nvi_next;
+
+ if (curr == priv->nvp_last)
+ priv->nvp_last = curr->nvi_prev;
+ else
+ curr->nvi_next->nvi_prev = curr->nvi_prev;
+}
+
+/*
+ * take a nvpair type and number of elements and make sure the are valid
+ */
+static int
+i_validate_type_nelem(data_type_t type, uint_t nelem)
+{
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ if (nelem != 0)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_STRING:
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
+ if (nelem != 1)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ /* we allow arrays with 0 elements */
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * Verify nvp_name_sz and check the name string length.
+ */
+static int
+i_validate_nvpair_name(nvpair_t *nvp)
+{
+ if ((nvp->nvp_name_sz <= 0) ||
+ (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
+ return (EFAULT);
+
+ /* verify the name string, make sure its terminated */
+ if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
+ return (EFAULT);
+
+ return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
+}
+
+static int
+i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
+{
+ switch (type) {
+ case DATA_TYPE_BOOLEAN_VALUE:
+ if (*(boolean_t *)data != B_TRUE &&
+ *(boolean_t *)data != B_FALSE)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY: {
+ int i;
+
+ for (i = 0; i < nelem; i++)
+ if (((boolean_t *)data)[i] != B_TRUE &&
+ ((boolean_t *)data)[i] != B_FALSE)
+ return (EINVAL);
+ break;
+ }
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * This function takes a pointer to what should be a nvpair and it's size
+ * and then verifies that all the nvpair fields make sense and can be
+ * trusted. This function is used when decoding packed nvpairs.
+ */
+static int
+i_validate_nvpair(nvpair_t *nvp)
+{
+ data_type_t type = NVP_TYPE(nvp);
+ int size1, size2;
+
+ /* verify nvp_name_sz, check the name string length */
+ if (i_validate_nvpair_name(nvp) != 0)
+ return (EFAULT);
+
+ if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
+ return (EFAULT);
+
+ /*
+ * verify nvp_type, nvp_value_elem, and also possibly
+ * verify string values and get the value size.
+ */
+ size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
+ size1 = nvp->nvp_size - NVP_VALOFF(nvp);
+ if (size2 < 0 || size1 != NV_ALIGN(size2))
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ nvpair_t *nvp = &curr->nvi_nvp;
+ int err;
+
+ if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
+ NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Frees all memory allocated for an nvpair (like embedded lists) with
+ * the exception of the nvpair buffer itself.
+ */
+static void
+nvpair_free(nvpair_t *nvp)
+{
+ switch (NVP_TYPE(nvp)) {
+ case DATA_TYPE_NVLIST:
+ nvlist_free(EMBEDDED_NVL(nvp));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ int i;
+
+ for (i = 0; i < NVP_NELEM(nvp); i++)
+ nvlist_free(nvlp[i]);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+/*
+ * nvlist_free - free an unpacked nvlist
+ */
+void
+nvlist_free(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return;
+
+ /*
+ * Unpacked nvlist are linked through i_nvp_t
+ */
+ curr = priv->nvp_list;
+ while (curr != NULL) {
+ nvpair_t *nvp = &curr->nvi_nvp;
+ curr = curr->nvi_next;
+
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ }
+
+ if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
+ nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
+ else
+ nvl->nvl_priv = 0;
+
+ nvt_tab_free(priv);
+ nv_mem_free(priv, priv, sizeof (nvpriv_t));
+}
+
+static int
+nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+
+ if (nvp == NULL)
+ return (0);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
+ if (&curr->nvi_nvp == nvp)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Make a copy of nvlist
+ */
+/*ARGSUSED1*/
+int
+nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag)
+{
+#if defined(_KERNEL) && !defined(_BOOT)
+ return (nvlist_xdup(nvl, nvlp,
+ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
+#else
+ return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep));
+#endif
+}
+
+int
+nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
+{
+ int err;
+ nvlist_t *ret;
+
+ if (nvl == NULL || nvlp == NULL)
+ return (EINVAL);
+
+ if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
+ return (err);
+
+ if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
+ nvlist_free(ret);
+ else
+ *nvlp = ret;
+
+ return (err);
+}
+
+/*
+ * Remove all with matching name
+ */
+int
+nvlist_remove_all(nvlist_t *nvl, const char *name)
+{
+ int error = ENOENT;
+
+ if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ nvpair_t *nvp;
+ while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
+ VERIFY0(nvlist_remove_nvpair(nvl, nvp));
+ error = 0;
+ }
+
+ return (error);
+}
+
+/*
+ * Remove first one with matching name and type
+ */
+int
+nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
+{
+ if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+ if (nvp == NULL)
+ return (ENOENT);
+
+ return (nvlist_remove_nvpair(nvl, nvp));
+}
+
+int
+nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ if (nvl == NULL || nvp == NULL)
+ return (EINVAL);
+
+ int err = nvt_remove_nvpair(nvl, nvp);
+ if (err != 0)
+ return (err);
+
+ nvp_buf_unlink(nvl, nvp);
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (0);
+}
+
+/*
+ * This function calculates the size of an nvpair value.
+ *
+ * The data argument controls the behavior in case of the data types
+ * DATA_TYPE_STRING and
+ * DATA_TYPE_STRING_ARRAY
+ * Is data == NULL then the size of the string(s) is excluded.
+ */
+static int
+i_get_value_size(data_type_t type, const void *data, uint_t nelem)
+{
+ uint64_t value_sz;
+
+ if (i_validate_type_nelem(type, nelem) != 0)
+ return (-1);
+
+ /* Calculate required size for holding value */
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ value_sz = 0;
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ value_sz = sizeof (boolean_t);
+ break;
+ case DATA_TYPE_BYTE:
+ value_sz = sizeof (uchar_t);
+ break;
+ case DATA_TYPE_INT8:
+ value_sz = sizeof (int8_t);
+ break;
+ case DATA_TYPE_UINT8:
+ value_sz = sizeof (uint8_t);
+ break;
+ case DATA_TYPE_INT16:
+ value_sz = sizeof (int16_t);
+ break;
+ case DATA_TYPE_UINT16:
+ value_sz = sizeof (uint16_t);
+ break;
+ case DATA_TYPE_INT32:
+ value_sz = sizeof (int32_t);
+ break;
+ case DATA_TYPE_UINT32:
+ value_sz = sizeof (uint32_t);
+ break;
+ case DATA_TYPE_INT64:
+ value_sz = sizeof (int64_t);
+ break;
+ case DATA_TYPE_UINT64:
+ value_sz = sizeof (uint64_t);
+ break;
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+ value_sz = sizeof (double);
+ break;
+#endif
+ case DATA_TYPE_STRING:
+ if (data == NULL)
+ value_sz = 0;
+ else
+ value_sz = strlen(data) + 1;
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (boolean_t);
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uchar_t);
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int8_t);
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint8_t);
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int16_t);
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint16_t);
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int32_t);
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint32_t);
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (int64_t);
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t);
+
+ if (data != NULL) {
+ char *const *strs = data;
+ uint_t i;
+
+ /* no alignment requirement for strings */
+ for (i = 0; i < nelem; i++) {
+ if (strs[i] == NULL)
+ return (-1);
+ value_sz += strlen(strs[i]) + 1;
+ }
+ }
+ break;
+ case DATA_TYPE_HRTIME:
+ value_sz = sizeof (hrtime_t);
+ break;
+ case DATA_TYPE_NVLIST:
+ value_sz = NV_ALIGN(sizeof (nvlist_t));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ value_sz = (uint64_t)nelem * sizeof (uint64_t) +
+ (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
+ break;
+ default:
+ return (-1);
+ }
+
+ return (value_sz > INT32_MAX ? -1 : (int)value_sz);
+}
+
+static int
+nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl)
+{
+ nvpriv_t *priv;
+ int err;
+
+ if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
+ nvl->nvl_priv)) == NULL)
+ return (ENOMEM);
+
+ nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
+
+ if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
+ nvlist_free(emb_nvl);
+ emb_nvl->nvl_priv = 0;
+ }
+
+ return (err);
+}
+
+/*
+ * nvlist_add_common - Add new <name,value> pair to nvlist
+ */
+static int
+nvlist_add_common(nvlist_t *nvl, const char *name,
+ data_type_t type, uint_t nelem, const void *data)
+{
+ nvpair_t *nvp;
+ uint_t i;
+
+ int nvp_sz, name_sz, value_sz;
+ int err = 0;
+
+ if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ if (nelem != 0 && data == NULL)
+ return (EINVAL);
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) included.
+ */
+ if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
+ return (EINVAL);
+
+ if (i_validate_nvpair_value(type, nelem, data) != 0)
+ return (EINVAL);
+
+ /*
+ * If we're adding an nvlist or nvlist array, ensure that we are not
+ * adding the input nvlist to itself, which would cause recursion,
+ * and ensure that no NULL nvlist pointers are present.
+ */
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ if (data == nvl || data == NULL)
+ return (EINVAL);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **onvlp = (nvlist_t **)data;
+ for (i = 0; i < nelem; i++) {
+ if (onvlp[i] == nvl || onvlp[i] == NULL)
+ return (EINVAL);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* calculate sizes of the nvpair elements and the nvpair itself */
+ name_sz = strlen(name) + 1;
+ if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * 8 - 1))
+ return (EINVAL);
+
+ nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
+
+ if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
+ return (ENOMEM);
+
+ ASSERT(nvp->nvp_size == nvp_sz);
+ nvp->nvp_name_sz = name_sz;
+ nvp->nvp_value_elem = nelem;
+ nvp->nvp_type = type;
+ bcopy(name, NVP_NAME(nvp), name_sz);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ break;
+ case DATA_TYPE_STRING_ARRAY: {
+ char *const *strs = data;
+ char *buf = NVP_VALUE(nvp);
+ char **cstrs = (void *)buf;
+
+ /* skip pre-allocated space for pointer array */
+ buf += nelem * sizeof (uint64_t);
+ for (i = 0; i < nelem; i++) {
+ int slen = strlen(strs[i]) + 1;
+ bcopy(strs[i], buf, slen);
+ cstrs[i] = buf;
+ buf += slen;
+ }
+ break;
+ }
+ case DATA_TYPE_NVLIST: {
+ nvlist_t *nnvl = EMBEDDED_NVL(nvp);
+ nvlist_t *onvl = (nvlist_t *)data;
+
+ if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+ break;
+ }
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **onvlp = (nvlist_t **)data;
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ nvlist_t *embedded = (nvlist_t *)
+ ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
+
+ for (i = 0; i < nelem; i++) {
+ if ((err = nvlist_copy_embedded(nvl,
+ onvlp[i], embedded)) != 0) {
+ /*
+ * Free any successfully created lists
+ */
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+
+ nvlp[i] = embedded++;
+ }
+ break;
+ }
+ default:
+ bcopy(data, NVP_VALUE(nvp), value_sz);
+ }
+
+ /* if unique name, remove before add */
+ if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
+ (void) nvlist_remove_all(nvl, name);
+ else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
+ (void) nvlist_remove(nvl, name, type);
+
+ err = nvt_add_nvpair(nvl, nvp);
+ if (err != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+ nvp_buf_link(nvl, nvp);
+
+ return (0);
+}
+
+int
+nvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
+}
+
+int
+nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
+}
+
+int
+nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
+}
+
+int
+nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
+}
+
+int
+nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
+}
+
+int
+nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
+}
+
+int
+nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
+}
+
+int
+nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
+}
+
+int
+nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
+}
+
+int
+nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
+}
+
+int
+nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
+}
+
+#if !defined(_KERNEL)
+int
+nvlist_add_double(nvlist_t *nvl, const char *name, double val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
+}
+#endif
+
+int
+nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
+}
+
+int
+nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+ boolean_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_string_array(nvlist_t *nvl, const char *name,
+ char *const *a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
+}
+
+int
+nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
+}
+
+int
+nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n)
+{
+ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+/* reading name-value pairs */
+nvpair_t *
+nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ curr = NVPAIR2I_NVP(nvp);
+
+ /*
+ * Ensure that nvp is a valid nvpair on this nvlist.
+ * NB: nvp_curr is used only as a hint so that we don't always
+ * have to walk the list to determine if nvp is still on the list.
+ */
+ if (nvp == NULL)
+ curr = priv->nvp_list;
+ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+ curr = curr->nvi_next;
+ else
+ curr = NULL;
+
+ priv->nvp_curr = curr;
+
+ return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+nvpair_t *
+nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv;
+ i_nvp_t *curr;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (NULL);
+
+ curr = NVPAIR2I_NVP(nvp);
+
+ if (nvp == NULL)
+ curr = priv->nvp_last;
+ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+ curr = curr->nvi_prev;
+ else
+ curr = NULL;
+
+ priv->nvp_curr = curr;
+
+ return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+boolean_t
+nvlist_empty(nvlist_t *nvl)
+{
+ nvpriv_t *priv;
+
+ if (nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (B_TRUE);
+
+ return (priv->nvp_list == NULL);
+}
+
+char *
+nvpair_name(nvpair_t *nvp)
+{
+ return (NVP_NAME(nvp));
+}
+
+data_type_t
+nvpair_type(nvpair_t *nvp)
+{
+ return (NVP_TYPE(nvp));
+}
+
+int
+nvpair_type_is_array(nvpair_t *nvp)
+{
+ data_type_t type = NVP_TYPE(nvp);
+
+ if ((type == DATA_TYPE_BYTE_ARRAY) ||
+ (type == DATA_TYPE_INT8_ARRAY) ||
+ (type == DATA_TYPE_UINT8_ARRAY) ||
+ (type == DATA_TYPE_INT16_ARRAY) ||
+ (type == DATA_TYPE_UINT16_ARRAY) ||
+ (type == DATA_TYPE_INT32_ARRAY) ||
+ (type == DATA_TYPE_UINT32_ARRAY) ||
+ (type == DATA_TYPE_INT64_ARRAY) ||
+ (type == DATA_TYPE_UINT64_ARRAY) ||
+ (type == DATA_TYPE_BOOLEAN_ARRAY) ||
+ (type == DATA_TYPE_STRING_ARRAY) ||
+ (type == DATA_TYPE_NVLIST_ARRAY))
+ return (1);
+ return (0);
+
+}
+
+static int
+nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
+{
+ if (nvp == NULL || nvpair_type(nvp) != type)
+ return (EINVAL);
+
+ /*
+ * For non-array types, we copy the data.
+ * For array types (including string), we set a pointer.
+ */
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ if (nelem != NULL)
+ *nelem = 0;
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
+ if (data == NULL)
+ return (EINVAL);
+ bcopy(NVP_VALUE(nvp), data,
+ (size_t)i_get_value_size(type, NULL, 1));
+ if (nelem != NULL)
+ *nelem = 1;
+ break;
+
+ case DATA_TYPE_NVLIST:
+ case DATA_TYPE_STRING:
+ if (data == NULL)
+ return (EINVAL);
+ *(void **)data = (void *)NVP_VALUE(nvp);
+ if (nelem != NULL)
+ *nelem = 1;
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ if (nelem == NULL || data == NULL)
+ return (EINVAL);
+ if ((*nelem = NVP_NELEM(nvp)) != 0)
+ *(void **)data = (void *)NVP_VALUE(nvp);
+ else
+ *(void **)data = NULL;
+ break;
+
+ default:
+ return (ENOTSUP);
+ }
+
+ return (0);
+}
+
+static int
+nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
+ uint_t *nelem, void *data)
+{
+ if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
+ return (EINVAL);
+
+ if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
+ return (ENOTSUP);
+
+ nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+ if (nvp == NULL)
+ return (ENOENT);
+
+ return (nvpair_value_common(nvp, type, nelem, data));
+}
+
+int
+nvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
+}
+
+int
+nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val)
+{
+ return (nvlist_lookup_common(nvl, name,
+ DATA_TYPE_BOOLEAN_VALUE, NULL, val));
+}
+
+int
+nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
+}
+
+int
+nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
+}
+
+int
+nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
+}
+
+int
+nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
+}
+
+int
+nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
+}
+
+int
+nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
+}
+
+int
+nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
+}
+
+int
+nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
+}
+
+int
+nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
+}
+
+#if !defined(_KERNEL)
+int
+nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
+int
+nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
+}
+
+int
+nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
+}
+
+int
+nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name,
+ boolean_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name,
+ DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_byte_array(nvlist_t *nvl, const char *name,
+ uchar_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name,
+ uint8_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int16_array(nvlist_t *nvl, const char *name,
+ int16_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name,
+ uint16_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int32_array(nvlist_t *nvl, const char *name,
+ int32_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name,
+ uint32_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int64_array(nvlist_t *nvl, const char *name,
+ int64_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name,
+ uint64_t **a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_string_array(nvlist_t *nvl, const char *name,
+ char ***a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name,
+ nvlist_t ***a, uint_t *n)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val)
+{
+ return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
+}
+
+int
+nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
+{
+ va_list ap;
+ char *name;
+ int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
+ int ret = 0;
+
+ va_start(ap, flag);
+ while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+ data_type_t type;
+ void *val;
+ uint_t *nelem;
+
+ switch (type = va_arg(ap, data_type_t)) {
+ case DATA_TYPE_BOOLEAN:
+ ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+ case DATA_TYPE_STRING:
+ case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
+ val = va_arg(ap, void *);
+ ret = nvlist_lookup_common(nvl, name, type, NULL, val);
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ case DATA_TYPE_NVLIST_ARRAY:
+ val = va_arg(ap, void *);
+ nelem = va_arg(ap, uint_t *);
+ ret = nvlist_lookup_common(nvl, name, type, nelem, val);
+ break;
+
+ default:
+ ret = EINVAL;
+ }
+
+ if (ret == ENOENT && noentok)
+ ret = 0;
+ }
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
+ * returns zero and a pointer to the matching nvpair is returned in '*ret'
+ * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
+ * multiple levels of embedded nvlists, with 'sep' as the separator. As an
+ * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
+ * "a.d[3].e[1]". This matches the C syntax for array embed (for convience,
+ * code also supports "a.d[3]e[1]" syntax).
+ *
+ * If 'ip' is non-NULL and the last name component is an array, return the
+ * value of the "...[index]" array index in *ip. For an array reference that
+ * is not indexed, *ip will be returned as -1. If there is a syntax error in
+ * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
+ * inside the 'name' string where the syntax error was detected.
+ */
+static int
+nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep,
+ nvpair_t **ret, int *ip, char **ep)
+{
+ nvpair_t *nvp;
+ const char *np;
+ char *sepp;
+ char *idxp, *idxep;
+ nvlist_t **nva;
+ long idx;
+ int n;
+
+ if (ip)
+ *ip = -1; /* not indexed */
+ if (ep)
+ *ep = NULL;
+
+ if ((nvl == NULL) || (name == NULL))
+ return (EINVAL);
+
+ sepp = NULL;
+ idx = 0;
+ /* step through components of name */
+ for (np = name; np && *np; np = sepp) {
+ /* ensure unique names */
+ if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
+ return (ENOTSUP);
+
+ /* skip white space */
+ skip_whitespace(np);
+ if (*np == 0)
+ break;
+
+ /* set 'sepp' to end of current component 'np' */
+ if (sep)
+ sepp = strchr(np, sep);
+ else
+ sepp = NULL;
+
+ /* find start of next "[ index ]..." */
+ idxp = strchr(np, '[');
+
+ /* if sepp comes first, set idxp to NULL */
+ if (sepp && idxp && (sepp < idxp))
+ idxp = NULL;
+
+ /*
+ * At this point 'idxp' is set if there is an index
+ * expected for the current component.
+ */
+ if (idxp) {
+ /* set 'n' to length of current 'np' name component */
+ n = idxp++ - np;
+
+ /* keep sepp up to date for *ep use as we advance */
+ skip_whitespace(idxp);
+ sepp = idxp;
+
+ /* determine the index value */
+#if defined(_KERNEL) && !defined(_BOOT)
+ if (ddi_strtol(idxp, &idxep, 0, &idx))
+ goto fail;
+#else
+ idx = strtol(idxp, &idxep, 0);
+#endif
+ if (idxep == idxp)
+ goto fail;
+
+ /* keep sepp up to date for *ep use as we advance */
+ sepp = idxep;
+
+ /* skip white space index value and check for ']' */
+ skip_whitespace(sepp);
+ if (*sepp++ != ']')
+ goto fail;
+
+ /* for embedded arrays, support C syntax: "a[1].b" */
+ skip_whitespace(sepp);
+ if (sep && (*sepp == sep))
+ sepp++;
+ } else if (sepp) {
+ n = sepp++ - np;
+ } else {
+ n = strlen(np);
+ }
+
+ /* trim trailing whitespace by reducing length of 'np' */
+ if (n == 0)
+ goto fail;
+ for (n--; (np[n] == ' ') || (np[n] == '\t'); n--)
+ ;
+ n++;
+
+ /* skip whitespace, and set sepp to NULL if complete */
+ if (sepp) {
+ skip_whitespace(sepp);
+ if (*sepp == 0)
+ sepp = NULL;
+ }
+
+ /*
+ * At this point:
+ * o 'n' is the length of current 'np' component.
+ * o 'idxp' is set if there was an index, and value 'idx'.
+ * o 'sepp' is set to the beginning of the next component,
+ * and set to NULL if we have no more components.
+ *
+ * Search for nvpair with matching component name.
+ */
+ for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ /* continue if no match on name */
+ if (strncmp(np, nvpair_name(nvp), n) ||
+ (strlen(nvpair_name(nvp)) != n))
+ continue;
+
+ /* if indexed, verify type is array oriented */
+ if (idxp && !nvpair_type_is_array(nvp))
+ goto fail;
+
+ /*
+ * Full match found, return nvp and idx if this
+ * was the last component.
+ */
+ if (sepp == NULL) {
+ if (ret)
+ *ret = nvp;
+ if (ip && idxp)
+ *ip = (int)idx; /* return index */
+ return (0); /* found */
+ }
+
+ /*
+ * More components: current match must be
+ * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
+ * to support going deeper.
+ */
+ if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
+ nvl = EMBEDDED_NVL(nvp);
+ break;
+ } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
+ (void) nvpair_value_nvlist_array(nvp,
+ &nva, (uint_t *)&n);
+ if ((n < 0) || (idx >= n))
+ goto fail;
+ nvl = nva[idx];
+ break;
+ }
+
+ /* type does not support more levels */
+ goto fail;
+ }
+ if (nvp == NULL)
+ goto fail; /* 'name' not found */
+
+ /* search for match of next component in embedded 'nvl' list */
+ }
+
+fail: if (ep && sepp)
+ *ep = sepp;
+ return (EINVAL);
+}
+
+/*
+ * Return pointer to nvpair with specified 'name'.
+ */
+int
+nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret)
+{
+ return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
+}
+
+/*
+ * Determine if named nvpair exists in nvlist (use embedded separator of '.'
+ * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed
+ * description.
+ */
+int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
+ const char *name, nvpair_t **ret, int *ip, char **ep)
+{
+ return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
+}
+
+boolean_t
+nvlist_exists(nvlist_t *nvl, const char *name)
+{
+ nvpriv_t *priv;
+ nvpair_t *nvp;
+ i_nvp_t *curr;
+
+ if (name == NULL || nvl == NULL ||
+ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (B_FALSE);
+
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ nvp = &curr->nvi_nvp;
+
+ if (strcmp(name, NVP_NAME(nvp)) == 0)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+int
+nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
+}
+
+int
+nvpair_value_byte(nvpair_t *nvp, uchar_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
+}
+
+int
+nvpair_value_int8(nvpair_t *nvp, int8_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
+}
+
+int
+nvpair_value_uint8(nvpair_t *nvp, uint8_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
+}
+
+int
+nvpair_value_int16(nvpair_t *nvp, int16_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
+}
+
+int
+nvpair_value_uint16(nvpair_t *nvp, uint16_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
+}
+
+int
+nvpair_value_int32(nvpair_t *nvp, int32_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
+}
+
+int
+nvpair_value_uint32(nvpair_t *nvp, uint32_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
+}
+
+int
+nvpair_value_int64(nvpair_t *nvp, int64_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
+}
+
+int
+nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
+}
+
+#if !defined(_KERNEL)
+int
+nvpair_value_double(nvpair_t *nvp, double *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
+int
+nvpair_value_string(nvpair_t *nvp, char **val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
+}
+
+int
+nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
+}
+
+int
+nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val)
+{
+ return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
+}
+
+/*
+ * Add specified pair to the list.
+ */
+int
+nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ if (nvl == NULL || nvp == NULL)
+ return (EINVAL);
+
+ return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
+ NVP_NELEM(nvp), NVP_VALUE(nvp)));
+}
+
+/*
+ * Merge the supplied nvlists and put the result in dst.
+ * The merged list will contain all names specified in both lists,
+ * the values are taken from nvl in the case of duplicates.
+ * Return 0 on success.
+ */
+/*ARGSUSED*/
+int
+nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag)
+{
+ if (nvl == NULL || dst == NULL)
+ return (EINVAL);
+
+ if (dst != nvl)
+ return (nvlist_copy_pairs(nvl, dst));
+
+ return (0);
+}
+
+/*
+ * Encoding related routines
+ */
+#define NVS_OP_ENCODE 0
+#define NVS_OP_DECODE 1
+#define NVS_OP_GETSIZE 2
+
+typedef struct nvs_ops nvs_ops_t;
+
+typedef struct {
+ int nvs_op;
+ const nvs_ops_t *nvs_ops;
+ void *nvs_private;
+ nvpriv_t *nvs_priv;
+ int nvs_recursion;
+} nvstream_t;
+
+/*
+ * nvs operations are:
+ * - nvs_nvlist
+ * encoding / decoding of a nvlist header (nvlist_t)
+ * calculates the size used for header and end detection
+ *
+ * - nvs_nvpair
+ * responsible for the first part of encoding / decoding of an nvpair
+ * calculates the decoded size of an nvpair
+ *
+ * - nvs_nvp_op
+ * second part of encoding / decoding of an nvpair
+ *
+ * - nvs_nvp_size
+ * calculates the encoding size of an nvpair
+ *
+ * - nvs_nvl_fini
+ * encodes the end detection mark (zeros).
+ */
+struct nvs_ops {
+ int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *);
+ int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *);
+ int (*nvs_nvp_op)(nvstream_t *, nvpair_t *);
+ int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *);
+ int (*nvs_nvl_fini)(nvstream_t *);
+};
+
+typedef struct {
+ char nvh_encoding; /* nvs encoding method */
+ char nvh_endian; /* nvs endian */
+ char nvh_reserved1; /* reserved for future use */
+ char nvh_reserved2; /* reserved for future use */
+} nvs_header_t;
+
+static int
+nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+
+ /*
+ * Walk nvpair in list and encode each nvpair
+ */
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
+ if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
+ return (EFAULT);
+
+ return (nvs->nvs_ops->nvs_nvl_fini(nvs));
+}
+
+static int
+nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
+{
+ nvpair_t *nvp;
+ size_t nvsize;
+ int err;
+
+ /*
+ * Get decoded size of next pair in stream, alloc
+ * memory for nvpair_t, then decode the nvpair
+ */
+ while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
+ if (nvsize == 0) /* end of list */
+ break;
+
+ /* make sure len makes sense */
+ if (nvsize < NVP_SIZE_CALC(1, 0))
+ return (EFAULT);
+
+ if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
+ return (ENOMEM);
+
+ if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+
+ if (i_validate_nvpair(nvp) != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (EFAULT);
+ }
+
+ err = nvt_add_nvpair(nvl, nvp);
+ if (err != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
+ nvp_buf_link(nvl, nvp);
+ }
+ return (err);
+}
+
+static int
+nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ i_nvp_t *curr;
+ uint64_t nvsize = *buflen;
+ size_t size;
+
+ /*
+ * Get encoded size of nvpairs in nvlist
+ */
+ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+ if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
+ return (EINVAL);
+
+ if ((nvsize += size) > INT32_MAX)
+ return (EINVAL);
+ }
+
+ *buflen = nvsize;
+ return (0);
+}
+
+static int
+nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
+{
+ int err;
+
+ if (nvl->nvl_priv == 0)
+ return (EFAULT);
+
+ /*
+ * Perform the operation, starting with header, then each nvpair
+ */
+ if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
+ return (err);
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ err = nvs_encode_pairs(nvs, nvl);
+ break;
+
+ case NVS_OP_DECODE:
+ err = nvs_decode_pairs(nvs, nvl);
+ break;
+
+ case NVS_OP_GETSIZE:
+ err = nvs_getsize_pairs(nvs, nvl, buflen);
+ break;
+
+ default:
+ err = EINVAL;
+ }
+
+ return (err);
+}
+
+static int
+nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE: {
+ int err;
+
+ if (nvs->nvs_recursion >= nvpair_max_recursion)
+ return (EINVAL);
+ nvs->nvs_recursion++;
+ err = nvs_operation(nvs, embedded, NULL);
+ nvs->nvs_recursion--;
+ return (err);
+ }
+ case NVS_OP_DECODE: {
+ nvpriv_t *priv;
+ int err;
+
+ if (embedded->nvl_version != NV_VERSION)
+ return (ENOTSUP);
+
+ if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
+ return (ENOMEM);
+
+ nvlist_init(embedded, embedded->nvl_nvflag, priv);
+
+ if (nvs->nvs_recursion >= nvpair_max_recursion) {
+ nvlist_free(embedded);
+ return (EINVAL);
+ }
+ nvs->nvs_recursion++;
+ if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
+ nvlist_free(embedded);
+ nvs->nvs_recursion--;
+ return (err);
+ }
+ default:
+ break;
+ }
+
+ return (EINVAL);
+}
+
+static int
+nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ size_t nelem = NVP_NELEM(nvp);
+ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+ int i;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ for (i = 0; i < nelem; i++)
+ if (nvs_embedded(nvs, nvlp[i]) != 0)
+ return (EFAULT);
+ break;
+
+ case NVS_OP_DECODE: {
+ size_t len = nelem * sizeof (uint64_t);
+ nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len);
+
+ bzero(nvlp, len); /* don't trust packed data */
+ for (i = 0; i < nelem; i++) {
+ if (nvs_embedded(nvs, embedded) != 0) {
+ nvpair_free(nvp);
+ return (EFAULT);
+ }
+
+ nvlp[i] = embedded++;
+ }
+ break;
+ }
+ case NVS_OP_GETSIZE: {
+ uint64_t nvsize = 0;
+
+ for (i = 0; i < nelem; i++) {
+ size_t nvp_sz = 0;
+
+ if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
+ return (EINVAL);
+
+ if ((nvsize += nvp_sz) > INT32_MAX)
+ return (EINVAL);
+ }
+
+ *size = nvsize;
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *);
+static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *);
+
+/*
+ * Common routine for nvlist operations:
+ * encode, decode, getsize (encoded size).
+ */
+static int
+nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
+ int nvs_op)
+{
+ int err = 0;
+ nvstream_t nvs;
+ int nvl_endian;
+#if BYTE_ORDER == _LITTLE_ENDIAN
+ int host_endian = 1;
+#else
+ int host_endian = 0;
+#endif /* _LITTLE_ENDIAN */
+ nvs_header_t *nvh = (void *)buf;
+
+ if (buflen == NULL || nvl == NULL ||
+ (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ return (EINVAL);
+
+ nvs.nvs_op = nvs_op;
+ nvs.nvs_recursion = 0;
+
+ /*
+ * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
+ * a buffer is allocated. The first 4 bytes in the buffer are
+ * used for encoding method and host endian.
+ */
+ switch (nvs_op) {
+ case NVS_OP_ENCODE:
+ if (buf == NULL || *buflen < sizeof (nvs_header_t))
+ return (EINVAL);
+
+ nvh->nvh_encoding = encoding;
+ nvh->nvh_endian = nvl_endian = host_endian;
+ nvh->nvh_reserved1 = 0;
+ nvh->nvh_reserved2 = 0;
+ break;
+
+ case NVS_OP_DECODE:
+ if (buf == NULL || *buflen < sizeof (nvs_header_t))
+ return (EINVAL);
+
+ /* get method of encoding from first byte */
+ encoding = nvh->nvh_encoding;
+ nvl_endian = nvh->nvh_endian;
+ break;
+
+ case NVS_OP_GETSIZE:
+ nvl_endian = host_endian;
+
+ /*
+ * add the size for encoding
+ */
+ *buflen = sizeof (nvs_header_t);
+ break;
+
+ default:
+ return (ENOTSUP);
+ }
+
+ /*
+ * Create an nvstream with proper encoding method
+ */
+ switch (encoding) {
+ case NV_ENCODE_NATIVE:
+ /*
+ * check endianness, in case we are unpacking
+ * from a file
+ */
+ if (nvl_endian != host_endian)
+ return (ENOTSUP);
+ err = nvs_native(&nvs, nvl, buf, buflen);
+ break;
+ case NV_ENCODE_XDR:
+ err = nvs_xdr(&nvs, nvl, buf, buflen);
+ break;
+ default:
+ err = ENOTSUP;
+ break;
+ }
+
+ return (err);
+}
+
+int
+nvlist_size(nvlist_t *nvl, size_t *size, int encoding)
+{
+ return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
+}
+
+/*
+ * Pack nvlist into contiguous memory
+ */
+/*ARGSUSED1*/
+int
+nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
+ int kmflag)
+{
+#if defined(_KERNEL) && !defined(_BOOT)
+ return (nvlist_xpack(nvl, bufp, buflen, encoding,
+ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
+#else
+ return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep));
+#endif
+}
+
+int
+nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
+ nv_alloc_t *nva)
+{
+ nvpriv_t nvpriv;
+ size_t alloc_size;
+ char *buf;
+ int err;
+
+ if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL)
+ return (EINVAL);
+
+ if (*bufp != NULL)
+ return (nvlist_common(nvl, *bufp, buflen, encoding,
+ NVS_OP_ENCODE));
+
+ /*
+ * Here is a difficult situation:
+ * 1. The nvlist has fixed allocator properties.
+ * All other nvlist routines (like nvlist_add_*, ...) use
+ * these properties.
+ * 2. When using nvlist_pack() the user can specify their own
+ * allocator properties (e.g. by using KM_NOSLEEP).
+ *
+ * We use the user specified properties (2). A clearer solution
+ * will be to remove the kmflag from nvlist_pack(), but we will
+ * not change the interface.
+ */
+ nv_priv_init(&nvpriv, nva, 0);
+
+ if ((err = nvlist_size(nvl, &alloc_size, encoding)))
+ return (err);
+
+ if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
+ return (ENOMEM);
+
+ if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
+ NVS_OP_ENCODE)) != 0) {
+ nv_mem_free(&nvpriv, buf, alloc_size);
+ } else {
+ *buflen = alloc_size;
+ *bufp = buf;
+ }
+
+ return (err);
+}
+
+/*
+ * Unpack buf into an nvlist_t
+ */
+/*ARGSUSED1*/
+int
+nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag)
+{
+#if defined(_KERNEL) && !defined(_BOOT)
+ return (nvlist_xunpack(buf, buflen, nvlp,
+ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
+#else
+ return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep));
+#endif
+}
+
+int
+nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva)
+{
+ nvlist_t *nvl;
+ int err;
+
+ if (nvlp == NULL)
+ return (EINVAL);
+
+ if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
+ return (err);
+
+ if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0)
+ nvlist_free(nvl);
+ else
+ *nvlp = nvl;
+
+ return (err);
+}
+
+/*
+ * Native encoding functions
+ */
+typedef struct {
+ /*
+ * This structure is used when decoding a packed nvpair in
+ * the native format. n_base points to a buffer containing the
+ * packed nvpair. n_end is a pointer to the end of the buffer.
+ * (n_end actually points to the first byte past the end of the
+ * buffer.) n_curr is a pointer that lies between n_base and n_end.
+ * It points to the current data that we are decoding.
+ * The amount of data left in the buffer is equal to n_end - n_curr.
+ * n_flag is used to recognize a packed embedded list.
+ */
+ caddr_t n_base;
+ caddr_t n_end;
+ caddr_t n_curr;
+ uint_t n_flag;
+} nvs_native_t;
+
+static int
+nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf,
+ size_t buflen)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ nvs->nvs_private = native;
+ native->n_curr = native->n_base = buf;
+ native->n_end = buf + buflen;
+ native->n_flag = 0;
+ return (0);
+
+ case NVS_OP_GETSIZE:
+ nvs->nvs_private = native;
+ native->n_curr = native->n_base = native->n_end = NULL;
+ native->n_flag = 0;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+/*ARGSUSED*/
+static void
+nvs_native_destroy(nvstream_t *nvs)
+{
+}
+
+static int
+native_cp(nvstream_t *nvs, void *buf, size_t size)
+{
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+
+ if (native->n_curr + size > native->n_end)
+ return (EFAULT);
+
+ /*
+ * The bcopy() below eliminates alignment requirement
+ * on the buffer (stream) and is preferred over direct access.
+ */
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ bcopy(buf, native->n_curr, size);
+ break;
+ case NVS_OP_DECODE:
+ bcopy(native->n_curr, buf, size);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ native->n_curr += size;
+ return (0);
+}
+
+/*
+ * operate on nvlist_t header
+ */
+static int
+nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
+{
+ nvs_native_t *native = nvs->nvs_private;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ if (native->n_flag)
+ return (0); /* packed embedded list */
+
+ native->n_flag = 1;
+
+ /* copy version and nvflag of the nvlist_t */
+ if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 ||
+ native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
+ return (EFAULT);
+
+ return (0);
+
+ case NVS_OP_GETSIZE:
+ /*
+ * if calculate for packed embedded list
+ * 4 for end of the embedded list
+ * else
+ * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag
+ * and 4 for end of the entire list
+ */
+ if (native->n_flag) {
+ *size += 4;
+ } else {
+ native->n_flag = 1;
+ *size += 2 * sizeof (int32_t) + 4;
+ }
+
+ return (0);
+
+ default:
+ return (EINVAL);
+ }
+}
+
+static int
+nvs_native_nvl_fini(nvstream_t *nvs)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ /*
+ * Add 4 zero bytes at end of nvlist. They are used
+ * for end detection by the decode routine.
+ */
+ if (native->n_curr + sizeof (int) > native->n_end)
+ return (EFAULT);
+
+ bzero(native->n_curr, sizeof (int));
+ native->n_curr += sizeof (int);
+ }
+
+ return (0);
+}
+
+static int
+nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ char *packed = (void *)
+ (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
+ /*
+ * Null out the pointer that is meaningless in the packed
+ * structure. The address may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero(packed + offsetof(nvlist_t, nvl_priv),
+ sizeof(((nvlist_t *)NULL)->nvl_priv));
+ }
+
+ return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
+}
+
+static int
+nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
+ size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
+ int i;
+ /*
+ * Null out pointers that are meaningless in the packed
+ * structure. The addresses may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero(value, len);
+
+ value += len;
+ for (i = 0; i < NVP_NELEM(nvp); i++) {
+ /*
+ * Null out the pointer that is meaningless in the
+ * packed structure. The address may not be aligned,
+ * so we have to use bzero.
+ */
+ bzero(value + offsetof(nvlist_t, nvl_priv),
+ sizeof(((nvlist_t *)NULL)->nvl_priv));
+ value += sizeof(nvlist_t);
+ }
+ }
+
+ return (nvs_embedded_nvl_array(nvs, nvp, NULL));
+}
+
+static void
+nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE: {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ uint64_t *strp = (void *)
+ (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
+ /*
+ * Null out pointers that are meaningless in the packed
+ * structure. The addresses may not be aligned, so we have
+ * to use bzero.
+ */
+ bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t));
+ break;
+ }
+ case NVS_OP_DECODE: {
+ char **strp = (void *)NVP_VALUE(nvp);
+ char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t));
+ int i;
+
+ for (i = 0; i < NVP_NELEM(nvp); i++) {
+ strp[i] = buf;
+ buf += strlen(buf) + 1;
+ }
+ break;
+ }
+ }
+}
+
+static int
+nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
+{
+ data_type_t type;
+ int value_sz;
+ int ret = 0;
+
+ /*
+ * We do the initial bcopy of the data before we look at
+ * the nvpair type, because when we're decoding, we won't
+ * have the correct values for the pair until we do the bcopy.
+ */
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
+ return (EFAULT);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /* verify nvp_name_sz, check the name string length */
+ if (i_validate_nvpair_name(nvp) != 0)
+ return (EFAULT);
+
+ type = NVP_TYPE(nvp);
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) excluded.
+ */
+ if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
+ return (EFAULT);
+
+ if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
+ return (EFAULT);
+
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ ret = nvpair_native_embedded(nvs, nvp);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ ret = nvpair_native_embedded_array(nvs, nvp);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ nvpair_native_string_array(nvs, nvp);
+ break;
+ default:
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ uint64_t nvp_sz = nvp->nvp_size;
+
+ switch (NVP_TYPE(nvp)) {
+ case DATA_TYPE_NVLIST: {
+ size_t nvsize = 0;
+
+ if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+ case DATA_TYPE_NVLIST_ARRAY: {
+ size_t nvsize;
+
+ if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (nvp_sz > INT32_MAX)
+ return (EINVAL);
+
+ *size = nvp_sz;
+
+ return (0);
+}
+
+static int
+nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ return (nvs_native_nvp_op(nvs, nvp));
+
+ case NVS_OP_DECODE: {
+ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+ int32_t decode_len;
+
+ /* try to read the size value from the stream */
+ if (native->n_curr + sizeof (int32_t) > native->n_end)
+ return (EFAULT);
+ bcopy(native->n_curr, &decode_len, sizeof (int32_t));
+
+ /* sanity check the size value */
+ if (decode_len < 0 ||
+ decode_len > native->n_end - native->n_curr)
+ return (EFAULT);
+
+ *size = decode_len;
+
+ /*
+ * If at the end of the stream then move the cursor
+ * forward, otherwise nvpair_native_op() will read
+ * the entire nvpair at the same cursor position.
+ */
+ if (*size == 0)
+ native->n_curr += sizeof (int32_t);
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static const nvs_ops_t nvs_native_ops = {
+ nvs_native_nvlist,
+ nvs_native_nvpair,
+ nvs_native_nvp_op,
+ nvs_native_nvp_size,
+ nvs_native_nvl_fini
+};
+
+static int
+nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
+{
+ nvs_native_t native;
+ int err;
+
+ nvs->nvs_ops = &nvs_native_ops;
+
+ if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
+ *buflen - sizeof (nvs_header_t))) != 0)
+ return (err);
+
+ err = nvs_operation(nvs, nvl, buflen);
+
+ nvs_native_destroy(nvs);
+
+ return (err);
+}
+
+/*
+ * XDR encoding functions
+ *
+ * An xdr packed nvlist is encoded as:
+ *
+ * - encoding methode and host endian (4 bytes)
+ * - nvl_version (4 bytes)
+ * - nvl_nvflag (4 bytes)
+ *
+ * - encoded nvpairs, the format of one xdr encoded nvpair is:
+ * - encoded size of the nvpair (4 bytes)
+ * - decoded size of the nvpair (4 bytes)
+ * - name string, (4 + sizeof(NV_ALIGN4(string))
+ * a string is coded as size (4 bytes) and data
+ * - data type (4 bytes)
+ * - number of elements in the nvpair (4 bytes)
+ * - data
+ *
+ * - 2 zero's for end of the entire list (8 bytes)
+ */
+static int
+nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen)
+{
+ /* xdr data must be 4 byte aligned */
+ if ((ulong_t)buf % 4 != 0)
+ return (EFAULT);
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
+ nvs->nvs_private = xdr;
+ return (0);
+ case NVS_OP_DECODE:
+ xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
+ nvs->nvs_private = xdr;
+ return (0);
+ case NVS_OP_GETSIZE:
+ nvs->nvs_private = NULL;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+static void
+nvs_xdr_destroy(nvstream_t *nvs)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE:
+ xdr_destroy((XDR *)nvs->nvs_private);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
+{
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE:
+ case NVS_OP_DECODE: {
+ XDR *xdr = nvs->nvs_private;
+
+ if (!xdr_int(xdr, &nvl->nvl_version) ||
+ !xdr_u_int(xdr, &nvl->nvl_nvflag))
+ return (EFAULT);
+ break;
+ }
+ case NVS_OP_GETSIZE: {
+ /*
+ * 2 * 4 for nvl_version + nvl_nvflag
+ * and 8 for end of the entire list
+ */
+ *size += 2 * 4 + 8;
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+nvs_xdr_nvl_fini(nvstream_t *nvs)
+{
+ if (nvs->nvs_op == NVS_OP_ENCODE) {
+ XDR *xdr = nvs->nvs_private;
+ int zero = 0;
+
+ if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero))
+ return (EFAULT);
+ }
+
+ return (0);
+}
+
+/*
+ * The format of xdr encoded nvpair is:
+ * encode_size, decode_size, name string, data type, nelem, data
+ */
+static int
+nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
+{
+ data_type_t type;
+ char *buf;
+ char *buf_end = (char *)nvp + nvp->nvp_size;
+ int value_sz;
+ uint_t nelem, buflen;
+ bool_t ret = FALSE;
+ XDR *xdr = nvs->nvs_private;
+
+ ASSERT(xdr != NULL && nvp != NULL);
+
+ /* name string */
+ if ((buf = NVP_NAME(nvp)) >= buf_end)
+ return (EFAULT);
+ buflen = buf_end - buf;
+
+ if (!xdr_string(xdr, &buf, buflen - 1))
+ return (EFAULT);
+ nvp->nvp_name_sz = strlen(buf) + 1;
+
+ /* type and nelem */
+ if (!xdr_int(xdr, (int *)&nvp->nvp_type) ||
+ !xdr_int(xdr, &nvp->nvp_value_elem))
+ return (EFAULT);
+
+ type = NVP_TYPE(nvp);
+ nelem = nvp->nvp_value_elem;
+
+ /*
+ * Verify type and nelem and get the value size.
+ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+ * is the size of the string(s) excluded.
+ */
+ if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
+ return (EFAULT);
+
+ /* if there is no data to extract then return */
+ if (nelem == 0)
+ return (0);
+
+ /* value */
+ if ((buf = NVP_VALUE(nvp)) >= buf_end)
+ return (EFAULT);
+ buflen = buf_end - buf;
+
+ if (buflen < value_sz)
+ return (EFAULT);
+
+ switch (type) {
+ case DATA_TYPE_NVLIST:
+ if (nvs_embedded(nvs, (void *)buf) == 0)
+ return (0);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
+ return (0);
+ break;
+
+ case DATA_TYPE_BOOLEAN:
+ ret = TRUE;
+ break;
+
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ ret = xdr_char(xdr, buf);
+ break;
+
+ case DATA_TYPE_INT16:
+ ret = xdr_short(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT16:
+ ret = xdr_u_short(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_INT32:
+ ret = xdr_int(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT32:
+ ret = xdr_u_int(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_INT64:
+ ret = xdr_longlong_t(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_UINT64:
+ ret = xdr_u_longlong_t(xdr, (void *)buf);
+ break;
+
+ case DATA_TYPE_HRTIME:
+ /*
+ * NOTE: must expose the definition of hrtime_t here
+ */
+ ret = xdr_longlong_t(xdr, (void *)buf);
+ break;
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+ ret = xdr_double(xdr, (void *)buf);
+ break;
+#endif
+ case DATA_TYPE_STRING:
+ ret = xdr_string(xdr, &buf, buflen - 1);
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ ret = xdr_opaque(xdr, buf, nelem);
+ break;
+
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
+ (xdrproc_t)xdr_char);
+ break;
+
+ case DATA_TYPE_INT16_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
+ sizeof (int16_t), (xdrproc_t)xdr_short);
+ break;
+
+ case DATA_TYPE_UINT16_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
+ sizeof (uint16_t), (xdrproc_t)xdr_u_short);
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
+ sizeof (int32_t), (xdrproc_t)xdr_int);
+ break;
+
+ case DATA_TYPE_UINT32_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
+ sizeof (uint32_t), (xdrproc_t)xdr_u_int);
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
+ sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
+ break;
+
+ case DATA_TYPE_UINT64_ARRAY:
+ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
+ sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
+ break;
+
+ case DATA_TYPE_STRING_ARRAY: {
+ size_t len = nelem * sizeof (uint64_t);
+ char **strp = (void *)buf;
+ int i;
+
+ if (nvs->nvs_op == NVS_OP_DECODE)
+ bzero(buf, len); /* don't trust packed data */
+
+ for (i = 0; i < nelem; i++) {
+ if (buflen <= len)
+ return (EFAULT);
+
+ buf += len;
+ buflen -= len;
+
+ if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
+ return (EFAULT);
+
+ if (nvs->nvs_op == NVS_OP_DECODE)
+ strp[i] = buf;
+ len = strlen(buf) + 1;
+ }
+ ret = TRUE;
+ break;
+ }
+ default:
+ break;
+ }
+
+ return (ret == TRUE ? 0 : EFAULT);
+}
+
+static int
+nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ data_type_t type = NVP_TYPE(nvp);
+ /*
+ * encode_size + decode_size + name string size + data type + nelem
+ * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
+ */
+ uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ case DATA_TYPE_BYTE:
+ case DATA_TYPE_INT8:
+ case DATA_TYPE_UINT8:
+ case DATA_TYPE_INT16:
+ case DATA_TYPE_UINT16:
+ case DATA_TYPE_INT32:
+ case DATA_TYPE_UINT32:
+ nvp_sz += 4; /* 4 is the minimum xdr unit */
+ break;
+
+ case DATA_TYPE_INT64:
+ case DATA_TYPE_UINT64:
+ case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+ case DATA_TYPE_DOUBLE:
+#endif
+ nvp_sz += 8;
+ break;
+
+ case DATA_TYPE_STRING:
+ nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
+ break;
+
+ case DATA_TYPE_STRING_ARRAY: {
+ int i;
+ char **strs = (void *)NVP_VALUE(nvp);
+
+ for (i = 0; i < NVP_NELEM(nvp); i++)
+ nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
+
+ break;
+ }
+
+ case DATA_TYPE_NVLIST:
+ case DATA_TYPE_NVLIST_ARRAY: {
+ size_t nvsize = 0;
+ int old_nvs_op = nvs->nvs_op;
+ int err;
+
+ nvs->nvs_op = NVS_OP_GETSIZE;
+ if (type == DATA_TYPE_NVLIST)
+ err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
+ else
+ err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
+ nvs->nvs_op = old_nvs_op;
+
+ if (err != 0)
+ return (EINVAL);
+
+ nvp_sz += nvsize;
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+
+ if (nvp_sz > INT32_MAX)
+ return (EINVAL);
+
+ *size = nvp_sz;
+
+ return (0);
+}
+
+
+/*
+ * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
+ * the largest nvpair that could be encoded in the buffer.
+ *
+ * See comments above nvpair_xdr_op() for the format of xdr encoding.
+ * The size of a xdr packed nvpair without any data is 5 words.
+ *
+ * Using the size of the data directly as an estimate would be ok
+ * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY
+ * then the actual nvpair has space for an array of pointers to index
+ * the strings. These pointers are not encoded into the packed xdr buffer.
+ *
+ * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
+ * of length 0, then each string is endcoded in xdr format as a single word.
+ * Therefore when expanded to an nvpair there will be 2.25 word used for
+ * each string. (a int64_t allocated for pointer usage, and a single char
+ * for the null termination.)
+ *
+ * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
+ */
+#define NVS_XDR_HDR_LEN ((size_t)(5 * 4))
+#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
+ 0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
+#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \
+ (NVS_XDR_DATA_LEN(x) * 2) + \
+ NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
+
+static int
+nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+ XDR *xdr = nvs->nvs_private;
+ int32_t encode_len, decode_len;
+
+ switch (nvs->nvs_op) {
+ case NVS_OP_ENCODE: {
+ size_t nvsize;
+
+ if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
+ return (EFAULT);
+
+ decode_len = nvp->nvp_size;
+ encode_len = nvsize;
+ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
+ return (EFAULT);
+
+ return (nvs_xdr_nvp_op(nvs, nvp));
+ }
+ case NVS_OP_DECODE: {
+ struct xdr_bytesrec bytesrec;
+
+ /* get the encode and decode size */
+ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
+ return (EFAULT);
+ *size = decode_len;
+
+ /* are we at the end of the stream? */
+ if (*size == 0)
+ return (0);
+
+ /* sanity check the size parameter */
+ if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
+ return (EFAULT);
+
+ if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
+ return (EFAULT);
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static const struct nvs_ops nvs_xdr_ops = {
+ nvs_xdr_nvlist,
+ nvs_xdr_nvpair,
+ nvs_xdr_nvp_op,
+ nvs_xdr_nvp_size,
+ nvs_xdr_nvl_fini
+};
+
+static int
+nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
+{
+ XDR xdr;
+ int err;
+
+ nvs->nvs_ops = &nvs_xdr_ops;
+
+ if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
+ *buflen - sizeof (nvs_header_t))) != 0)
+ return (err);
+
+ err = nvs_operation(nvs, nvl, buflen);
+
+ nvs_xdr_destroy(nvs);
+
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c
new file mode 100644
index 000000000000..620171e4ca4e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+#if defined(_KERNEL) && !defined(_BOOT)
+#include <sys/varargs.h>
+#else
+#include <stdarg.h>
+#include <strings.h>
+#endif
+
+/*
+ * This allocator is very simple.
+ * - it uses a pre-allocated buffer for memory allocations.
+ * - it does _not_ free memory in the pre-allocated buffer.
+ *
+ * The reason for the selected implemention is simplicity.
+ * This allocator is designed for the usage in interrupt context when
+ * the caller may not wait for free memory.
+ */
+
+/* pre-allocated buffer for memory allocations */
+typedef struct nvbuf {
+ uintptr_t nvb_buf; /* address of pre-allocated buffer */
+ uintptr_t nvb_lim; /* limit address in the buffer */
+ uintptr_t nvb_cur; /* current address in the buffer */
+} nvbuf_t;
+
+/*
+ * Initialize the pre-allocated buffer allocator. The caller needs to supply
+ *
+ * buf address of pre-allocated buffer
+ * bufsz size of pre-allocated buffer
+ *
+ * nv_fixed_init() calculates the remaining members of nvbuf_t.
+ */
+static int
+nv_fixed_init(nv_alloc_t *nva, va_list valist)
+{
+ uintptr_t base = va_arg(valist, uintptr_t);
+ uintptr_t lim = base + va_arg(valist, size_t);
+ nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t));
+
+ if (base == 0 || (uintptr_t)&nvb[1] > lim)
+ return (EINVAL);
+
+ nvb->nvb_buf = (uintptr_t)&nvb[0];
+ nvb->nvb_cur = (uintptr_t)&nvb[1];
+ nvb->nvb_lim = lim;
+ nva->nva_arg = nvb;
+
+ return (0);
+}
+
+static void *
+nv_fixed_alloc(nv_alloc_t *nva, size_t size)
+{
+ nvbuf_t *nvb = nva->nva_arg;
+ uintptr_t new = nvb->nvb_cur;
+
+ if (size == 0 || new + size > nvb->nvb_lim)
+ return (NULL);
+
+ nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t));
+
+ return ((void *)new);
+}
+
+/*ARGSUSED*/
+static void
+nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+ /* don't free memory in the pre-allocated buffer */
+}
+
+static void
+nv_fixed_reset(nv_alloc_t *nva)
+{
+ nvbuf_t *nvb = nva->nva_arg;
+
+ nvb->nvb_cur = (uintptr_t)&nvb[1];
+}
+
+const nv_alloc_ops_t nv_fixed_ops_def = {
+ nv_fixed_init, /* nv_ao_init() */
+ NULL, /* nv_ao_fini() */
+ nv_fixed_alloc, /* nv_ao_alloc() */
+ nv_fixed_free, /* nv_ao_free() */
+ nv_fixed_reset /* nv_ao_reset() */
+};
+
+const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def;
diff --git a/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
new file mode 100644
index 000000000000..06c8a05506e7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c
@@ -0,0 +1,2130 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+/*
+ * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
+ *
+ * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
+ * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed.
+ */
+
+#include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#else
+#include <strings.h>
+#endif /* _KERNEL */
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+#include <sys/u8_textprep.h>
+#include <sys/u8_textprep_data.h>
+
+
+/* The maximum possible number of bytes in a UTF-8 character. */
+#define U8_MB_CUR_MAX (4)
+
+/*
+ * The maximum number of bytes needed for a UTF-8 character to cover
+ * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
+ */
+#define U8_MAX_BYTES_UCS2 (3)
+
+/* The maximum possible number of bytes in a Stream-Safe Text. */
+#define U8_STREAM_SAFE_TEXT_MAX (128)
+
+/*
+ * The maximum number of characters in a combining/conjoining sequence and
+ * the actual upperbound limit of a combining/conjoining sequence.
+ */
+#define U8_MAX_CHARS_A_SEQ (32)
+#define U8_UPPER_LIMIT_IN_A_SEQ (31)
+
+/* The combining class value for Starter. */
+#define U8_COMBINING_CLASS_STARTER (0)
+
+/*
+ * Some Hangul related macros at below.
+ *
+ * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
+ * Vowels, and optional Trailing consonants in Unicode scalar values.
+ *
+ * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
+ * the actual U+11A8. This is due to that the trailing consonant is optional
+ * and thus we are doing a pre-calculation of subtracting one.
+ *
+ * Each of 19 modern leading consonants has total 588 possible syllables since
+ * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
+ * no trailing consonant case, i.e., 21 x 28 = 588.
+ *
+ * We also have bunch of Hangul related macros at below. Please bear in mind
+ * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
+ * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
+ * Jamo; it just guarantee that it will be most likely.
+ */
+#define U8_HANGUL_SYL_FIRST (0xAC00U)
+#define U8_HANGUL_SYL_LAST (0xD7A3U)
+
+#define U8_HANGUL_JAMO_L_FIRST (0x1100U)
+#define U8_HANGUL_JAMO_L_LAST (0x1112U)
+#define U8_HANGUL_JAMO_V_FIRST (0x1161U)
+#define U8_HANGUL_JAMO_V_LAST (0x1175U)
+#define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
+#define U8_HANGUL_JAMO_T_LAST (0x11C2U)
+
+#define U8_HANGUL_V_COUNT (21)
+#define U8_HANGUL_VT_COUNT (588)
+#define U8_HANGUL_T_COUNT (28)
+
+#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
+
+#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
+ (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
+ (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
+ (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
+
+#define U8_HANGUL_JAMO_L(u) \
+ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
+
+#define U8_HANGUL_JAMO_V(u) \
+ ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
+
+#define U8_HANGUL_JAMO_T(u) \
+ ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define U8_HANGUL_JAMO(u) \
+ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define U8_HANGUL_SYLLABLE(u) \
+ ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
+
+#define U8_HANGUL_COMPOSABLE_L_V(s, u) \
+ ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
+
+#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
+ ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
+
+/* The types of decomposition mappings. */
+#define U8_DECOMP_BOTH (0xF5U)
+#define U8_DECOMP_CANONICAL (0xF6U)
+
+/* The indicator for 16-bit table. */
+#define U8_16BIT_TABLE_INDICATOR (0x8000U)
+
+/* The following are some convenience macros. */
+#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
+ (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \
+ (((uint32_t)(b2) & 0x3F) << 6) | \
+ ((uint32_t)(b3) & 0x3F));
+#define U8_SIMPLE_SWAP(a, b, t) \
+ (t) = (a); \
+ (a) = (b); \
+ (b) = (t);
+
+#define U8_ASCII_TOUPPER(c) \
+ (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
+
+#define U8_ASCII_TOLOWER(c) \
+ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
+
+#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
+/*
+ * The following macro assumes that the two characters that are to be
+ * swapped are adjacent to each other and 'a' comes before 'b'.
+ *
+ * If the assumptions are not met, then, the macro will fail.
+ */
+#define U8_SWAP_COMB_MARKS(a, b) \
+ for (k = 0; k < disp[(a)]; k++) \
+ u8t[k] = u8s[start[(a)] + k]; \
+ for (k = 0; k < disp[(b)]; k++) \
+ u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
+ start[(b)] = start[(a)] + disp[(b)]; \
+ for (k = 0; k < disp[(a)]; k++) \
+ u8s[start[(b)] + k] = u8t[k]; \
+ U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
+ U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
+
+/* The possible states during normalization. */
+typedef enum {
+ U8_STATE_START = 0,
+ U8_STATE_HANGUL_L = 1,
+ U8_STATE_HANGUL_LV = 2,
+ U8_STATE_HANGUL_LVT = 3,
+ U8_STATE_HANGUL_V = 4,
+ U8_STATE_HANGUL_T = 5,
+ U8_STATE_COMBINING_MARK = 6
+} u8_normalization_states_t;
+
+/*
+ * The three vectors at below are used to check bytes of a given UTF-8
+ * character are valid and not containing any malformed byte values.
+ *
+ * We used to have a quite relaxed UTF-8 binary representation but then there
+ * was some security related issues and so the Unicode Consortium defined
+ * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
+ * one more time at the Unicode 3.2. The following three tables are based on
+ * that.
+ */
+
+#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
+
+#define I_ U8_ILLEGAL_CHAR
+#define O_ U8_OUT_OF_RANGE_CHAR
+
+const int8_t u8_number_of_bytes[0x100] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
+ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
+ I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+
+/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+
+/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
+ 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
+};
+
+#undef I_
+#undef O_
+
+const uint8_t u8_valid_min_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* C8 C9 CA CB CC CD CE CF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* D8 D9 DA DB DC DD DE DF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* E8 E9 EA EB EC ED EE EF */
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+const uint8_t u8_valid_max_2nd_byte[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+/* C0 C1 C2 C3 C4 C5 C6 C7 */
+ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* C8 C9 CA CB CC CD CE CF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* D0 D1 D2 D3 D4 D5 D6 D7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* D8 D9 DA DB DC DD DE DF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* E0 E1 E2 E3 E4 E5 E6 E7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/* E8 E9 EA EB EC ED EE EF */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+/* F0 F1 F2 F3 F4 F5 F6 F7 */
+ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+
+/*
+ * The u8_validate() validates on the given UTF-8 character string and
+ * calculate the byte length. It is quite similar to mblen(3C) except that
+ * this will validate against the list of characters if required and
+ * specific to UTF-8 and Unicode.
+ */
+int
+u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
+{
+ uchar_t *ib;
+ uchar_t *ibtail;
+ uchar_t **p;
+ uchar_t *s1;
+ uchar_t *s2;
+ uchar_t f;
+ int sz;
+ size_t i;
+ int ret_val;
+ boolean_t second;
+ boolean_t no_need_to_validate_entire;
+ boolean_t check_additional;
+ boolean_t validate_ucs2_range_only;
+
+ if (! u8str)
+ return (0);
+
+ ib = (uchar_t *)u8str;
+ ibtail = ib + n;
+
+ ret_val = 0;
+
+ no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
+ check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
+ validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
+
+ while (ib < ibtail) {
+ /*
+ * The first byte of a UTF-8 character tells how many
+ * bytes will follow for the character. If the first byte
+ * is an illegal byte value or out of range value, we just
+ * return -1 with an appropriate error number.
+ */
+ sz = u8_number_of_bytes[*ib];
+ if (sz == U8_ILLEGAL_CHAR) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+
+ if (sz == U8_OUT_OF_RANGE_CHAR ||
+ (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
+ *errnum = ERANGE;
+ return (-1);
+ }
+
+ /*
+ * If we don't have enough bytes to check on, that's also
+ * an error. As you can see, we give illegal byte sequence
+ * checking higher priority then EINVAL cases.
+ */
+ if ((ibtail - ib) < sz) {
+ *errnum = EINVAL;
+ return (-1);
+ }
+
+ if (sz == 1) {
+ ib++;
+ ret_val++;
+ } else {
+ /*
+ * Check on the multi-byte UTF-8 character. For more
+ * details on this, see comment added for the used
+ * data structures at the beginning of the file.
+ */
+ f = *ib++;
+ ret_val++;
+ second = B_TRUE;
+ for (i = 1; i < sz; i++) {
+ if (second) {
+ if (*ib < u8_valid_min_2nd_byte[f] ||
+ *ib > u8_valid_max_2nd_byte[f]) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+ second = B_FALSE;
+ } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
+ *errnum = EILSEQ;
+ return (-1);
+ }
+ ib++;
+ ret_val++;
+ }
+ }
+
+ if (check_additional) {
+ for (p = (uchar_t **)list, i = 0; p[i]; i++) {
+ s1 = ib - sz;
+ s2 = p[i];
+ while (s1 < ib) {
+ if (*s1 != *s2 || *s2 == '\0')
+ break;
+ s1++;
+ s2++;
+ }
+
+ if (s1 >= ib && *s2 == '\0') {
+ *errnum = EBADF;
+ return (-1);
+ }
+ }
+ }
+
+ if (no_need_to_validate_entire)
+ break;
+ }
+
+ return (ret_val);
+}
+
+/*
+ * The do_case_conv() looks at the mapping tables and returns found
+ * bytes if any. If not found, the input bytes are returned. The function
+ * always terminate the return bytes with a null character assuming that
+ * there are plenty of room to do so.
+ *
+ * The case conversions are simple case conversions mapping a character to
+ * another character as specified in the Unicode data. The byte size of
+ * the mapped character could be different from that of the input character.
+ *
+ * The return value is the byte length of the returned character excluding
+ * the terminating null byte.
+ */
+static size_t
+do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
+{
+ size_t i;
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+
+ /*
+ * At this point, the only possible values for sz are 2, 3, and 4.
+ * The u8s should point to a vector that is well beyond the size of
+ * 5 bytes.
+ */
+ if (sz == 2) {
+ b3 = u8s[0] = s[0];
+ b4 = u8s[1] = s[1];
+ } else if (sz == 3) {
+ b2 = u8s[0] = s[0];
+ b3 = u8s[1] = s[1];
+ b4 = u8s[2] = s[2];
+ } else if (sz == 4) {
+ b1 = u8s[0] = s[0];
+ b2 = u8s[1] = s[1];
+ b3 = u8s[2] = s[2];
+ b4 = u8s[3] = s[3];
+ } else {
+ /* This is not possible but just in case as a fallback. */
+ if (is_it_toupper)
+ *u8s = U8_ASCII_TOUPPER(*s);
+ else
+ *u8s = U8_ASCII_TOLOWER(*s);
+ u8s[1] = '\0';
+
+ return (1);
+ }
+ u8s[sz] = '\0';
+
+ /*
+ * Let's find out if we have a corresponding character.
+ */
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b2 = u8_case_common_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ if (is_it_toupper) {
+ b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
+
+ /* Either there is no match or an error at the table. */
+ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+ return ((size_t)sz);
+
+ b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
+ } else {
+ b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
+
+ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+ return ((size_t)sz);
+
+ b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
+ }
+
+ /*
+ * If i is still zero, that means there is no corresponding character.
+ */
+ if (i == 0)
+ return ((size_t)sz);
+
+ u8s[i] = '\0';
+
+ return (i);
+}
+
+/*
+ * The do_case_compare() function compares the two input strings, s1 and s2,
+ * one character at a time doing case conversions if applicable and return
+ * the comparison result as like strcmp().
+ *
+ * Since, in empirical sense, most of text data are 7-bit ASCII characters,
+ * we treat the 7-bit ASCII characters as a special case trying to yield
+ * faster processing time.
+ */
+static int
+do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
+ size_t n2, boolean_t is_it_toupper, int *errnum)
+{
+ int f;
+ int sz1;
+ int sz2;
+ size_t j;
+ size_t i1;
+ size_t i2;
+ uchar_t u8s1[U8_MB_CUR_MAX + 1];
+ uchar_t u8s2[U8_MB_CUR_MAX + 1];
+
+ i1 = i2 = 0;
+ while (i1 < n1 && i2 < n2) {
+ /*
+ * Find out what would be the byte length for this UTF-8
+ * character at string s1 and also find out if this is
+ * an illegal start byte or not and if so, issue a proper
+ * error number and yet treat this byte as a character.
+ */
+ sz1 = u8_number_of_bytes[*s1];
+ if (sz1 < 0) {
+ *errnum = EILSEQ;
+ sz1 = 1;
+ }
+
+ /*
+ * For 7-bit ASCII characters mainly, we do a quick case
+ * conversion right at here.
+ *
+ * If we don't have enough bytes for this character, issue
+ * an EINVAL error and use what are available.
+ *
+ * If we have enough bytes, find out if there is
+ * a corresponding uppercase character and if so, copy over
+ * the bytes for a comparison later. If there is no
+ * corresponding uppercase character, then, use what we have
+ * for the comparison.
+ */
+ if (sz1 == 1) {
+ if (is_it_toupper)
+ u8s1[0] = U8_ASCII_TOUPPER(*s1);
+ else
+ u8s1[0] = U8_ASCII_TOLOWER(*s1);
+ s1++;
+ u8s1[1] = '\0';
+ } else if ((i1 + sz1) > n1) {
+ *errnum = EINVAL;
+ for (j = 0; (i1 + j) < n1; )
+ u8s1[j++] = *s1++;
+ u8s1[j] = '\0';
+ } else {
+ (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
+ s1 += sz1;
+ }
+
+ /* Do the same for the string s2. */
+ sz2 = u8_number_of_bytes[*s2];
+ if (sz2 < 0) {
+ *errnum = EILSEQ;
+ sz2 = 1;
+ }
+
+ if (sz2 == 1) {
+ if (is_it_toupper)
+ u8s2[0] = U8_ASCII_TOUPPER(*s2);
+ else
+ u8s2[0] = U8_ASCII_TOLOWER(*s2);
+ s2++;
+ u8s2[1] = '\0';
+ } else if ((i2 + sz2) > n2) {
+ *errnum = EINVAL;
+ for (j = 0; (i2 + j) < n2; )
+ u8s2[j++] = *s2++;
+ u8s2[j] = '\0';
+ } else {
+ (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
+ s2 += sz2;
+ }
+
+ /* Now compare the two characters. */
+ if (sz1 == 1 && sz2 == 1) {
+ if (*u8s1 > *u8s2)
+ return (1);
+ if (*u8s1 < *u8s2)
+ return (-1);
+ } else {
+ f = strcmp((const char *)u8s1, (const char *)u8s2);
+ if (f != 0)
+ return (f);
+ }
+
+ /*
+ * They were the same. Let's move on to the next
+ * characters then.
+ */
+ i1 += sz1;
+ i2 += sz2;
+ }
+
+ /*
+ * We compared until the end of either or both strings.
+ *
+ * If we reached to or went over the ends for the both, that means
+ * they are the same.
+ *
+ * If we reached only one of the two ends, that means the other string
+ * has something which then the fact can be used to determine
+ * the return value.
+ */
+ if (i1 >= n1) {
+ if (i2 >= n2)
+ return (0);
+ return (-1);
+ }
+ return (1);
+}
+
+/*
+ * The combining_class() function checks on the given bytes and find out
+ * the corresponding Unicode combining class value. The return value 0 means
+ * it is a Starter. Any illegal UTF-8 character will also be treated as
+ * a Starter.
+ */
+static uchar_t
+combining_class(size_t uv, uchar_t *s, size_t sz)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b4 = 0;
+
+ if (sz == 1 || sz > 4)
+ return (0);
+
+ if (sz == 2) {
+ b3 = s[0];
+ b4 = s[1];
+ } else if (sz == 3) {
+ b2 = s[0];
+ b3 = s[1];
+ b4 = s[2];
+ } else if (sz == 4) {
+ b1 = s[0];
+ b2 = s[1];
+ b3 = s[2];
+ b4 = s[3];
+ }
+
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ b2 = u8_combining_class_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ b3 = u8_combining_class_b3_tbl[uv][b2][b3];
+ if (b3 == U8_TBL_ELEMENT_NOT_DEF)
+ return (0);
+
+ return (u8_combining_class_b4_tbl[uv][b3][b4]);
+}
+
+/*
+ * The do_decomp() function finds out a matching decomposition if any
+ * and return. If there is no match, the input bytes are copied and returned.
+ * The function also checks if there is a Hangul, decomposes it if necessary
+ * and returns.
+ *
+ * To save time, a single byte 7-bit ASCII character should be handled by
+ * the caller.
+ *
+ * The function returns the number of bytes returned sans always terminating
+ * the null byte. It will also return a state that will tell if there was
+ * a Hangul character decomposed which then will be used by the caller.
+ */
+static size_t
+do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
+ boolean_t canonical_decomposition, u8_normalization_states_t *state)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+ size_t i;
+ uint32_t u1;
+
+ if (sz == 2) {
+ b3 = u8s[0] = s[0];
+ b4 = u8s[1] = s[1];
+ u8s[2] = '\0';
+ } else if (sz == 3) {
+ /* Convert it to a Unicode scalar value. */
+ U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
+
+ /*
+ * If this is a Hangul syllable, we decompose it into
+ * a leading consonant, a vowel, and an optional trailing
+ * consonant and then return.
+ */
+ if (U8_HANGUL_SYLLABLE(u1)) {
+ u1 -= U8_HANGUL_SYL_FIRST;
+
+ b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
+ b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
+ / U8_HANGUL_T_COUNT;
+ b3 = u1 % U8_HANGUL_T_COUNT;
+
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
+ if (b3) {
+ b3 += U8_HANGUL_JAMO_T_FIRST;
+ U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
+
+ u8s[9] = '\0';
+ *state = U8_STATE_HANGUL_LVT;
+ return (9);
+ }
+
+ u8s[6] = '\0';
+ *state = U8_STATE_HANGUL_LV;
+ return (6);
+ }
+
+ b2 = u8s[0] = s[0];
+ b3 = u8s[1] = s[1];
+ b4 = u8s[2] = s[2];
+ u8s[3] = '\0';
+
+ /*
+ * If this is a Hangul Jamo, we know there is nothing
+ * further that we can decompose.
+ */
+ if (U8_HANGUL_JAMO_L(u1)) {
+ *state = U8_STATE_HANGUL_L;
+ return (3);
+ }
+
+ if (U8_HANGUL_JAMO_V(u1)) {
+ if (*state == U8_STATE_HANGUL_L)
+ *state = U8_STATE_HANGUL_LV;
+ else
+ *state = U8_STATE_HANGUL_V;
+ return (3);
+ }
+
+ if (U8_HANGUL_JAMO_T(u1)) {
+ if (*state == U8_STATE_HANGUL_LV)
+ *state = U8_STATE_HANGUL_LVT;
+ else
+ *state = U8_STATE_HANGUL_T;
+ return (3);
+ }
+ } else if (sz == 4) {
+ b1 = u8s[0] = s[0];
+ b2 = u8s[1] = s[1];
+ b3 = u8s[2] = s[2];
+ b4 = u8s[3] = s[3];
+ u8s[4] = '\0';
+ } else {
+ /*
+ * This is a fallback and should not happen if the function
+ * was called properly.
+ */
+ u8s[0] = s[0];
+ u8s[1] = '\0';
+ *state = U8_STATE_START;
+ return (1);
+ }
+
+ /*
+ * At this point, this rountine does not know what it would get.
+ * The caller should sort it out if the state isn't a Hangul one.
+ */
+ *state = U8_STATE_START;
+
+ /* Try to find matching decomposition mapping byte sequence. */
+ b1 = u8_common_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b2 = u8_decomp_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return ((size_t)sz);
+
+ /*
+ * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
+ * which is 0x8000, this means we couldn't fit the mappings into
+ * the cardinality of a unsigned byte.
+ */
+ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+ b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+ start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
+ end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+ } else {
+ start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
+ }
+
+ /* This also means there wasn't any matching decomposition. */
+ if (start_id >= end_id)
+ return ((size_t)sz);
+
+ /*
+ * The final table for decomposition mappings has three types of
+ * byte sequences depending on whether a mapping is for compatibility
+ * decomposition, canonical decomposition, or both like the following:
+ *
+ * (1) Compatibility decomposition mappings:
+ *
+ * +---+---+-...-+---+
+ * | B0| B1| ... | Bm|
+ * +---+---+-...-+---+
+ *
+ * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
+ *
+ * (2) Canonical decomposition mappings:
+ *
+ * +---+---+---+-...-+---+
+ * | T | b0| b1| ... | bn|
+ * +---+---+---+-...-+---+
+ *
+ * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
+ *
+ * (3) Both mappings:
+ *
+ * +---+---+---+---+-...-+---+---+---+-...-+---+
+ * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
+ * +---+---+---+---+-...-+---+---+---+-...-+---+
+ *
+ * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
+ * byte, b0 to bn are canonical mapping bytes and B0 to Bm are
+ * compatibility mapping bytes.
+ *
+ * Note that compatibility decomposition means doing recursive
+ * decompositions using both compatibility decomposition mappings and
+ * canonical decomposition mappings. On the other hand, canonical
+ * decomposition means doing recursive decompositions using only
+ * canonical decomposition mappings. Since the table we have has gone
+ * through the recursions already, we do not need to do so during
+ * runtime, i.e., the table has been completely flattened out
+ * already.
+ */
+
+ b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
+
+ /* Get the type, T, of the byte sequence. */
+ b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
+
+ /*
+ * If necessary, adjust start_id, end_id, or both. Note that if
+ * this is compatibility decomposition mapping, there is no
+ * adjustment.
+ */
+ if (canonical_decomposition) {
+ /* Is the mapping only for compatibility decomposition? */
+ if (b1 < U8_DECOMP_BOTH)
+ return ((size_t)sz);
+
+ start_id++;
+
+ if (b1 == U8_DECOMP_BOTH) {
+ end_id = start_id +
+ u8_decomp_final_tbl[uv][b3_base + start_id];
+ start_id++;
+ }
+ } else {
+ /*
+ * Unless this is a compatibility decomposition mapping,
+ * we adjust the start_id.
+ */
+ if (b1 == U8_DECOMP_BOTH) {
+ start_id++;
+ start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
+ } else if (b1 == U8_DECOMP_CANONICAL) {
+ start_id++;
+ }
+ }
+
+ for (i = 0; start_id < end_id; start_id++)
+ u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
+ u8s[i] = '\0';
+
+ return (i);
+}
+
+/*
+ * The find_composition_start() function uses the character bytes given and
+ * find out the matching composition mappings if any and return the address
+ * to the composition mappings as explained in the do_composition().
+ */
+static uchar_t *
+find_composition_start(size_t uv, uchar_t *s, size_t sz)
+{
+ uint16_t b1 = 0;
+ uint16_t b2 = 0;
+ uint16_t b3 = 0;
+ uint16_t b3_tbl;
+ uint16_t b3_base;
+ uint16_t b4 = 0;
+ size_t start_id;
+ size_t end_id;
+
+ if (sz == 1) {
+ b4 = s[0];
+ } else if (sz == 2) {
+ b3 = s[0];
+ b4 = s[1];
+ } else if (sz == 3) {
+ b2 = s[0];
+ b3 = s[1];
+ b4 = s[2];
+ } else if (sz == 4) {
+ b1 = s[0];
+ b2 = s[1];
+ b3 = s[2];
+ b4 = s[3];
+ } else {
+ /*
+ * This is a fallback and should not happen if the function
+ * was called properly.
+ */
+ return (NULL);
+ }
+
+ b1 = u8_composition_b1_tbl[uv][b1];
+ if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ b2 = u8_composition_b2_tbl[uv][b1][b2];
+ if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
+ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+ return (NULL);
+
+ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+ b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+ start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
+ end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+ } else {
+ start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
+ end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
+ }
+
+ if (start_id >= end_id)
+ return (NULL);
+
+ b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
+
+ return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
+}
+
+/*
+ * The blocked() function checks on the combining class values of previous
+ * characters in this sequence and return whether it is blocked or not.
+ */
+static boolean_t
+blocked(uchar_t *comb_class, size_t last)
+{
+ uchar_t my_comb_class;
+ size_t i;
+
+ my_comb_class = comb_class[last];
+ for (i = 1; i < last; i++)
+ if (comb_class[i] >= my_comb_class ||
+ comb_class[i] == U8_COMBINING_CLASS_STARTER)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * The do_composition() reads the character string pointed by 's' and
+ * do necessary canonical composition and then copy over the result back to
+ * the 's'.
+ *
+ * The input argument 's' cannot contain more than 32 characters.
+ */
+static size_t
+do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
+ uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
+{
+ uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t tc[U8_MB_CUR_MAX];
+ uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
+ size_t saved_marks_count;
+ uchar_t *p;
+ uchar_t *saved_p;
+ uchar_t *q;
+ size_t i;
+ size_t saved_i;
+ size_t j;
+ size_t k;
+ size_t l;
+ size_t C;
+ size_t saved_l;
+ size_t size;
+ uint32_t u1;
+ uint32_t u2;
+ boolean_t match_not_found = B_TRUE;
+
+ /*
+ * This should never happen unless the callers are doing some strange
+ * and unexpected things.
+ *
+ * The "last" is the index pointing to the last character not last + 1.
+ */
+ if (last >= U8_MAX_CHARS_A_SEQ)
+ last = U8_UPPER_LIMIT_IN_A_SEQ;
+
+ for (i = l = 0; i <= last; i++) {
+ /*
+ * The last or any non-Starters at the beginning, we don't
+ * have any chance to do composition and so we just copy them
+ * to the temporary buffer.
+ */
+ if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
+SAVE_THE_CHAR:
+ p = s + start[i];
+ size = disp[i];
+ for (k = 0; k < size; k++)
+ t[l++] = *p++;
+ continue;
+ }
+
+ /*
+ * If this could be a start of Hangul Jamos, then, we try to
+ * conjoin them.
+ */
+ if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
+ U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
+ s[start[i] + 1], s[start[i] + 2]);
+ U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
+ s[start[i] + 4], s[start[i] + 5]);
+
+ if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
+ u1 -= U8_HANGUL_JAMO_L_FIRST;
+ u2 -= U8_HANGUL_JAMO_V_FIRST;
+ u1 = U8_HANGUL_SYL_FIRST +
+ (u1 * U8_HANGUL_V_COUNT + u2) *
+ U8_HANGUL_T_COUNT;
+
+ i += 2;
+ if (i <= last) {
+ U8_PUT_3BYTES_INTO_UTF32(u2,
+ s[start[i]], s[start[i] + 1],
+ s[start[i] + 2]);
+
+ if (U8_HANGUL_JAMO_T(u2)) {
+ u1 += u2 -
+ U8_HANGUL_JAMO_T_FIRST;
+ i++;
+ }
+ }
+
+ U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
+ i--;
+ l += 3;
+ continue;
+ }
+ }
+
+ /*
+ * Let's then find out if this Starter has composition
+ * mapping.
+ */
+ p = find_composition_start(uv, s + start[i], disp[i]);
+ if (p == NULL)
+ goto SAVE_THE_CHAR;
+
+ /*
+ * We have a Starter with composition mapping and the next
+ * character is a non-Starter. Let's try to find out if
+ * we can do composition.
+ */
+
+ saved_p = p;
+ saved_i = i;
+ saved_l = l;
+ saved_marks_count = 0;
+
+TRY_THE_NEXT_MARK:
+ q = s + start[++i];
+ size = disp[i];
+
+ /*
+ * The next for() loop compares the non-Starter pointed by
+ * 'q' with the possible (joinable) characters pointed by 'p'.
+ *
+ * The composition final table entry pointed by the 'p'
+ * looks like the following:
+ *
+ * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+ * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
+ * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+ *
+ * where C is the count byte indicating the number of
+ * mapping pairs where each pair would be look like
+ * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
+ * character of a canonical decomposition and the B0-Bm are
+ * the bytes of a matching composite character. The F is
+ * a filler byte after each character as the separator.
+ */
+
+ match_not_found = B_TRUE;
+
+ for (C = *p++; C > 0; C--) {
+ for (k = 0; k < size; p++, k++)
+ if (*p != q[k])
+ break;
+
+ /* Have we found it? */
+ if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
+ match_not_found = B_FALSE;
+
+ l = saved_l;
+
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ t[l++] = *p;
+
+ break;
+ }
+
+ /* We didn't find; skip to the next pair. */
+ if (*p != U8_TBL_ELEMENT_FILLER)
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ ;
+ while (*++p != U8_TBL_ELEMENT_FILLER)
+ ;
+ p++;
+ }
+
+ /*
+ * If there was no match, we will need to save the combining
+ * mark for later appending. After that, if the next one
+ * is a non-Starter and not blocked, then, we try once
+ * again to do composition with the next non-Starter.
+ *
+ * If there was no match and this was a Starter, then,
+ * this is a new start.
+ *
+ * If there was a match and a composition done and we have
+ * more to check on, then, we retrieve a new composition final
+ * table entry for the composite and then try to do the
+ * composition again.
+ */
+
+ if (match_not_found) {
+ if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
+ i--;
+ goto SAVE_THE_CHAR;
+ }
+
+ saved_marks[saved_marks_count++] = i;
+ }
+
+ if (saved_l == l) {
+ while (i < last) {
+ if (blocked(comb_class, i + 1))
+ saved_marks[saved_marks_count++] = ++i;
+ else
+ break;
+ }
+ if (i < last) {
+ p = saved_p;
+ goto TRY_THE_NEXT_MARK;
+ }
+ } else if (i < last) {
+ p = find_composition_start(uv, t + saved_l,
+ l - saved_l);
+ if (p != NULL) {
+ saved_p = p;
+ goto TRY_THE_NEXT_MARK;
+ }
+ }
+
+ /*
+ * There is no more composition possible.
+ *
+ * If there was no composition what so ever then we copy
+ * over the original Starter and then append any non-Starters
+ * remaining at the target string sequentially after that.
+ */
+
+ if (saved_l == l) {
+ p = s + start[saved_i];
+ size = disp[saved_i];
+ for (j = 0; j < size; j++)
+ t[l++] = *p++;
+ }
+
+ for (k = 0; k < saved_marks_count; k++) {
+ p = s + start[saved_marks[k]];
+ size = disp[saved_marks[k]];
+ for (j = 0; j < size; j++)
+ t[l++] = *p++;
+ }
+ }
+
+ /*
+ * If the last character is a Starter and if we have a character
+ * (possibly another Starter) that can be turned into a composite,
+ * we do so and we do so until there is no more of composition
+ * possible.
+ */
+ if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
+ p = *os;
+ saved_l = l - disp[last];
+
+ while (p < oslast) {
+ size = u8_number_of_bytes[*p];
+ if (size <= 1 || (p + size) > oslast)
+ break;
+
+ saved_p = p;
+
+ for (i = 0; i < size; i++)
+ tc[i] = *p++;
+
+ q = find_composition_start(uv, t + saved_l,
+ l - saved_l);
+ if (q == NULL) {
+ p = saved_p;
+ break;
+ }
+
+ match_not_found = B_TRUE;
+
+ for (C = *q++; C > 0; C--) {
+ for (k = 0; k < size; q++, k++)
+ if (*q != tc[k])
+ break;
+
+ if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
+ match_not_found = B_FALSE;
+
+ l = saved_l;
+
+ while (*++q != U8_TBL_ELEMENT_FILLER) {
+ /*
+ * This is practically
+ * impossible but we don't
+ * want to take any chances.
+ */
+ if (l >=
+ U8_STREAM_SAFE_TEXT_MAX) {
+ p = saved_p;
+ goto SAFE_RETURN;
+ }
+ t[l++] = *q;
+ }
+
+ break;
+ }
+
+ if (*q != U8_TBL_ELEMENT_FILLER)
+ while (*++q != U8_TBL_ELEMENT_FILLER)
+ ;
+ while (*++q != U8_TBL_ELEMENT_FILLER)
+ ;
+ q++;
+ }
+
+ if (match_not_found) {
+ p = saved_p;
+ break;
+ }
+ }
+SAFE_RETURN:
+ *os = p;
+ }
+
+ /*
+ * Now we copy over the temporary string to the target string.
+ * Since composition always reduces the number of characters or
+ * the number of characters stay, we don't need to worry about
+ * the buffer overflow here.
+ */
+ for (i = 0; i < l; i++)
+ s[i] = t[i];
+ s[l] = '\0';
+
+ return (l);
+}
+
+/*
+ * The collect_a_seq() function checks on the given string s, collect
+ * a sequence of characters at u8s, and return the sequence. While it collects
+ * a sequence, it also applies case conversion, canonical or compatibility
+ * decomposition, canonical decomposition, or some or all of them and
+ * in that order.
+ *
+ * The collected sequence cannot be bigger than 32 characters since if
+ * it is having more than 31 characters, the sequence will be terminated
+ * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
+ * a Stream-Safe Text. The collected sequence is always terminated with
+ * a null byte and the return value is the byte length of the sequence
+ * including 0. The return value does not include the terminating
+ * null byte.
+ */
+static size_t
+collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
+ boolean_t is_it_toupper,
+ boolean_t is_it_tolower,
+ boolean_t canonical_decomposition,
+ boolean_t compatibility_decomposition,
+ boolean_t canonical_composition,
+ int *errnum, u8_normalization_states_t *state)
+{
+ uchar_t *s;
+ int sz;
+ int saved_sz;
+ size_t i;
+ size_t j;
+ size_t k;
+ size_t l;
+ uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
+ uchar_t disp[U8_MAX_CHARS_A_SEQ];
+ uchar_t start[U8_MAX_CHARS_A_SEQ];
+ uchar_t u8t[U8_MB_CUR_MAX];
+ uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t tc;
+ size_t last;
+ size_t saved_last;
+ uint32_t u1;
+
+ /*
+ * Save the source string pointer which we will return a changed
+ * pointer if we do processing.
+ */
+ s = *source;
+
+ /*
+ * The following is a fallback for just in case callers are not
+ * checking the string boundaries before the calling.
+ */
+ if (s >= slast) {
+ u8s[0] = '\0';
+
+ return (0);
+ }
+
+ /*
+ * As the first thing, let's collect a character and do case
+ * conversion if necessary.
+ */
+
+ sz = u8_number_of_bytes[*s];
+
+ if (sz < 0) {
+ *errnum = EILSEQ;
+
+ u8s[0] = *s++;
+ u8s[1] = '\0';
+
+ *source = s;
+
+ return (1);
+ }
+
+ if (sz == 1) {
+ if (is_it_toupper)
+ u8s[0] = U8_ASCII_TOUPPER(*s);
+ else if (is_it_tolower)
+ u8s[0] = U8_ASCII_TOLOWER(*s);
+ else
+ u8s[0] = *s;
+ s++;
+ u8s[1] = '\0';
+ } else if ((s + sz) > slast) {
+ *errnum = EINVAL;
+
+ for (i = 0; s < slast; )
+ u8s[i++] = *s++;
+ u8s[i] = '\0';
+
+ *source = s;
+
+ return (i);
+ } else {
+ if (is_it_toupper || is_it_tolower) {
+ i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
+ s += sz;
+ sz = i;
+ } else {
+ for (i = 0; i < sz; )
+ u8s[i++] = *s++;
+ u8s[i] = '\0';
+ }
+ }
+
+ /*
+ * And then canonical/compatibility decomposition followed by
+ * an optional canonical composition. Please be noted that
+ * canonical composition is done only when a decomposition is
+ * done.
+ */
+ if (canonical_decomposition || compatibility_decomposition) {
+ if (sz == 1) {
+ *state = U8_STATE_START;
+
+ saved_sz = 1;
+
+ comb_class[0] = 0;
+ start[0] = 0;
+ disp[0] = 1;
+
+ last = 1;
+ } else {
+ saved_sz = do_decomp(uv, u8s, u8s, sz,
+ canonical_decomposition, state);
+
+ last = 0;
+
+ for (i = 0; i < saved_sz; ) {
+ sz = u8_number_of_bytes[u8s[i]];
+
+ comb_class[last] = combining_class(uv,
+ u8s + i, sz);
+ start[last] = i;
+ disp[last] = sz;
+
+ last++;
+ i += sz;
+ }
+
+ /*
+ * Decomposition yields various Hangul related
+ * states but not on combining marks. We need to
+ * find out at here by checking on the last
+ * character.
+ */
+ if (*state == U8_STATE_START) {
+ if (comb_class[last - 1])
+ *state = U8_STATE_COMBINING_MARK;
+ }
+ }
+
+ saved_last = last;
+
+ while (s < slast) {
+ sz = u8_number_of_bytes[*s];
+
+ /*
+ * If this is an illegal character, an incomplete
+ * character, or an 7-bit ASCII Starter character,
+ * then we have collected a sequence; break and let
+ * the next call deal with the two cases.
+ *
+ * Note that this is okay only if you are using this
+ * function with a fixed length string, not on
+ * a buffer with multiple calls of one chunk at a time.
+ */
+ if (sz <= 1) {
+ break;
+ } else if ((s + sz) > slast) {
+ break;
+ } else {
+ /*
+ * If the previous character was a Hangul Jamo
+ * and this character is a Hangul Jamo that
+ * can be conjoined, we collect the Jamo.
+ */
+ if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
+ U8_PUT_3BYTES_INTO_UTF32(u1,
+ *s, *(s + 1), *(s + 2));
+
+ if (U8_HANGUL_COMPOSABLE_L_V(*state,
+ u1)) {
+ i = 0;
+ *state = U8_STATE_HANGUL_LV;
+ goto COLLECT_A_HANGUL;
+ }
+
+ if (U8_HANGUL_COMPOSABLE_LV_T(*state,
+ u1)) {
+ i = 0;
+ *state = U8_STATE_HANGUL_LVT;
+ goto COLLECT_A_HANGUL;
+ }
+ }
+
+ /*
+ * Regardless of whatever it was, if this is
+ * a Starter, we don't collect the character
+ * since that's a new start and we will deal
+ * with it at the next time.
+ */
+ i = combining_class(uv, s, sz);
+ if (i == U8_COMBINING_CLASS_STARTER)
+ break;
+
+ /*
+ * We know the current character is a combining
+ * mark. If the previous character wasn't
+ * a Starter (not Hangul) or a combining mark,
+ * then, we don't collect this combining mark.
+ */
+ if (*state != U8_STATE_START &&
+ *state != U8_STATE_COMBINING_MARK)
+ break;
+
+ *state = U8_STATE_COMBINING_MARK;
+COLLECT_A_HANGUL:
+ /*
+ * If we collected a Starter and combining
+ * marks up to 30, i.e., total 31 characters,
+ * then, we terminate this degenerately long
+ * combining sequence with a U+034F COMBINING
+ * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
+ * UTF-8 and turn this into a Stream-Safe
+ * Text. This will be extremely rare but
+ * possible.
+ *
+ * The following will also guarantee that
+ * we are not writing more than 32 characters
+ * plus a NULL at u8s[].
+ */
+ if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
+TURN_STREAM_SAFE:
+ *state = U8_STATE_START;
+ comb_class[last] = 0;
+ start[last] = saved_sz;
+ disp[last] = 2;
+ last++;
+
+ u8s[saved_sz++] = 0xCD;
+ u8s[saved_sz++] = 0x8F;
+
+ break;
+ }
+
+ /*
+ * Some combining marks also do decompose into
+ * another combining mark or marks.
+ */
+ if (*state == U8_STATE_COMBINING_MARK) {
+ k = last;
+ l = sz;
+ i = do_decomp(uv, uts, s, sz,
+ canonical_decomposition, state);
+ for (j = 0; j < i; ) {
+ sz = u8_number_of_bytes[uts[j]];
+
+ comb_class[last] =
+ combining_class(uv,
+ uts + j, sz);
+ start[last] = saved_sz + j;
+ disp[last] = sz;
+
+ last++;
+ if (last >=
+ U8_UPPER_LIMIT_IN_A_SEQ) {
+ last = k;
+ goto TURN_STREAM_SAFE;
+ }
+ j += sz;
+ }
+
+ *state = U8_STATE_COMBINING_MARK;
+ sz = i;
+ s += l;
+
+ for (i = 0; i < sz; i++)
+ u8s[saved_sz++] = uts[i];
+ } else {
+ comb_class[last] = i;
+ start[last] = saved_sz;
+ disp[last] = sz;
+ last++;
+
+ for (i = 0; i < sz; i++)
+ u8s[saved_sz++] = *s++;
+ }
+
+ /*
+ * If this is U+0345 COMBINING GREEK
+ * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
+ * iota subscript, and need to be converted to
+ * uppercase letter, convert it to U+0399 GREEK
+ * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
+ * i.e., convert to capital adscript form as
+ * specified in the Unicode standard.
+ *
+ * This is the only special case of (ambiguous)
+ * case conversion at combining marks and
+ * probably the standard will never have
+ * anything similar like this in future.
+ */
+ if (is_it_toupper && sz >= 2 &&
+ u8s[saved_sz - 2] == 0xCD &&
+ u8s[saved_sz - 1] == 0x85) {
+ u8s[saved_sz - 2] = 0xCE;
+ u8s[saved_sz - 1] = 0x99;
+ }
+ }
+ }
+
+ /*
+ * Let's try to ensure a canonical ordering for the collected
+ * combining marks. We do this only if we have collected
+ * at least one more non-Starter. (The decomposition mapping
+ * data tables have fully (and recursively) expanded and
+ * canonically ordered decompositions.)
+ *
+ * The U8_SWAP_COMB_MARKS() convenience macro has some
+ * assumptions and we are meeting the assumptions.
+ */
+ last--;
+ if (last >= saved_last) {
+ for (i = 0; i < last; i++)
+ for (j = last; j > i; j--)
+ if (comb_class[j] &&
+ comb_class[j - 1] > comb_class[j]) {
+ U8_SWAP_COMB_MARKS(j - 1, j);
+ }
+ }
+
+ *source = s;
+
+ if (! canonical_composition) {
+ u8s[saved_sz] = '\0';
+ return (saved_sz);
+ }
+
+ /*
+ * Now do the canonical composition. Note that we do this
+ * only after a canonical or compatibility decomposition to
+ * finish up NFC or NFKC.
+ */
+ sz = do_composition(uv, u8s, comb_class, start, disp, last,
+ &s, slast);
+ }
+
+ *source = s;
+
+ return ((size_t)sz);
+}
+
+/*
+ * The do_norm_compare() function does string comparion based on Unicode
+ * simple case mappings and Unicode Normalization definitions.
+ *
+ * It does so by collecting a sequence of character at a time and comparing
+ * the collected sequences from the strings.
+ *
+ * The meanings on the return values are the same as the usual strcmp().
+ */
+static int
+do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
+ int flag, int *errnum)
+{
+ int result;
+ size_t sz1;
+ size_t sz2;
+ uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
+ uchar_t *s1last;
+ uchar_t *s2last;
+ boolean_t is_it_toupper;
+ boolean_t is_it_tolower;
+ boolean_t canonical_decomposition;
+ boolean_t compatibility_decomposition;
+ boolean_t canonical_composition;
+ u8_normalization_states_t state;
+
+ s1last = s1 + n1;
+ s2last = s2 + n2;
+
+ is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+ is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+ canonical_decomposition = flag & U8_CANON_DECOMP;
+ compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+ canonical_composition = flag & U8_CANON_COMP;
+
+ while (s1 < s1last && s2 < s2last) {
+ /*
+ * If the current character is a 7-bit ASCII and the last
+ * character, or, if the current character and the next
+ * character are both some 7-bit ASCII characters then
+ * we treat the current character as a sequence.
+ *
+ * In any other cases, we need to call collect_a_seq().
+ */
+
+ if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
+ ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
+ if (is_it_toupper)
+ u8s1[0] = U8_ASCII_TOUPPER(*s1);
+ else if (is_it_tolower)
+ u8s1[0] = U8_ASCII_TOLOWER(*s1);
+ else
+ u8s1[0] = *s1;
+ u8s1[1] = '\0';
+ sz1 = 1;
+ s1++;
+ } else {
+ state = U8_STATE_START;
+ sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
+ is_it_toupper, is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition, errnum, &state);
+ }
+
+ if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
+ ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
+ if (is_it_toupper)
+ u8s2[0] = U8_ASCII_TOUPPER(*s2);
+ else if (is_it_tolower)
+ u8s2[0] = U8_ASCII_TOLOWER(*s2);
+ else
+ u8s2[0] = *s2;
+ u8s2[1] = '\0';
+ sz2 = 1;
+ s2++;
+ } else {
+ state = U8_STATE_START;
+ sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
+ is_it_toupper, is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition, errnum, &state);
+ }
+
+ /*
+ * Now compare the two characters. If they are the same,
+ * we move on to the next character sequences.
+ */
+ if (sz1 == 1 && sz2 == 1) {
+ if (*u8s1 > *u8s2)
+ return (1);
+ if (*u8s1 < *u8s2)
+ return (-1);
+ } else {
+ result = strcmp((const char *)u8s1, (const char *)u8s2);
+ if (result != 0)
+ return (result);
+ }
+ }
+
+ /*
+ * We compared until the end of either or both strings.
+ *
+ * If we reached to or went over the ends for the both, that means
+ * they are the same.
+ *
+ * If we reached only one end, that means the other string has
+ * something which then can be used to determine the return value.
+ */
+ if (s1 >= s1last) {
+ if (s2 >= s2last)
+ return (0);
+ return (-1);
+ }
+ return (1);
+}
+
+/*
+ * The u8_strcmp() function compares two UTF-8 strings quite similar to
+ * the strcmp(). For the comparison, however, Unicode Normalization specific
+ * equivalency and Unicode simple case conversion mappings based equivalency
+ * can be requested and checked against.
+ */
+int
+u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
+ int *errnum)
+{
+ int f;
+ size_t n1;
+ size_t n2;
+
+ *errnum = 0;
+
+ /*
+ * Check on the requested Unicode version, case conversion, and
+ * normalization flag values.
+ */
+
+ if (uv > U8_UNICODE_LATEST) {
+ *errnum = ERANGE;
+ uv = U8_UNICODE_LATEST;
+ }
+
+ if (flag == 0) {
+ flag = U8_STRCMP_CS;
+ } else {
+ f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
+ U8_STRCMP_CI_LOWER);
+ if (f == 0) {
+ flag |= U8_STRCMP_CS;
+ } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
+ f != U8_STRCMP_CI_LOWER) {
+ *errnum = EBADF;
+ flag = U8_STRCMP_CS;
+ }
+
+ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+ if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
+ f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
+ *errnum = EBADF;
+ flag = U8_STRCMP_CS;
+ }
+ }
+
+ if (flag == U8_STRCMP_CS) {
+ return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
+ }
+
+ n1 = strlen(s1);
+ n2 = strlen(s2);
+ if (n != 0) {
+ if (n < n1)
+ n1 = n;
+ if (n < n2)
+ n2 = n;
+ }
+
+ /*
+ * Simple case conversion can be done much faster and so we do
+ * them separately here.
+ */
+ if (flag == U8_STRCMP_CI_UPPER) {
+ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+ n1, n2, B_TRUE, errnum));
+ } else if (flag == U8_STRCMP_CI_LOWER) {
+ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+ n1, n2, B_FALSE, errnum));
+ }
+
+ return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
+ flag, errnum));
+}
+
+size_t
+u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
+ int flag, size_t unicode_version, int *errnum)
+{
+ int f;
+ int sz;
+ uchar_t *ib;
+ uchar_t *ibtail;
+ uchar_t *ob;
+ uchar_t *obtail;
+ boolean_t do_not_ignore_null;
+ boolean_t do_not_ignore_invalid;
+ boolean_t is_it_toupper;
+ boolean_t is_it_tolower;
+ boolean_t canonical_decomposition;
+ boolean_t compatibility_decomposition;
+ boolean_t canonical_composition;
+ size_t ret_val;
+ size_t i;
+ size_t j;
+ uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
+ u8_normalization_states_t state;
+
+ if (unicode_version > U8_UNICODE_LATEST) {
+ *errnum = ERANGE;
+ return ((size_t)-1);
+ }
+
+ f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
+ if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
+ *errnum = EBADF;
+ return ((size_t)-1);
+ }
+
+ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+ if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
+ f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
+ *errnum = EBADF;
+ return ((size_t)-1);
+ }
+
+ if (inarray == NULL || *inlen == 0)
+ return (0);
+
+ if (outarray == NULL) {
+ *errnum = E2BIG;
+ return ((size_t)-1);
+ }
+
+ ib = (uchar_t *)inarray;
+ ob = (uchar_t *)outarray;
+ ibtail = ib + *inlen;
+ obtail = ob + *outlen;
+
+ do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
+ do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
+ is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+ is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+
+ ret_val = 0;
+
+ /*
+ * If we don't have a normalization flag set, we do the simple case
+ * conversion based text preparation separately below. Text
+ * preparation involving Normalization will be done in the false task
+ * block, again, separately since it will take much more time and
+ * resource than doing simple case conversions.
+ */
+ if (f == 0) {
+ while (ib < ibtail) {
+ if (*ib == '\0' && do_not_ignore_null)
+ break;
+
+ sz = u8_number_of_bytes[*ib];
+
+ if (sz < 0) {
+ if (do_not_ignore_invalid) {
+ *errnum = EILSEQ;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ sz = 1;
+ ret_val++;
+ }
+
+ if (sz == 1) {
+ if (ob >= obtail) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if (is_it_toupper)
+ *ob = U8_ASCII_TOUPPER(*ib);
+ else if (is_it_tolower)
+ *ob = U8_ASCII_TOLOWER(*ib);
+ else
+ *ob = *ib;
+ ib++;
+ ob++;
+ } else if ((ib + sz) > ibtail) {
+ if (do_not_ignore_invalid) {
+ *errnum = EINVAL;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if ((obtail - ob) < (ibtail - ib)) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ /*
+ * We treat the remaining incomplete character
+ * bytes as a character.
+ */
+ ret_val++;
+
+ while (ib < ibtail)
+ *ob++ = *ib++;
+ } else {
+ if (is_it_toupper || is_it_tolower) {
+ i = do_case_conv(unicode_version, u8s,
+ ib, sz, is_it_toupper);
+
+ if ((obtail - ob) < i) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ ib += sz;
+
+ for (sz = 0; sz < i; sz++)
+ *ob++ = u8s[sz];
+ } else {
+ if ((obtail - ob) < sz) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ for (i = 0; i < sz; i++)
+ *ob++ = *ib++;
+ }
+ }
+ }
+ } else {
+ canonical_decomposition = flag & U8_CANON_DECOMP;
+ compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+ canonical_composition = flag & U8_CANON_COMP;
+
+ while (ib < ibtail) {
+ if (*ib == '\0' && do_not_ignore_null)
+ break;
+
+ /*
+ * If the current character is a 7-bit ASCII
+ * character and it is the last character, or,
+ * if the current character is a 7-bit ASCII
+ * character and the next character is also a 7-bit
+ * ASCII character, then, we copy over this
+ * character without going through collect_a_seq().
+ *
+ * In any other cases, we need to look further with
+ * the collect_a_seq() function.
+ */
+ if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
+ ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
+ if (ob >= obtail) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if (is_it_toupper)
+ *ob = U8_ASCII_TOUPPER(*ib);
+ else if (is_it_tolower)
+ *ob = U8_ASCII_TOLOWER(*ib);
+ else
+ *ob = *ib;
+ ib++;
+ ob++;
+ } else {
+ *errnum = 0;
+ state = U8_STATE_START;
+
+ j = collect_a_seq(unicode_version, u8s,
+ &ib, ibtail,
+ is_it_toupper,
+ is_it_tolower,
+ canonical_decomposition,
+ compatibility_decomposition,
+ canonical_composition,
+ errnum, &state);
+
+ if (*errnum && do_not_ignore_invalid) {
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ if ((obtail - ob) < j) {
+ *errnum = E2BIG;
+ ret_val = (size_t)-1;
+ break;
+ }
+
+ for (i = 0; i < j; i++)
+ *ob++ = u8s[i];
+ }
+ }
+ }
+
+ *inlen = ibtail - ib;
+ *outlen = obtail - ob;
+
+ return (ret_val);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/util/strtolctype.h b/sys/cddl/contrib/opensolaris/common/util/strtolctype.h
new file mode 100644
index 000000000000..91609cede4e1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/util/strtolctype.h
@@ -0,0 +1,79 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1988 AT&T */
+/* All Rights Reserved */
+
+#ifndef _COMMON_UTIL_CTYPE_H
+#define _COMMON_UTIL_CTYPE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This header file contains a collection of macros that the strtou?ll?
+ * functions in common/util use to test characters. What we need is a kernel
+ * version of ctype.h.
+ *
+ * NOTE: These macros are used within several DTrace probe context functions.
+ * They must not be altered to make function calls or perform actions not
+ * safe in probe context.
+ */
+
+#if defined(illumos) && (defined(_KERNEL) || defined(_BOOT))
+
+#define isalnum(ch) (isalpha(ch) || isdigit(ch))
+#define isalpha(ch) (isupper(ch) || islower(ch))
+#define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
+#define islower(ch) ((ch) >= 'a' && (ch) <= 'z')
+#define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
+ ((ch) == '\t') || ((ch) == '\f'))
+#define isupper(ch) ((ch) >= 'A' && (ch) <= 'Z')
+#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
+ ((ch) >= 'A' && (ch) <= 'F'))
+
+#endif /* _KERNEL || _BOOT */
+
+#define DIGIT(x) \
+ (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
+
+#define MBASE ('z' - 'a' + 1 + 10)
+
+/*
+ * The following macro is a version of isalnum() that limits alphabetic
+ * characters to the ranges a-z and A-Z; locale dependent characters will not
+ * return 1. The members of a-z and A-Z are assumed to be in ascending order
+ * and contiguous.
+ */
+#define lisalnum(x) \
+ (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _COMMON_UTIL_CTYPE_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
new file mode 100644
index 000000000000..ba79eeaaefea
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
@@ -0,0 +1,310 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <errno.h>
+#include <string.h>
+#endif
+#include <sys/debug.h>
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+#include "zfeature_common.h"
+
+/*
+ * Set to disable all feature checks while opening pools, allowing pools with
+ * unsupported features to be opened. Set for testing only.
+ */
+boolean_t zfeature_checks_disable = B_FALSE;
+
+zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+/*
+ * Valid characters for feature guids. This list is mainly for aesthetic
+ * purposes and could be expanded in the future. There are different allowed
+ * characters in the guids reverse dns portion (before the colon) and its
+ * short name (after the colon).
+ */
+static int
+valid_char(char c, boolean_t after_colon)
+{
+ return ((c >= 'a' && c <= 'z') ||
+ (c >= '0' && c <= '9') ||
+ (after_colon && c == '_') ||
+ (!after_colon && (c == '.' || c == '-')));
+}
+
+/*
+ * Every feature guid must contain exactly one colon which separates a reverse
+ * dns organization name from the feature's "short" name (e.g.
+ * "com.company:feature_name").
+ */
+boolean_t
+zfeature_is_valid_guid(const char *name)
+{
+ int i;
+ boolean_t has_colon = B_FALSE;
+
+ i = 0;
+ while (name[i] != '\0') {
+ char c = name[i++];
+ if (c == ':') {
+ if (has_colon)
+ return (B_FALSE);
+ has_colon = B_TRUE;
+ continue;
+ }
+ if (!valid_char(c, has_colon))
+ return (B_FALSE);
+ }
+
+ return (has_colon);
+}
+
+boolean_t
+zfeature_is_supported(const char *guid)
+{
+ if (zfeature_checks_disable)
+ return (B_TRUE);
+
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t *feature = &spa_feature_table[i];
+ if (strcmp(guid, feature->fi_guid) == 0)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+int
+zfeature_lookup_name(const char *name, spa_feature_t *res)
+{
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t *feature = &spa_feature_table[i];
+ if (strcmp(name, feature->fi_uname) == 0) {
+ if (res != NULL)
+ *res = i;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+boolean_t
+zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
+{
+ zfeature_info_t *feature = &spa_feature_table[fid];
+
+ for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
+ if (feature->fi_depends[i] == check)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static void
+zfeature_register(spa_feature_t fid, const char *guid, const char *name,
+ const char *desc, zfeature_flags_t flags, const spa_feature_t *deps)
+{
+ zfeature_info_t *feature = &spa_feature_table[fid];
+ static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
+
+ ASSERT(name != NULL);
+ ASSERT(desc != NULL);
+ ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
+ (flags & ZFEATURE_FLAG_MOS) == 0);
+ ASSERT3U(fid, <, SPA_FEATURES);
+ ASSERT(zfeature_is_valid_guid(guid));
+
+ if (deps == NULL)
+ deps = nodeps;
+
+ feature->fi_feature = fid;
+ feature->fi_guid = guid;
+ feature->fi_uname = name;
+ feature->fi_desc = desc;
+ feature->fi_flags = flags;
+ feature->fi_depends = deps;
+}
+
+void
+zpool_feature_init(void)
+{
+ zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
+ "com.delphix:async_destroy", "async_destroy",
+ "Destroy filesystems asynchronously.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+ zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
+ "com.delphix:empty_bpobj", "empty_bpobj",
+ "Snapshots use less space.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+ zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
+ "org.illumos:lz4_compress", "lz4_compress",
+ "LZ4 compression algorithm support.",
+ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);
+
+ zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+ "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
+ "Crash dumps to multiple vdev pools.",
+ 0, NULL);
+
+ zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
+ "com.delphix:spacemap_histogram", "spacemap_histogram",
+ "Spacemaps maintain space histograms.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+ zfeature_register(SPA_FEATURE_ENABLED_TXG,
+ "com.delphix:enabled_txg", "enabled_txg",
+ "Record txg at which a feature is enabled",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+ static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
+ SPA_FEATURE_NONE };
+ zfeature_register(SPA_FEATURE_HOLE_BIRTH,
+ "com.delphix:hole_birth", "hole_birth",
+ "Retain hole birth txg for more precise zfs send",
+ ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ hole_birth_deps);
+
+ zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
+ "com.delphix:extensible_dataset", "extensible_dataset",
+ "Enhanced dataset functionality, used by other features.",
+ 0, NULL);
+
+ static const spa_feature_t bookmarks_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_BOOKMARKS,
+ "com.delphix:bookmarks", "bookmarks",
+ "\"zfs bookmark\" command",
+ ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);
+
+ static const spa_feature_t filesystem_limits_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
+ "com.joyent:filesystem_limits", "filesystem_limits",
+ "Filesystem and snapshot limits.",
+ ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);
+
+ zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
+ "com.delphix:embedded_data", "embedded_data",
+ "Blocks which compress very well use even less space.",
+ ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ NULL);
+
+ zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
+ "com.delphix:zpool_checkpoint", "zpool_checkpoint",
+ "Pool state can be checkpointed, allowing rewind later.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+ zfeature_register(SPA_FEATURE_SPACEMAP_V2,
+ "com.delphix:spacemap_v2", "spacemap_v2",
+ "Space maps representing large segments are more efficient.",
+ ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ NULL);
+
+ static const spa_feature_t large_blocks_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+ "org.open-zfs:large_blocks", "large_blocks",
+ "Support for blocks larger than 128KB.",
+ ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
+
+ {
+ static const spa_feature_t large_dnode_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_DNODE,
+ "org.zfsonlinux:large_dnode", "large_dnode",
+ "Variable on-disk size of dnodes.",
+ ZFEATURE_FLAG_PER_DATASET, large_dnode_deps);
+ }
+
+ static const spa_feature_t sha512_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_SHA512,
+ "org.illumos:sha512", "sha512",
+ "SHA-512/256 hash algorithm.",
+ ZFEATURE_FLAG_PER_DATASET, sha512_deps);
+
+ static const spa_feature_t skein_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_SKEIN,
+ "org.illumos:skein", "skein",
+ "Skein hash algorithm.",
+ ZFEATURE_FLAG_PER_DATASET, skein_deps);
+
+#ifdef illumos
+ static const spa_feature_t edonr_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_EDONR,
+ "org.illumos:edonr", "edonr",
+ "Edon-R hash algorithm.",
+ ZFEATURE_FLAG_PER_DATASET, edonr_deps);
+#endif
+
+ zfeature_register(SPA_FEATURE_DEVICE_REMOVAL,
+ "com.delphix:device_removal", "device_removal",
+ "Top-level vdevs can be removed, reducing logical pool size.",
+ ZFEATURE_FLAG_MOS, NULL);
+
+ static const spa_feature_t obsolete_counts_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_DEVICE_REMOVAL,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS,
+ "com.delphix:obsolete_counts", "obsolete_counts",
+ "Reduce memory used by removed devices when their blocks are "
+ "freed or remapped.",
+ ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
+
+ {
+ zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
+ "org.zfsonlinux:allocation_classes", "allocation_classes",
+ "Support for separate allocation classes.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
new file mode 100644
index 000000000000..d23a4e226e2d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#ifndef _ZFEATURE_COMMON_H
+#define _ZFEATURE_COMMON_H
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct zfeature_info;
+
+typedef enum spa_feature {
+ SPA_FEATURE_NONE = -1,
+ SPA_FEATURE_ASYNC_DESTROY,
+ SPA_FEATURE_EMPTY_BPOBJ,
+ SPA_FEATURE_LZ4_COMPRESS,
+ SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM,
+ SPA_FEATURE_ENABLED_TXG,
+ SPA_FEATURE_HOLE_BIRTH,
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_EMBEDDED_DATA,
+ SPA_FEATURE_BOOKMARKS,
+ SPA_FEATURE_FS_SS_LIMIT,
+ SPA_FEATURE_LARGE_BLOCKS,
+ SPA_FEATURE_LARGE_DNODE,
+ SPA_FEATURE_SHA512,
+ SPA_FEATURE_SKEIN,
+#ifdef illumos
+ SPA_FEATURE_EDONR,
+#endif
+ SPA_FEATURE_DEVICE_REMOVAL,
+ SPA_FEATURE_OBSOLETE_COUNTS,
+ SPA_FEATURE_POOL_CHECKPOINT,
+ SPA_FEATURE_SPACEMAP_V2,
+ SPA_FEATURE_ALLOCATION_CLASSES,
+ SPA_FEATURES
+} spa_feature_t;
+
+#define SPA_FEATURE_DISABLED (-1ULL)
+
+typedef enum zfeature_flags {
+ /* Can open pool readonly even if this feature is not supported. */
+ ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0),
+ /* Is this feature necessary to read the MOS? */
+ ZFEATURE_FLAG_MOS = (1 << 1),
+ /* Activate this feature at the same time it is enabled. */
+ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2),
+ /* Each dataset has a field set if it has ever used this feature. */
+ ZFEATURE_FLAG_PER_DATASET = (1 << 3)
+} zfeature_flags_t;
+
+typedef struct zfeature_info {
+ spa_feature_t fi_feature;
+ const char *fi_uname; /* User-facing feature name */
+ const char *fi_guid; /* On-disk feature identifier */
+ const char *fi_desc; /* Feature description */
+ zfeature_flags_t fi_flags;
+ /* array of dependencies, terminated by SPA_FEATURE_NONE */
+ const spa_feature_t *fi_depends;
+} zfeature_info_t;
+
+typedef int (zfeature_func_t)(zfeature_info_t *, void *);
+
+#define ZFS_FEATURE_DEBUG
+
+extern zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+extern boolean_t zfeature_is_valid_guid(const char *);
+
+extern boolean_t zfeature_is_supported(const char *);
+extern int zfeature_lookup_name(const char *, spa_feature_t *);
+extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
+
+extern void zpool_feature_init(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFEATURE_COMMON_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
new file mode 100644
index 000000000000..f18d82b507b2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
@@ -0,0 +1,206 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * This file is intended for functions that ought to be common between user
+ * land (libzfs) and the kernel. When many common routines need to be shared
+ * then a separate file should to be created.
+ */
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/nvpair.h>
+#include "zfs_comutil.h"
+
+/*
+ * Are there allocatable vdevs?
+ */
+boolean_t
+zfs_allocatable_devs(nvlist_t *nv)
+{
+ uint64_t is_log;
+ uint_t c;
+ nvlist_t **child;
+ uint_t children;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ return (B_FALSE);
+ }
+ for (c = 0; c < children; c++) {
+ is_log = 0;
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+ if (!is_log)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+void
+zpool_get_load_policy(nvlist_t *nvl, zpool_load_policy_t *zlpp)
+{
+ nvlist_t *policy;
+ nvpair_t *elem;
+ char *nm;
+
+ /* Defaults */
+ zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+ zlpp->zlp_maxmeta = 0;
+ zlpp->zlp_maxdata = UINT64_MAX;
+ zlpp->zlp_txg = UINT64_MAX;
+
+ if (nvl == NULL)
+ return;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+ nm = nvpair_name(elem);
+ if (strcmp(nm, ZPOOL_LOAD_POLICY) == 0) {
+ if (nvpair_value_nvlist(elem, &policy) == 0)
+ zpool_get_load_policy(policy, zlpp);
+ return;
+ } else if (strcmp(nm, ZPOOL_LOAD_REWIND_POLICY) == 0) {
+ if (nvpair_value_uint32(elem, &zlpp->zlp_rewind) == 0)
+ if (zlpp->zlp_rewind & ~ZPOOL_REWIND_POLICIES)
+ zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+ } else if (strcmp(nm, ZPOOL_LOAD_REQUEST_TXG) == 0) {
+ (void) nvpair_value_uint64(elem, &zlpp->zlp_txg);
+ } else if (strcmp(nm, ZPOOL_LOAD_META_THRESH) == 0) {
+ (void) nvpair_value_uint64(elem, &zlpp->zlp_maxmeta);
+ } else if (strcmp(nm, ZPOOL_LOAD_DATA_THRESH) == 0) {
+ (void) nvpair_value_uint64(elem, &zlpp->zlp_maxdata);
+ }
+ }
+ if (zlpp->zlp_rewind == 0)
+ zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+}
+
+typedef struct zfs_version_spa_map {
+ int version_zpl;
+ int version_spa;
+} zfs_version_spa_map_t;
+
+/*
+ * Keep this table in monotonically increasing version number order.
+ */
+static zfs_version_spa_map_t zfs_version_table[] = {
+ {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
+ {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
+ {ZPL_VERSION_FUID, SPA_VERSION_FUID},
+ {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+ {ZPL_VERSION_SA, SPA_VERSION_SA},
+ {0, 0}
+};
+
+/*
+ * Return the max zpl version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_zpl_version_map(int spa_version)
+{
+ int i;
+ int version = -1;
+
+ for (i = 0; zfs_version_table[i].version_spa; i++) {
+ if (spa_version >= zfs_version_table[i].version_spa)
+ version = zfs_version_table[i].version_zpl;
+ }
+
+ return (version);
+}
+
+/*
+ * Return the min spa version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_spa_version_map(int zpl_version)
+{
+ int i;
+ int version = -1;
+
+ for (i = 0; zfs_version_table[i].version_zpl; i++) {
+ if (zfs_version_table[i].version_zpl >= zpl_version)
+ return (zfs_version_table[i].version_spa);
+ }
+
+ return (version);
+}
+
+/*
+ * This is the table of legacy internal event names; it should not be modified.
+ * The internal events are now stored in the history log as strings.
+ */
+const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
+ "invalid event",
+ "pool create",
+ "vdev add",
+ "pool remove",
+ "pool destroy",
+ "pool export",
+ "pool import",
+ "vdev attach",
+ "vdev replace",
+ "vdev detach",
+ "vdev online",
+ "vdev offline",
+ "vdev upgrade",
+ "pool clear",
+ "pool scrub",
+ "pool property set",
+ "create",
+ "clone",
+ "destroy",
+ "destroy_begin_sync",
+ "inherit",
+ "property set",
+ "quota set",
+ "permission update",
+ "permission remove",
+ "permission who remove",
+ "promote",
+ "receive",
+ "rename",
+ "reservation set",
+ "replay_inc_sync",
+ "replay_full_sync",
+ "rollback",
+ "snapshot",
+ "filesystem version upgrade",
+ "refquota set",
+ "refreservation set",
+ "pool scrub done",
+ "user hold",
+ "user release",
+ "pool split",
+};
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
new file mode 100644
index 000000000000..1c828e41e29f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _ZFS_COMUTIL_H
+#define _ZFS_COMUTIL_H
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Needed for ZoL errno usage in MMP kernel and user code */
+#define EREMOTEIO EREMOTE
+
+extern boolean_t zfs_allocatable_devs(nvlist_t *);
+extern void zpool_get_load_policy(nvlist_t *, zpool_load_policy_t *);
+
+extern int zfs_zpl_version_map(int spa_version);
+extern int zfs_spa_version_map(int zpl_version);
+#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41
+extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_COMUTIL_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
new file mode 100644
index 000000000000..a3383f4ccf2d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
@@ -0,0 +1,235 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
+ */
+
+#include <sys/zfs_context.h>
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#include <sys/sunddi.h>
+#include <sys/ctype.h>
+#else
+#include <stdio.h>
+#include <unistd.h>
+#include <strings.h>
+#include <libnvpair.h>
+#include <ctype.h>
+#endif
+#include <sys/dsl_deleg.h>
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_namecheck.h"
+
+zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
+ {ZFS_DELEG_PERM_ALLOW},
+ {ZFS_DELEG_PERM_BOOKMARK},
+ {ZFS_DELEG_PERM_CLONE},
+ {ZFS_DELEG_PERM_CREATE},
+ {ZFS_DELEG_PERM_DESTROY},
+ {ZFS_DELEG_PERM_DIFF},
+ {ZFS_DELEG_PERM_MOUNT},
+ {ZFS_DELEG_PERM_PROMOTE},
+ {ZFS_DELEG_PERM_RECEIVE},
+ {ZFS_DELEG_PERM_REMAP},
+ {ZFS_DELEG_PERM_RENAME},
+ {ZFS_DELEG_PERM_ROLLBACK},
+ {ZFS_DELEG_PERM_SNAPSHOT},
+ {ZFS_DELEG_PERM_SHARE},
+ {ZFS_DELEG_PERM_SEND},
+ {ZFS_DELEG_PERM_USERPROP},
+ {ZFS_DELEG_PERM_USERQUOTA},
+ {ZFS_DELEG_PERM_GROUPQUOTA},
+ {ZFS_DELEG_PERM_USERUSED},
+ {ZFS_DELEG_PERM_GROUPUSED},
+ {ZFS_DELEG_PERM_HOLD},
+ {ZFS_DELEG_PERM_RELEASE},
+ {NULL}
+};
+
+static int
+zfs_valid_permission_name(const char *perm)
+{
+ if (zfs_deleg_canonicalize_perm(perm))
+ return (0);
+
+ return (permset_namecheck(perm, NULL, NULL));
+}
+
+const char *
+zfs_deleg_canonicalize_perm(const char *perm)
+{
+ int i;
+ zfs_prop_t prop;
+
+ for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
+ if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
+ return (perm);
+ }
+
+ prop = zfs_name_to_prop(perm);
+ if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
+ return (zfs_prop_to_name(prop));
+ return (NULL);
+
+}
+
+static int
+zfs_validate_who(char *who)
+{
+ char *p;
+
+ if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
+ return (-1);
+
+ switch (who[0]) {
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP_SETS:
+ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+ return (-1);
+ for (p = &who[3]; *p; p++)
+ if (!isdigit(*p))
+ return (-1);
+ break;
+
+ case ZFS_DELEG_NAMED_SET:
+ case ZFS_DELEG_NAMED_SET_SETS:
+ if (who[1] != ZFS_DELEG_NA)
+ return (-1);
+ return (permset_namecheck(&who[3], NULL, NULL));
+
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ if (who[1] != ZFS_DELEG_NA)
+ return (-1);
+ if (who[3] != '\0')
+ return (-1);
+ break;
+
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+ return (-1);
+ if (who[3] != '\0')
+ return (-1);
+ break;
+
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+zfs_deleg_verify_nvlist(nvlist_t *nvp)
+{
+ nvpair_t *who, *perm_name;
+ nvlist_t *perms;
+ int error;
+
+ if (nvp == NULL)
+ return (-1);
+
+ who = nvlist_next_nvpair(nvp, NULL);
+ if (who == NULL)
+ return (-1);
+
+ do {
+ if (zfs_validate_who(nvpair_name(who)))
+ return (-1);
+
+ error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);
+
+ if (error && error != ENOENT)
+ return (-1);
+ if (error == ENOENT)
+ continue;
+
+ perm_name = nvlist_next_nvpair(perms, NULL);
+ if (perm_name == NULL) {
+ return (-1);
+ }
+ do {
+ error = zfs_valid_permission_name(
+ nvpair_name(perm_name));
+ if (error)
+ return (-1);
+ } while ((perm_name = nvlist_next_nvpair(perms, perm_name))
+ != NULL);
+ } while ((who = nvlist_next_nvpair(nvp, who)) != NULL);
+ return (0);
+}
+
+/*
+ * Construct the base attribute name. The base attribute names
+ * are the "key" to locate the jump objects which contain the actual
+ * permissions. The base attribute names are encoded based on
+ * type of entry and whether it is a local or descendent permission.
+ *
+ * Arguments:
+ * attr - attribute name return string, attribute is assumed to be
+ * ZFS_MAX_DELEG_NAME long.
+ * type - type of entry to construct
+ * inheritchr - inheritance type (local,descendent, or NA for create and
+ * permission set definitions
+ * data - is either a permission set name or a 64 bit uid/gid.
+ */
+void
+zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
+ char inheritchr, void *data)
+{
+ int len = ZFS_MAX_DELEG_NAME;
+ uint64_t *id = data;
+
+ switch (type) {
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP_SETS:
+ (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
+ ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
+ break;
+ case ZFS_DELEG_NAMED_SET_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ (void) snprintf(attr, len, "%c-%c%s", type,
+ ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
+ break;
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ (void) snprintf(attr, len, "%c-%c", type,
+ ZFS_DELEG_FIELD_SEP_CHR);
+ break;
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ (void) snprintf(attr, len, "%c%c%c", type, inheritchr,
+ ZFS_DELEG_FIELD_SEP_CHR);
+ break;
+ default:
+ ASSERT(!"bad zfs_deleg_who_type_t");
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
new file mode 100644
index 000000000000..06d2df9bb80d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _ZFS_DELEG_H
+#define _ZFS_DELEG_H
+
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */
+#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */
+
+/*
+ * Max name length for a delegation attribute
+ */
+#define ZFS_MAX_DELEG_NAME 128
+
+#define ZFS_DELEG_LOCAL 'l'
+#define ZFS_DELEG_DESCENDENT 'd'
+#define ZFS_DELEG_NA '-'
+
+typedef enum {
+ ZFS_DELEG_NOTE_CREATE,
+ ZFS_DELEG_NOTE_DESTROY,
+ ZFS_DELEG_NOTE_SNAPSHOT,
+ ZFS_DELEG_NOTE_ROLLBACK,
+ ZFS_DELEG_NOTE_CLONE,
+ ZFS_DELEG_NOTE_PROMOTE,
+ ZFS_DELEG_NOTE_RENAME,
+ ZFS_DELEG_NOTE_SEND,
+ ZFS_DELEG_NOTE_RECEIVE,
+ ZFS_DELEG_NOTE_ALLOW,
+ ZFS_DELEG_NOTE_USERPROP,
+ ZFS_DELEG_NOTE_MOUNT,
+ ZFS_DELEG_NOTE_SHARE,
+ ZFS_DELEG_NOTE_USERQUOTA,
+ ZFS_DELEG_NOTE_GROUPQUOTA,
+ ZFS_DELEG_NOTE_USERUSED,
+ ZFS_DELEG_NOTE_GROUPUSED,
+ ZFS_DELEG_NOTE_HOLD,
+ ZFS_DELEG_NOTE_RELEASE,
+ ZFS_DELEG_NOTE_DIFF,
+ ZFS_DELEG_NOTE_BOOKMARK,
+ ZFS_DELEG_NOTE_REMAP,
+ ZFS_DELEG_NOTE_NONE
+} zfs_deleg_note_t;
+
+typedef struct zfs_deleg_perm_tab {
+ char *z_perm;
+ zfs_deleg_note_t z_note;
+} zfs_deleg_perm_tab_t;
+
+extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[];
+
+int zfs_deleg_verify_nvlist(nvlist_t *nvlist);
+void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
+ char checkflag, void *data);
+const char *zfs_deleg_canonicalize_perm(const char *perm);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_DELEG_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
new file mode 100644
index 000000000000..c889169b426b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
@@ -0,0 +1,279 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * Fletcher Checksums
+ * ------------------
+ *
+ * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
+ * recurrence relations:
+ *
+ * a = a + f
+ * i i-1 i-1
+ *
+ * b = b + a
+ * i i-1 i
+ *
+ * c = c + b (fletcher-4 only)
+ * i i-1 i
+ *
+ * d = d + c (fletcher-4 only)
+ * i i-1 i
+ *
+ * Where
+ * a_0 = b_0 = c_0 = d_0 = 0
+ * and
+ * f_0 .. f_(n-1) are the input data.
+ *
+ * Using standard techniques, these translate into the following series:
+ *
+ * __n_ __n_
+ * \ | \ |
+ * a = > f b = > i * f
+ * n /___| n - i n /___| n - i
+ * i = 1 i = 1
+ *
+ *
+ * __n_ __n_
+ * \ | i*(i+1) \ | i*(i+1)*(i+2)
+ * c = > ------- f d = > ------------- f
+ * n /___| 2 n - i n /___| 6 n - i
+ * i = 1 i = 1
+ *
+ * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
+ * Since the additions are done mod (2^64), errors in the high bits may not
+ * be noticed. For this reason, fletcher-2 is deprecated.
+ *
+ * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
+ * A conservative estimate of how big the buffer can get before we overflow
+ * can be estimated using f_i = 0xffffffff for all i:
+ *
+ * % bc
+ * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
+ * 2264
+ * quit
+ * %
+ *
+ * So blocks of up to 2k will not overflow. Our largest block size is
+ * 128k, which has 32k 4-byte words, so we can compute the largest possible
+ * accumulators, then divide by 2^64 to figure the max amount of overflow:
+ *
+ * % bc
+ * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
+ * a/2^64;b/2^64;c/2^64;d/2^64
+ * 0
+ * 0
+ * 1365
+ * 11186858
+ * quit
+ * %
+ *
+ * So a and b cannot overflow. To make sure each bit of input has some
+ * effect on the contents of c and d, we can look at what the factors of
+ * the coefficients in the equations for c_n and d_n are. The number of 2s
+ * in the factors determines the lowest set bit in the multiplier. Running
+ * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
+ * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
+ * the 64-bit accumulators, every bit of every f_i effects every accumulator,
+ * even for 128k blocks.
+ *
+ * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
+ * we could do our calculations mod (2^32 - 1) by adding in the carries
+ * periodically, and store the number of carries in the top 32-bits.
+ *
+ * --------------------
+ * Checksum Performance
+ * --------------------
+ *
+ * There are two interesting components to checksum performance: cached and
+ * uncached performance. With cached data, fletcher-2 is about four times
+ * faster than fletcher-4. With uncached data, the performance difference is
+ * negligible, since the cost of a cache fill dominates the processing time.
+ * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
+ * efficient pass over the data.
+ *
+ * In normal operation, the data which is being checksummed is in a buffer
+ * which has been filled either by:
+ *
+ * 1. a compression step, which will be mostly cached, or
+ * 2. a bcopy() or copyin(), which will be uncached (because the
+ * copy is cache-bypassing).
+ *
+ * For both cached and uncached data, both fletcher checksums are much faster
+ * than sha-256, and slower than 'off', which doesn't touch the data at all.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <zfs_fletcher.h>
+
+void
+fletcher_init(zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+int
+fletcher_2_incremental_native(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ a0 = zcp->zc_word[0];
+ a1 = zcp->zc_word[1];
+ b0 = zcp->zc_word[2];
+ b1 = zcp->zc_word[3];
+
+ for (; ip < ipend; ip += 2) {
+ a0 += ip[0];
+ a1 += ip[1];
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_2_native(const void *buf, size_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) fletcher_2_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ a0 = zcp->zc_word[0];
+ a1 = zcp->zc_word[1];
+ b0 = zcp->zc_word[2];
+ b1 = zcp->zc_word[3];
+
+ for (; ip < ipend; ip += 2) {
+ a0 += BSWAP_64(ip[0]);
+ a1 += BSWAP_64(ip[1]);
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_2_byteswap(const void *buf, size_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
+}
+
+int
+fletcher_4_incremental_native(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_4_native(const void *buf, size_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) fletcher_4_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_4_byteswap(const void *buf, size_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) fletcher_4_incremental_byteswap((void *) buf, size, zcp);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
new file mode 100644
index 000000000000..33c6c728cf61
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
@@ -0,0 +1,58 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _ZFS_FLETCHER_H
+#define _ZFS_FLETCHER_H
+
+#include <sys/types.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * fletcher checksum functions
+ */
+
+void fletcher_init(zio_cksum_t *);
+void fletcher_2_native(const void *, size_t, const void *, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, size_t, const void *, zio_cksum_t *);
+int fletcher_2_incremental_native(void *, size_t, void *);
+int fletcher_2_incremental_byteswap(void *, size_t, void *);
+void fletcher_4_native(const void *, size_t, const void *, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, size_t, const void *, zio_cksum_t *);
+int fletcher_4_incremental_native(void *, size_t, void *);
+int fletcher_4_incremental_byteswap(void *, size_t, void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_FLETCHER_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
new file mode 100644
index 000000000000..e5ac73f96b98
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
@@ -0,0 +1,1380 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Xin Li <delphij@FreeBSD.org>. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Portions Copyright 2005, 2010, Oracle and/or its affiliates.
+ * All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/nvpair.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zfs_ioctl.h>
+#include "zfs_namecheck.h"
+#include "zfs_ioctl_compat.h"
+
+static int zfs_version_ioctl = ZFS_IOCVER_CURRENT;
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
+ 0, "ZFS_IOCTL_VERSION");
+
+/*
+ * FreeBSD zfs_cmd compatibility with older binaries
+ * appropriately remap/extend the zfs_cmd_t structure
+ */
+void
+zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
+{
+ zfs_cmd_v15_t *zc_c;
+ zfs_cmd_v28_t *zc28_c;
+ zfs_cmd_deadman_t *zcdm_c;
+ zfs_cmd_zcmd_t *zcmd_c;
+ zfs_cmd_edbp_t *edbp_c;
+ zfs_cmd_resume_t *resume_c;
+ zfs_cmd_inlanes_t *inlanes_c;
+
+ switch (cflag) {
+ case ZFS_CMD_COMPAT_INLANES:
+ inlanes_c = (void *)addr;
+ /* zc */
+ strlcpy(zc->zc_name, inlanes_c->zc_name, MAXPATHLEN);
+ strlcpy(zc->zc_value, inlanes_c->zc_value, MAXPATHLEN * 2);
+ strlcpy(zc->zc_string, inlanes_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = inlanes_c->field
+ FIELD_COPY(zc_nvlist_src);
+ FIELD_COPY(zc_nvlist_src_size);
+ FIELD_COPY(zc_nvlist_dst);
+ FIELD_COPY(zc_nvlist_dst_size);
+ FIELD_COPY(zc_nvlist_dst_filled);
+ FIELD_COPY(zc_pad2);
+ FIELD_COPY(zc_history);
+ FIELD_COPY(zc_guid);
+ FIELD_COPY(zc_nvlist_conf);
+ FIELD_COPY(zc_nvlist_conf_size);
+ FIELD_COPY(zc_cookie);
+ FIELD_COPY(zc_objset_type);
+ FIELD_COPY(zc_perm_action);
+ FIELD_COPY(zc_history_len);
+ FIELD_COPY(zc_history_offset);
+ FIELD_COPY(zc_obj);
+ FIELD_COPY(zc_iflags);
+ FIELD_COPY(zc_share);
+ FIELD_COPY(zc_jailid);
+ FIELD_COPY(zc_objset_stats);
+ FIELD_COPY(zc_begin_record);
+ FIELD_COPY(zc_inject_record);
+ FIELD_COPY(zc_defer_destroy);
+ FIELD_COPY(zc_flags);
+ FIELD_COPY(zc_action_handle);
+ FIELD_COPY(zc_cleanup_fd);
+ FIELD_COPY(zc_simple);
+ FIELD_COPY(zc_resumable);
+ FIELD_COPY(zc_sendobj);
+ FIELD_COPY(zc_fromobj);
+ FIELD_COPY(zc_createtxg);
+ FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+ break;
+
+ case ZFS_CMD_COMPAT_RESUME:
+ resume_c = (void *)addr;
+ /* zc */
+ strlcpy(zc->zc_name, resume_c->zc_name, MAXPATHLEN);
+ strlcpy(zc->zc_value, resume_c->zc_value, MAXPATHLEN * 2);
+ strlcpy(zc->zc_string, resume_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = resume_c->field
+ FIELD_COPY(zc_nvlist_src);
+ FIELD_COPY(zc_nvlist_src_size);
+ FIELD_COPY(zc_nvlist_dst);
+ FIELD_COPY(zc_nvlist_dst_size);
+ FIELD_COPY(zc_nvlist_dst_filled);
+ FIELD_COPY(zc_pad2);
+ FIELD_COPY(zc_history);
+ FIELD_COPY(zc_guid);
+ FIELD_COPY(zc_nvlist_conf);
+ FIELD_COPY(zc_nvlist_conf_size);
+ FIELD_COPY(zc_cookie);
+ FIELD_COPY(zc_objset_type);
+ FIELD_COPY(zc_perm_action);
+ FIELD_COPY(zc_history_len);
+ FIELD_COPY(zc_history_offset);
+ FIELD_COPY(zc_obj);
+ FIELD_COPY(zc_iflags);
+ FIELD_COPY(zc_share);
+ FIELD_COPY(zc_jailid);
+ FIELD_COPY(zc_objset_stats);
+ FIELD_COPY(zc_begin_record);
+ FIELD_COPY(zc_inject_record.zi_objset);
+ FIELD_COPY(zc_inject_record.zi_object);
+ FIELD_COPY(zc_inject_record.zi_start);
+ FIELD_COPY(zc_inject_record.zi_end);
+ FIELD_COPY(zc_inject_record.zi_guid);
+ FIELD_COPY(zc_inject_record.zi_level);
+ FIELD_COPY(zc_inject_record.zi_error);
+ FIELD_COPY(zc_inject_record.zi_type);
+ FIELD_COPY(zc_inject_record.zi_freq);
+ FIELD_COPY(zc_inject_record.zi_failfast);
+ strlcpy(zc->zc_inject_record.zi_func,
+ resume_c->zc_inject_record.zi_func, MAXNAMELEN);
+ FIELD_COPY(zc_inject_record.zi_iotype);
+ FIELD_COPY(zc_inject_record.zi_duration);
+ FIELD_COPY(zc_inject_record.zi_timer);
+ zc->zc_inject_record.zi_nlanes = 1;
+ FIELD_COPY(zc_inject_record.zi_cmd);
+ FIELD_COPY(zc_inject_record.zi_pad);
+ FIELD_COPY(zc_defer_destroy);
+ FIELD_COPY(zc_flags);
+ FIELD_COPY(zc_action_handle);
+ FIELD_COPY(zc_cleanup_fd);
+ FIELD_COPY(zc_simple);
+ FIELD_COPY(zc_resumable);
+ FIELD_COPY(zc_sendobj);
+ FIELD_COPY(zc_fromobj);
+ FIELD_COPY(zc_createtxg);
+ FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+ break;
+
+ case ZFS_CMD_COMPAT_EDBP:
+ edbp_c = (void *)addr;
+ /* zc */
+ strlcpy(zc->zc_name, edbp_c->zc_name, MAXPATHLEN);
+ strlcpy(zc->zc_value, edbp_c->zc_value, MAXPATHLEN * 2);
+ strlcpy(zc->zc_string, edbp_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = edbp_c->field
+ FIELD_COPY(zc_nvlist_src);
+ FIELD_COPY(zc_nvlist_src_size);
+ FIELD_COPY(zc_nvlist_dst);
+ FIELD_COPY(zc_nvlist_dst_size);
+ FIELD_COPY(zc_nvlist_dst_filled);
+ FIELD_COPY(zc_pad2);
+ FIELD_COPY(zc_history);
+ FIELD_COPY(zc_guid);
+ FIELD_COPY(zc_nvlist_conf);
+ FIELD_COPY(zc_nvlist_conf_size);
+ FIELD_COPY(zc_cookie);
+ FIELD_COPY(zc_objset_type);
+ FIELD_COPY(zc_perm_action);
+ FIELD_COPY(zc_history_len);
+ FIELD_COPY(zc_history_offset);
+ FIELD_COPY(zc_obj);
+ FIELD_COPY(zc_iflags);
+ FIELD_COPY(zc_share);
+ FIELD_COPY(zc_jailid);
+ FIELD_COPY(zc_objset_stats);
+ zc->zc_begin_record.drr_u.drr_begin = edbp_c->zc_begin_record;
+ FIELD_COPY(zc_inject_record.zi_objset);
+ FIELD_COPY(zc_inject_record.zi_object);
+ FIELD_COPY(zc_inject_record.zi_start);
+ FIELD_COPY(zc_inject_record.zi_end);
+ FIELD_COPY(zc_inject_record.zi_guid);
+ FIELD_COPY(zc_inject_record.zi_level);
+ FIELD_COPY(zc_inject_record.zi_error);
+ FIELD_COPY(zc_inject_record.zi_type);
+ FIELD_COPY(zc_inject_record.zi_freq);
+ FIELD_COPY(zc_inject_record.zi_failfast);
+ strlcpy(zc->zc_inject_record.zi_func,
+ edbp_c->zc_inject_record.zi_func, MAXNAMELEN);
+ FIELD_COPY(zc_inject_record.zi_iotype);
+ FIELD_COPY(zc_inject_record.zi_duration);
+ FIELD_COPY(zc_inject_record.zi_timer);
+ zc->zc_inject_record.zi_nlanes = 1;
+ FIELD_COPY(zc_inject_record.zi_cmd);
+ FIELD_COPY(zc_inject_record.zi_pad);
+ FIELD_COPY(zc_defer_destroy);
+ FIELD_COPY(zc_flags);
+ FIELD_COPY(zc_action_handle);
+ FIELD_COPY(zc_cleanup_fd);
+ FIELD_COPY(zc_simple);
+ zc->zc_resumable = B_FALSE;
+ FIELD_COPY(zc_sendobj);
+ FIELD_COPY(zc_fromobj);
+ FIELD_COPY(zc_createtxg);
+ FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+ break;
+
+ case ZFS_CMD_COMPAT_ZCMD:
+ zcmd_c = (void *)addr;
+ /* zc */
+ strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN);
+ strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2);
+ strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = zcmd_c->field
+ FIELD_COPY(zc_nvlist_src);
+ FIELD_COPY(zc_nvlist_src_size);
+ FIELD_COPY(zc_nvlist_dst);
+ FIELD_COPY(zc_nvlist_dst_size);
+ FIELD_COPY(zc_nvlist_dst_filled);
+ FIELD_COPY(zc_pad2);
+ FIELD_COPY(zc_history);
+ FIELD_COPY(zc_guid);
+ FIELD_COPY(zc_nvlist_conf);
+ FIELD_COPY(zc_nvlist_conf_size);
+ FIELD_COPY(zc_cookie);
+ FIELD_COPY(zc_objset_type);
+ FIELD_COPY(zc_perm_action);
+ FIELD_COPY(zc_history_len);
+ FIELD_COPY(zc_history_offset);
+ FIELD_COPY(zc_obj);
+ FIELD_COPY(zc_iflags);
+ FIELD_COPY(zc_share);
+ FIELD_COPY(zc_jailid);
+ FIELD_COPY(zc_objset_stats);
+ zc->zc_begin_record.drr_u.drr_begin = zcmd_c->zc_begin_record;
+ FIELD_COPY(zc_inject_record.zi_objset);
+ FIELD_COPY(zc_inject_record.zi_object);
+ FIELD_COPY(zc_inject_record.zi_start);
+ FIELD_COPY(zc_inject_record.zi_end);
+ FIELD_COPY(zc_inject_record.zi_guid);
+ FIELD_COPY(zc_inject_record.zi_level);
+ FIELD_COPY(zc_inject_record.zi_error);
+ FIELD_COPY(zc_inject_record.zi_type);
+ FIELD_COPY(zc_inject_record.zi_freq);
+ FIELD_COPY(zc_inject_record.zi_failfast);
+ strlcpy(zc->zc_inject_record.zi_func,
+ zcmd_c->zc_inject_record.zi_func, MAXNAMELEN);
+ FIELD_COPY(zc_inject_record.zi_iotype);
+ FIELD_COPY(zc_inject_record.zi_duration);
+ FIELD_COPY(zc_inject_record.zi_timer);
+ zc->zc_inject_record.zi_nlanes = 1;
+ FIELD_COPY(zc_inject_record.zi_cmd);
+ FIELD_COPY(zc_inject_record.zi_pad);
+
+ /* boolean_t -> uint32_t */
+ zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy);
+ zc->zc_flags = 0;
+
+ FIELD_COPY(zc_action_handle);
+ FIELD_COPY(zc_cleanup_fd);
+ FIELD_COPY(zc_simple);
+ zc->zc_resumable = B_FALSE;
+ FIELD_COPY(zc_sendobj);
+ FIELD_COPY(zc_fromobj);
+ FIELD_COPY(zc_createtxg);
+ FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+
+ break;
+
+ case ZFS_CMD_COMPAT_DEADMAN:
+ zcdm_c = (void *)addr;
+ /* zc */
+ strlcpy(zc->zc_name, zcdm_c->zc_name, MAXPATHLEN);
+ strlcpy(zc->zc_value, zcdm_c->zc_value, MAXPATHLEN * 2);
+ strlcpy(zc->zc_string, zcdm_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = zcdm_c->field
+ zc->zc_guid = zcdm_c->zc_guid;
+ zc->zc_nvlist_conf = zcdm_c->zc_nvlist_conf;
+ zc->zc_nvlist_conf_size = zcdm_c->zc_nvlist_conf_size;
+ zc->zc_nvlist_src = zcdm_c->zc_nvlist_src;
+ zc->zc_nvlist_src_size = zcdm_c->zc_nvlist_src_size;
+ zc->zc_nvlist_dst = zcdm_c->zc_nvlist_dst;
+ zc->zc_nvlist_dst_size = zcdm_c->zc_nvlist_dst_size;
+ zc->zc_cookie = zcdm_c->zc_cookie;
+ zc->zc_objset_type = zcdm_c->zc_objset_type;
+ zc->zc_perm_action = zcdm_c->zc_perm_action;
+ zc->zc_history = zcdm_c->zc_history;
+ zc->zc_history_len = zcdm_c->zc_history_len;
+ zc->zc_history_offset = zcdm_c->zc_history_offset;
+ zc->zc_obj = zcdm_c->zc_obj;
+ zc->zc_iflags = zcdm_c->zc_iflags;
+ zc->zc_share = zcdm_c->zc_share;
+ zc->zc_jailid = zcdm_c->zc_jailid;
+ zc->zc_objset_stats = zcdm_c->zc_objset_stats;
+ zc->zc_begin_record.drr_u.drr_begin = zcdm_c->zc_begin_record;
+ zc->zc_defer_destroy = zcdm_c->zc_defer_destroy;
+ (void)zcdm_c->zc_temphold;
+ zc->zc_action_handle = zcdm_c->zc_action_handle;
+ zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd;
+ zc->zc_simple = zcdm_c->zc_simple;
+ zc->zc_resumable = B_FALSE;
+ zc->zc_sendobj = zcdm_c->zc_sendobj;
+ zc->zc_fromobj = zcdm_c->zc_fromobj;
+ zc->zc_createtxg = zcdm_c->zc_createtxg;
+ zc->zc_stat = zcdm_c->zc_stat;
+ FIELD_COPY(zc_inject_record.zi_objset);
+ FIELD_COPY(zc_inject_record.zi_object);
+ FIELD_COPY(zc_inject_record.zi_start);
+ FIELD_COPY(zc_inject_record.zi_end);
+ FIELD_COPY(zc_inject_record.zi_guid);
+ FIELD_COPY(zc_inject_record.zi_level);
+ FIELD_COPY(zc_inject_record.zi_error);
+ FIELD_COPY(zc_inject_record.zi_type);
+ FIELD_COPY(zc_inject_record.zi_freq);
+ FIELD_COPY(zc_inject_record.zi_failfast);
+ strlcpy(zc->zc_inject_record.zi_func,
+ resume_c->zc_inject_record.zi_func, MAXNAMELEN);
+ FIELD_COPY(zc_inject_record.zi_iotype);
+ FIELD_COPY(zc_inject_record.zi_duration);
+ FIELD_COPY(zc_inject_record.zi_timer);
+ zc->zc_inject_record.zi_nlanes = 1;
+ FIELD_COPY(zc_inject_record.zi_cmd);
+ FIELD_COPY(zc_inject_record.zi_pad);
+
+ /* we always assume zc_nvlist_dst_filled is true */
+ zc->zc_nvlist_dst_filled = B_TRUE;
+#undef FIELD_COPY
+ break;
+
+ case ZFS_CMD_COMPAT_V28:
+ zc28_c = (void *)addr;
+
+ /* zc */
+ strlcpy(zc->zc_name, zc28_c->zc_name, MAXPATHLEN);
+ strlcpy(zc->zc_value, zc28_c->zc_value, MAXPATHLEN * 2);
+ strlcpy(zc->zc_string, zc28_c->zc_string, MAXPATHLEN);
+ zc->zc_guid = zc28_c->zc_guid;
+ zc->zc_nvlist_conf = zc28_c->zc_nvlist_conf;
+ zc->zc_nvlist_conf_size = zc28_c->zc_nvlist_conf_size;
+ zc->zc_nvlist_src = zc28_c->zc_nvlist_src;
+ zc->zc_nvlist_src_size = zc28_c->zc_nvlist_src_size;
+ zc->zc_nvlist_dst = zc28_c->zc_nvlist_dst;
+ zc->zc_nvlist_dst_size = zc28_c->zc_nvlist_dst_size;
+ zc->zc_cookie = zc28_c->zc_cookie;
+ zc->zc_objset_type = zc28_c->zc_objset_type;
+ zc->zc_perm_action = zc28_c->zc_perm_action;
+ zc->zc_history = zc28_c->zc_history;
+ zc->zc_history_len = zc28_c->zc_history_len;
+ zc->zc_history_offset = zc28_c->zc_history_offset;
+ zc->zc_obj = zc28_c->zc_obj;
+ zc->zc_iflags = zc28_c->zc_iflags;
+ zc->zc_share = zc28_c->zc_share;
+ zc->zc_jailid = zc28_c->zc_jailid;
+ zc->zc_objset_stats = zc28_c->zc_objset_stats;
+ zc->zc_begin_record.drr_u.drr_begin = zc28_c->zc_begin_record;
+ zc->zc_defer_destroy = zc28_c->zc_defer_destroy;
+ (void)zc28_c->zc_temphold;
+ zc->zc_action_handle = zc28_c->zc_action_handle;
+ zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd;
+ zc->zc_simple = zc28_c->zc_simple;
+ zc->zc_resumable = B_FALSE;
+ zc->zc_sendobj = zc28_c->zc_sendobj;
+ zc->zc_fromobj = zc28_c->zc_fromobj;
+ zc->zc_createtxg = zc28_c->zc_createtxg;
+ zc->zc_stat = zc28_c->zc_stat;
+
+ /* zc->zc_inject_record */
+ zc->zc_inject_record.zi_objset =
+ zc28_c->zc_inject_record.zi_objset;
+ zc->zc_inject_record.zi_object =
+ zc28_c->zc_inject_record.zi_object;
+ zc->zc_inject_record.zi_start =
+ zc28_c->zc_inject_record.zi_start;
+ zc->zc_inject_record.zi_end =
+ zc28_c->zc_inject_record.zi_end;
+ zc->zc_inject_record.zi_guid =
+ zc28_c->zc_inject_record.zi_guid;
+ zc->zc_inject_record.zi_level =
+ zc28_c->zc_inject_record.zi_level;
+ zc->zc_inject_record.zi_error =
+ zc28_c->zc_inject_record.zi_error;
+ zc->zc_inject_record.zi_type =
+ zc28_c->zc_inject_record.zi_type;
+ zc->zc_inject_record.zi_freq =
+ zc28_c->zc_inject_record.zi_freq;
+ zc->zc_inject_record.zi_failfast =
+ zc28_c->zc_inject_record.zi_failfast;
+ strlcpy(zc->zc_inject_record.zi_func,
+ zc28_c->zc_inject_record.zi_func, MAXNAMELEN);
+ zc->zc_inject_record.zi_iotype =
+ zc28_c->zc_inject_record.zi_iotype;
+ zc->zc_inject_record.zi_duration =
+ zc28_c->zc_inject_record.zi_duration;
+ zc->zc_inject_record.zi_timer =
+ zc28_c->zc_inject_record.zi_timer;
+ zc->zc_inject_record.zi_nlanes = 1;
+ zc->zc_inject_record.zi_cmd = ZINJECT_UNINITIALIZED;
+ zc->zc_inject_record.zi_pad = 0;
+ break;
+
+ case ZFS_CMD_COMPAT_V15:
+ zc_c = (void *)addr;
+
+ /* zc */
+ strlcpy(zc->zc_name, zc_c->zc_name, MAXPATHLEN);
+ strlcpy(zc->zc_value, zc_c->zc_value, MAXPATHLEN);
+ strlcpy(zc->zc_string, zc_c->zc_string, MAXPATHLEN);
+ zc->zc_guid = zc_c->zc_guid;
+ zc->zc_nvlist_conf = zc_c->zc_nvlist_conf;
+ zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size;
+ zc->zc_nvlist_src = zc_c->zc_nvlist_src;
+ zc->zc_nvlist_src_size = zc_c->zc_nvlist_src_size;
+ zc->zc_nvlist_dst = zc_c->zc_nvlist_dst;
+ zc->zc_nvlist_dst_size = zc_c->zc_nvlist_dst_size;
+ zc->zc_cookie = zc_c->zc_cookie;
+ zc->zc_objset_type = zc_c->zc_objset_type;
+ zc->zc_perm_action = zc_c->zc_perm_action;
+ zc->zc_history = zc_c->zc_history;
+ zc->zc_history_len = zc_c->zc_history_len;
+ zc->zc_history_offset = zc_c->zc_history_offset;
+ zc->zc_obj = zc_c->zc_obj;
+ zc->zc_share = zc_c->zc_share;
+ zc->zc_jailid = zc_c->zc_jailid;
+ zc->zc_objset_stats = zc_c->zc_objset_stats;
+ zc->zc_begin_record.drr_u.drr_begin = zc_c->zc_begin_record;
+
+ /* zc->zc_inject_record */
+ zc->zc_inject_record.zi_objset =
+ zc_c->zc_inject_record.zi_objset;
+ zc->zc_inject_record.zi_object =
+ zc_c->zc_inject_record.zi_object;
+ zc->zc_inject_record.zi_start =
+ zc_c->zc_inject_record.zi_start;
+ zc->zc_inject_record.zi_end =
+ zc_c->zc_inject_record.zi_end;
+ zc->zc_inject_record.zi_guid =
+ zc_c->zc_inject_record.zi_guid;
+ zc->zc_inject_record.zi_level =
+ zc_c->zc_inject_record.zi_level;
+ zc->zc_inject_record.zi_error =
+ zc_c->zc_inject_record.zi_error;
+ zc->zc_inject_record.zi_type =
+ zc_c->zc_inject_record.zi_type;
+ zc->zc_inject_record.zi_freq =
+ zc_c->zc_inject_record.zi_freq;
+ zc->zc_inject_record.zi_failfast =
+ zc_c->zc_inject_record.zi_failfast;
+ break;
+ }
+}
+
+void
+zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request,
+ const int cflag)
+{
+ zfs_cmd_v15_t *zc_c;
+ zfs_cmd_v28_t *zc28_c;
+ zfs_cmd_deadman_t *zcdm_c;
+ zfs_cmd_zcmd_t *zcmd_c;
+ zfs_cmd_edbp_t *edbp_c;
+ zfs_cmd_resume_t *resume_c;
+ zfs_cmd_inlanes_t *inlanes_c;
+
+ switch (cflag) {
+ case ZFS_CMD_COMPAT_INLANES:
+ inlanes_c = (void *)addr;
+ strlcpy(inlanes_c->zc_name, zc->zc_name, MAXPATHLEN);
+ strlcpy(inlanes_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+ strlcpy(inlanes_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) inlanes_c->field = zc->field
+ FIELD_COPY(zc_nvlist_src);
+ FIELD_COPY(zc_nvlist_src_size);
+ FIELD_COPY(zc_nvlist_dst);
+ FIELD_COPY(zc_nvlist_dst_size);
+ FIELD_COPY(zc_nvlist_dst_filled);
+ FIELD_COPY(zc_pad2);
+ FIELD_COPY(zc_history);
+ FIELD_COPY(zc_guid);
+ FIELD_COPY(zc_nvlist_conf);
+ FIELD_COPY(zc_nvlist_conf_size);
+ FIELD_COPY(zc_cookie);
+ FIELD_COPY(zc_objset_type);
+ FIELD_COPY(zc_perm_action);
+ FIELD_COPY(zc_history_len);
+ FIELD_COPY(zc_history_offset);
+ FIELD_COPY(zc_obj);
+ FIELD_COPY(zc_iflags);
+ FIELD_COPY(zc_share);
+ FIELD_COPY(zc_jailid);
+ FIELD_COPY(zc_objset_stats);
+ FIELD_COPY(zc_begin_record);
+ FIELD_COPY(zc_inject_record);
+ FIELD_COPY(zc_defer_destroy);
+ FIELD_COPY(zc_flags);
+ FIELD_COPY(zc_action_handle);
+ FIELD_COPY(zc_cleanup_fd);
+ FIELD_COPY(zc_simple);
+ FIELD_COPY(zc_sendobj);
+ FIELD_COPY(zc_fromobj);
+ FIELD_COPY(zc_createtxg);
+ FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+ break;
+
+ case ZFS_CMD_COMPAT_RESUME:
+ resume_c = (void *)addr;
+ strlcpy(resume_c->zc_name, zc->zc_name, MAXPATHLEN);
+ strlcpy(resume_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+ strlcpy(resume_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) resume_c->field = zc->field
+ FIELD_COPY(zc_nvlist_src);
+ FIELD_COPY(zc_nvlist_src_size);
+ FIELD_COPY(zc_nvlist_dst);
+ FIELD_COPY(zc_nvlist_dst_size);
+ FIELD_COPY(zc_nvlist_dst_filled);
+ FIELD_COPY(zc_pad2);
+ FIELD_COPY(zc_history);
+ FIELD_COPY(zc_guid);
+ FIELD_COPY(zc_nvlist_conf);
+ FIELD_COPY(zc_nvlist_conf_size);
+ FIELD_COPY(zc_cookie);
+ FIELD_COPY(zc_objset_type);
+ FIELD_COPY(zc_perm_action);
+ FIELD_COPY(zc_history_len);
+ FIELD_COPY(zc_history_offset);
+ FIELD_COPY(zc_obj);
+ FIELD_COPY(zc_iflags);
+ FIELD_COPY(zc_share);
+ FIELD_COPY(zc_jailid);
+ FIELD_COPY(zc_objset_stats);
+ FIELD_COPY(zc_begin_record);
+ FIELD_COPY(zc_inject_record.zi_objset);
+ FIELD_COPY(zc_inject_record.zi_object);
+ FIELD_COPY(zc_inject_record.zi_start);
+ FIELD_COPY(zc_inject_record.zi_end);
+ FIELD_COPY(zc_inject_record.zi_guid);
+ FIELD_COPY(zc_inject_record.zi_level);
+ FIELD_COPY(zc_inject_record.zi_error);
+ FIELD_COPY(zc_inject_record.zi_type);
+ FIELD_COPY(zc_inject_record.zi_freq);
+ FIELD_COPY(zc_inject_record.zi_failfast);
+ strlcpy(resume_c->zc_inject_record.zi_func,
+ zc->zc_inject_record.zi_func, MAXNAMELEN);
+ FIELD_COPY(zc_inject_record.zi_iotype);
+ FIELD_COPY(zc_inject_record.zi_duration);
+ FIELD_COPY(zc_inject_record.zi_timer);
+ FIELD_COPY(zc_inject_record.zi_cmd);
+ FIELD_COPY(zc_inject_record.zi_pad);
+ FIELD_COPY(zc_defer_destroy);
+ FIELD_COPY(zc_flags);
+ FIELD_COPY(zc_action_handle);
+ FIELD_COPY(zc_cleanup_fd);
+ FIELD_COPY(zc_simple);
+ FIELD_COPY(zc_sendobj);
+ FIELD_COPY(zc_fromobj);
+ FIELD_COPY(zc_createtxg);
+ FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+ break;
+
+ case ZFS_CMD_COMPAT_EDBP:
+ edbp_c = (void *)addr;
+ strlcpy(edbp_c->zc_name, zc->zc_name, MAXPATHLEN);
+ strlcpy(edbp_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+ strlcpy(edbp_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) edbp_c->field = zc->field
+ FIELD_COPY(zc_nvlist_src);
+ FIELD_COPY(zc_nvlist_src_size);
+ FIELD_COPY(zc_nvlist_dst);
+ FIELD_COPY(zc_nvlist_dst_size);
+ FIELD_COPY(zc_nvlist_dst_filled);
+ FIELD_COPY(zc_pad2);
+ FIELD_COPY(zc_history);
+ FIELD_COPY(zc_guid);
+ FIELD_COPY(zc_nvlist_conf);
+ FIELD_COPY(zc_nvlist_conf_size);
+ FIELD_COPY(zc_cookie);
+ FIELD_COPY(zc_objset_type);
+ FIELD_COPY(zc_perm_action);
+ FIELD_COPY(zc_history_len);
+ FIELD_COPY(zc_history_offset);
+ FIELD_COPY(zc_obj);
+ FIELD_COPY(zc_iflags);
+ FIELD_COPY(zc_share);
+ FIELD_COPY(zc_jailid);
+ FIELD_COPY(zc_objset_stats);
+ edbp_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+ FIELD_COPY(zc_inject_record.zi_objset);
+ FIELD_COPY(zc_inject_record.zi_object);
+ FIELD_COPY(zc_inject_record.zi_start);
+ FIELD_COPY(zc_inject_record.zi_end);
+ FIELD_COPY(zc_inject_record.zi_guid);
+ FIELD_COPY(zc_inject_record.zi_level);
+ FIELD_COPY(zc_inject_record.zi_error);
+ FIELD_COPY(zc_inject_record.zi_type);
+ FIELD_COPY(zc_inject_record.zi_freq);
+ FIELD_COPY(zc_inject_record.zi_failfast);
+ strlcpy(resume_c->zc_inject_record.zi_func,
+ zc->zc_inject_record.zi_func, MAXNAMELEN);
+ FIELD_COPY(zc_inject_record.zi_iotype);
+ FIELD_COPY(zc_inject_record.zi_duration);
+ FIELD_COPY(zc_inject_record.zi_timer);
+ FIELD_COPY(zc_inject_record.zi_cmd);
+ FIELD_COPY(zc_inject_record.zi_pad);
+ FIELD_COPY(zc_defer_destroy);
+ FIELD_COPY(zc_flags);
+ FIELD_COPY(zc_action_handle);
+ FIELD_COPY(zc_cleanup_fd);
+ FIELD_COPY(zc_simple);
+ FIELD_COPY(zc_sendobj);
+ FIELD_COPY(zc_fromobj);
+ FIELD_COPY(zc_createtxg);
+ FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+ break;
+
+ case ZFS_CMD_COMPAT_ZCMD:
+ zcmd_c = (void *)addr;
+ /* zc */
+ strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN);
+ strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+ strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zcmd_c->field = zc->field
+ FIELD_COPY(zc_nvlist_src);
+ FIELD_COPY(zc_nvlist_src_size);
+ FIELD_COPY(zc_nvlist_dst);
+ FIELD_COPY(zc_nvlist_dst_size);
+ FIELD_COPY(zc_nvlist_dst_filled);
+ FIELD_COPY(zc_pad2);
+ FIELD_COPY(zc_history);
+ FIELD_COPY(zc_guid);
+ FIELD_COPY(zc_nvlist_conf);
+ FIELD_COPY(zc_nvlist_conf_size);
+ FIELD_COPY(zc_cookie);
+ FIELD_COPY(zc_objset_type);
+ FIELD_COPY(zc_perm_action);
+ FIELD_COPY(zc_history_len);
+ FIELD_COPY(zc_history_offset);
+ FIELD_COPY(zc_obj);
+ FIELD_COPY(zc_iflags);
+ FIELD_COPY(zc_share);
+ FIELD_COPY(zc_jailid);
+ FIELD_COPY(zc_objset_stats);
+ zcmd_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+ FIELD_COPY(zc_inject_record.zi_objset);
+ FIELD_COPY(zc_inject_record.zi_object);
+ FIELD_COPY(zc_inject_record.zi_start);
+ FIELD_COPY(zc_inject_record.zi_end);
+ FIELD_COPY(zc_inject_record.zi_guid);
+ FIELD_COPY(zc_inject_record.zi_level);
+ FIELD_COPY(zc_inject_record.zi_error);
+ FIELD_COPY(zc_inject_record.zi_type);
+ FIELD_COPY(zc_inject_record.zi_freq);
+ FIELD_COPY(zc_inject_record.zi_failfast);
+ strlcpy(resume_c->zc_inject_record.zi_func,
+ zc->zc_inject_record.zi_func, MAXNAMELEN);
+ FIELD_COPY(zc_inject_record.zi_iotype);
+ FIELD_COPY(zc_inject_record.zi_duration);
+ FIELD_COPY(zc_inject_record.zi_timer);
+ FIELD_COPY(zc_inject_record.zi_cmd);
+ FIELD_COPY(zc_inject_record.zi_pad);
+
+ /* boolean_t -> uint32_t */
+ zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy);
+ zcmd_c->zc_temphold = 0;
+
+ FIELD_COPY(zc_action_handle);
+ FIELD_COPY(zc_cleanup_fd);
+ FIELD_COPY(zc_simple);
+ FIELD_COPY(zc_sendobj);
+ FIELD_COPY(zc_fromobj);
+ FIELD_COPY(zc_createtxg);
+ FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+
+ break;
+
+ case ZFS_CMD_COMPAT_DEADMAN:
+ zcdm_c = (void *)addr;
+
+ strlcpy(zcdm_c->zc_name, zc->zc_name, MAXPATHLEN);
+ strlcpy(zcdm_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+ strlcpy(zcdm_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zcdm_c->field = zc->field
+ zcdm_c->zc_guid = zc->zc_guid;
+ zcdm_c->zc_nvlist_conf = zc->zc_nvlist_conf;
+ zcdm_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
+ zcdm_c->zc_nvlist_src = zc->zc_nvlist_src;
+ zcdm_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
+ zcdm_c->zc_nvlist_dst = zc->zc_nvlist_dst;
+ zcdm_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
+ zcdm_c->zc_cookie = zc->zc_cookie;
+ zcdm_c->zc_objset_type = zc->zc_objset_type;
+ zcdm_c->zc_perm_action = zc->zc_perm_action;
+ zcdm_c->zc_history = zc->zc_history;
+ zcdm_c->zc_history_len = zc->zc_history_len;
+ zcdm_c->zc_history_offset = zc->zc_history_offset;
+ zcdm_c->zc_obj = zc->zc_obj;
+ zcdm_c->zc_iflags = zc->zc_iflags;
+ zcdm_c->zc_share = zc->zc_share;
+ zcdm_c->zc_jailid = zc->zc_jailid;
+ zcdm_c->zc_objset_stats = zc->zc_objset_stats;
+ zcdm_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+ zcdm_c->zc_defer_destroy = zc->zc_defer_destroy;
+ zcdm_c->zc_temphold = 0;
+ zcdm_c->zc_action_handle = zc->zc_action_handle;
+ zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd;
+ zcdm_c->zc_simple = zc->zc_simple;
+ zcdm_c->zc_sendobj = zc->zc_sendobj;
+ zcdm_c->zc_fromobj = zc->zc_fromobj;
+ zcdm_c->zc_createtxg = zc->zc_createtxg;
+ zcdm_c->zc_stat = zc->zc_stat;
+ FIELD_COPY(zc_inject_record.zi_objset);
+ FIELD_COPY(zc_inject_record.zi_object);
+ FIELD_COPY(zc_inject_record.zi_start);
+ FIELD_COPY(zc_inject_record.zi_end);
+ FIELD_COPY(zc_inject_record.zi_guid);
+ FIELD_COPY(zc_inject_record.zi_level);
+ FIELD_COPY(zc_inject_record.zi_error);
+ FIELD_COPY(zc_inject_record.zi_type);
+ FIELD_COPY(zc_inject_record.zi_freq);
+ FIELD_COPY(zc_inject_record.zi_failfast);
+ strlcpy(resume_c->zc_inject_record.zi_func,
+ zc->zc_inject_record.zi_func, MAXNAMELEN);
+ FIELD_COPY(zc_inject_record.zi_iotype);
+ FIELD_COPY(zc_inject_record.zi_duration);
+ FIELD_COPY(zc_inject_record.zi_timer);
+ FIELD_COPY(zc_inject_record.zi_cmd);
+ FIELD_COPY(zc_inject_record.zi_pad);
+#undef FIELD_COPY
+#ifndef _KERNEL
+ if (request == ZFS_IOC_RECV)
+ strlcpy(zcdm_c->zc_top_ds,
+ zc->zc_value + strlen(zc->zc_value) + 1,
+ (MAXPATHLEN * 2) - strlen(zc->zc_value) - 1);
+#endif
+ break;
+
+ case ZFS_CMD_COMPAT_V28:
+ zc28_c = (void *)addr;
+
+ strlcpy(zc28_c->zc_name, zc->zc_name, MAXPATHLEN);
+ strlcpy(zc28_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+ strlcpy(zc28_c->zc_string, zc->zc_string, MAXPATHLEN);
+ zc28_c->zc_guid = zc->zc_guid;
+ zc28_c->zc_nvlist_conf = zc->zc_nvlist_conf;
+ zc28_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
+ zc28_c->zc_nvlist_src = zc->zc_nvlist_src;
+ zc28_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
+ zc28_c->zc_nvlist_dst = zc->zc_nvlist_dst;
+ zc28_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
+ zc28_c->zc_cookie = zc->zc_cookie;
+ zc28_c->zc_objset_type = zc->zc_objset_type;
+ zc28_c->zc_perm_action = zc->zc_perm_action;
+ zc28_c->zc_history = zc->zc_history;
+ zc28_c->zc_history_len = zc->zc_history_len;
+ zc28_c->zc_history_offset = zc->zc_history_offset;
+ zc28_c->zc_obj = zc->zc_obj;
+ zc28_c->zc_iflags = zc->zc_iflags;
+ zc28_c->zc_share = zc->zc_share;
+ zc28_c->zc_jailid = zc->zc_jailid;
+ zc28_c->zc_objset_stats = zc->zc_objset_stats;
+ zc28_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+ zc28_c->zc_defer_destroy = zc->zc_defer_destroy;
+ zc28_c->zc_temphold = 0;
+ zc28_c->zc_action_handle = zc->zc_action_handle;
+ zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd;
+ zc28_c->zc_simple = zc->zc_simple;
+ zc28_c->zc_sendobj = zc->zc_sendobj;
+ zc28_c->zc_fromobj = zc->zc_fromobj;
+ zc28_c->zc_createtxg = zc->zc_createtxg;
+ zc28_c->zc_stat = zc->zc_stat;
+#ifndef _KERNEL
+ if (request == ZFS_IOC_RECV)
+ strlcpy(zc28_c->zc_top_ds,
+ zc->zc_value + strlen(zc->zc_value) + 1,
+ MAXPATHLEN * 2 - strlen(zc->zc_value) - 1);
+#endif
+ /* zc_inject_record */
+ zc28_c->zc_inject_record.zi_objset =
+ zc->zc_inject_record.zi_objset;
+ zc28_c->zc_inject_record.zi_object =
+ zc->zc_inject_record.zi_object;
+ zc28_c->zc_inject_record.zi_start =
+ zc->zc_inject_record.zi_start;
+ zc28_c->zc_inject_record.zi_end =
+ zc->zc_inject_record.zi_end;
+ zc28_c->zc_inject_record.zi_guid =
+ zc->zc_inject_record.zi_guid;
+ zc28_c->zc_inject_record.zi_level =
+ zc->zc_inject_record.zi_level;
+ zc28_c->zc_inject_record.zi_error =
+ zc->zc_inject_record.zi_error;
+ zc28_c->zc_inject_record.zi_type =
+ zc->zc_inject_record.zi_type;
+ zc28_c->zc_inject_record.zi_freq =
+ zc->zc_inject_record.zi_freq;
+ zc28_c->zc_inject_record.zi_failfast =
+ zc->zc_inject_record.zi_failfast;
+ strlcpy(zc28_c->zc_inject_record.zi_func,
+ zc->zc_inject_record.zi_func, MAXNAMELEN);
+ zc28_c->zc_inject_record.zi_iotype =
+ zc->zc_inject_record.zi_iotype;
+ zc28_c->zc_inject_record.zi_duration =
+ zc->zc_inject_record.zi_duration;
+ zc28_c->zc_inject_record.zi_timer =
+ zc->zc_inject_record.zi_timer;
+ break;
+
+ case ZFS_CMD_COMPAT_V15:
+ zc_c = (void *)addr;
+
+ /* zc */
+ strlcpy(zc_c->zc_name, zc->zc_name, MAXPATHLEN);
+ strlcpy(zc_c->zc_value, zc->zc_value, MAXPATHLEN);
+ strlcpy(zc_c->zc_string, zc->zc_string, MAXPATHLEN);
+ zc_c->zc_guid = zc->zc_guid;
+ zc_c->zc_nvlist_conf = zc->zc_nvlist_conf;
+ zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
+ zc_c->zc_nvlist_src = zc->zc_nvlist_src;
+ zc_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
+ zc_c->zc_nvlist_dst = zc->zc_nvlist_dst;
+ zc_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
+ zc_c->zc_cookie = zc->zc_cookie;
+ zc_c->zc_objset_type = zc->zc_objset_type;
+ zc_c->zc_perm_action = zc->zc_perm_action;
+ zc_c->zc_history = zc->zc_history;
+ zc_c->zc_history_len = zc->zc_history_len;
+ zc_c->zc_history_offset = zc->zc_history_offset;
+ zc_c->zc_obj = zc->zc_obj;
+ zc_c->zc_share = zc->zc_share;
+ zc_c->zc_jailid = zc->zc_jailid;
+ zc_c->zc_objset_stats = zc->zc_objset_stats;
+ zc_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+
+ /* zc_inject_record */
+ zc_c->zc_inject_record.zi_objset =
+ zc->zc_inject_record.zi_objset;
+ zc_c->zc_inject_record.zi_object =
+ zc->zc_inject_record.zi_object;
+ zc_c->zc_inject_record.zi_start =
+ zc->zc_inject_record.zi_start;
+ zc_c->zc_inject_record.zi_end =
+ zc->zc_inject_record.zi_end;
+ zc_c->zc_inject_record.zi_guid =
+ zc->zc_inject_record.zi_guid;
+ zc_c->zc_inject_record.zi_level =
+ zc->zc_inject_record.zi_level;
+ zc_c->zc_inject_record.zi_error =
+ zc->zc_inject_record.zi_error;
+ zc_c->zc_inject_record.zi_type =
+ zc->zc_inject_record.zi_type;
+ zc_c->zc_inject_record.zi_freq =
+ zc->zc_inject_record.zi_freq;
+ zc_c->zc_inject_record.zi_failfast =
+ zc->zc_inject_record.zi_failfast;
+
+ break;
+ }
+}
+
+static int
+zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag,
+ nvlist_t **nvp)
+{
+ char *packed;
+ int error;
+ nvlist_t *list = NULL;
+
+ /*
+ * Read in and unpack the user-supplied nvlist.
+ */
+ if (size == 0)
+ return (EINVAL);
+
+#ifdef _KERNEL
+ packed = kmem_alloc(size, KM_SLEEP);
+ if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+ iflag)) != 0) {
+ kmem_free(packed, size);
+ return (error);
+ }
+#else
+ packed = (void *)(uintptr_t)nvl;
+#endif
+
+ error = nvlist_unpack(packed, size, &list, 0);
+
+#ifdef _KERNEL
+ kmem_free(packed, size);
+#endif
+
+ if (error != 0)
+ return (error);
+
+ *nvp = list;
+ return (0);
+}
+
+static int
+zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
+{
+ char *packed = NULL;
+ int error = 0;
+ size_t size;
+
+ VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
+
+#ifdef _KERNEL
+ packed = kmem_alloc(size, KM_SLEEP);
+ VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
+ KM_SLEEP) == 0);
+
+ if (ddi_copyout(packed,
+ (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0)
+ error = EFAULT;
+ kmem_free(packed, size);
+#else
+ packed = (void *)(uintptr_t)zc->zc_nvlist_dst;
+ VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
+ 0) == 0);
+#endif
+
+ zc->zc_nvlist_dst_size = size;
+ return (error);
+}
+
+static void
+zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl)
+{
+ nvlist_t **child;
+ nvlist_t *nvroot = NULL;
+ vdev_stat_t *vs;
+ uint_t c, children, nelem;
+
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ zfs_ioctl_compat_fix_stats_nvlist(child[c]);
+ }
+ }
+
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0)
+ zfs_ioctl_compat_fix_stats_nvlist(nvroot);
+#ifdef _KERNEL
+ if ((nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS,
+#else
+ if ((nvlist_lookup_uint64_array(nvl, "stats",
+#endif
+
+ (uint64_t **)&vs, &nelem) == 0)) {
+ nvlist_add_uint64_array(nvl,
+#ifdef _KERNEL
+ "stats",
+#else
+ ZPOOL_CONFIG_VDEV_STATS,
+#endif
+ (uint64_t *)vs, nelem);
+#ifdef _KERNEL
+ nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS,
+#else
+ nvlist_remove(nvl, "stats",
+#endif
+ DATA_TYPE_UINT64_ARRAY);
+ }
+}
+
+static int
+zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc)
+{
+ nvlist_t *nv, *nvp = NULL;
+ nvpair_t *elem;
+ int error;
+
+ if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
+ return (error);
+
+ if (nc == 5) { /* ZFS_IOC_POOL_STATS */
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
+ if (nvpair_value_nvlist(elem, &nvp) == 0)
+ zfs_ioctl_compat_fix_stats_nvlist(nvp);
+ }
+ elem = NULL;
+ } else
+ zfs_ioctl_compat_fix_stats_nvlist(nv);
+
+ error = zfs_ioctl_compat_put_nvlist(zc, nv);
+
+ nvlist_free(nv);
+
+ return (error);
+}
+
+static int
+zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc)
+{
+ nvlist_t *nv, *nva = NULL;
+ int error;
+
+ if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
+ return (error);
+
+#ifdef _KERNEL
+ if (nvlist_lookup_nvlist(nv, "allocated", &nva) == 0) {
+ nvlist_add_nvlist(nv, "used", nva);
+ nvlist_remove(nv, "allocated", DATA_TYPE_NVLIST);
+ }
+
+ if (nvlist_lookup_nvlist(nv, "free", &nva) == 0) {
+ nvlist_add_nvlist(nv, "available", nva);
+ nvlist_remove(nv, "free", DATA_TYPE_NVLIST);
+ }
+#else
+ if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) {
+ nvlist_add_nvlist(nv, "allocated", nva);
+ nvlist_remove(nv, "used", DATA_TYPE_NVLIST);
+ }
+
+ if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) {
+ nvlist_add_nvlist(nv, "free", nva);
+ nvlist_remove(nv, "available", DATA_TYPE_NVLIST);
+ }
+#endif
+
+ error = zfs_ioctl_compat_put_nvlist(zc, nv);
+
+ nvlist_free(nv);
+
+ return (error);
+}
+
+#ifndef _KERNEL
+int
+zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag)
+{
+ int nc, ret;
+ void *zc_c;
+ unsigned long ncmd;
+ zfs_iocparm_t zp;
+
+ switch (cflag) {
+ case ZFS_CMD_COMPAT_NONE:
+ ncmd = _IOWR('Z', request, struct zfs_iocparm);
+ zp.zfs_cmd = (uint64_t)zc;
+ zp.zfs_cmd_size = sizeof(zfs_cmd_t);
+ zp.zfs_ioctl_version = ZFS_IOCVER_CURRENT;
+ return (ioctl(fd, ncmd, &zp));
+ case ZFS_CMD_COMPAT_INLANES:
+ ncmd = _IOWR('Z', request, struct zfs_iocparm);
+ zp.zfs_cmd = (uint64_t)zc;
+ zp.zfs_cmd_size = sizeof(zfs_cmd_inlanes_t);
+ zp.zfs_ioctl_version = ZFS_IOCVER_INLANES;
+ return (ioctl(fd, ncmd, &zp));
+ case ZFS_CMD_COMPAT_RESUME:
+ ncmd = _IOWR('Z', request, struct zfs_iocparm);
+ zp.zfs_cmd = (uint64_t)zc;
+ zp.zfs_cmd_size = sizeof(zfs_cmd_resume_t);
+ zp.zfs_ioctl_version = ZFS_IOCVER_RESUME;
+ return (ioctl(fd, ncmd, &zp));
+ case ZFS_CMD_COMPAT_EDBP:
+ ncmd = _IOWR('Z', request, struct zfs_iocparm);
+ zp.zfs_cmd = (uint64_t)zc;
+ zp.zfs_cmd_size = sizeof(zfs_cmd_edbp_t);
+ zp.zfs_ioctl_version = ZFS_IOCVER_EDBP;
+ return (ioctl(fd, ncmd, &zp));
+ case ZFS_CMD_COMPAT_ZCMD:
+ ncmd = _IOWR('Z', request, struct zfs_iocparm);
+ zp.zfs_cmd = (uint64_t)zc;
+ zp.zfs_cmd_size = sizeof(zfs_cmd_zcmd_t);
+ zp.zfs_ioctl_version = ZFS_IOCVER_ZCMD;
+ return (ioctl(fd, ncmd, &zp));
+ case ZFS_CMD_COMPAT_LZC:
+ ncmd = _IOWR('Z', request, struct zfs_cmd);
+ return (ioctl(fd, ncmd, zc));
+ case ZFS_CMD_COMPAT_DEADMAN:
+ zc_c = malloc(sizeof(zfs_cmd_deadman_t));
+ ncmd = _IOWR('Z', request, struct zfs_cmd_deadman);
+ break;
+ case ZFS_CMD_COMPAT_V28:
+ zc_c = malloc(sizeof(zfs_cmd_v28_t));
+ ncmd = _IOWR('Z', request, struct zfs_cmd_v28);
+ break;
+ case ZFS_CMD_COMPAT_V15:
+ nc = zfs_ioctl_v28_to_v15[request];
+ zc_c = malloc(sizeof(zfs_cmd_v15_t));
+ ncmd = _IOWR('Z', nc, struct zfs_cmd_v15);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ if (ZFS_IOCREQ(ncmd) == ZFS_IOC_COMPAT_FAIL)
+ return (ENOTSUP);
+
+ zfs_cmd_compat_put(zc, (caddr_t)zc_c, request, cflag);
+
+ ret = ioctl(fd, ncmd, zc_c);
+ if (cflag == ZFS_CMD_COMPAT_V15 &&
+ nc == ZFS_IOC_POOL_IMPORT)
+ ret = ioctl(fd, _IOWR('Z', ZFS_IOC_POOL_CONFIGS,
+ struct zfs_cmd_v15), zc_c);
+ zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag);
+ free(zc_c);
+
+ if (cflag == ZFS_CMD_COMPAT_V15) {
+ switch (nc) {
+ case ZFS_IOC_POOL_IMPORT:
+ case ZFS_IOC_POOL_CONFIGS:
+ case ZFS_IOC_POOL_STATS:
+ case ZFS_IOC_POOL_TRYIMPORT:
+ zfs_ioctl_compat_fix_stats(zc, nc);
+ break;
+ case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
+ zfs_ioctl_compat_pool_get_props(zc);
+ break;
+ }
+ }
+
+ return (ret);
+}
+#else /* _KERNEL */
+int
+zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag)
+{
+ int error = 0;
+
+ /* are we creating a clone? */
+ if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0')
+ *vec = ZFS_IOC_CLONE;
+
+ if (cflag == ZFS_CMD_COMPAT_V15) {
+ switch (*vec) {
+
+ case 7: /* ZFS_IOC_POOL_SCRUB (v15) */
+ zc->zc_cookie = POOL_SCAN_SCRUB;
+ break;
+ }
+ }
+
+ return (error);
+}
+
+void
+zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag)
+{
+ if (cflag == ZFS_CMD_COMPAT_V15) {
+ switch (vec) {
+ case ZFS_IOC_POOL_CONFIGS:
+ case ZFS_IOC_POOL_STATS:
+ case ZFS_IOC_POOL_TRYIMPORT:
+ zfs_ioctl_compat_fix_stats(zc, vec);
+ break;
+ case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
+ zfs_ioctl_compat_pool_get_props(zc);
+ break;
+ }
+ }
+}
+
+nvlist_t *
+zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t * innvl, const int vec,
+ const int cflag)
+{
+ nvlist_t *nvl, *tmpnvl, *hnvl;
+ nvpair_t *elem;
+ char *poolname, *snapname;
+ int err;
+
+ if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC ||
+ cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP ||
+ cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES)
+ goto out;
+
+ switch (vec) {
+ case ZFS_IOC_CREATE:
+ nvl = fnvlist_alloc();
+ fnvlist_add_int32(nvl, "type", zc->zc_objset_type);
+ if (innvl != NULL) {
+ fnvlist_add_nvlist(nvl, "props", innvl);
+ nvlist_free(innvl);
+ }
+ return (nvl);
+ break;
+ case ZFS_IOC_CLONE:
+ nvl = fnvlist_alloc();
+ fnvlist_add_string(nvl, "origin", zc->zc_value);
+ if (innvl != NULL) {
+ fnvlist_add_nvlist(nvl, "props", innvl);
+ nvlist_free(innvl);
+ }
+ return (nvl);
+ break;
+ case ZFS_IOC_SNAPSHOT:
+ if (innvl == NULL)
+ goto out;
+ nvl = fnvlist_alloc();
+ fnvlist_add_nvlist(nvl, "props", innvl);
+ tmpnvl = fnvlist_alloc();
+ snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
+ fnvlist_add_boolean(tmpnvl, snapname);
+ kmem_free(snapname, strlen(snapname + 1));
+ /* check if we are doing a recursive snapshot */
+ if (zc->zc_cookie)
+ dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value,
+ tmpnvl);
+ fnvlist_add_nvlist(nvl, "snaps", tmpnvl);
+ fnvlist_free(tmpnvl);
+ nvlist_free(innvl);
+ /* strip dataset part from zc->zc_name */
+ zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
+ return (nvl);
+ break;
+ case ZFS_IOC_SPACE_SNAPS:
+ nvl = fnvlist_alloc();
+ fnvlist_add_string(nvl, "firstsnap", zc->zc_value);
+ if (innvl != NULL)
+ nvlist_free(innvl);
+ return (nvl);
+ break;
+ case ZFS_IOC_DESTROY_SNAPS:
+ if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN)
+ goto out;
+ nvl = fnvlist_alloc();
+ if (innvl != NULL) {
+ fnvlist_add_nvlist(nvl, "snaps", innvl);
+ } else {
+ /*
+ * We are probably called by even older binaries,
+ * allocate and populate nvlist with recursive
+ * snapshots
+ */
+ if (zfs_component_namecheck(zc->zc_value, NULL,
+ NULL) == 0) {
+ tmpnvl = fnvlist_alloc();
+ if (dmu_get_recursive_snaps_nvl(zc->zc_name,
+ zc->zc_value, tmpnvl) == 0)
+ fnvlist_add_nvlist(nvl, "snaps",
+ tmpnvl);
+ nvlist_free(tmpnvl);
+ }
+ }
+ if (innvl != NULL)
+ nvlist_free(innvl);
+ /* strip dataset part from zc->zc_name */
+ zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
+ return (nvl);
+ break;
+ case ZFS_IOC_HOLD:
+ nvl = fnvlist_alloc();
+ tmpnvl = fnvlist_alloc();
+ if (zc->zc_cleanup_fd != -1)
+ fnvlist_add_int32(nvl, "cleanup_fd",
+ (int32_t)zc->zc_cleanup_fd);
+ if (zc->zc_cookie) {
+ hnvl = fnvlist_alloc();
+ if (dmu_get_recursive_snaps_nvl(zc->zc_name,
+ zc->zc_value, hnvl) == 0) {
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(hnvl,
+ elem)) != NULL) {
+ nvlist_add_string(tmpnvl,
+ nvpair_name(elem), zc->zc_string);
+ }
+ }
+ nvlist_free(hnvl);
+ } else {
+ snapname = kmem_asprintf("%s@%s", zc->zc_name,
+ zc->zc_value);
+ nvlist_add_string(tmpnvl, snapname, zc->zc_string);
+ kmem_free(snapname, strlen(snapname + 1));
+ }
+ fnvlist_add_nvlist(nvl, "holds", tmpnvl);
+ nvlist_free(tmpnvl);
+ if (innvl != NULL)
+ nvlist_free(innvl);
+ /* strip dataset part from zc->zc_name */
+ zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
+ return (nvl);
+ break;
+ case ZFS_IOC_RELEASE:
+ nvl = fnvlist_alloc();
+ tmpnvl = fnvlist_alloc();
+ if (zc->zc_cookie) {
+ hnvl = fnvlist_alloc();
+ if (dmu_get_recursive_snaps_nvl(zc->zc_name,
+ zc->zc_value, hnvl) == 0) {
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(hnvl,
+ elem)) != NULL) {
+ fnvlist_add_boolean(tmpnvl,
+ zc->zc_string);
+ fnvlist_add_nvlist(nvl,
+ nvpair_name(elem), tmpnvl);
+ }
+ }
+ nvlist_free(hnvl);
+ } else {
+ snapname = kmem_asprintf("%s@%s", zc->zc_name,
+ zc->zc_value);
+ fnvlist_add_boolean(tmpnvl, zc->zc_string);
+ fnvlist_add_nvlist(nvl, snapname, tmpnvl);
+ kmem_free(snapname, strlen(snapname + 1));
+ }
+ nvlist_free(tmpnvl);
+ if (innvl != NULL)
+ nvlist_free(innvl);
+ /* strip dataset part from zc->zc_name */
+ zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
+ return (nvl);
+ break;
+ }
+out:
+ return (innvl);
+}
+
+nvlist_t *
+zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t * outnvl, const int vec,
+ const int cflag)
+{
+ nvlist_t *tmpnvl;
+
+ if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC ||
+ cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP ||
+ cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES)
+ return (outnvl);
+
+ switch (vec) {
+ case ZFS_IOC_SPACE_SNAPS:
+ (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie);
+ (void) nvlist_lookup_uint64(outnvl, "compressed",
+ &zc->zc_objset_type);
+ (void) nvlist_lookup_uint64(outnvl, "uncompressed",
+ &zc->zc_perm_action);
+ nvlist_free(outnvl);
+ /* return empty outnvl */
+ tmpnvl = fnvlist_alloc();
+ return (tmpnvl);
+ break;
+ case ZFS_IOC_CREATE:
+ case ZFS_IOC_CLONE:
+ case ZFS_IOC_HOLD:
+ case ZFS_IOC_RELEASE:
+ nvlist_free(outnvl);
+ /* return empty outnvl */
+ tmpnvl = fnvlist_alloc();
+ return (tmpnvl);
+ break;
+ }
+
+ return (outnvl);
+}
+#endif /* KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
new file mode 100644
index 000000000000..61f1514e3ebd
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
@@ -0,0 +1,543 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_IOCTL_COMPAT_H
+#define _SYS_ZFS_IOCTL_COMPAT_H
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zfs_ioctl.h>
+
+#ifdef _KERNEL
+#include <sys/nvpair.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Backwards ioctl compatibility
+ */
+
+/* ioctl versions for vfs.zfs.version.ioctl */
+#define ZFS_IOCVER_UNDEF -1
+#define ZFS_IOCVER_NONE 0
+#define ZFS_IOCVER_DEADMAN 1
+#define ZFS_IOCVER_LZC 2
+#define ZFS_IOCVER_ZCMD 3
+#define ZFS_IOCVER_EDBP 4
+#define ZFS_IOCVER_RESUME 5
+#define ZFS_IOCVER_INLANES 6
+#define ZFS_IOCVER_PAD 7
+#define ZFS_IOCVER_CURRENT ZFS_IOCVER_PAD
+
+/* compatibility conversion flag */
+#define ZFS_CMD_COMPAT_NONE 0
+#define ZFS_CMD_COMPAT_V15 1
+#define ZFS_CMD_COMPAT_V28 2
+#define ZFS_CMD_COMPAT_DEADMAN 3
+#define ZFS_CMD_COMPAT_LZC 4
+#define ZFS_CMD_COMPAT_ZCMD 5
+#define ZFS_CMD_COMPAT_EDBP 6
+#define ZFS_CMD_COMPAT_RESUME 7
+#define ZFS_CMD_COMPAT_INLANES 8
+
+#define ZFS_IOC_COMPAT_PASS 254
+#define ZFS_IOC_COMPAT_FAIL 255
+
+#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff)
+
+typedef struct zfs_iocparm {
+ uint32_t zfs_ioctl_version;
+ uint64_t zfs_cmd;
+ uint64_t zfs_cmd_size;
+} zfs_iocparm_t;
+
+typedef struct zinject_record_v15 {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+ uint32_t zi_failfast;
+} zinject_record_v15_t;
+
+typedef struct zfs_cmd_v15 {
+ char zc_name[MAXPATHLEN];
+ char zc_value[MAXPATHLEN];
+ char zc_string[MAXNAMELEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history; /* really (char *) */
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ zinject_record_v15_t zc_inject_record;
+} zfs_cmd_v15_t;
+
+typedef struct zinject_record_v28 {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+ uint32_t zi_failfast;
+ char zi_func[MAXNAMELEN];
+ uint32_t zi_iotype;
+ int32_t zi_duration;
+ uint64_t zi_timer;
+} zinject_record_v28_t;
+
+typedef struct zfs_cmd_v28 {
+ char zc_name[MAXPATHLEN];
+ char zc_value[MAXPATHLEN * 2];
+ char zc_string[MAXNAMELEN];
+ char zc_top_ds[MAXPATHLEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history; /* really (char *) */
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ zinject_record_v28_t zc_inject_record;
+ boolean_t zc_defer_destroy;
+ boolean_t zc_temphold;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_simple;
+ uint8_t zc_pad[3]; /* alignment */
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
+} zfs_cmd_v28_t;
+
+typedef struct zinject_record_deadman {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+ uint32_t zi_failfast;
+ char zi_func[MAXNAMELEN];
+ uint32_t zi_iotype;
+ int32_t zi_duration;
+ uint64_t zi_timer;
+ uint32_t zi_cmd;
+ uint32_t zi_pad;
+} zinject_record_deadman_t;
+
+typedef struct zfs_cmd_deadman {
+ char zc_name[MAXPATHLEN];
+ char zc_value[MAXPATHLEN * 2];
+ char zc_string[MAXNAMELEN];
+ char zc_top_ds[MAXPATHLEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history; /* really (char *) */
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ /* zc_inject_record doesn't change in libzfs_core */
+ zinject_record_deadman_t zc_inject_record;
+ boolean_t zc_defer_destroy;
+ boolean_t zc_temphold;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_simple;
+ uint8_t zc_pad[3]; /* alignment */
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
+} zfs_cmd_deadman_t;
+
+typedef struct zfs_cmd_zcmd {
+ char zc_name[MAXPATHLEN]; /* name of pool or dataset */
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
+ int zc_pad2;
+
+ /*
+ * The following members are for legacy ioctls which haven't been
+ * converted to the new method.
+ */
+ uint64_t zc_history; /* really (char *) */
+ char zc_value[MAXPATHLEN * 2];
+ char zc_string[MAXNAMELEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ zinject_record_deadman_t zc_inject_record;
+ boolean_t zc_defer_destroy;
+ boolean_t zc_temphold;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_simple;
+ uint8_t zc_pad[3]; /* alignment */
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
+} zfs_cmd_zcmd_t;
+
+typedef struct zfs_cmd_edbp {
+ char zc_name[MAXPATHLEN]; /* name of pool or dataset */
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
+ int zc_pad2;
+
+ /*
+ * The following members are for legacy ioctls which haven't been
+ * converted to the new method.
+ */
+ uint64_t zc_history; /* really (char *) */
+ char zc_value[MAXPATHLEN * 2];
+ char zc_string[MAXNAMELEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ zinject_record_deadman_t zc_inject_record;
+ uint32_t zc_defer_destroy;
+ uint32_t zc_flags;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_simple;
+ uint8_t zc_pad[3]; /* alignment */
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
+} zfs_cmd_edbp_t;
+
+typedef struct zfs_cmd_resume {
+ char zc_name[MAXPATHLEN]; /* name of pool or dataset */
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
+ int zc_pad2;
+
+ /*
+ * The following members are for legacy ioctls which haven't been
+ * converted to the new method.
+ */
+ uint64_t zc_history; /* really (char *) */
+ char zc_value[MAXPATHLEN * 2];
+ char zc_string[MAXNAMELEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ dmu_replay_record_t zc_begin_record;
+ zinject_record_deadman_t zc_inject_record;
+ uint32_t zc_defer_destroy;
+ uint32_t zc_flags;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_simple;
+ boolean_t zc_resumable;
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
+} zfs_cmd_resume_t;
+
+typedef struct zfs_cmd_inlanes {
+ char zc_name[MAXPATHLEN]; /* name of pool or dataset */
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
+ int zc_pad2;
+
+ /*
+ * The following members are for legacy ioctls which haven't been
+ * converted to the new method.
+ */
+ uint64_t zc_history; /* really (char *) */
+ char zc_value[MAXPATHLEN * 2];
+ char zc_string[MAXNAMELEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ dmu_replay_record_t zc_begin_record;
+ zinject_record_t zc_inject_record;
+ uint32_t zc_defer_destroy;
+ uint32_t zc_flags;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_simple;
+ boolean_t zc_resumable;
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
+} zfs_cmd_inlanes_t;
+
+#ifdef _KERNEL
+unsigned static long zfs_ioctl_v15_to_v28[] = {
+ 0, /* 0 ZFS_IOC_POOL_CREATE */
+ 1, /* 1 ZFS_IOC_POOL_DESTROY */
+ 2, /* 2 ZFS_IOC_POOL_IMPORT */
+ 3, /* 3 ZFS_IOC_POOL_EXPORT */
+ 4, /* 4 ZFS_IOC_POOL_CONFIGS */
+ 5, /* 5 ZFS_IOC_POOL_STATS */
+ 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */
+ 7, /* 7 ZFS_IOC_POOL_SCRUB */
+ 8, /* 8 ZFS_IOC_POOL_FREEZE */
+ 9, /* 9 ZFS_IOC_POOL_UPGRADE */
+ 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */
+ 11, /* 11 ZFS_IOC_VDEV_ADD */
+ 12, /* 12 ZFS_IOC_VDEV_REMOVE */
+ 13, /* 13 ZFS_IOC_VDEV_SET_STATE */
+ 14, /* 14 ZFS_IOC_VDEV_ATTACH */
+ 15, /* 15 ZFS_IOC_VDEV_DETACH */
+ 16, /* 16 ZFS_IOC_VDEV_SETPATH */
+ 18, /* 17 ZFS_IOC_OBJSET_STATS */
+ 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */
+ 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */
+ 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */
+ 22, /* 21 ZFS_IOC_SET_PROP */
+ ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */
+ ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */
+ 23, /* 24 ZFS_IOC_CREATE */
+ 24, /* 25 ZFS_IOC_DESTROY */
+ 25, /* 26 ZFS_IOC_ROLLBACK */
+ 26, /* 27 ZFS_IOC_RENAME */
+ 27, /* 28 ZFS_IOC_RECV */
+ 28, /* 29 ZFS_IOC_SEND */
+ 29, /* 30 ZFS_IOC_INJECT_FAULT */
+ 30, /* 31 ZFS_IOC_CLEAR_FAULT */
+ 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */
+ 32, /* 33 ZFS_IOC_ERROR_LOG */
+ 33, /* 34 ZFS_IOC_CLEAR */
+ 34, /* 35 ZFS_IOC_PROMOTE */
+ 35, /* 36 ZFS_IOC_DESTROY_SNAPS */
+ 36, /* 37 ZFS_IOC_SNAPSHOT */
+ 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */
+ 38, /* 39 ZFS_IOC_OBJ_TO_PATH */
+ 39, /* 40 ZFS_IOC_POOL_SET_PROPS */
+ 40, /* 41 ZFS_IOC_POOL_GET_PROPS */
+ 41, /* 42 ZFS_IOC_SET_FSACL */
+ 42, /* 43 ZFS_IOC_GET_FSACL */
+ ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */
+ 43, /* 45 ZFS_IOC_SHARE */
+ 44, /* 46 ZFS_IOC_IHNERIT_PROP */
+ 58, /* 47 ZFS_IOC_JAIL */
+ 59, /* 48 ZFS_IOC_UNJAIL */
+ 45, /* 49 ZFS_IOC_SMB_ACL */
+ 46, /* 50 ZFS_IOC_USERSPACE_ONE */
+ 47, /* 51 ZFS_IOC_USERSPACE_MANY */
+ 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */
+ 17, /* 53 ZFS_IOC_SETFRU */
+};
+
+#else /* KERNEL */
+unsigned static long zfs_ioctl_v28_to_v15[] = {
+ 0, /* 0 ZFS_IOC_POOL_CREATE */
+ 1, /* 1 ZFS_IOC_POOL_DESTROY */
+ 2, /* 2 ZFS_IOC_POOL_IMPORT */
+ 3, /* 3 ZFS_IOC_POOL_EXPORT */
+ 4, /* 4 ZFS_IOC_POOL_CONFIGS */
+ 5, /* 5 ZFS_IOC_POOL_STATS */
+ 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */
+ 7, /* 7 ZFS_IOC_POOL_SCAN */
+ 8, /* 8 ZFS_IOC_POOL_FREEZE */
+ 9, /* 9 ZFS_IOC_POOL_UPGRADE */
+ 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */
+ 11, /* 11 ZFS_IOC_VDEV_ADD */
+ 12, /* 12 ZFS_IOC_VDEV_REMOVE */
+ 13, /* 13 ZFS_IOC_VDEV_SET_STATE */
+ 14, /* 14 ZFS_IOC_VDEV_ATTACH */
+ 15, /* 15 ZFS_IOC_VDEV_DETACH */
+ 16, /* 16 ZFS_IOC_VDEV_SETPATH */
+ 53, /* 17 ZFS_IOC_VDEV_SETFRU */
+ 17, /* 18 ZFS_IOC_OBJSET_STATS */
+ 18, /* 19 ZFS_IOC_OBJSET_ZPLPROPS */
+ 19, /* 20 ZFS_IOC_DATASET_LIST_NEXT */
+ 20, /* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */
+ 21, /* 22 ZFS_IOC_SET_PROP */
+ 24, /* 23 ZFS_IOC_CREATE */
+ 25, /* 24 ZFS_IOC_DESTROY */
+ 26, /* 25 ZFS_IOC_ROLLBACK */
+ 27, /* 26 ZFS_IOC_RENAME */
+ 28, /* 27 ZFS_IOC_RECV */
+ 29, /* 28 ZFS_IOC_SEND */
+ 30, /* 39 ZFS_IOC_INJECT_FAULT */
+ 31, /* 30 ZFS_IOC_CLEAR_FAULT */
+ 32, /* 31 ZFS_IOC_INJECT_LIST_NEXT */
+ 33, /* 32 ZFS_IOC_ERROR_LOG */
+ 34, /* 33 ZFS_IOC_CLEAR */
+ 35, /* 34 ZFS_IOC_PROMOTE */
+ 36, /* 35 ZFS_IOC_DESTROY_SNAPS */
+ 37, /* 36 ZFS_IOC_SNAPSHOT */
+ 38, /* 37 ZFS_IOC_DSOBJ_TO_DSNAME */
+ 39, /* 38 ZFS_IOC_OBJ_TO_PATH */
+ 40, /* 39 ZFS_IOC_POOL_SET_PROPS */
+ 41, /* 40 ZFS_IOC_POOL_GET_PROPS */
+ 42, /* 41 ZFS_IOC_SET_FSACL */
+ 43, /* 42 ZFS_IOC_GET_FSACL */
+ 45, /* 43 ZFS_IOC_SHARE */
+ 46, /* 44 ZFS_IOC_IHNERIT_PROP */
+ 49, /* 45 ZFS_IOC_SMB_ACL */
+ 50, /* 46 ZFS_IOC_USERSPACE_ONE */
+ 51, /* 47 ZFS_IOC_USERSPACE_MANY */
+ 52, /* 48 ZFS_IOC_USERSPACE_UPGRADE */
+ ZFS_IOC_COMPAT_FAIL, /* 49 ZFS_IOC_HOLD */
+ ZFS_IOC_COMPAT_FAIL, /* 50 ZFS_IOC_RELEASE */
+ ZFS_IOC_COMPAT_FAIL, /* 51 ZFS_IOC_GET_HOLDS */
+ ZFS_IOC_COMPAT_FAIL, /* 52 ZFS_IOC_OBJSET_RECVD_PROPS */
+ ZFS_IOC_COMPAT_FAIL, /* 53 ZFS_IOC_VDEV_SPLIT */
+ ZFS_IOC_COMPAT_FAIL, /* 54 ZFS_IOC_NEXT_OBJ */
+ ZFS_IOC_COMPAT_FAIL, /* 55 ZFS_IOC_DIFF */
+ ZFS_IOC_COMPAT_FAIL, /* 56 ZFS_IOC_TMP_SNAPSHOT */
+ ZFS_IOC_COMPAT_FAIL, /* 57 ZFS_IOC_OBJ_TO_STATS */
+ 47, /* 58 ZFS_IOC_JAIL */
+ 48, /* 59 ZFS_IOC_UNJAIL */
+};
+#endif /* ! _KERNEL */
+
+#ifdef _KERNEL
+int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int);
+void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int);
+nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int,
+ const int);
+nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int,
+ const int);
+#else
+int zcmd_ioctl_compat(int, int, zfs_cmd_t *, const int);
+#endif /* _KERNEL */
+void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int);
+void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IOCTL_COMPAT_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
new file mode 100644
index 000000000000..bad8f20e6917
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
@@ -0,0 +1,399 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * Common name validation routines for ZFS. These routines are shared by the
+ * userland code as well as the ioctl() layer to ensure that we don't
+ * inadvertently expose a hole through direct ioctl()s that never gets tested.
+ * In userland, however, we want significantly more information about _why_ the
+ * name is invalid. In the kernel, we only care whether it's valid or not.
+ * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
+ * the name failed to validate.
+ */
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <sys/dsl_dir.h>
+#include <sys/param.h>
+#include <sys/nvpair.h>
+#include "zfs_namecheck.h"
+#include "zfs_deleg.h"
+
+/*
+ * Deeply nested datasets can overflow the stack, so we put a limit
+ * in the amount of nesting a path can have. zfs_max_dataset_nesting
+ * can be tuned temporarily to fix existing datasets that exceed our
+ * predefined limit.
+ */
+int zfs_max_dataset_nesting = 50;
+
+static int
+valid_char(char c)
+{
+ return ((c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9') ||
+ c == '-' || c == '_' || c == '.' || c == ':' || c == ' ');
+}
+
+/*
+ * Looks at a path and returns its level of nesting (depth).
+ */
+int
+get_dataset_depth(const char *path)
+{
+ const char *loc = path;
+ int nesting = 0;
+
+ /*
+ * Keep track of nesting until you hit the end of the
+ * path or found the snapshot/bookmark seperator.
+ */
+ for (int i = 0; loc[i] != '\0' &&
+ loc[i] != '@' &&
+ loc[i] != '#'; i++) {
+ if (loc[i] == '/')
+ nesting++;
+ }
+
+ return (nesting);
+}
+
+/*
+ * Snapshot names must be made up of alphanumeric characters plus the following
+ * characters:
+ *
+ * [-_.: ]
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ const char *loc;
+
+ if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ if (path[0] == '\0') {
+ if (why)
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ return (-1);
+ }
+
+ for (loc = path; *loc; loc++) {
+ if (!valid_char(*loc)) {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *loc;
+ }
+ return (-1);
+ }
+ }
+ return (0);
+}
+
+
+/*
+ * Permissions set name must start with the letter '@' followed by the
+ * same character restrictions as snapshot names, except that the name
+ * cannot exceed 64 characters.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+permset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ if (strlen(path) >= ZFS_PERMSET_MAXLEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ if (path[0] != '@') {
+ if (why) {
+ *why = NAME_ERR_NO_AT;
+ *what = path[0];
+ }
+ return (-1);
+ }
+
+ return (zfs_component_namecheck(&path[1], why, what));
+}
+
+/*
+ * Dataset paths should not be deeper than zfs_max_dataset_nesting
+ * in terms of nesting.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+dataset_nestcheck(const char *path)
+{
+ return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1);
+}
+
+/*
+ * Entity names must be of the following form:
+ *
+ * [component/]*[component][(@|#)component]?
+ *
+ * Where each component is made up of alphanumeric characters plus the following
+ * characters:
+ *
+ * [-_.:%]
+ *
+ * We allow '%' here as we use that character internally to create unique
+ * names for temporary clones (for online recv).
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+entity_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ const char *end;
+
+ /*
+ * Make sure the name is not too long.
+ */
+ if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ /* Explicitly check for a leading slash. */
+ if (path[0] == '/') {
+ if (why)
+ *why = NAME_ERR_LEADING_SLASH;
+ return (-1);
+ }
+
+ if (path[0] == '\0') {
+ if (why)
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ return (-1);
+ }
+
+ const char *start = path;
+ boolean_t found_delim = B_FALSE;
+ for (;;) {
+ /* Find the end of this component */
+ end = start;
+ while (*end != '/' && *end != '@' && *end != '#' &&
+ *end != '\0')
+ end++;
+
+ if (*end == '\0' && end[-1] == '/') {
+ /* trailing slashes are not allowed */
+ if (why)
+ *why = NAME_ERR_TRAILING_SLASH;
+ return (-1);
+ }
+
+ /* Validate the contents of this component */
+ for (const char *loc = start; loc != end; loc++) {
+ if (!valid_char(*loc) && *loc != '%') {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *loc;
+ }
+ return (-1);
+ }
+ }
+
+ /* Snapshot or bookmark delimiter found */
+ if (*end == '@' || *end == '#') {
+ /* Multiple delimiters are not allowed */
+ if (found_delim != 0) {
+ if (why)
+ *why = NAME_ERR_MULTIPLE_DELIMITERS;
+ return (-1);
+ }
+
+ found_delim = B_TRUE;
+ }
+
+ /* Zero-length components are not allowed */
+ if (start == end) {
+ if (why)
+ *why = NAME_ERR_EMPTY_COMPONENT;
+ return (-1);
+ }
+
+ /* If we've reached the end of the string, we're OK */
+ if (*end == '\0')
+ return (0);
+
+ /*
+ * If there is a '/' in a snapshot or bookmark name
+ * then report an error
+ */
+ if (*end == '/' && found_delim != 0) {
+ if (why)
+ *why = NAME_ERR_TRAILING_SLASH;
+ return (-1);
+ }
+
+ /* Update to the next component */
+ start = end + 1;
+ }
+}
+
+/*
+ * Dataset is any entity, except bookmark
+ */
+int
+dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+ int ret = entity_namecheck(path, why, what);
+
+ if (ret == 0 && strchr(path, '#') != NULL) {
+ if (why != NULL) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = '#';
+ }
+ return (-1);
+ }
+
+ return (ret);
+}
+
+/*
+ * mountpoint names must be of the following form:
+ *
+ * /[component][/]*[component][/]
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+mountpoint_namecheck(const char *path, namecheck_err_t *why)
+{
+ const char *start, *end;
+
+ /*
+ * Make sure none of the mountpoint component names are too long.
+ * If a component name is too long then the mkdir of the mountpoint
+ * will fail but then the mountpoint property will be set to a value
+ * that can never be mounted. Better to fail before setting the prop.
+ * Extra slashes are OK, they will be tossed by the mountpoint mkdir.
+ */
+
+ if (path == NULL || *path != '/') {
+ if (why)
+ *why = NAME_ERR_LEADING_SLASH;
+ return (-1);
+ }
+
+ /* Skip leading slash */
+ start = &path[1];
+ do {
+ end = start;
+ while (*end != '/' && *end != '\0')
+ end++;
+
+ if (end - start >= ZFS_MAX_DATASET_NAME_LEN) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+ start = end + 1;
+
+ } while (*end != '\0');
+
+ return (0);
+}
+
+/*
+ * For pool names, we have the same set of valid characters as described in
+ * dataset names, with the additional restriction that the pool name must begin
+ * with a letter. The pool names 'raidz' and 'mirror' are also reserved names
+ * that cannot be used.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
+{
+ const char *c;
+
+ /*
+ * Make sure the name is not too long.
+ * If we're creating a pool with version >= SPA_VERSION_DSL_SCRUB (v11)
+ * we need to account for additional space needed by the origin ds which
+ * will also be snapshotted: "poolname"+"/"+"$ORIGIN"+"@"+"$ORIGIN".
+ * Play it safe and enforce this limit even if the pool version is < 11
+ * so it can be upgraded without issues.
+ */
+ if (strlen(pool) >= (ZFS_MAX_DATASET_NAME_LEN - 2 -
+ strlen(ORIGIN_DIR_NAME) * 2)) {
+ if (why)
+ *why = NAME_ERR_TOOLONG;
+ return (-1);
+ }
+
+ c = pool;
+ while (*c != '\0') {
+ if (!valid_char(*c)) {
+ if (why) {
+ *why = NAME_ERR_INVALCHAR;
+ *what = *c;
+ }
+ return (-1);
+ }
+ c++;
+ }
+
+ if (!(*pool >= 'a' && *pool <= 'z') &&
+ !(*pool >= 'A' && *pool <= 'Z')) {
+ if (why)
+ *why = NAME_ERR_NOLETTER;
+ return (-1);
+ }
+
+ if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) {
+ if (why)
+ *why = NAME_ERR_RESERVED;
+ return (-1);
+ }
+
+ if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) {
+ if (why)
+ *why = NAME_ERR_DISKLIKE;
+ return (-1);
+ }
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
new file mode 100644
index 000000000000..527db92b0cfa
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _ZFS_NAMECHECK_H
+#define _ZFS_NAMECHECK_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ NAME_ERR_LEADING_SLASH, /* name begins with leading slash */
+ NAME_ERR_EMPTY_COMPONENT, /* name contains an empty component */
+ NAME_ERR_TRAILING_SLASH, /* name ends with a slash */
+ NAME_ERR_INVALCHAR, /* invalid character found */
+ NAME_ERR_MULTIPLE_DELIMITERS, /* multiple '@'/'#' delimiters found */
+ NAME_ERR_NOLETTER, /* pool doesn't begin with a letter */
+ NAME_ERR_RESERVED, /* entire name is reserved */
+ NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */
+ NAME_ERR_TOOLONG, /* name is too long */
+ NAME_ERR_NO_AT, /* permission set is missing '@' */
+} namecheck_err_t;
+
+#define ZFS_PERMSET_MAXLEN 64
+
+extern int zfs_max_dataset_nesting;
+
+int get_dataset_depth(const char *);
+int pool_namecheck(const char *, namecheck_err_t *, char *);
+int entity_namecheck(const char *, namecheck_err_t *, char *);
+int dataset_namecheck(const char *, namecheck_err_t *, char *);
+int dataset_nestcheck(const char *);
+int mountpoint_namecheck(const char *, namecheck_err_t *);
+int zfs_component_namecheck(const char *, namecheck_err_t *, char *);
+int permset_namecheck(const char *, namecheck_err_t *, char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_NAMECHECK_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
new file mode 100644
index 000000000000..ac8da491a9ec
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -0,0 +1,718 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/u8_textprep.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
+
+/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
+const char *zfs_userquota_prop_prefixes[] = {
+ "userused@",
+ "userquota@",
+ "groupused@",
+ "groupquota@"
+};
+
+zprop_desc_t *
+zfs_prop_get_table(void)
+{
+ return (zfs_prop_table);
+}
+
+void
+zfs_prop_init(void)
+{
+ static zprop_index_t checksum_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
+ { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { "noparity", ZIO_CHECKSUM_NOPARITY },
+ { "sha512", ZIO_CHECKSUM_SHA512 },
+ { "skein", ZIO_CHECKSUM_SKEIN },
+#ifdef illumos
+ { "edonr", ZIO_CHECKSUM_EDONR },
+#endif
+ { NULL }
+ };
+
+ static zprop_index_t dedup_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { "sha256,verify",
+ ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+ { "sha512", ZIO_CHECKSUM_SHA512 },
+ { "sha512,verify",
+ ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY },
+ { "skein", ZIO_CHECKSUM_SKEIN },
+ { "skein,verify",
+ ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
+#ifdef illumos
+ { "edonr,verify",
+ ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
+#endif
+ { NULL }
+ };
+
+ static zprop_index_t compress_table[] = {
+ { "on", ZIO_COMPRESS_ON },
+ { "off", ZIO_COMPRESS_OFF },
+ { "lzjb", ZIO_COMPRESS_LZJB },
+ { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */
+ { "gzip-1", ZIO_COMPRESS_GZIP_1 },
+ { "gzip-2", ZIO_COMPRESS_GZIP_2 },
+ { "gzip-3", ZIO_COMPRESS_GZIP_3 },
+ { "gzip-4", ZIO_COMPRESS_GZIP_4 },
+ { "gzip-5", ZIO_COMPRESS_GZIP_5 },
+ { "gzip-6", ZIO_COMPRESS_GZIP_6 },
+ { "gzip-7", ZIO_COMPRESS_GZIP_7 },
+ { "gzip-8", ZIO_COMPRESS_GZIP_8 },
+ { "gzip-9", ZIO_COMPRESS_GZIP_9 },
+ { "zle", ZIO_COMPRESS_ZLE },
+ { "lz4", ZIO_COMPRESS_LZ4 },
+ { NULL }
+ };
+
+ static zprop_index_t snapdir_table[] = {
+ { "hidden", ZFS_SNAPDIR_HIDDEN },
+ { "visible", ZFS_SNAPDIR_VISIBLE },
+ { NULL }
+ };
+
+ static zprop_index_t acl_mode_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "groupmask", ZFS_ACL_GROUPMASK },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { "restricted", ZFS_ACL_RESTRICTED },
+ { NULL }
+ };
+
+ static zprop_index_t acl_inherit_table[] = {
+ { "discard", ZFS_ACL_DISCARD },
+ { "noallow", ZFS_ACL_NOALLOW },
+ { "restricted", ZFS_ACL_RESTRICTED },
+ { "passthrough", ZFS_ACL_PASSTHROUGH },
+ { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */
+ { "passthrough-x", ZFS_ACL_PASSTHROUGH_X },
+ { NULL }
+ };
+
+ static zprop_index_t case_table[] = {
+ { "sensitive", ZFS_CASE_SENSITIVE },
+ { "insensitive", ZFS_CASE_INSENSITIVE },
+ { "mixed", ZFS_CASE_MIXED },
+ { NULL }
+ };
+
+ static zprop_index_t copies_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { NULL }
+ };
+
+ /*
+ * Use the unique flags we have to send to u8_strcmp() and/or
+ * u8_textprep() to represent the various normalization property
+ * values.
+ */
+ static zprop_index_t normalize_table[] = {
+ { "none", 0 },
+ { "formD", U8_TEXTPREP_NFD },
+ { "formKC", U8_TEXTPREP_NFKC },
+ { "formC", U8_TEXTPREP_NFC },
+ { "formKD", U8_TEXTPREP_NFKD },
+ { NULL }
+ };
+
+ static zprop_index_t version_table[] = {
+ { "1", 1 },
+ { "2", 2 },
+ { "3", 3 },
+ { "4", 4 },
+ { "5", 5 },
+ { "current", ZPL_VERSION },
+ { NULL }
+ };
+
+ static zprop_index_t boolean_table[] = {
+ { "off", 0 },
+ { "on", 1 },
+ { NULL }
+ };
+
+ static zprop_index_t logbias_table[] = {
+ { "latency", ZFS_LOGBIAS_LATENCY },
+ { "throughput", ZFS_LOGBIAS_THROUGHPUT },
+ { NULL }
+ };
+
+ static zprop_index_t canmount_table[] = {
+ { "off", ZFS_CANMOUNT_OFF },
+ { "on", ZFS_CANMOUNT_ON },
+ { "noauto", ZFS_CANMOUNT_NOAUTO },
+ { NULL }
+ };
+
+ static zprop_index_t cache_table[] = {
+ { "none", ZFS_CACHE_NONE },
+ { "metadata", ZFS_CACHE_METADATA },
+ { "all", ZFS_CACHE_ALL },
+ { NULL }
+ };
+
+ static zprop_index_t sync_table[] = {
+ { "standard", ZFS_SYNC_STANDARD },
+ { "always", ZFS_SYNC_ALWAYS },
+ { "disabled", ZFS_SYNC_DISABLED },
+ { NULL }
+ };
+
+ static zprop_index_t volmode_table[] = {
+ { "default", ZFS_VOLMODE_DEFAULT },
+ { "geom", ZFS_VOLMODE_GEOM },
+ { "dev", ZFS_VOLMODE_DEV },
+ { "none", ZFS_VOLMODE_NONE },
+ { NULL }
+ };
+
+ static zprop_index_t dnsize_table[] = {
+ { "legacy", ZFS_DNSIZE_LEGACY },
+ { "auto", ZFS_DNSIZE_AUTO },
+ { "1k", ZFS_DNSIZE_1K },
+ { "2k", ZFS_DNSIZE_2K },
+ { "4k", ZFS_DNSIZE_4K },
+ { "8k", ZFS_DNSIZE_8K },
+ { "16k", ZFS_DNSIZE_16K },
+ { NULL }
+ };
+
+ static zprop_index_t redundant_metadata_table[] = {
+ { "all", ZFS_REDUNDANT_METADATA_ALL },
+ { "most", ZFS_REDUNDANT_METADATA_MOST },
+ { NULL }
+ };
+
+ /* inherit index properties */
+ zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
+ ZFS_REDUNDANT_METADATA_ALL,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "all | most", "REDUND_MD",
+ redundant_metadata_table);
+ zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "standard | always | disabled", "SYNC",
+ sync_table);
+ zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
+ ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_VOLUME,
+ "on | off | fletcher2 | fletcher4 | sha256 | sha512 | "
+ "skein", "CHECKSUM", checksum_table);
+ zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | verify | sha256[,verify], sha512[,verify], "
+ "skein[,verify]", "DEDUP", dedup_table);
+ zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
+ ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4",
+ "COMPRESS", compress_table);
+ zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "hidden | visible", "SNAPDIR", snapdir_table);
+ zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "discard | groupmask | passthrough | restricted", "ACLMODE",
+ acl_mode_table);
+ zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
+ ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "discard | noallow | restricted | passthrough | passthrough-x",
+ "ACLINHERIT", acl_inherit_table);
+ zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "1 | 2 | 3", "COPIES", copies_table);
+ zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+ ZFS_CACHE_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "all | none | metadata", "PRIMARYCACHE", cache_table);
+ zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+ ZFS_CACHE_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "all | none | metadata", "SECONDARYCACHE", cache_table);
+ zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "latency | throughput", "LOGBIAS", logbias_table);
+ zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
+ ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "default | geom | dev | none", "VOLMODE", volmode_table);
+
+ zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
+ ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table);
+
+ /* inherit index (boolean) properties */
+ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
+ zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
+ zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
+ boolean_table);
+ zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
+ boolean_table);
+
+ /* default index properties */
+ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table);
+ zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
+ "CANMOUNT", canmount_table);
+
+ /* readonly index (boolean) properties */
+ zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+ zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
+ boolean_table);
+
+ /* set once index properties */
+ zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+ PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "none | formC | formD | formKC | formKD", "NORMALIZATION",
+ normalize_table);
+ zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
+ ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_SNAPSHOT,
+ "sensitive | insensitive | mixed", "CASE", case_table);
+
+ /* set once index (boolean) properties */
+ zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+ "on | off", "UTF8ONLY", boolean_table);
+
+ /* string properties */
+ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
+ zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
+ ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
+ zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
+ "MOUNTPOINT");
+ zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
+ "SHARENFS");
+ zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+ "filesystem | volume | snapshot | bookmark", "TYPE");
+ zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "on | off | sharemgr(1M) options", "SHARESMB");
+ zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
+ ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
+ "<sensitivity label>", "MLSLABEL");
+ zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
+ "receive_resume_token",
+ NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<string token>", "RESUMETOK");
+
+ /* readonly number properties */
+ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET, "<size>", "USED");
+ zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
+ zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
+ zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET,
+ "<1.00x or higher if compressed>", "RATIO");
+ zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
+ PROP_READONLY, ZFS_TYPE_DATASET,
+ "<1.00x or higher if compressed>", "REFRATIO");
+ zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
+ ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
+ ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
+ zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDSNAP");
+ zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDDS");
+ zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "USEDCHILD");
+ zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+ PROP_READONLY,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+ zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+ ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
+ zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET, "<size>", "WRITTEN");
+ zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+ "LUSED");
+ zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
+ 0, PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LREFER");
+
+ /* default number properties */
+ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
+ zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<size> | none", "RESERV");
+ zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+ ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
+ zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+ ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
+ zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<size> | none", "REFRESERV");
+ zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+ "<count> | none", "FSLIMIT");
+ zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<count> | none", "SSLIMIT");
+ zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+ "<count>", "FSCOUNT");
+ zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<count>", "SSCOUNT");
+ zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
+ zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
+
+ /* inherit number properties */
+ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
+ SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
+ zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
+ "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS");
+
+ /* hidden properties */
+ zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
+ zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
+ zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME");
+ zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
+ PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+ zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
+ PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
+ "STMF_SBD_LU");
+ zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+ "USERACCOUNTING");
+ zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
+ zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
+ zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
+ zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
+
+ /* oddball properties */
+ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
+ NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+ "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
+}
+
+boolean_t
+zfs_prop_delegatable(zfs_prop_t prop)
+{
+ zprop_desc_t *pd = &zfs_prop_table[prop];
+
+ /* The mlslabel property is never delegatable. */
+ if (prop == ZFS_PROP_MLSLABEL)
+ return (B_FALSE);
+
+ return (pd->pd_attr != PROP_READONLY);
+}
+
+/*
+ * Given a zfs dataset property name, returns the corresponding property ID.
+ */
+zfs_prop_t
+zfs_name_to_prop(const char *propname)
+{
+ return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
+}
+
+/*
+ * For user property names, we allow all lowercase alphanumeric characters, plus
+ * a few useful punctuation characters.
+ */
+static int
+valid_char(char c)
+{
+ return ((c >= 'a' && c <= 'z') ||
+ (c >= '0' && c <= '9') ||
+ c == '-' || c == '_' || c == '.' || c == ':');
+}
+
+/*
+ * Returns true if this is a valid user-defined property (one with a ':').
+ */
+boolean_t
+zfs_prop_user(const char *name)
+{
+ int i;
+ char c;
+ boolean_t foundsep = B_FALSE;
+
+ for (i = 0; i < strlen(name); i++) {
+ c = name[i];
+ if (!valid_char(c))
+ return (B_FALSE);
+ if (c == ':')
+ foundsep = B_TRUE;
+ }
+
+ if (!foundsep)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Returns true if this is a valid userspace-type property (one with a '@').
+ * Note that after the @, any character is valid (eg, another @, for SID
+ * user@domain).
+ */
+boolean_t
+zfs_prop_userquota(const char *name)
+{
+ zfs_userquota_prop_t prop;
+
+ for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
+ if (strncmp(name, zfs_userquota_prop_prefixes[prop],
+ strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Returns true if this is a valid written@ property.
+ * Note that after the @, any character is valid (eg, another @, for
+ * written@pool/fs@origin).
+ */
+boolean_t
+zfs_prop_written(const char *name)
+{
+ static const char *prefix = "written@";
+ return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+/*
+ * Tables of index types, plus functions to convert between the user view
+ * (strings) and internal representation (uint64_t).
+ */
+int
+zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
+{
+ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
+}
+
+int
+zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
+{
+ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
+}
+
+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+ return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
+/*
+ * Returns TRUE if the property applies to any of the given dataset types.
+ */
+boolean_t
+zfs_prop_valid_for_type(int prop, zfs_type_t types)
+{
+ return (zprop_valid_for_type(prop, types));
+}
+
+zprop_type_t
+zfs_prop_get_type(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_proptype);
+}
+
+/*
+ * Returns TRUE if the property is readonly.
+ */
+boolean_t
+zfs_prop_readonly(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME);
+}
+
+/*
+ * Returns TRUE if the property is visible (not hidden).
+ */
+boolean_t
+zfs_prop_visible(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_visible);
+}
+
+/*
+ * Returns TRUE if the property is only allowed to be set once.
+ */
+boolean_t
+zfs_prop_setonce(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
+}
+
+const char *
+zfs_prop_default_string(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zfs_prop_default_numeric(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_numdefault);
+}
+
+/*
+ * Given a dataset property ID, returns the corresponding name.
+ * Assuming the zfs dataset property ID is valid.
+ */
+const char *
+zfs_prop_to_name(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_name);
+}
+
+/*
+ * Returns TRUE if the property is inheritable.
+ */
+boolean_t
+zfs_prop_inheritable(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_attr == PROP_INHERIT ||
+ zfs_prop_table[prop].pd_attr == PROP_ONETIME);
+}
+
+#ifndef _KERNEL
+
+/*
+ * Returns a string describing the set of acceptable values for the given
+ * zfs property, or NULL if it cannot be set.
+ */
+const char *
+zfs_prop_values(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if this property is a string type. Note that index types
+ * (compression, checksum) are treated as strings in userland, even though they
+ * are stored numerically on disk.
+ */
+int
+zfs_prop_is_string(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING ||
+ zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
+}
+
+/*
+ * Returns the column header for the given property. Used only in
+ * 'zfs list -o', but centralized here with the other property information.
+ */
+const char *
+zfs_prop_column_name(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_colname);
+}
+
+/*
+ * Returns whether the given property should be displayed right-justified for
+ * 'zfs list'.
+ */
+boolean_t
+zfs_prop_align_right(zfs_prop_t prop)
+{
+ return (zfs_prop_table[prop].pd_rightalign);
+}
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
new file mode 100644
index 000000000000..e604abda131d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_PROP_H
+#define _ZFS_PROP_H
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * For index types (e.g. compression and checksum), we want the numeric value
+ * in the kernel, but the string value in userland.
+ */
+typedef enum {
+ PROP_TYPE_NUMBER, /* numeric value */
+ PROP_TYPE_STRING, /* string value */
+ PROP_TYPE_INDEX /* numeric value indexed by string */
+} zprop_type_t;
+
+typedef enum {
+ PROP_DEFAULT,
+ PROP_READONLY,
+ PROP_INHERIT,
+ /*
+ * ONETIME properties are a sort of conglomeration of READONLY
+ * and INHERIT. They can be set only during object creation,
+ * after that they are READONLY. If not explicitly set during
+ * creation, they can be inherited.
+ */
+ PROP_ONETIME
+} zprop_attr_t;
+
+typedef struct zfs_index {
+ const char *pi_name;
+ uint64_t pi_value;
+} zprop_index_t;
+
+typedef struct {
+ const char *pd_name; /* human-readable property name */
+ int pd_propnum; /* property number */
+ zprop_type_t pd_proptype; /* string, boolean, index, number */
+ const char *pd_strdefault; /* default for strings */
+ uint64_t pd_numdefault; /* for boolean / index / number */
+ zprop_attr_t pd_attr; /* default, readonly, inherit */
+ int pd_types; /* bitfield of valid dataset types */
+ /* fs | vol | snap; or pool */
+ const char *pd_values; /* string telling acceptable values */
+ const char *pd_colname; /* column header for "zfs list" */
+ boolean_t pd_rightalign; /* column alignment for "zfs list" */
+ boolean_t pd_visible; /* do we list this property with the */
+ /* "zfs get" help message */
+ const zprop_index_t *pd_table; /* for index properties, a table */
+ /* defining the possible values */
+ size_t pd_table_size; /* number of entries in pd_table[] */
+} zprop_desc_t;
+
+/*
+ * zfs dataset property functions
+ */
+void zfs_prop_init(void);
+zprop_type_t zfs_prop_get_type(zfs_prop_t);
+boolean_t zfs_prop_delegatable(zfs_prop_t prop);
+zprop_desc_t *zfs_prop_get_table(void);
+
+/*
+ * zpool property functions
+ */
+void zpool_prop_init(void);
+zprop_type_t zpool_prop_get_type(zpool_prop_t);
+zprop_desc_t *zpool_prop_get_table(void);
+
+/*
+ * Common routines to initialize property tables
+ */
+void zprop_register_impl(int, const char *, zprop_type_t, uint64_t,
+ const char *, zprop_attr_t, int, const char *, const char *,
+ boolean_t, boolean_t, const zprop_index_t *);
+void zprop_register_string(int, const char *, const char *,
+ zprop_attr_t attr, int, const char *, const char *);
+void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int,
+ const char *, const char *);
+void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int,
+ const char *, const char *, const zprop_index_t *);
+void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
+ int, const char *);
+
+/*
+ * Common routines for zfs and zpool property management
+ */
+int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
+int zprop_name_to_prop(const char *, zfs_type_t);
+int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
+int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
+const char *zprop_values(int, zfs_type_t);
+size_t zprop_width(int, boolean_t *, zfs_type_t);
+boolean_t zprop_valid_for_type(int, zfs_type_t);
+boolean_t zfs_prop_written(const char *name);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_PROP_H */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
new file mode 100644
index 000000000000..d17c7fd98043
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
@@ -0,0 +1,250 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS];
+
+zprop_desc_t *
+zpool_prop_get_table(void)
+{
+ return (zpool_prop_table);
+}
+
+void
+zpool_prop_init(void)
+{
+ static zprop_index_t boolean_table[] = {
+ { "off", 0},
+ { "on", 1},
+ { NULL }
+ };
+
+ static zprop_index_t failuremode_table[] = {
+ { "wait", ZIO_FAILURE_MODE_WAIT },
+ { "continue", ZIO_FAILURE_MODE_CONTINUE },
+ { "panic", ZIO_FAILURE_MODE_PANIC },
+ { NULL }
+ };
+
+ /* string properties */
+ zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<path>", "ALTROOT");
+ zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+ ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
+ zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+ zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<comment-string>", "COMMENT");
+
+ /* readonly number properties */
+ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "SIZE");
+ zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "FREE");
+ zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "FREEING");
+ zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
+ zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "LEAKED");
+ zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
+ zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
+ zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
+ zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "CAP");
+ zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<guid>", "GUID");
+ zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<state>", "HEALTH");
+ zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
+ "DEDUP");
+
+ /* system partition size */
+ zprop_register_number(ZPOOL_PROP_BOOTSIZE, "bootsize", 0, PROP_ONETIME,
+ ZFS_TYPE_POOL, "<size>", "BOOTSIZE");
+
+ /* default number properties */
+ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+ zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
+
+ /* default index (boolean) properties */
+ zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
+ boolean_table);
+ zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+ zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
+ boolean_table);
+ zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+ zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table);
+ zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST",
+ boolean_table);
+
+ /* default index properties */
+ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+ ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
+ "wait | continue | panic", "FAILMODE", failuremode_table);
+
+ /* hidden properties */
+ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+ PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+ zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
+ zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
+ PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
+ zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE");
+}
+
+/*
+ * Given a property name and its type, returns the corresponding property ID.
+ */
+zpool_prop_t
+zpool_name_to_prop(const char *propname)
+{
+ return (zprop_name_to_prop(propname, ZFS_TYPE_POOL));
+}
+
+/*
+ * Given a pool property ID, returns the corresponding name.
+ * Assuming the pool propety ID is valid.
+ */
+const char *
+zpool_prop_to_name(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_name);
+}
+
+zprop_type_t
+zpool_prop_get_type(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_proptype);
+}
+
+boolean_t
+zpool_prop_readonly(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_attr == PROP_READONLY);
+}
+
+const char *
+zpool_prop_default_string(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zpool_prop_default_numeric(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_numdefault);
+}
+
+/*
+ * Returns true if this is a valid feature@ property.
+ */
+boolean_t
+zpool_prop_feature(const char *name)
+{
+ static const char *prefix = "feature@";
+ return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+/*
+ * Returns true if this is a valid unsupported@ property.
+ */
+boolean_t
+zpool_prop_unsupported(const char *name)
+{
+ static const char *prefix = "unsupported@";
+ return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+int
+zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
+ uint64_t *index)
+{
+ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL));
+}
+
+int
+zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
+ const char **string)
+{
+ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
+}
+
+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+ return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
+#ifndef _KERNEL
+
+const char *
+zpool_prop_values(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_values);
+}
+
+const char *
+zpool_prop_column_name(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_colname);
+}
+
+boolean_t
+zpool_prop_align_right(zpool_prop_t prop)
+{
+ return (zpool_prop_table[prop].pd_rightalign);
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
new file mode 100644
index 000000000000..ca2e72c5daa4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
@@ -0,0 +1,430 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+/*
+ * Common routines used by zfs and zpool property management.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#include <sys/libkern.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t *
+zprop_get_proptable(zfs_type_t type)
+{
+ if (type == ZFS_TYPE_POOL)
+ return (zpool_prop_get_table());
+ else
+ return (zfs_prop_get_table());
+}
+
+static int
+zprop_get_numprops(zfs_type_t type)
+{
+ if (type == ZFS_TYPE_POOL)
+ return (ZPOOL_NUM_PROPS);
+ else
+ return (ZFS_NUM_PROPS);
+}
+
+void
+zprop_register_impl(int prop, const char *name, zprop_type_t type,
+ uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
+ int objset_types, const char *values, const char *colname,
+ boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
+{
+ zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types);
+ zprop_desc_t *pd;
+
+ pd = &prop_tbl[prop];
+
+ ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+ ASSERT(name != NULL);
+ ASSERT(colname != NULL);
+
+ pd->pd_name = name;
+ pd->pd_propnum = prop;
+ pd->pd_proptype = type;
+ pd->pd_numdefault = numdefault;
+ pd->pd_strdefault = strdefault;
+ pd->pd_attr = attr;
+ pd->pd_types = objset_types;
+ pd->pd_values = values;
+ pd->pd_colname = colname;
+ pd->pd_rightalign = rightalign;
+ pd->pd_visible = visible;
+ pd->pd_table = idx_tbl;
+ pd->pd_table_size = 0;
+ while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+ pd->pd_table_size++;
+}
+
+void
+zprop_register_string(int prop, const char *name, const char *def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname)
+{
+ zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+ objset_types, values, colname, B_FALSE, B_TRUE, NULL);
+
+}
+
+void
+zprop_register_number(int prop, const char *name, uint64_t def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname)
+{
+ zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+ objset_types, values, colname, B_TRUE, B_TRUE, NULL);
+}
+
+void
+zprop_register_index(int prop, const char *name, uint64_t def,
+ zprop_attr_t attr, int objset_types, const char *values,
+ const char *colname, const zprop_index_t *idx_tbl)
+{
+ zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+ objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
+}
+
+void
+zprop_register_hidden(int prop, const char *name, zprop_type_t type,
+ zprop_attr_t attr, int objset_types, const char *colname)
+{
+ zprop_register_impl(prop, name, type, 0, NULL, attr,
+ objset_types, NULL, colname,
+ type == PROP_TYPE_NUMBER, B_FALSE, NULL);
+}
+
+
+/*
+ * A comparison function we can use to order indexes into property tables.
+ */
+static int
+zprop_compare(const void *arg1, const void *arg2)
+{
+ const zprop_desc_t *p1 = *((zprop_desc_t **)arg1);
+ const zprop_desc_t *p2 = *((zprop_desc_t **)arg2);
+ boolean_t p1ro, p2ro;
+
+ p1ro = (p1->pd_attr == PROP_READONLY);
+ p2ro = (p2->pd_attr == PROP_READONLY);
+
+ if (p1ro == p2ro)
+ return (strcmp(p1->pd_name, p2->pd_name));
+
+ return (p1ro ? -1 : 1);
+}
+
+/*
+ * Iterate over all properties in the given property table, calling back
+ * into the specified function for each property. We will continue to
+ * iterate until we either reach the end or the callback function returns
+ * something other than ZPROP_CONT.
+ */
+int
+zprop_iter_common(zprop_func func, void *cb, boolean_t show_all,
+ boolean_t ordered, zfs_type_t type)
+{
+ int i, j, num_props, size, prop;
+ zprop_desc_t *prop_tbl;
+ zprop_desc_t **order;
+
+ prop_tbl = zprop_get_proptable(type);
+ num_props = zprop_get_numprops(type);
+ size = num_props * sizeof (zprop_desc_t *);
+
+#if defined(_KERNEL)
+ order = kmem_alloc(size, KM_SLEEP);
+#else
+ if ((order = malloc(size)) == NULL)
+ return (ZPROP_CONT);
+#endif
+
+ for (j = 0; j < num_props; j++)
+ order[j] = &prop_tbl[j];
+
+ if (ordered) {
+ qsort((void *)order, num_props, sizeof (zprop_desc_t *),
+ zprop_compare);
+ }
+
+ prop = ZPROP_CONT;
+ for (i = 0; i < num_props; i++) {
+ if ((order[i]->pd_visible || show_all) &&
+ (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) {
+ prop = order[i]->pd_propnum;
+ break;
+ }
+ }
+
+#if defined(_KERNEL)
+ kmem_free(order, size);
+#else
+ free(order);
+#endif
+ return (prop);
+}
+
+static boolean_t
+propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
+{
+ const char *propname = prop_entry->pd_name;
+#ifndef _KERNEL
+ const char *colname = prop_entry->pd_colname;
+ int c;
+#endif
+
+ if (len == strlen(propname) &&
+ strncmp(p, propname, len) == 0)
+ return (B_TRUE);
+
+#ifndef _KERNEL
+ if (colname == NULL || len != strlen(colname))
+ return (B_FALSE);
+
+ for (c = 0; c < len; c++)
+ if (p[c] != tolower(colname[c]))
+ break;
+
+ return (colname[c] == '\0');
+#else
+ return (B_FALSE);
+#endif
+}
+
+typedef struct name_to_prop_cb {
+ const char *propname;
+ zprop_desc_t *prop_tbl;
+} name_to_prop_cb_t;
+
+static int
+zprop_name_to_prop_cb(int prop, void *cb_data)
+{
+ name_to_prop_cb_t *data = cb_data;
+
+ if (propname_match(data->propname, strlen(data->propname),
+ &data->prop_tbl[prop]))
+ return (prop);
+
+ return (ZPROP_CONT);
+}
+
+int
+zprop_name_to_prop(const char *propname, zfs_type_t type)
+{
+ int prop;
+ name_to_prop_cb_t cb_data;
+
+ cb_data.propname = propname;
+ cb_data.prop_tbl = zprop_get_proptable(type);
+
+ prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data,
+ B_TRUE, B_FALSE, type);
+
+ return (prop == ZPROP_CONT ? ZPROP_INVAL : prop);
+}
+
+int
+zprop_string_to_index(int prop, const char *string, uint64_t *index,
+ zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+ int i;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (-1);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+ return (-1);
+
+ for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+ if (strcmp(string, idx_tbl[i].pi_name) == 0) {
+ *index = idx_tbl[i].pi_value;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+int
+zprop_index_to_string(int prop, uint64_t index, const char **string,
+ zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+ int i;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (-1);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+ return (-1);
+
+ for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+ if (idx_tbl[i].pi_value == index) {
+ *string = idx_tbl[i].pi_name;
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+/*
+ * Return a random valid property value. Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+
+ ASSERT((uint_t)prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ idx_tbl = prop_tbl[prop].pd_table;
+
+ if (idx_tbl == NULL)
+ return (seed);
+
+ return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
+const char *
+zprop_values(int prop, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+
+ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+ ASSERT(prop < zprop_get_numprops(type));
+
+ prop_tbl = zprop_get_proptable(type);
+
+ return (prop_tbl[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if the property applies to any of the given dataset types.
+ */
+boolean_t
+zprop_valid_for_type(int prop, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+
+ if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+ return (B_FALSE);
+
+ ASSERT(prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ return ((prop_tbl[prop].pd_types & type) != 0);
+}
+
+#ifndef _KERNEL
+
+/*
+ * Determines the minimum width for the column, and indicates whether it's fixed
+ * or not. Only string columns are non-fixed.
+ */
+size_t
+zprop_width(int prop, boolean_t *fixed, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl, *pd;
+ const zprop_index_t *idx;
+ size_t ret;
+ int i;
+
+ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+ ASSERT(prop < zprop_get_numprops(type));
+
+ prop_tbl = zprop_get_proptable(type);
+ pd = &prop_tbl[prop];
+
+ *fixed = B_TRUE;
+
+ /*
+ * Start with the width of the column name.
+ */
+ ret = strlen(pd->pd_colname);
+
+ /*
+ * For fixed-width values, make sure the width is large enough to hold
+ * any possible value.
+ */
+ switch (pd->pd_proptype) {
+ case PROP_TYPE_NUMBER:
+ /*
+ * The maximum length of a human-readable number is 5 characters
+ * ("20.4M", for example).
+ */
+ if (ret < 5)
+ ret = 5;
+ /*
+ * 'creation' is handled specially because it's a number
+ * internally, but displayed as a date string.
+ */
+ if (prop == ZFS_PROP_CREATION)
+ *fixed = B_FALSE;
+ break;
+ case PROP_TYPE_INDEX:
+ idx = prop_tbl[prop].pd_table;
+ for (i = 0; idx[i].pi_name != NULL; i++) {
+ if (strlen(idx[i].pi_name) > ret)
+ ret = strlen(idx[i].pi_name);
+ }
+ break;
+
+ case PROP_TYPE_STRING:
+ *fixed = B_FALSE;
+ break;
+ }
+
+ return (ret);
+}
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/aarch64/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/aarch64/dtrace/fasttrap_isa.c
new file mode 100644
index 000000000000..f5377a895d6c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/aarch64/dtrace/fasttrap_isa.c
@@ -0,0 +1,29 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * XXX: Placeholder for AArch64 fasttrap code
+ */
diff --git a/sys/cddl/contrib/opensolaris/uts/aarch64/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/aarch64/sys/fasttrap_isa.h
new file mode 100644
index 000000000000..d85426edb417
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/aarch64/sys/fasttrap_isa.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_ISA_H
+#define _FASTTRAP_ISA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint32_t fasttrap_instr_t;
+
+/* XXX: Place for AArch64 fasttrap headers */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_ISA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/arm/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/arm/dtrace/fasttrap_isa.c
new file mode 100644
index 000000000000..18e3837b35b6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/arm/dtrace/fasttrap_isa.c
@@ -0,0 +1,30 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+/*
+ * XXX: Placeholder for ARM fasttrap code
+ */
diff --git a/sys/cddl/contrib/opensolaris/uts/arm/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/arm/sys/fasttrap_isa.h
new file mode 100644
index 000000000000..10361cbed8de
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/arm/sys/fasttrap_isa.h
@@ -0,0 +1,94 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_ISA_H
+#define _FASTTRAP_ISA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is our reserved trap instruction: ta 0x38
+ */
+#define FASTTRAP_INSTR 0x91d02038
+
+#define FASTTRAP_SUNWDTRACE_SIZE 128
+
+typedef uint32_t fasttrap_instr_t;
+
+typedef struct fasttrap_machtp {
+ fasttrap_instr_t ftmt_instr; /* original instruction */
+ uintptr_t ftmt_dest; /* destination of DCTI */
+ uint8_t ftmt_type; /* emulation type */
+ uint8_t ftmt_flags; /* emulation flags */
+ uint8_t ftmt_cc; /* which cc to look at */
+ uint8_t ftmt_code; /* branch condition */
+} fasttrap_machtp_t;
+
+#define ftt_instr ftt_mtp.ftmt_instr
+#define ftt_dest ftt_mtp.ftmt_dest
+#define ftt_type ftt_mtp.ftmt_type
+#define ftt_flags ftt_mtp.ftmt_flags
+#define ftt_cc ftt_mtp.ftmt_cc
+#define ftt_code ftt_mtp.ftmt_code
+
+#define FASTTRAP_T_COMMON 0x00 /* common case -- no emulation */
+#define FASTTRAP_T_CCR 0x01 /* integer condition code branch */
+#define FASTTRAP_T_FCC 0x02 /* floating-point branch */
+#define FASTTRAP_T_REG 0x03 /* register predicated branch */
+#define FASTTRAP_T_ALWAYS 0x04 /* branch always */
+#define FASTTRAP_T_CALL 0x05 /* call instruction */
+#define FASTTRAP_T_JMPL 0x06 /* jmpl instruction */
+#define FASTTRAP_T_RDPC 0x07 /* rdpc instruction */
+#define FASTTRAP_T_RETURN 0x08 /* return instruction */
+
+/*
+ * For performance rather than correctness.
+ */
+#define FASTTRAP_T_SAVE 0x10 /* save instruction (func entry only) */
+#define FASTTRAP_T_RESTORE 0x11 /* restore instruction */
+#define FASTTRAP_T_OR 0x12 /* mov instruction */
+#define FASTTRAP_T_SETHI 0x13 /* sethi instruction (includes nop) */
+
+#define FASTTRAP_F_ANNUL 0x01 /* branch is annulled */
+#define FASTTRAP_F_RETMAYBE 0x02 /* not definitely a return site */
+
+#define FASTTRAP_AFRAMES 3
+#define FASTTRAP_RETURN_AFRAMES 4
+#define FASTTRAP_ENTRY_AFRAMES 3
+#define FASTTRAP_OFFSET_AFRAMES 3
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_ISA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
new file mode 100644
index 000000000000..74b094f1e007
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -0,0 +1,183 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc. All rights reserved.
+# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+#
+#
+# This Makefile defines all file modules for the directory uts/common
+# and its children. These are the source files which may be considered
+# common to all SunOS systems.
+
+LUA_OBJS += \
+ ldo.o \
+ lvm.o \
+ lbitlib.o \
+ lopcodes.o \
+ lstring.o \
+ ltable.o \
+ ltm.o \
+ lcorolib.o \
+ lauxlib.o \
+ ldebug.o \
+ lstate.o \
+ lgc.o \
+ lmem.o \
+ lctype.o \
+ lfunc.o \
+ ldump.o \
+ lundump.o \
+ lstrlib.o \
+ ltablib.o \
+ lapi.o \
+ lobject.o \
+ lbaselib.o \
+ lcompat.o \
+ lzio.o \
+ lcode.o \
+ llex.o \
+ lparser.o
+
+ZFS_COMMON_OBJS += \
+ abd.o \
+ aggsum.o \
+ arc.o \
+ bplist.o \
+ blkptr.o \
+ bpobj.o \
+ bptree.o \
+ bqueue.o \
+ cityhash.o \
+ dbuf.o \
+ dbuf_stats.o \
+ ddt.o \
+ ddt_zap.o \
+ dmu.o \
+ dmu_diff.o \
+ dmu_send.o \
+ dmu_object.o \
+ dmu_objset.o \
+ dmu_traverse.o \
+ dmu_tx.o \
+ dnode.o \
+ dnode_sync.o \
+ dsl_bookmark.o \
+ dsl_dir.o \
+ dsl_dataset.o \
+ dsl_deadlist.o \
+ dsl_destroy.o \
+ dsl_pool.o \
+ dsl_synctask.o \
+ dsl_userhold.o \
+ dmu_zfetch.o \
+ dsl_deleg.o \
+ dsl_prop.o \
+ dsl_scan.o \
+ zfeature.o \
+ gzip.o \
+ lzjb.o \
+ metaslab.o \
+ mmp.o \
+ multilist.o \
+ range_tree.o \
+ refcount.o \
+ rrwlock.o \
+ sa.o \
+ sha256.o \
+ skein_zfs.o \
+ spa.o \
+ spa_checkpoint.o \
+ spa_config.o \
+ spa_errlog.o \
+ spa_history.o \
+ spa_misc.o \
+ space_map.o \
+ space_reftree.o \
+ txg.o \
+ uberblock.o \
+ unique.o \
+ vdev.o \
+ vdev_cache.o \
+ vdev_file.o \
+ vdev_indirect.o \
+ vdev_indirect_births.o \
+ vdev_indirect_mapping.o \
+ vdev_initialize.o \
+ vdev_label.o \
+ vdev_mirror.o \
+ vdev_missing.o \
+ vdev_queue.o \
+ vdev_raidz.o \
+ vdev_removal.o \
+ vdev_root.o \
+ zap.o \
+ zap_leaf.o \
+ zap_micro.o \
+ zcp.o \
+ zcp_get.o \
+ zcp_global.o \
+ zcp_iter.o \
+ zcp_synctask.o \
+ zfs_byteswap.o \
+ zfs_debug.o \
+ zfs_fm.o \
+ zfs_fuid.o \
+ zfs_sa.o \
+ zfs_znode.o \
+ zil.o \
+ zio.o \
+ zio_checksum.o \
+ zio_compress.o \
+ zio_inject.o \
+ zle.o \
+ zrlock.o \
+ zthr.o
+
+ZFS_SHARED_OBJS += \
+ zfeature_common.o \
+ zfs_comutil.o \
+ zfs_deleg.o \
+ zfs_fletcher.o \
+ zfs_namecheck.o \
+ zfs_prop.o \
+ zpool_prop.o \
+ zprop_common.o
+
+ZFS_OBJS += \
+ $(ZFS_COMMON_OBJS) \
+ $(ZFS_SHARED_OBJS) \
+ zfs_acl.o \
+ zfs_ctldir.o \
+ zfs_dir.o \
+ zfs_ioctl.o \
+ zfs_ioctl_compat.o \
+ zfs_log.o \
+ zfs_onexit.o \
+ zfs_replay.o \
+ zfs_rlock.o \
+ zfs_vfsops.o \
+ zfs_vnops.o \
+ zvol.o
diff --git a/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c b/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c
new file mode 100644
index 000000000000..b34cf400cd2c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c
@@ -0,0 +1,177 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/sysmacros.h>
+#include <sys/modctl.h>
+#include <sys/debug.h>
+#include <sys/mman.h>
+#include <sys/modctl.h>
+#include <sys/kobj.h>
+#include <ctf_impl.h>
+
+int ctf_leave_compressed = 0;
+
+static struct modlmisc modlmisc = {
+ &mod_miscops, "Compact C Type Format routines"
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modlmisc, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *mip)
+{
+ return (mod_info(&modlinkage, mip));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
+
+/*ARGSUSED*/
+void *
+ctf_zopen(int *errp)
+{
+ return ((void *)1); /* zmod is always loaded because we depend on it */
+}
+
+/*ARGSUSED*/
+const void *
+ctf_sect_mmap(ctf_sect_t *sp, int fd)
+{
+ return (MAP_FAILED); /* we don't support this in the kernel */
+}
+
+/*ARGSUSED*/
+void
+ctf_sect_munmap(const ctf_sect_t *sp)
+{
+ /* we don't support this in the kernel */
+}
+
+/*ARGSUSED*/
+ctf_file_t *
+ctf_fdopen(int fd, int *errp)
+{
+ return (ctf_set_open_errno(errp, ENOTSUP));
+}
+
+/*ARGSUSED*/
+ctf_file_t *
+ctf_open(const char *filename, int *errp)
+{
+ return (ctf_set_open_errno(errp, ENOTSUP));
+}
+
+/*ARGSUSED*/
+int
+ctf_write(ctf_file_t *fp, int fd)
+{
+ return (ctf_set_errno(fp, ENOTSUP));
+}
+
+int
+ctf_version(int version)
+{
+ ASSERT(version > 0 && version <= CTF_VERSION);
+
+ if (version > 0)
+ _libctf_version = MIN(CTF_VERSION, version);
+
+ return (_libctf_version);
+}
+
+/*ARGSUSED*/
+ctf_file_t *
+ctf_modopen(struct module *mp, int *error)
+{
+ ctf_sect_t ctfsect, symsect, strsect;
+ ctf_file_t *fp = NULL;
+ int err;
+
+ if (error == NULL)
+ error = &err;
+
+ ctfsect.cts_name = ".SUNW_ctf";
+ ctfsect.cts_type = SHT_PROGBITS;
+ ctfsect.cts_flags = SHF_ALLOC;
+ ctfsect.cts_data = mp->ctfdata;
+ ctfsect.cts_size = mp->ctfsize;
+ ctfsect.cts_entsize = 1;
+ ctfsect.cts_offset = 0;
+
+ symsect.cts_name = ".symtab";
+ symsect.cts_type = SHT_SYMTAB;
+ symsect.cts_flags = 0;
+ symsect.cts_data = mp->symtbl;
+ symsect.cts_size = mp->symhdr->sh_size;
+#ifdef _LP64
+ symsect.cts_entsize = sizeof (Elf64_Sym);
+#else
+ symsect.cts_entsize = sizeof (Elf32_Sym);
+#endif
+ symsect.cts_offset = 0;
+
+ strsect.cts_name = ".strtab";
+ strsect.cts_type = SHT_STRTAB;
+ strsect.cts_flags = 0;
+ strsect.cts_data = mp->strings;
+ strsect.cts_size = mp->strhdr->sh_size;
+ strsect.cts_entsize = 1;
+ strsect.cts_offset = 0;
+
+ ASSERT(MUTEX_HELD(&mod_lock));
+
+ if ((fp = ctf_bufopen(&ctfsect, &symsect, &strsect, error)) == NULL)
+ return (NULL);
+
+ if (!ctf_leave_compressed && (caddr_t)fp->ctf_base != mp->ctfdata) {
+ /*
+ * We must have just uncompressed the CTF data. To avoid
+ * others having to pay the (substantial) cost of decompressing
+ * the data, we're going to substitute the uncompressed version
+ * for the compressed version. Note that this implies that the
+ * first CTF consumer will induce memory impact on the system
+ * (but in the name of performance of future CTF consumers).
+ */
+ kobj_set_ctf(mp, (caddr_t)fp->ctf_base, fp->ctf_size);
+ fp->ctf_data.cts_data = fp->ctf_base;
+ fp->ctf_data.cts_size = fp->ctf_size;
+ }
+
+ return (fp);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c b/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c
new file mode 100644
index 000000000000..cd0a828628d4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c
@@ -0,0 +1,96 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <ctf_impl.h>
+#include <sys/kobj.h>
+#include <sys/kobj_impl.h>
+
+/*
+ * This module is used both during the normal operation of the kernel (i.e.
+ * after kmem has been initialized) and during boot (before unix`_start has
+ * been called). kobj_alloc is able to tell the difference between the two
+ * cases, and as such must be used instead of kmem_alloc.
+ */
+
+void *
+ctf_data_alloc(size_t size)
+{
+ void *buf = kobj_alloc(size, KM_NOWAIT|KM_SCRATCH);
+
+ if (buf == NULL)
+ return (MAP_FAILED);
+
+ return (buf);
+}
+
+void
+ctf_data_free(void *buf, size_t size)
+{
+ kobj_free(buf, size);
+}
+
+/*ARGSUSED*/
+void
+ctf_data_protect(void *buf, size_t size)
+{
+ /* we don't support this operation in the kernel */
+}
+
+void *
+ctf_alloc(size_t size)
+{
+ return (kobj_alloc(size, KM_NOWAIT|KM_TMP));
+}
+
+/*ARGSUSED*/
+void
+ctf_free(void *buf, size_t size)
+{
+ kobj_free(buf, size);
+}
+
+/*ARGSUSED*/
+const char *
+ctf_strerror(int err)
+{
+ return (NULL); /* we don't support this operation in the kernel */
+}
+
+/*PRINTFLIKE1*/
+void
+ctf_dprintf(const char *format, ...)
+{
+ if (_libctf_debug) {
+ va_list alist;
+
+ va_start(alist, format);
+ (void) printf("ctf DEBUG: ");
+ (void) vprintf(format, alist);
+ va_end(alist);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
new file mode 100644
index 000000000000..8399be770bb0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
@@ -0,0 +1,18424 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+/*
+ * DTrace - Dynamic Tracing for Solaris
+ *
+ * This is the implementation of the Solaris Dynamic Tracing framework
+ * (DTrace). The user-visible interface to DTrace is described at length in
+ * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
+ * library, the in-kernel DTrace framework, and the DTrace providers are
+ * described in the block comments in the <sys/dtrace.h> header file. The
+ * internal architecture of DTrace is described in the block comments in the
+ * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
+ * implementation very much assume mastery of all of these sources; if one has
+ * an unanswered question about the implementation, one should consult them
+ * first.
+ *
+ * The functions here are ordered roughly as follows:
+ *
+ * - Probe context functions
+ * - Probe hashing functions
+ * - Non-probe context utility functions
+ * - Matching functions
+ * - Provider-to-Framework API functions
+ * - Probe management functions
+ * - DIF object functions
+ * - Format functions
+ * - Predicate functions
+ * - ECB functions
+ * - Buffer functions
+ * - Enabling functions
+ * - DOF functions
+ * - Anonymous enabling functions
+ * - Consumer state functions
+ * - Helper functions
+ * - Hook functions
+ * - Driver cookbook functions
+ *
+ * Each group of functions begins with a block comment labelled the "DTrace
+ * [Group] Functions", allowing one to find each block by searching forward
+ * on capital-f functions.
+ */
+#include <sys/errno.h>
+#ifndef illumos
+#include <sys/time.h>
+#endif
+#include <sys/stat.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/systm.h>
+#ifdef illumos
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#endif
+#include <sys/cpuvar.h>
+#include <sys/kmem.h>
+#ifdef illumos
+#include <sys/strsubr.h>
+#endif
+#include <sys/sysmacros.h>
+#include <sys/dtrace_impl.h>
+#include <sys/atomic.h>
+#include <sys/cmn_err.h>
+#ifdef illumos
+#include <sys/mutex_impl.h>
+#include <sys/rwlock_impl.h>
+#endif
+#include <sys/ctf_api.h>
+#ifdef illumos
+#include <sys/panic.h>
+#include <sys/priv_impl.h>
+#endif
+#include <sys/policy.h>
+#ifdef illumos
+#include <sys/cred_impl.h>
+#include <sys/procfs_isa.h>
+#endif
+#include <sys/taskq.h>
+#ifdef illumos
+#include <sys/mkdev.h>
+#include <sys/kdi.h>
+#endif
+#include <sys/zone.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include "strtolctype.h"
+
+/* FreeBSD includes: */
+#ifndef illumos
+#include <sys/callout.h>
+#include <sys/ctype.h>
+#include <sys/eventhandler.h>
+#include <sys/limits.h>
+#include <sys/linker.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/ptrace.h>
+#include <sys/random.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+#include <sys/dtrace_bsd.h>
+
+#include <netinet/in.h>
+
+#include "dtrace_cddl.h"
+#include "dtrace_debug.c"
+#endif
+
+#include "dtrace_xoroshiro128_plus.h"
+
+/*
+ * DTrace Tunable Variables
+ *
+ * The following variables may be tuned by adding a line to /etc/system that
+ * includes both the name of the DTrace module ("dtrace") and the name of the
+ * variable. For example:
+ *
+ * set dtrace:dtrace_destructive_disallow = 1
+ *
+ * In general, the only variables that one should be tuning this way are those
+ * that affect system-wide DTrace behavior, and for which the default behavior
+ * is undesirable. Most of these variables are tunable on a per-consumer
+ * basis using DTrace options, and need not be tuned on a system-wide basis.
+ * When tuning these variables, avoid pathological values; while some attempt
+ * is made to verify the integrity of these variables, they are not considered
+ * part of the supported interface to DTrace, and they are therefore not
+ * checked comprehensively. Further, these variables should not be tuned
+ * dynamically via "mdb -kw" or other means; they should only be tuned via
+ * /etc/system.
+ */
+int dtrace_destructive_disallow = 0;
+#ifndef illumos
+/* Positive logic version of dtrace_destructive_disallow for loader tunable */
+int dtrace_allow_destructive = 1;
+#endif
+dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
+size_t dtrace_difo_maxsize = (256 * 1024);
+dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
+size_t dtrace_statvar_maxsize = (16 * 1024);
+size_t dtrace_actions_max = (16 * 1024);
+size_t dtrace_retain_max = 1024;
+dtrace_optval_t dtrace_helper_actions_max = 128;
+dtrace_optval_t dtrace_helper_providers_max = 32;
+dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
+size_t dtrace_strsize_default = 256;
+dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
+dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
+dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
+dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
+dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
+dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
+dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
+dtrace_optval_t dtrace_nspec_default = 1;
+dtrace_optval_t dtrace_specsize_default = 32 * 1024;
+dtrace_optval_t dtrace_stackframes_default = 20;
+dtrace_optval_t dtrace_ustackframes_default = 20;
+dtrace_optval_t dtrace_jstackframes_default = 50;
+dtrace_optval_t dtrace_jstackstrsize_default = 512;
+int dtrace_msgdsize_max = 128;
+hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */
+hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
+int dtrace_devdepth_max = 32;
+int dtrace_err_verbose;
+hrtime_t dtrace_deadman_interval = NANOSEC;
+hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
+hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
+hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
+#ifndef illumos
+int dtrace_memstr_max = 4096;
+#endif
+
+/*
+ * DTrace External Variables
+ *
+ * As dtrace(7D) is a kernel module, any DTrace variables are obviously
+ * available to DTrace consumers via the backtick (`) syntax. One of these,
+ * dtrace_zero, is made deliberately so: it is provided as a source of
+ * well-known, zero-filled memory. While this variable is not documented,
+ * it is used by some translators as an implementation detail.
+ */
+const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
+
+/*
+ * DTrace Internal Variables
+ */
+#ifdef illumos
+static dev_info_t *dtrace_devi; /* device info */
+#endif
+#ifdef illumos
+static vmem_t *dtrace_arena; /* probe ID arena */
+static vmem_t *dtrace_minor; /* minor number arena */
+#else
+static taskq_t *dtrace_taskq; /* task queue */
+static struct unrhdr *dtrace_arena; /* Probe ID number. */
+#endif
+static dtrace_probe_t **dtrace_probes; /* array of all probes */
+static int dtrace_nprobes; /* number of probes */
+static dtrace_provider_t *dtrace_provider; /* provider list */
+static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
+static int dtrace_opens; /* number of opens */
+static int dtrace_helpers; /* number of helpers */
+static int dtrace_getf; /* number of unpriv getf()s */
+#ifdef illumos
+static void *dtrace_softstate; /* softstate pointer */
+#endif
+static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
+static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
+static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
+static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
+static int dtrace_toxranges; /* number of toxic ranges */
+static int dtrace_toxranges_max; /* size of toxic range array */
+static dtrace_anon_t dtrace_anon; /* anonymous enabling */
+static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
+static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
+static kthread_t *dtrace_panicked; /* panicking thread */
+static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
+static dtrace_genid_t dtrace_probegen; /* current probe generation */
+static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
+static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
+static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
+static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
+static int dtrace_dynvar_failclean; /* dynvars failed to clean */
+#ifndef illumos
+static struct mtx dtrace_unr_mtx;
+MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
+static eventhandler_tag dtrace_kld_load_tag;
+static eventhandler_tag dtrace_kld_unload_try_tag;
+#endif
+
+/*
+ * DTrace Locking
+ * DTrace is protected by three (relatively coarse-grained) locks:
+ *
+ * (1) dtrace_lock is required to manipulate essentially any DTrace state,
+ * including enabling state, probes, ECBs, consumer state, helper state,
+ * etc. Importantly, dtrace_lock is _not_ required when in probe context;
+ * probe context is lock-free -- synchronization is handled via the
+ * dtrace_sync() cross call mechanism.
+ *
+ * (2) dtrace_provider_lock is required when manipulating provider state, or
+ * when provider state must be held constant.
+ *
+ * (3) dtrace_meta_lock is required when manipulating meta provider state, or
+ * when meta provider state must be held constant.
+ *
+ * The lock ordering between these three locks is dtrace_meta_lock before
+ * dtrace_provider_lock before dtrace_lock. (In particular, there are
+ * several places where dtrace_provider_lock is held by the framework as it
+ * calls into the providers -- which then call back into the framework,
+ * grabbing dtrace_lock.)
+ *
+ * There are two other locks in the mix: mod_lock and cpu_lock. With respect
+ * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
+ * role as a coarse-grained lock; it is acquired before both of these locks.
+ * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
+ * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
+ * mod_lock is similar with respect to dtrace_provider_lock in that it must be
+ * acquired _between_ dtrace_provider_lock and dtrace_lock.
+ */
+static kmutex_t dtrace_lock; /* probe state lock */
+static kmutex_t dtrace_provider_lock; /* provider state lock */
+static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
+
+#ifndef illumos
+/* XXX FreeBSD hacks. */
+#define cr_suid cr_svuid
+#define cr_sgid cr_svgid
+#define ipaddr_t in_addr_t
+#define mod_modname pathname
+#define vuprintf vprintf
+#define ttoproc(_a) ((_a)->td_proc)
+#define crgetzoneid(_a) 0
+#define SNOCD 0
+#define CPU_ON_INTR(_a) 0
+
+#define PRIV_EFFECTIVE (1 << 0)
+#define PRIV_DTRACE_KERNEL (1 << 1)
+#define PRIV_DTRACE_PROC (1 << 2)
+#define PRIV_DTRACE_USER (1 << 3)
+#define PRIV_PROC_OWNER (1 << 4)
+#define PRIV_PROC_ZONE (1 << 5)
+#define PRIV_ALL ~0
+
+SYSCTL_DECL(_debug_dtrace);
+SYSCTL_DECL(_kern_dtrace);
+#endif
+
+#ifdef illumos
+#define curcpu CPU->cpu_id
+#endif
+
+
+/*
+ * DTrace Provider Variables
+ *
+ * These are the variables relating to DTrace as a provider (that is, the
+ * provider of the BEGIN, END, and ERROR probes).
+ */
+static dtrace_pattr_t dtrace_provider_attr = {
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+};
+
+static void
+dtrace_nullop(void)
+{}
+
+static dtrace_pops_t dtrace_provider_ops = {
+ .dtps_provide = (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
+ .dtps_provide_module = (void (*)(void *, modctl_t *))dtrace_nullop,
+ .dtps_enable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+ .dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+ .dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+ .dtps_resume = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+ .dtps_getargdesc = NULL,
+ .dtps_getargval = NULL,
+ .dtps_usermode = NULL,
+ .dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+};
+
+static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
+static dtrace_id_t dtrace_probeid_end; /* special END probe */
+dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
+
+/*
+ * DTrace Helper Tracing Variables
+ *
+ * These variables should be set dynamically to enable helper tracing. The
+ * only variables that should be set are dtrace_helptrace_enable (which should
+ * be set to a non-zero value to allocate helper tracing buffers on the next
+ * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
+ * non-zero value to deallocate helper tracing buffers on the next close of
+ * /dev/dtrace). When (and only when) helper tracing is disabled, the
+ * buffer size may also be set via dtrace_helptrace_bufsize.
+ */
+int dtrace_helptrace_enable = 0;
+int dtrace_helptrace_disable = 0;
+int dtrace_helptrace_bufsize = 16 * 1024 * 1024;
+uint32_t dtrace_helptrace_nlocals;
+static dtrace_helptrace_t *dtrace_helptrace_buffer;
+static uint32_t dtrace_helptrace_next = 0;
+static int dtrace_helptrace_wrapped = 0;
+
+/*
+ * DTrace Error Hashing
+ *
+ * On DEBUG kernels, DTrace will track the errors that has seen in a hash
+ * table. This is very useful for checking coverage of tests that are
+ * expected to induce DIF or DOF processing errors, and may be useful for
+ * debugging problems in the DIF code generator or in DOF generation . The
+ * error hash may be examined with the ::dtrace_errhash MDB dcmd.
+ */
+#ifdef DEBUG
+static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
+static const char *dtrace_errlast;
+static kthread_t *dtrace_errthread;
+static kmutex_t dtrace_errlock;
+#endif
+
+/*
+ * DTrace Macros and Constants
+ *
+ * These are various macros that are useful in various spots in the
+ * implementation, along with a few random constants that have no meaning
+ * outside of the implementation. There is no real structure to this cpp
+ * mishmash -- but is there ever?
+ */
+#define DTRACE_HASHSTR(hash, probe) \
+ dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
+
+#define DTRACE_HASHNEXT(hash, probe) \
+ (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
+
+#define DTRACE_HASHPREV(hash, probe) \
+ (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
+
+#define DTRACE_HASHEQ(hash, lhs, rhs) \
+ (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
+ *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
+
+#define DTRACE_AGGHASHSIZE_SLEW 17
+
+#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
+
+/*
+ * The key for a thread-local variable consists of the lower 61 bits of the
+ * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
+ * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
+ * equal to a variable identifier. This is necessary (but not sufficient) to
+ * assure that global associative arrays never collide with thread-local
+ * variables. To guarantee that they cannot collide, we must also define the
+ * order for keying dynamic variables. That order is:
+ *
+ * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
+ *
+ * Because the variable-key and the tls-key are in orthogonal spaces, there is
+ * no way for a global variable key signature to match a thread-local key
+ * signature.
+ */
+#ifdef illumos
+#define DTRACE_TLS_THRKEY(where) { \
+ uint_t intr = 0; \
+ uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
+ for (; actv; actv >>= 1) \
+ intr++; \
+ ASSERT(intr < (1 << 3)); \
+ (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
+ (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
+}
+#else
+#define DTRACE_TLS_THRKEY(where) { \
+ solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
+ uint_t intr = 0; \
+ uint_t actv = _c->cpu_intr_actv; \
+ for (; actv; actv >>= 1) \
+ intr++; \
+ ASSERT(intr < (1 << 3)); \
+ (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
+ (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
+}
+#endif
+
+#define DT_BSWAP_8(x) ((x) & 0xff)
+#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
+#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
+#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
+
+#define DT_MASK_LO 0x00000000FFFFFFFFULL
+
+#define DTRACE_STORE(type, tomax, offset, what) \
+ *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
+
+#ifndef __x86
+#define DTRACE_ALIGNCHECK(addr, size, flags) \
+ if (addr & (size - 1)) { \
+ *flags |= CPU_DTRACE_BADALIGN; \
+ cpu_core[curcpu].cpuc_dtrace_illval = addr; \
+ return (0); \
+ }
+#else
+#define DTRACE_ALIGNCHECK(addr, size, flags)
+#endif
+
+/*
+ * Test whether a range of memory starting at testaddr of size testsz falls
+ * within the range of memory described by addr, sz. We take care to avoid
+ * problems with overflow and underflow of the unsigned quantities, and
+ * disallow all negative sizes. Ranges of size 0 are allowed.
+ */
+#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
+ ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
+ (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
+ (testaddr) + (testsz) >= (testaddr))
+
+#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
+do { \
+ if ((remp) != NULL) { \
+ *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+
+/*
+ * Test whether alloc_sz bytes will fit in the scratch region. We isolate
+ * alloc_sz on the righthand side of the comparison in order to avoid overflow
+ * or underflow in the comparison with it. This is simpler than the INRANGE
+ * check above, because we know that the dtms_scratch_ptr is valid in the
+ * range. Allocations of size zero are allowed.
+ */
+#define DTRACE_INSCRATCH(mstate, alloc_sz) \
+ ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
+ (mstate)->dtms_scratch_ptr >= (alloc_sz))
+
+#define DTRACE_LOADFUNC(bits) \
+/*CSTYLED*/ \
+uint##bits##_t \
+dtrace_load##bits(uintptr_t addr) \
+{ \
+ size_t size = bits / NBBY; \
+ /*CSTYLED*/ \
+ uint##bits##_t rval; \
+ int i; \
+ volatile uint16_t *flags = (volatile uint16_t *) \
+ &cpu_core[curcpu].cpuc_dtrace_flags; \
+ \
+ DTRACE_ALIGNCHECK(addr, size, flags); \
+ \
+ for (i = 0; i < dtrace_toxranges; i++) { \
+ if (addr >= dtrace_toxrange[i].dtt_limit) \
+ continue; \
+ \
+ if (addr + size <= dtrace_toxrange[i].dtt_base) \
+ continue; \
+ \
+ /* \
+ * This address falls within a toxic region; return 0. \
+ */ \
+ *flags |= CPU_DTRACE_BADADDR; \
+ cpu_core[curcpu].cpuc_dtrace_illval = addr; \
+ return (0); \
+ } \
+ \
+ *flags |= CPU_DTRACE_NOFAULT; \
+ /*CSTYLED*/ \
+ rval = *((volatile uint##bits##_t *)addr); \
+ *flags &= ~CPU_DTRACE_NOFAULT; \
+ \
+ return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
+}
+
+#ifdef _LP64
+#define dtrace_loadptr dtrace_load64
+#else
+#define dtrace_loadptr dtrace_load32
+#endif
+
+#define DTRACE_DYNHASH_FREE 0
+#define DTRACE_DYNHASH_SINK 1
+#define DTRACE_DYNHASH_VALID 2
+
+#define DTRACE_MATCH_NEXT 0
+#define DTRACE_MATCH_DONE 1
+#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
+#define DTRACE_STATE_ALIGN 64
+
+#define DTRACE_FLAGS2FLT(flags) \
+ (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
+ ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
+ ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
+ ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
+ ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
+ ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
+ ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
+ ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
+ ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
+ DTRACEFLT_UNKNOWN)
+
+#define DTRACEACT_ISSTRING(act) \
+ ((act)->dta_kind == DTRACEACT_DIFEXPR && \
+ (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
+
+/* Function prototype definitions: */
+static size_t dtrace_strlen(const char *, size_t);
+static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
+static void dtrace_enabling_provide(dtrace_provider_t *);
+static int dtrace_enabling_match(dtrace_enabling_t *, int *);
+static void dtrace_enabling_matchall(void);
+static void dtrace_enabling_reap(void);
+static dtrace_state_t *dtrace_anon_grab(void);
+static uint64_t dtrace_helper(int, dtrace_mstate_t *,
+ dtrace_state_t *, uint64_t, uint64_t);
+static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
+static void dtrace_buffer_drop(dtrace_buffer_t *);
+static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
+static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
+ dtrace_state_t *, dtrace_mstate_t *);
+static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
+ dtrace_optval_t);
+static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
+static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
+uint16_t dtrace_load16(uintptr_t);
+uint32_t dtrace_load32(uintptr_t);
+uint64_t dtrace_load64(uintptr_t);
+uint8_t dtrace_load8(uintptr_t);
+void dtrace_dynvar_clean(dtrace_dstate_t *);
+dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
+ size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
+uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
+static int dtrace_priv_proc(dtrace_state_t *);
+static void dtrace_getf_barrier(void);
+static int dtrace_canload_remains(uint64_t, size_t, size_t *,
+ dtrace_mstate_t *, dtrace_vstate_t *);
+static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
+ dtrace_mstate_t *, dtrace_vstate_t *);
+
+/*
+ * DTrace Probe Context Functions
+ *
+ * These functions are called from probe context. Because probe context is
+ * any context in which C may be called, arbitrarily locks may be held,
+ * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
+ * As a result, functions called from probe context may only call other DTrace
+ * support functions -- they may not interact at all with the system at large.
+ * (Note that the ASSERT macro is made probe-context safe by redefining it in
+ * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
+ * loads are to be performed from probe context, they _must_ be in terms of
+ * the safe dtrace_load*() variants.
+ *
+ * Some functions in this block are not actually called from probe context;
+ * for these functions, there will be a comment above the function reading
+ * "Note: not called from probe context."
+ */
+void
+dtrace_panic(const char *format, ...)
+{
+ va_list alist;
+
+ va_start(alist, format);
+#ifdef __FreeBSD__
+ vpanic(format, alist);
+#else
+ dtrace_vpanic(format, alist);
+#endif
+ va_end(alist);
+}
+
+int
+dtrace_assfail(const char *a, const char *f, int l)
+{
+ dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
+
+ /*
+ * We just need something here that even the most clever compiler
+ * cannot optimize away.
+ */
+ return (a[(uintptr_t)f]);
+}
+
+/*
+ * Atomically increment a specified error counter from probe context.
+ */
+static void
+dtrace_error(uint32_t *counter)
+{
+ /*
+ * Most counters stored to in probe context are per-CPU counters.
+ * However, there are some error conditions that are sufficiently
+ * arcane that they don't merit per-CPU storage. If these counters
+ * are incremented concurrently on different CPUs, scalability will be
+ * adversely affected -- but we don't expect them to be white-hot in a
+ * correctly constructed enabling...
+ */
+ uint32_t oval, nval;
+
+ do {
+ oval = *counter;
+
+ if ((nval = oval + 1) == 0) {
+ /*
+ * If the counter would wrap, set it to 1 -- assuring
+ * that the counter is never zero when we have seen
+ * errors. (The counter must be 32-bits because we
+ * aren't guaranteed a 64-bit compare&swap operation.)
+ * To save this code both the infamy of being fingered
+ * by a priggish news story and the indignity of being
+ * the target of a neo-puritan witch trial, we're
+ * carefully avoiding any colorful description of the
+ * likelihood of this condition -- but suffice it to
+ * say that it is only slightly more likely than the
+ * overflow of predicate cache IDs, as discussed in
+ * dtrace_predicate_create().
+ */
+ nval = 1;
+ }
+ } while (dtrace_cas32(counter, oval, nval) != oval);
+}
+
+/*
+ * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
+ * uint8_t, a uint16_t, a uint32_t and a uint64_t.
+ */
+/* BEGIN CSTYLED */
+DTRACE_LOADFUNC(8)
+DTRACE_LOADFUNC(16)
+DTRACE_LOADFUNC(32)
+DTRACE_LOADFUNC(64)
+/* END CSTYLED */
+
+static int
+dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
+{
+ if (dest < mstate->dtms_scratch_base)
+ return (0);
+
+ if (dest + size < dest)
+ return (0);
+
+ if (dest + size > mstate->dtms_scratch_ptr)
+ return (0);
+
+ return (1);
+}
+
+static int
+dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
+ dtrace_statvar_t **svars, int nsvars)
+{
+ int i;
+ size_t maxglobalsize, maxlocalsize;
+
+ if (nsvars == 0)
+ return (0);
+
+ maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
+ maxlocalsize = maxglobalsize * NCPU;
+
+ for (i = 0; i < nsvars; i++) {
+ dtrace_statvar_t *svar = svars[i];
+ uint8_t scope;
+ size_t size;
+
+ if (svar == NULL || (size = svar->dtsv_size) == 0)
+ continue;
+
+ scope = svar->dtsv_var.dtdv_scope;
+
+ /*
+ * We verify that our size is valid in the spirit of providing
+ * defense in depth: we want to prevent attackers from using
+ * DTrace to escalate an orthogonal kernel heap corruption bug
+ * into the ability to store to arbitrary locations in memory.
+ */
+ VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
+ (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
+
+ if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
+ svar->dtsv_size)) {
+ DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
+ svar->dtsv_size);
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Check to see if the address is within a memory region to which a store may
+ * be issued. This includes the DTrace scratch areas, and any DTrace variable
+ * region. The caller of dtrace_canstore() is responsible for performing any
+ * alignment checks that are needed before stores are actually executed.
+ */
+static int
+dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
+ dtrace_vstate_t *vstate)
+{
+ return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
+}
+
+/*
+ * Implementation of dtrace_canstore which communicates the upper bound of the
+ * allowed memory region.
+ */
+static int
+dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
+ dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
+{
+ /*
+ * First, check to see if the address is in scratch space...
+ */
+ if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
+ mstate->dtms_scratch_size)) {
+ DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
+ mstate->dtms_scratch_size);
+ return (1);
+ }
+
+ /*
+ * Now check to see if it's a dynamic variable. This check will pick
+ * up both thread-local variables and any global dynamically-allocated
+ * variables.
+ */
+ if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
+ vstate->dtvs_dynvars.dtds_size)) {
+ dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
+ uintptr_t base = (uintptr_t)dstate->dtds_base +
+ (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
+ uintptr_t chunkoffs;
+ dtrace_dynvar_t *dvar;
+
+ /*
+ * Before we assume that we can store here, we need to make
+ * sure that it isn't in our metadata -- storing to our
+ * dynamic variable metadata would corrupt our state. For
+ * the range to not include any dynamic variable metadata,
+ * it must:
+ *
+ * (1) Start above the hash table that is at the base of
+ * the dynamic variable space
+ *
+ * (2) Have a starting chunk offset that is beyond the
+ * dtrace_dynvar_t that is at the base of every chunk
+ *
+ * (3) Not span a chunk boundary
+ *
+ * (4) Not be in the tuple space of a dynamic variable
+ *
+ */
+ if (addr < base)
+ return (0);
+
+ chunkoffs = (addr - base) % dstate->dtds_chunksize;
+
+ if (chunkoffs < sizeof (dtrace_dynvar_t))
+ return (0);
+
+ if (chunkoffs + sz > dstate->dtds_chunksize)
+ return (0);
+
+ dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
+
+ if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
+ return (0);
+
+ if (chunkoffs < sizeof (dtrace_dynvar_t) +
+ ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
+ return (0);
+
+ DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
+ return (1);
+ }
+
+ /*
+ * Finally, check the static local and global variables. These checks
+ * take the longest, so we perform them last.
+ */
+ if (dtrace_canstore_statvar(addr, sz, remain,
+ vstate->dtvs_locals, vstate->dtvs_nlocals))
+ return (1);
+
+ if (dtrace_canstore_statvar(addr, sz, remain,
+ vstate->dtvs_globals, vstate->dtvs_nglobals))
+ return (1);
+
+ return (0);
+}
+
+
+/*
+ * Convenience routine to check to see if the address is within a memory
+ * region in which a load may be issued given the user's privilege level;
+ * if not, it sets the appropriate error flags and loads 'addr' into the
+ * illegal value slot.
+ *
+ * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
+ * appropriate memory access protection.
+ */
+static int
+dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
+ dtrace_vstate_t *vstate)
+{
+ return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
+}
+
+/*
+ * Implementation of dtrace_canload which communicates the uppoer bound of the
+ * allowed memory region.
+ */
+static int
+dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
+ dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
+{
+ volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
+ file_t *fp;
+
+ /*
+ * If we hold the privilege to read from kernel memory, then
+ * everything is readable.
+ */
+ if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
+ DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
+ return (1);
+ }
+
+ /*
+ * You can obviously read that which you can store.
+ */
+ if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
+ return (1);
+
+ /*
+ * We're allowed to read from our own string table.
+ */
+ if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
+ mstate->dtms_difo->dtdo_strlen)) {
+ DTRACE_RANGE_REMAIN(remain, addr,
+ mstate->dtms_difo->dtdo_strtab,
+ mstate->dtms_difo->dtdo_strlen);
+ return (1);
+ }
+
+ if (vstate->dtvs_state != NULL &&
+ dtrace_priv_proc(vstate->dtvs_state)) {
+ proc_t *p;
+
+ /*
+ * When we have privileges to the current process, there are
+ * several context-related kernel structures that are safe to
+ * read, even absent the privilege to read from kernel memory.
+ * These reads are safe because these structures contain only
+ * state that (1) we're permitted to read, (2) is harmless or
+ * (3) contains pointers to additional kernel state that we're
+ * not permitted to read (and as such, do not present an
+ * opportunity for privilege escalation). Finally (and
+ * critically), because of the nature of their relation with
+ * the current thread context, the memory associated with these
+ * structures cannot change over the duration of probe context,
+ * and it is therefore impossible for this memory to be
+ * deallocated and reallocated as something else while it's
+ * being operated upon.
+ */
+ if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
+ DTRACE_RANGE_REMAIN(remain, addr, curthread,
+ sizeof (kthread_t));
+ return (1);
+ }
+
+ if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
+ sz, curthread->t_procp, sizeof (proc_t))) {
+ DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
+ sizeof (proc_t));
+ return (1);
+ }
+
+ if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
+ curthread->t_cred, sizeof (cred_t))) {
+ DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
+ sizeof (cred_t));
+ return (1);
+ }
+
+#ifdef illumos
+ if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
+ &(p->p_pidp->pid_id), sizeof (pid_t))) {
+ DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
+ sizeof (pid_t));
+ return (1);
+ }
+
+ if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
+ curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
+ DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
+ offsetof(cpu_t, cpu_pause_thread));
+ return (1);
+ }
+#endif
+ }
+
+ if ((fp = mstate->dtms_getf) != NULL) {
+ uintptr_t psz = sizeof (void *);
+ vnode_t *vp;
+ vnodeops_t *op;
+
+ /*
+ * When getf() returns a file_t, the enabling is implicitly
+ * granted the (transient) right to read the returned file_t
+ * as well as the v_path and v_op->vnop_name of the underlying
+ * vnode. These accesses are allowed after a successful
+ * getf() because the members that they refer to cannot change
+ * once set -- and the barrier logic in the kernel's closef()
+ * path assures that the file_t and its referenced vode_t
+ * cannot themselves be stale (that is, it impossible for
+ * either dtms_getf itself or its f_vnode member to reference
+ * freed memory).
+ */
+ if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
+ DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
+ return (1);
+ }
+
+ if ((vp = fp->f_vnode) != NULL) {
+ size_t slen;
+#ifdef illumos
+ if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
+ DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
+ psz);
+ return (1);
+ }
+ slen = strlen(vp->v_path) + 1;
+ if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
+ DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
+ slen);
+ return (1);
+ }
+#endif
+
+ if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
+ DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
+ psz);
+ return (1);
+ }
+
+#ifdef illumos
+ if ((op = vp->v_op) != NULL &&
+ DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
+ DTRACE_RANGE_REMAIN(remain, addr,
+ &op->vnop_name, psz);
+ return (1);
+ }
+
+ if (op != NULL && op->vnop_name != NULL &&
+ DTRACE_INRANGE(addr, sz, op->vnop_name,
+ (slen = strlen(op->vnop_name) + 1))) {
+ DTRACE_RANGE_REMAIN(remain, addr,
+ op->vnop_name, slen);
+ return (1);
+ }
+#endif
+ }
+ }
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
+ *illval = addr;
+ return (0);
+}
+
+/*
+ * Convenience routine to check to see if a given string is within a memory
+ * region in which a load may be issued given the user's privilege level;
+ * this exists so that we don't need to issue unnecessary dtrace_strlen()
+ * calls in the event that the user has all privileges.
+ */
+static int
+dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
+ dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
+{
+ size_t rsize;
+
+ /*
+ * If we hold the privilege to read from kernel memory, then
+ * everything is readable.
+ */
+ if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
+ DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
+ return (1);
+ }
+
+ /*
+ * Even if the caller is uninterested in querying the remaining valid
+ * range, it is required to ensure that the access is allowed.
+ */
+ if (remain == NULL) {
+ remain = &rsize;
+ }
+ if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
+ size_t strsz;
+ /*
+ * Perform the strlen after determining the length of the
+ * memory region which is accessible. This prevents timing
+ * information from being used to find NULs in memory which is
+ * not accessible to the caller.
+ */
+ strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
+ MIN(sz, *remain));
+ if (strsz <= *remain) {
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Convenience routine to check to see if a given variable is within a memory
+ * region in which a load may be issued given the user's privilege level.
+ */
+static int
+dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
+ dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
+{
+ size_t sz;
+ ASSERT(type->dtdt_flags & DIF_TF_BYREF);
+
+ /*
+ * Calculate the max size before performing any checks since even
+ * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
+ * return the max length via 'remain'.
+ */
+ if (type->dtdt_kind == DIF_TYPE_STRING) {
+ dtrace_state_t *state = vstate->dtvs_state;
+
+ if (state != NULL) {
+ sz = state->dts_options[DTRACEOPT_STRSIZE];
+ } else {
+ /*
+ * In helper context, we have a NULL state; fall back
+ * to using the system-wide default for the string size
+ * in this case.
+ */
+ sz = dtrace_strsize_default;
+ }
+ } else {
+ sz = type->dtdt_size;
+ }
+
+ /*
+ * If we hold the privilege to read from kernel memory, then
+ * everything is readable.
+ */
+ if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
+ DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
+ return (1);
+ }
+
+ if (type->dtdt_kind == DIF_TYPE_STRING) {
+ return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
+ vstate));
+ }
+ return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
+ vstate));
+}
+
+/*
+ * Convert a string to a signed integer using safe loads.
+ *
+ * NOTE: This function uses various macros from strtolctype.h to manipulate
+ * digit values, etc -- these have all been checked to ensure they make
+ * no additional function calls.
+ */
+static int64_t
+dtrace_strtoll(char *input, int base, size_t limit)
+{
+ uintptr_t pos = (uintptr_t)input;
+ int64_t val = 0;
+ int x;
+ boolean_t neg = B_FALSE;
+ char c, cc, ccc;
+ uintptr_t end = pos + limit;
+
+ /*
+ * Consume any whitespace preceding digits.
+ */
+ while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
+ pos++;
+
+ /*
+ * Handle an explicit sign if one is present.
+ */
+ if (c == '-' || c == '+') {
+ if (c == '-')
+ neg = B_TRUE;
+ c = dtrace_load8(++pos);
+ }
+
+ /*
+ * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
+ * if present.
+ */
+ if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
+ cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
+ pos += 2;
+ c = ccc;
+ }
+
+ /*
+ * Read in contiguous digits until the first non-digit character.
+ */
+ for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
+ c = dtrace_load8(++pos))
+ val = val * base + x;
+
+ return (neg ? -val : val);
+}
+
+/*
+ * Compare two strings using safe loads.
+ */
+static int
+dtrace_strncmp(char *s1, char *s2, size_t limit)
+{
+ uint8_t c1, c2;
+ volatile uint16_t *flags;
+
+ if (s1 == s2 || limit == 0)
+ return (0);
+
+ flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
+
+ do {
+ if (s1 == NULL) {
+ c1 = '\0';
+ } else {
+ c1 = dtrace_load8((uintptr_t)s1++);
+ }
+
+ if (s2 == NULL) {
+ c2 = '\0';
+ } else {
+ c2 = dtrace_load8((uintptr_t)s2++);
+ }
+
+ if (c1 != c2)
+ return (c1 - c2);
+ } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
+
+ return (0);
+}
+
+/*
+ * Compute strlen(s) for a string using safe memory accesses. The additional
+ * len parameter is used to specify a maximum length to ensure completion.
+ */
+static size_t
+dtrace_strlen(const char *s, size_t lim)
+{
+ uint_t len;
+
+ for (len = 0; len != lim; len++) {
+ if (dtrace_load8((uintptr_t)s++) == '\0')
+ break;
+ }
+
+ return (len);
+}
+
+/*
+ * Check if an address falls within a toxic region.
+ */
+static int
+dtrace_istoxic(uintptr_t kaddr, size_t size)
+{
+ uintptr_t taddr, tsize;
+ int i;
+
+ for (i = 0; i < dtrace_toxranges; i++) {
+ taddr = dtrace_toxrange[i].dtt_base;
+ tsize = dtrace_toxrange[i].dtt_limit - taddr;
+
+ if (kaddr - taddr < tsize) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
+ cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
+ return (1);
+ }
+
+ if (taddr - kaddr < size) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
+ cpu_core[curcpu].cpuc_dtrace_illval = taddr;
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
+ * memory specified by the DIF program. The dst is assumed to be safe memory
+ * that we can store to directly because it is managed by DTrace. As with
+ * standard bcopy, overlapping copies are handled properly.
+ */
+static void
+dtrace_bcopy(const void *src, void *dst, size_t len)
+{
+ if (len != 0) {
+ uint8_t *s1 = dst;
+ const uint8_t *s2 = src;
+
+ if (s1 <= s2) {
+ do {
+ *s1++ = dtrace_load8((uintptr_t)s2++);
+ } while (--len != 0);
+ } else {
+ s2 += len;
+ s1 += len;
+
+ do {
+ *--s1 = dtrace_load8((uintptr_t)--s2);
+ } while (--len != 0);
+ }
+ }
+}
+
+/*
+ * Copy src to dst using safe memory accesses, up to either the specified
+ * length, or the point that a nul byte is encountered. The src is assumed to
+ * be unsafe memory specified by the DIF program. The dst is assumed to be
+ * safe memory that we can store to directly because it is managed by DTrace.
+ * Unlike dtrace_bcopy(), overlapping regions are not handled.
+ */
+static void
+dtrace_strcpy(const void *src, void *dst, size_t len)
+{
+ if (len != 0) {
+ uint8_t *s1 = dst, c;
+ const uint8_t *s2 = src;
+
+ do {
+ *s1++ = c = dtrace_load8((uintptr_t)s2++);
+ } while (--len != 0 && c != '\0');
+ }
+}
+
+/*
+ * Copy src to dst, deriving the size and type from the specified (BYREF)
+ * variable type. The src is assumed to be unsafe memory specified by the DIF
+ * program. The dst is assumed to be DTrace variable memory that is of the
+ * specified type; we assume that we can store to directly.
+ */
+static void
+dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
+{
+ ASSERT(type->dtdt_flags & DIF_TF_BYREF);
+
+ if (type->dtdt_kind == DIF_TYPE_STRING) {
+ dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
+ } else {
+ dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
+ }
+}
+
+/*
+ * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
+ * unsafe memory specified by the DIF program. The s2 data is assumed to be
+ * safe memory that we can access directly because it is managed by DTrace.
+ */
+static int
+dtrace_bcmp(const void *s1, const void *s2, size_t len)
+{
+ volatile uint16_t *flags;
+
+ flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
+
+ if (s1 == s2)
+ return (0);
+
+ if (s1 == NULL || s2 == NULL)
+ return (1);
+
+ if (s1 != s2 && len != 0) {
+ const uint8_t *ps1 = s1;
+ const uint8_t *ps2 = s2;
+
+ do {
+ if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
+ return (1);
+ } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
+ }
+ return (0);
+}
+
+/*
+ * Zero the specified region using a simple byte-by-byte loop. Note that this
+ * is for safe DTrace-managed memory only.
+ */
+static void
+dtrace_bzero(void *dst, size_t len)
+{
+ uchar_t *cp;
+
+ for (cp = dst; len != 0; len--)
+ *cp++ = 0;
+}
+
+static void
+dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
+{
+ uint64_t result[2];
+
+ result[0] = addend1[0] + addend2[0];
+ result[1] = addend1[1] + addend2[1] +
+ (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
+
+ sum[0] = result[0];
+ sum[1] = result[1];
+}
+
+/*
+ * Shift the 128-bit value in a by b. If b is positive, shift left.
+ * If b is negative, shift right.
+ */
+static void
+dtrace_shift_128(uint64_t *a, int b)
+{
+ uint64_t mask;
+
+ if (b == 0)
+ return;
+
+ if (b < 0) {
+ b = -b;
+ if (b >= 64) {
+ a[0] = a[1] >> (b - 64);
+ a[1] = 0;
+ } else {
+ a[0] >>= b;
+ mask = 1LL << (64 - b);
+ mask -= 1;
+ a[0] |= ((a[1] & mask) << (64 - b));
+ a[1] >>= b;
+ }
+ } else {
+ if (b >= 64) {
+ a[1] = a[0] << (b - 64);
+ a[0] = 0;
+ } else {
+ a[1] <<= b;
+ mask = a[0] >> (64 - b);
+ a[1] |= mask;
+ a[0] <<= b;
+ }
+ }
+}
+
+/*
+ * The basic idea is to break the 2 64-bit values into 4 32-bit values,
+ * use native multiplication on those, and then re-combine into the
+ * resulting 128-bit value.
+ *
+ * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
+ * hi1 * hi2 << 64 +
+ * hi1 * lo2 << 32 +
+ * hi2 * lo1 << 32 +
+ * lo1 * lo2
+ */
+static void
+dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
+{
+ uint64_t hi1, hi2, lo1, lo2;
+ uint64_t tmp[2];
+
+ hi1 = factor1 >> 32;
+ hi2 = factor2 >> 32;
+
+ lo1 = factor1 & DT_MASK_LO;
+ lo2 = factor2 & DT_MASK_LO;
+
+ product[0] = lo1 * lo2;
+ product[1] = hi1 * hi2;
+
+ tmp[0] = hi1 * lo2;
+ tmp[1] = 0;
+ dtrace_shift_128(tmp, 32);
+ dtrace_add_128(product, tmp, product);
+
+ tmp[0] = hi2 * lo1;
+ tmp[1] = 0;
+ dtrace_shift_128(tmp, 32);
+ dtrace_add_128(product, tmp, product);
+}
+
+/*
+ * This privilege check should be used by actions and subroutines to
+ * verify that the user credentials of the process that enabled the
+ * invoking ECB match the target credentials
+ */
+static int
+dtrace_priv_proc_common_user(dtrace_state_t *state)
+{
+ cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
+
+ /*
+ * We should always have a non-NULL state cred here, since if cred
+ * is null (anonymous tracing), we fast-path bypass this routine.
+ */
+ ASSERT(s_cr != NULL);
+
+ if ((cr = CRED()) != NULL &&
+ s_cr->cr_uid == cr->cr_uid &&
+ s_cr->cr_uid == cr->cr_ruid &&
+ s_cr->cr_uid == cr->cr_suid &&
+ s_cr->cr_gid == cr->cr_gid &&
+ s_cr->cr_gid == cr->cr_rgid &&
+ s_cr->cr_gid == cr->cr_sgid)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * This privilege check should be used by actions and subroutines to
+ * verify that the zone of the process that enabled the invoking ECB
+ * matches the target credentials
+ */
+static int
+dtrace_priv_proc_common_zone(dtrace_state_t *state)
+{
+#ifdef illumos
+ cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
+
+ /*
+ * We should always have a non-NULL state cred here, since if cred
+ * is null (anonymous tracing), we fast-path bypass this routine.
+ */
+ ASSERT(s_cr != NULL);
+
+ if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
+ return (1);
+
+ return (0);
+#else
+ return (1);
+#endif
+}
+
+/*
+ * This privilege check should be used by actions and subroutines to
+ * verify that the process has not setuid or changed credentials.
+ */
+static int
+dtrace_priv_proc_common_nocd(void)
+{
+ proc_t *proc;
+
+ if ((proc = ttoproc(curthread)) != NULL &&
+ !(proc->p_flag & SNOCD))
+ return (1);
+
+ return (0);
+}
+
+static int
+dtrace_priv_proc_destructive(dtrace_state_t *state)
+{
+ int action = state->dts_cred.dcr_action;
+
+ if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
+ dtrace_priv_proc_common_zone(state) == 0)
+ goto bad;
+
+ if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
+ dtrace_priv_proc_common_user(state) == 0)
+ goto bad;
+
+ if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
+ dtrace_priv_proc_common_nocd() == 0)
+ goto bad;
+
+ return (1);
+
+bad:
+ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
+
+ return (0);
+}
+
+static int
+dtrace_priv_proc_control(dtrace_state_t *state)
+{
+ if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
+ return (1);
+
+ if (dtrace_priv_proc_common_zone(state) &&
+ dtrace_priv_proc_common_user(state) &&
+ dtrace_priv_proc_common_nocd())
+ return (1);
+
+ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
+
+ return (0);
+}
+
+static int
+dtrace_priv_proc(dtrace_state_t *state)
+{
+ if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
+ return (1);
+
+ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
+
+ return (0);
+}
+
+static int
+dtrace_priv_kernel(dtrace_state_t *state)
+{
+ if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
+ return (1);
+
+ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
+
+ return (0);
+}
+
+static int
+dtrace_priv_kernel_destructive(dtrace_state_t *state)
+{
+ if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
+ return (1);
+
+ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
+
+ return (0);
+}
+
+/*
+ * Determine if the dte_cond of the specified ECB allows for processing of
+ * the current probe to continue. Note that this routine may allow continued
+ * processing, but with access(es) stripped from the mstate's dtms_access
+ * field.
+ */
+static int
+dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
+ dtrace_ecb_t *ecb)
+{
+ dtrace_probe_t *probe = ecb->dte_probe;
+ dtrace_provider_t *prov = probe->dtpr_provider;
+ dtrace_pops_t *pops = &prov->dtpv_pops;
+ int mode = DTRACE_MODE_NOPRIV_DROP;
+
+ ASSERT(ecb->dte_cond);
+
+#ifdef illumos
+ if (pops->dtps_mode != NULL) {
+ mode = pops->dtps_mode(prov->dtpv_arg,
+ probe->dtpr_id, probe->dtpr_arg);
+
+ ASSERT((mode & DTRACE_MODE_USER) ||
+ (mode & DTRACE_MODE_KERNEL));
+ ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
+ (mode & DTRACE_MODE_NOPRIV_DROP));
+ }
+
+ /*
+ * If the dte_cond bits indicate that this consumer is only allowed to
+ * see user-mode firings of this probe, call the provider's dtps_mode()
+ * entry point to check that the probe was fired while in a user
+ * context. If that's not the case, use the policy specified by the
+ * provider to determine if we drop the probe or merely restrict
+ * operation.
+ */
+ if (ecb->dte_cond & DTRACE_COND_USERMODE) {
+ ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
+
+ if (!(mode & DTRACE_MODE_USER)) {
+ if (mode & DTRACE_MODE_NOPRIV_DROP)
+ return (0);
+
+ mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
+ }
+ }
+#endif
+
+ /*
+ * This is more subtle than it looks. We have to be absolutely certain
+ * that CRED() isn't going to change out from under us so it's only
+ * legit to examine that structure if we're in constrained situations.
+ * Currently, the only times we'll this check is if a non-super-user
+ * has enabled the profile or syscall providers -- providers that
+ * allow visibility of all processes. For the profile case, the check
+ * above will ensure that we're examining a user context.
+ */
+ if (ecb->dte_cond & DTRACE_COND_OWNER) {
+ cred_t *cr;
+ cred_t *s_cr = state->dts_cred.dcr_cred;
+ proc_t *proc;
+
+ ASSERT(s_cr != NULL);
+
+ if ((cr = CRED()) == NULL ||
+ s_cr->cr_uid != cr->cr_uid ||
+ s_cr->cr_uid != cr->cr_ruid ||
+ s_cr->cr_uid != cr->cr_suid ||
+ s_cr->cr_gid != cr->cr_gid ||
+ s_cr->cr_gid != cr->cr_rgid ||
+ s_cr->cr_gid != cr->cr_sgid ||
+ (proc = ttoproc(curthread)) == NULL ||
+ (proc->p_flag & SNOCD)) {
+ if (mode & DTRACE_MODE_NOPRIV_DROP)
+ return (0);
+
+#ifdef illumos
+ mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
+#endif
+ }
+ }
+
+#ifdef illumos
+ /*
+ * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
+ * in our zone, check to see if our mode policy is to restrict rather
+ * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
+ * and DTRACE_ACCESS_ARGS
+ */
+ if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
+ cred_t *cr;
+ cred_t *s_cr = state->dts_cred.dcr_cred;
+
+ ASSERT(s_cr != NULL);
+
+ if ((cr = CRED()) == NULL ||
+ s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
+ if (mode & DTRACE_MODE_NOPRIV_DROP)
+ return (0);
+
+ mstate->dtms_access &=
+ ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
+ }
+ }
+#endif
+
+ return (1);
+}
+
+/*
+ * Note: not called from probe context. This function is called
+ * asynchronously (and at a regular interval) from outside of probe context to
+ * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
+ * cleaning is explained in detail in <sys/dtrace_impl.h>.
+ */
+void
+dtrace_dynvar_clean(dtrace_dstate_t *dstate)
+{
+ dtrace_dynvar_t *dirty;
+ dtrace_dstate_percpu_t *dcpu;
+ dtrace_dynvar_t **rinsep;
+ int i, j, work = 0;
+
+ for (i = 0; i < NCPU; i++) {
+ dcpu = &dstate->dtds_percpu[i];
+ rinsep = &dcpu->dtdsc_rinsing;
+
+ /*
+ * If the dirty list is NULL, there is no dirty work to do.
+ */
+ if (dcpu->dtdsc_dirty == NULL)
+ continue;
+
+ if (dcpu->dtdsc_rinsing != NULL) {
+ /*
+ * If the rinsing list is non-NULL, then it is because
+ * this CPU was selected to accept another CPU's
+ * dirty list -- and since that time, dirty buffers
+ * have accumulated. This is a highly unlikely
+ * condition, but we choose to ignore the dirty
+ * buffers -- they'll be picked up a future cleanse.
+ */
+ continue;
+ }
+
+ if (dcpu->dtdsc_clean != NULL) {
+ /*
+ * If the clean list is non-NULL, then we're in a
+ * situation where a CPU has done deallocations (we
+ * have a non-NULL dirty list) but no allocations (we
+ * also have a non-NULL clean list). We can't simply
+ * move the dirty list into the clean list on this
+ * CPU, yet we also don't want to allow this condition
+ * to persist, lest a short clean list prevent a
+ * massive dirty list from being cleaned (which in
+ * turn could lead to otherwise avoidable dynamic
+ * drops). To deal with this, we look for some CPU
+ * with a NULL clean list, NULL dirty list, and NULL
+ * rinsing list -- and then we borrow this CPU to
+ * rinse our dirty list.
+ */
+ for (j = 0; j < NCPU; j++) {
+ dtrace_dstate_percpu_t *rinser;
+
+ rinser = &dstate->dtds_percpu[j];
+
+ if (rinser->dtdsc_rinsing != NULL)
+ continue;
+
+ if (rinser->dtdsc_dirty != NULL)
+ continue;
+
+ if (rinser->dtdsc_clean != NULL)
+ continue;
+
+ rinsep = &rinser->dtdsc_rinsing;
+ break;
+ }
+
+ if (j == NCPU) {
+ /*
+ * We were unable to find another CPU that
+ * could accept this dirty list -- we are
+ * therefore unable to clean it now.
+ */
+ dtrace_dynvar_failclean++;
+ continue;
+ }
+ }
+
+ work = 1;
+
+ /*
+ * Atomically move the dirty list aside.
+ */
+ do {
+ dirty = dcpu->dtdsc_dirty;
+
+ /*
+ * Before we zap the dirty list, set the rinsing list.
+ * (This allows for a potential assertion in
+ * dtrace_dynvar(): if a free dynamic variable appears
+ * on a hash chain, either the dirty list or the
+ * rinsing list for some CPU must be non-NULL.)
+ */
+ *rinsep = dirty;
+ dtrace_membar_producer();
+ } while (dtrace_casptr(&dcpu->dtdsc_dirty,
+ dirty, NULL) != dirty);
+ }
+
+ if (!work) {
+ /*
+ * We have no work to do; we can simply return.
+ */
+ return;
+ }
+
+ dtrace_sync();
+
+ for (i = 0; i < NCPU; i++) {
+ dcpu = &dstate->dtds_percpu[i];
+
+ if (dcpu->dtdsc_rinsing == NULL)
+ continue;
+
+ /*
+ * We are now guaranteed that no hash chain contains a pointer
+ * into this dirty list; we can make it clean.
+ */
+ ASSERT(dcpu->dtdsc_clean == NULL);
+ dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
+ dcpu->dtdsc_rinsing = NULL;
+ }
+
+ /*
+ * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
+ * sure that all CPUs have seen all of the dtdsc_clean pointers.
+ * This prevents a race whereby a CPU incorrectly decides that
+ * the state should be something other than DTRACE_DSTATE_CLEAN
+ * after dtrace_dynvar_clean() has completed.
+ */
+ dtrace_sync();
+
+ dstate->dtds_state = DTRACE_DSTATE_CLEAN;
+}
+
+/*
+ * Depending on the value of the op parameter, this function looks-up,
+ * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
+ * allocation is requested, this function will return a pointer to a
+ * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
+ * variable can be allocated. If NULL is returned, the appropriate counter
+ * will be incremented.
+ */
+dtrace_dynvar_t *
+dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
+ dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
+ dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
+{
+ uint64_t hashval = DTRACE_DYNHASH_VALID;
+ dtrace_dynhash_t *hash = dstate->dtds_hash;
+ dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
+ processorid_t me = curcpu, cpu = me;
+ dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
+ size_t bucket, ksize;
+ size_t chunksize = dstate->dtds_chunksize;
+ uintptr_t kdata, lock, nstate;
+ uint_t i;
+
+ ASSERT(nkeys != 0);
+
+ /*
+ * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
+ * algorithm. For the by-value portions, we perform the algorithm in
+ * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
+ * bit, and seems to have only a minute effect on distribution. For
+ * the by-reference data, we perform "One-at-a-time" iterating (safely)
+ * over each referenced byte. It's painful to do this, but it's much
+ * better than pathological hash distribution. The efficacy of the
+ * hashing algorithm (and a comparison with other algorithms) may be
+ * found by running the ::dtrace_dynstat MDB dcmd.
+ */
+ for (i = 0; i < nkeys; i++) {
+ if (key[i].dttk_size == 0) {
+ uint64_t val = key[i].dttk_value;
+
+ hashval += (val >> 48) & 0xffff;
+ hashval += (hashval << 10);
+ hashval ^= (hashval >> 6);
+
+ hashval += (val >> 32) & 0xffff;
+ hashval += (hashval << 10);
+ hashval ^= (hashval >> 6);
+
+ hashval += (val >> 16) & 0xffff;
+ hashval += (hashval << 10);
+ hashval ^= (hashval >> 6);
+
+ hashval += val & 0xffff;
+ hashval += (hashval << 10);
+ hashval ^= (hashval >> 6);
+ } else {
+ /*
+ * This is incredibly painful, but it beats the hell
+ * out of the alternative.
+ */
+ uint64_t j, size = key[i].dttk_size;
+ uintptr_t base = (uintptr_t)key[i].dttk_value;
+
+ if (!dtrace_canload(base, size, mstate, vstate))
+ break;
+
+ for (j = 0; j < size; j++) {
+ hashval += dtrace_load8(base + j);
+ hashval += (hashval << 10);
+ hashval ^= (hashval >> 6);
+ }
+ }
+ }
+
+ if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
+ return (NULL);
+
+ hashval += (hashval << 3);
+ hashval ^= (hashval >> 11);
+ hashval += (hashval << 15);
+
+ /*
+ * There is a remote chance (ideally, 1 in 2^31) that our hashval
+ * comes out to be one of our two sentinel hash values. If this
+ * actually happens, we set the hashval to be a value known to be a
+ * non-sentinel value.
+ */
+ if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
+ hashval = DTRACE_DYNHASH_VALID;
+
+ /*
+ * Yes, it's painful to do a divide here. If the cycle count becomes
+ * important here, tricks can be pulled to reduce it. (However, it's
+ * critical that hash collisions be kept to an absolute minimum;
+ * they're much more painful than a divide.) It's better to have a
+ * solution that generates few collisions and still keeps things
+ * relatively simple.
+ */
+ bucket = hashval % dstate->dtds_hashsize;
+
+ if (op == DTRACE_DYNVAR_DEALLOC) {
+ volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
+
+ for (;;) {
+ while ((lock = *lockp) & 1)
+ continue;
+
+ if (dtrace_casptr((volatile void *)lockp,
+ (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
+ break;
+ }
+
+ dtrace_membar_producer();
+ }
+
+top:
+ prev = NULL;
+ lock = hash[bucket].dtdh_lock;
+
+ dtrace_membar_consumer();
+
+ start = hash[bucket].dtdh_chain;
+ ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
+ start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
+ op != DTRACE_DYNVAR_DEALLOC));
+
+ for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
+ dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
+ dtrace_key_t *dkey = &dtuple->dtt_key[0];
+
+ if (dvar->dtdv_hashval != hashval) {
+ if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
+ /*
+ * We've reached the sink, and therefore the
+ * end of the hash chain; we can kick out of
+ * the loop knowing that we have seen a valid
+ * snapshot of state.
+ */
+ ASSERT(dvar->dtdv_next == NULL);
+ ASSERT(dvar == &dtrace_dynhash_sink);
+ break;
+ }
+
+ if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
+ /*
+ * We've gone off the rails: somewhere along
+ * the line, one of the members of this hash
+ * chain was deleted. Note that we could also
+ * detect this by simply letting this loop run
+ * to completion, as we would eventually hit
+ * the end of the dirty list. However, we
+ * want to avoid running the length of the
+ * dirty list unnecessarily (it might be quite
+ * long), so we catch this as early as
+ * possible by detecting the hash marker. In
+ * this case, we simply set dvar to NULL and
+ * break; the conditional after the loop will
+ * send us back to top.
+ */
+ dvar = NULL;
+ break;
+ }
+
+ goto next;
+ }
+
+ if (dtuple->dtt_nkeys != nkeys)
+ goto next;
+
+ for (i = 0; i < nkeys; i++, dkey++) {
+ if (dkey->dttk_size != key[i].dttk_size)
+ goto next; /* size or type mismatch */
+
+ if (dkey->dttk_size != 0) {
+ if (dtrace_bcmp(
+ (void *)(uintptr_t)key[i].dttk_value,
+ (void *)(uintptr_t)dkey->dttk_value,
+ dkey->dttk_size))
+ goto next;
+ } else {
+ if (dkey->dttk_value != key[i].dttk_value)
+ goto next;
+ }
+ }
+
+ if (op != DTRACE_DYNVAR_DEALLOC)
+ return (dvar);
+
+ ASSERT(dvar->dtdv_next == NULL ||
+ dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
+
+ if (prev != NULL) {
+ ASSERT(hash[bucket].dtdh_chain != dvar);
+ ASSERT(start != dvar);
+ ASSERT(prev->dtdv_next == dvar);
+ prev->dtdv_next = dvar->dtdv_next;
+ } else {
+ if (dtrace_casptr(&hash[bucket].dtdh_chain,
+ start, dvar->dtdv_next) != start) {
+ /*
+ * We have failed to atomically swing the
+ * hash table head pointer, presumably because
+ * of a conflicting allocation on another CPU.
+ * We need to reread the hash chain and try
+ * again.
+ */
+ goto top;
+ }
+ }
+
+ dtrace_membar_producer();
+
+ /*
+ * Now set the hash value to indicate that it's free.
+ */
+ ASSERT(hash[bucket].dtdh_chain != dvar);
+ dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
+
+ dtrace_membar_producer();
+
+ /*
+ * Set the next pointer to point at the dirty list, and
+ * atomically swing the dirty pointer to the newly freed dvar.
+ */
+ do {
+ next = dcpu->dtdsc_dirty;
+ dvar->dtdv_next = next;
+ } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
+
+ /*
+ * Finally, unlock this hash bucket.
+ */
+ ASSERT(hash[bucket].dtdh_lock == lock);
+ ASSERT(lock & 1);
+ hash[bucket].dtdh_lock++;
+
+ return (NULL);
+next:
+ prev = dvar;
+ continue;
+ }
+
+ if (dvar == NULL) {
+ /*
+ * If dvar is NULL, it is because we went off the rails:
+ * one of the elements that we traversed in the hash chain
+ * was deleted while we were traversing it. In this case,
+ * we assert that we aren't doing a dealloc (deallocs lock
+ * the hash bucket to prevent themselves from racing with
+ * one another), and retry the hash chain traversal.
+ */
+ ASSERT(op != DTRACE_DYNVAR_DEALLOC);
+ goto top;
+ }
+
+ if (op != DTRACE_DYNVAR_ALLOC) {
+ /*
+ * If we are not to allocate a new variable, we want to
+ * return NULL now. Before we return, check that the value
+ * of the lock word hasn't changed. If it has, we may have
+ * seen an inconsistent snapshot.
+ */
+ if (op == DTRACE_DYNVAR_NOALLOC) {
+ if (hash[bucket].dtdh_lock != lock)
+ goto top;
+ } else {
+ ASSERT(op == DTRACE_DYNVAR_DEALLOC);
+ ASSERT(hash[bucket].dtdh_lock == lock);
+ ASSERT(lock & 1);
+ hash[bucket].dtdh_lock++;
+ }
+
+ return (NULL);
+ }
+
+ /*
+ * We need to allocate a new dynamic variable. The size we need is the
+ * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
+ * size of any auxiliary key data (rounded up to 8-byte alignment) plus
+ * the size of any referred-to data (dsize). We then round the final
+ * size up to the chunksize for allocation.
+ */
+ for (ksize = 0, i = 0; i < nkeys; i++)
+ ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
+
+ /*
+ * This should be pretty much impossible, but could happen if, say,
+ * strange DIF specified the tuple. Ideally, this should be an
+ * assertion and not an error condition -- but that requires that the
+ * chunksize calculation in dtrace_difo_chunksize() be absolutely
+ * bullet-proof. (That is, it must not be able to be fooled by
+ * malicious DIF.) Given the lack of backwards branches in DIF,
+ * solving this would presumably not amount to solving the Halting
+ * Problem -- but it still seems awfully hard.
+ */
+ if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
+ ksize + dsize > chunksize) {
+ dcpu->dtdsc_drops++;
+ return (NULL);
+ }
+
+ nstate = DTRACE_DSTATE_EMPTY;
+
+ do {
+retry:
+ free = dcpu->dtdsc_free;
+
+ if (free == NULL) {
+ dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
+ void *rval;
+
+ if (clean == NULL) {
+ /*
+ * We're out of dynamic variable space on
+ * this CPU. Unless we have tried all CPUs,
+ * we'll try to allocate from a different
+ * CPU.
+ */
+ switch (dstate->dtds_state) {
+ case DTRACE_DSTATE_CLEAN: {
+ void *sp = &dstate->dtds_state;
+
+ if (++cpu >= NCPU)
+ cpu = 0;
+
+ if (dcpu->dtdsc_dirty != NULL &&
+ nstate == DTRACE_DSTATE_EMPTY)
+ nstate = DTRACE_DSTATE_DIRTY;
+
+ if (dcpu->dtdsc_rinsing != NULL)
+ nstate = DTRACE_DSTATE_RINSING;
+
+ dcpu = &dstate->dtds_percpu[cpu];
+
+ if (cpu != me)
+ goto retry;
+
+ (void) dtrace_cas32(sp,
+ DTRACE_DSTATE_CLEAN, nstate);
+
+ /*
+ * To increment the correct bean
+ * counter, take another lap.
+ */
+ goto retry;
+ }
+
+ case DTRACE_DSTATE_DIRTY:
+ dcpu->dtdsc_dirty_drops++;
+ break;
+
+ case DTRACE_DSTATE_RINSING:
+ dcpu->dtdsc_rinsing_drops++;
+ break;
+
+ case DTRACE_DSTATE_EMPTY:
+ dcpu->dtdsc_drops++;
+ break;
+ }
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
+ return (NULL);
+ }
+
+ /*
+ * The clean list appears to be non-empty. We want to
+ * move the clean list to the free list; we start by
+ * moving the clean pointer aside.
+ */
+ if (dtrace_casptr(&dcpu->dtdsc_clean,
+ clean, NULL) != clean) {
+ /*
+ * We are in one of two situations:
+ *
+ * (a) The clean list was switched to the
+ * free list by another CPU.
+ *
+ * (b) The clean list was added to by the
+ * cleansing cyclic.
+ *
+ * In either of these situations, we can
+ * just reattempt the free list allocation.
+ */
+ goto retry;
+ }
+
+ ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
+
+ /*
+ * Now we'll move the clean list to our free list.
+ * It's impossible for this to fail: the only way
+ * the free list can be updated is through this
+ * code path, and only one CPU can own the clean list.
+ * Thus, it would only be possible for this to fail if
+ * this code were racing with dtrace_dynvar_clean().
+ * (That is, if dtrace_dynvar_clean() updated the clean
+ * list, and we ended up racing to update the free
+ * list.) This race is prevented by the dtrace_sync()
+ * in dtrace_dynvar_clean() -- which flushes the
+ * owners of the clean lists out before resetting
+ * the clean lists.
+ */
+ dcpu = &dstate->dtds_percpu[me];
+ rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
+ ASSERT(rval == NULL);
+ goto retry;
+ }
+
+ dvar = free;
+ new_free = dvar->dtdv_next;
+ } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
+
+ /*
+ * We have now allocated a new chunk. We copy the tuple keys into the
+ * tuple array and copy any referenced key data into the data space
+ * following the tuple array. As we do this, we relocate dttk_value
+ * in the final tuple to point to the key data address in the chunk.
+ */
+ kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
+ dvar->dtdv_data = (void *)(kdata + ksize);
+ dvar->dtdv_tuple.dtt_nkeys = nkeys;
+
+ for (i = 0; i < nkeys; i++) {
+ dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
+ size_t kesize = key[i].dttk_size;
+
+ if (kesize != 0) {
+ dtrace_bcopy(
+ (const void *)(uintptr_t)key[i].dttk_value,
+ (void *)kdata, kesize);
+ dkey->dttk_value = kdata;
+ kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
+ } else {
+ dkey->dttk_value = key[i].dttk_value;
+ }
+
+ dkey->dttk_size = kesize;
+ }
+
+ ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
+ dvar->dtdv_hashval = hashval;
+ dvar->dtdv_next = start;
+
+ if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
+ return (dvar);
+
+ /*
+ * The cas has failed. Either another CPU is adding an element to
+ * this hash chain, or another CPU is deleting an element from this
+ * hash chain. The simplest way to deal with both of these cases
+ * (though not necessarily the most efficient) is to free our
+ * allocated block and re-attempt it all. Note that the free is
+ * to the dirty list and _not_ to the free list. This is to prevent
+ * races with allocators, above.
+ */
+ dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
+
+ dtrace_membar_producer();
+
+ do {
+ free = dcpu->dtdsc_dirty;
+ dvar->dtdv_next = free;
+ } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
+
+ goto top;
+}
+
+/*ARGSUSED*/
+static void
+dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
+{
+ if ((int64_t)nval < (int64_t)*oval)
+ *oval = nval;
+}
+
+/*ARGSUSED*/
+static void
+dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
+{
+ if ((int64_t)nval > (int64_t)*oval)
+ *oval = nval;
+}
+
+static void
+dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
+{
+ int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
+ int64_t val = (int64_t)nval;
+
+ if (val < 0) {
+ for (i = 0; i < zero; i++) {
+ if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
+ quanta[i] += incr;
+ return;
+ }
+ }
+ } else {
+ for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
+ if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
+ quanta[i - 1] += incr;
+ return;
+ }
+ }
+
+ quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
+ return;
+ }
+
+ ASSERT(0);
+}
+
+static void
+dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
+{
+ uint64_t arg = *lquanta++;
+ int32_t base = DTRACE_LQUANTIZE_BASE(arg);
+ uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
+ uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
+ int32_t val = (int32_t)nval, level;
+
+ ASSERT(step != 0);
+ ASSERT(levels != 0);
+
+ if (val < base) {
+ /*
+ * This is an underflow.
+ */
+ lquanta[0] += incr;
+ return;
+ }
+
+ level = (val - base) / step;
+
+ if (level < levels) {
+ lquanta[level + 1] += incr;
+ return;
+ }
+
+ /*
+ * This is an overflow.
+ */
+ lquanta[levels + 1] += incr;
+}
+
+static int
+dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
+ uint16_t high, uint16_t nsteps, int64_t value)
+{
+ int64_t this = 1, last, next;
+ int base = 1, order;
+
+ ASSERT(factor <= nsteps);
+ ASSERT(nsteps % factor == 0);
+
+ for (order = 0; order < low; order++)
+ this *= factor;
+
+ /*
+ * If our value is less than our factor taken to the power of the
+ * low order of magnitude, it goes into the zeroth bucket.
+ */
+ if (value < (last = this))
+ return (0);
+
+ for (this *= factor; order <= high; order++) {
+ int nbuckets = this > nsteps ? nsteps : this;
+
+ if ((next = this * factor) < this) {
+ /*
+ * We should not generally get log/linear quantizations
+ * with a high magnitude that allows 64-bits to
+ * overflow, but we nonetheless protect against this
+ * by explicitly checking for overflow, and clamping
+ * our value accordingly.
+ */
+ value = this - 1;
+ }
+
+ if (value < this) {
+ /*
+ * If our value lies within this order of magnitude,
+ * determine its position by taking the offset within
+ * the order of magnitude, dividing by the bucket
+ * width, and adding to our (accumulated) base.
+ */
+ return (base + (value - last) / (this / nbuckets));
+ }
+
+ base += nbuckets - (nbuckets / factor);
+ last = this;
+ this = next;
+ }
+
+ /*
+ * Our value is greater than or equal to our factor taken to the
+ * power of one plus the high magnitude -- return the top bucket.
+ */
+ return (base);
+}
+
+static void
+dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
+{
+ uint64_t arg = *llquanta++;
+ uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
+ uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
+ uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
+ uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
+
+ llquanta[dtrace_aggregate_llquantize_bucket(factor,
+ low, high, nsteps, nval)] += incr;
+}
+
+/*ARGSUSED*/
+static void
+dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
+{
+ data[0]++;
+ data[1] += nval;
+}
+
+/*ARGSUSED*/
+static void
+dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
+{
+ int64_t snval = (int64_t)nval;
+ uint64_t tmp[2];
+
+ data[0]++;
+ data[1] += nval;
+
+ /*
+ * What we want to say here is:
+ *
+ * data[2] += nval * nval;
+ *
+ * But given that nval is 64-bit, we could easily overflow, so
+ * we do this as 128-bit arithmetic.
+ */
+ if (snval < 0)
+ snval = -snval;
+
+ dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
+ dtrace_add_128(data + 2, tmp, data + 2);
+}
+
+/*ARGSUSED*/
+static void
+dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
+{
+ *oval = *oval + 1;
+}
+
+/*ARGSUSED*/
+static void
+dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
+{
+ *oval += nval;
+}
+
+/*
+ * Aggregate given the tuple in the principal data buffer, and the aggregating
+ * action denoted by the specified dtrace_aggregation_t. The aggregation
+ * buffer is specified as the buf parameter. This routine does not return
+ * failure; if there is no space in the aggregation buffer, the data will be
+ * dropped, and a corresponding counter incremented.
+ */
+static void
+dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
+ intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
+{
+ dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
+ uint32_t i, ndx, size, fsize;
+ uint32_t align = sizeof (uint64_t) - 1;
+ dtrace_aggbuffer_t *agb;
+ dtrace_aggkey_t *key;
+ uint32_t hashval = 0, limit, isstr;
+ caddr_t tomax, data, kdata;
+ dtrace_actkind_t action;
+ dtrace_action_t *act;
+ uintptr_t offs;
+
+ if (buf == NULL)
+ return;
+
+ if (!agg->dtag_hasarg) {
+ /*
+ * Currently, only quantize() and lquantize() take additional
+ * arguments, and they have the same semantics: an increment
+ * value that defaults to 1 when not present. If additional
+ * aggregating actions take arguments, the setting of the
+ * default argument value will presumably have to become more
+ * sophisticated...
+ */
+ arg = 1;
+ }
+
+ action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
+ size = rec->dtrd_offset - agg->dtag_base;
+ fsize = size + rec->dtrd_size;
+
+ ASSERT(dbuf->dtb_tomax != NULL);
+ data = dbuf->dtb_tomax + offset + agg->dtag_base;
+
+ if ((tomax = buf->dtb_tomax) == NULL) {
+ dtrace_buffer_drop(buf);
+ return;
+ }
+
+ /*
+ * The metastructure is always at the bottom of the buffer.
+ */
+ agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
+ sizeof (dtrace_aggbuffer_t));
+
+ if (buf->dtb_offset == 0) {
+ /*
+ * We just kludge up approximately 1/8th of the size to be
+ * buckets. If this guess ends up being routinely
+ * off-the-mark, we may need to dynamically readjust this
+ * based on past performance.
+ */
+ uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
+
+ if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
+ (uintptr_t)tomax || hashsize == 0) {
+ /*
+ * We've been given a ludicrously small buffer;
+ * increment our drop count and leave.
+ */
+ dtrace_buffer_drop(buf);
+ return;
+ }
+
+ /*
+ * And now, a pathetic attempt to try to get a an odd (or
+ * perchance, a prime) hash size for better hash distribution.
+ */
+ if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
+ hashsize -= DTRACE_AGGHASHSIZE_SLEW;
+
+ agb->dtagb_hashsize = hashsize;
+ agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
+ agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
+ agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
+
+ for (i = 0; i < agb->dtagb_hashsize; i++)
+ agb->dtagb_hash[i] = NULL;
+ }
+
+ ASSERT(agg->dtag_first != NULL);
+ ASSERT(agg->dtag_first->dta_intuple);
+
+ /*
+ * Calculate the hash value based on the key. Note that we _don't_
+ * include the aggid in the hashing (but we will store it as part of
+ * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
+ * algorithm: a simple, quick algorithm that has no known funnels, and
+ * gets good distribution in practice. The efficacy of the hashing
+ * algorithm (and a comparison with other algorithms) may be found by
+ * running the ::dtrace_aggstat MDB dcmd.
+ */
+ for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
+ i = act->dta_rec.dtrd_offset - agg->dtag_base;
+ limit = i + act->dta_rec.dtrd_size;
+ ASSERT(limit <= size);
+ isstr = DTRACEACT_ISSTRING(act);
+
+ for (; i < limit; i++) {
+ hashval += data[i];
+ hashval += (hashval << 10);
+ hashval ^= (hashval >> 6);
+
+ if (isstr && data[i] == '\0')
+ break;
+ }
+ }
+
+ hashval += (hashval << 3);
+ hashval ^= (hashval >> 11);
+ hashval += (hashval << 15);
+
+ /*
+ * Yes, the divide here is expensive -- but it's generally the least
+ * of the performance issues given the amount of data that we iterate
+ * over to compute hash values, compare data, etc.
+ */
+ ndx = hashval % agb->dtagb_hashsize;
+
+ for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
+ ASSERT((caddr_t)key >= tomax);
+ ASSERT((caddr_t)key < tomax + buf->dtb_size);
+
+ if (hashval != key->dtak_hashval || key->dtak_size != size)
+ continue;
+
+ kdata = key->dtak_data;
+ ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
+
+ for (act = agg->dtag_first; act->dta_intuple;
+ act = act->dta_next) {
+ i = act->dta_rec.dtrd_offset - agg->dtag_base;
+ limit = i + act->dta_rec.dtrd_size;
+ ASSERT(limit <= size);
+ isstr = DTRACEACT_ISSTRING(act);
+
+ for (; i < limit; i++) {
+ if (kdata[i] != data[i])
+ goto next;
+
+ if (isstr && data[i] == '\0')
+ break;
+ }
+ }
+
+ if (action != key->dtak_action) {
+ /*
+ * We are aggregating on the same value in the same
+ * aggregation with two different aggregating actions.
+ * (This should have been picked up in the compiler,
+ * so we may be dealing with errant or devious DIF.)
+ * This is an error condition; we indicate as much,
+ * and return.
+ */
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+ return;
+ }
+
+ /*
+ * This is a hit: we need to apply the aggregator to
+ * the value at this key.
+ */
+ agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
+ return;
+next:
+ continue;
+ }
+
+ /*
+ * We didn't find it. We need to allocate some zero-filled space,
+ * link it into the hash table appropriately, and apply the aggregator
+ * to the (zero-filled) value.
+ */
+ offs = buf->dtb_offset;
+ while (offs & (align - 1))
+ offs += sizeof (uint32_t);
+
+ /*
+ * If we don't have enough room to both allocate a new key _and_
+ * its associated data, increment the drop count and return.
+ */
+ if ((uintptr_t)tomax + offs + fsize >
+ agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
+ dtrace_buffer_drop(buf);
+ return;
+ }
+
+ /*CONSTCOND*/
+ ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
+ key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
+ agb->dtagb_free -= sizeof (dtrace_aggkey_t);
+
+ key->dtak_data = kdata = tomax + offs;
+ buf->dtb_offset = offs + fsize;
+
+ /*
+ * Now copy the data across.
+ */
+ *((dtrace_aggid_t *)kdata) = agg->dtag_id;
+
+ for (i = sizeof (dtrace_aggid_t); i < size; i++)
+ kdata[i] = data[i];
+
+ /*
+ * Because strings are not zeroed out by default, we need to iterate
+ * looking for actions that store strings, and we need to explicitly
+ * pad these strings out with zeroes.
+ */
+ for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
+ int nul;
+
+ if (!DTRACEACT_ISSTRING(act))
+ continue;
+
+ i = act->dta_rec.dtrd_offset - agg->dtag_base;
+ limit = i + act->dta_rec.dtrd_size;
+ ASSERT(limit <= size);
+
+ for (nul = 0; i < limit; i++) {
+ if (nul) {
+ kdata[i] = '\0';
+ continue;
+ }
+
+ if (data[i] != '\0')
+ continue;
+
+ nul = 1;
+ }
+ }
+
+ for (i = size; i < fsize; i++)
+ kdata[i] = 0;
+
+ key->dtak_hashval = hashval;
+ key->dtak_size = size;
+ key->dtak_action = action;
+ key->dtak_next = agb->dtagb_hash[ndx];
+ agb->dtagb_hash[ndx] = key;
+
+ /*
+ * Finally, apply the aggregator.
+ */
+ *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
+ agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
+}
+
+/*
+ * Given consumer state, this routine finds a speculation in the INACTIVE
+ * state and transitions it into the ACTIVE state. If there is no speculation
+ * in the INACTIVE state, 0 is returned. In this case, no error counter is
+ * incremented -- it is up to the caller to take appropriate action.
+ */
+static int
+dtrace_speculation(dtrace_state_t *state)
+{
+ int i = 0;
+ dtrace_speculation_state_t curstate;
+ uint32_t *stat = &state->dts_speculations_unavail, count;
+
+ while (i < state->dts_nspeculations) {
+ dtrace_speculation_t *spec = &state->dts_speculations[i];
+
+ curstate = spec->dtsp_state;
+
+ if (curstate != DTRACESPEC_INACTIVE) {
+ if (curstate == DTRACESPEC_COMMITTINGMANY ||
+ curstate == DTRACESPEC_COMMITTING ||
+ curstate == DTRACESPEC_DISCARDING)
+ stat = &state->dts_speculations_busy;
+ i++;
+ continue;
+ }
+
+ if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
+ curstate, DTRACESPEC_ACTIVE) == curstate)
+ return (i + 1);
+ }
+
+ /*
+ * We couldn't find a speculation. If we found as much as a single
+ * busy speculation buffer, we'll attribute this failure as "busy"
+ * instead of "unavail".
+ */
+ do {
+ count = *stat;
+ } while (dtrace_cas32(stat, count, count + 1) != count);
+
+ return (0);
+}
+
+/*
+ * This routine commits an active speculation. If the specified speculation
+ * is not in a valid state to perform a commit(), this routine will silently do
+ * nothing. The state of the specified speculation is transitioned according
+ * to the state transition diagram outlined in <sys/dtrace_impl.h>
+ */
+static void
+dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
+ dtrace_specid_t which)
+{
+ dtrace_speculation_t *spec;
+ dtrace_buffer_t *src, *dest;
+ uintptr_t daddr, saddr, dlimit, slimit;
+ dtrace_speculation_state_t curstate, new = 0;
+ intptr_t offs;
+ uint64_t timestamp;
+
+ if (which == 0)
+ return;
+
+ if (which > state->dts_nspeculations) {
+ cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
+ return;
+ }
+
+ spec = &state->dts_speculations[which - 1];
+ src = &spec->dtsp_buffer[cpu];
+ dest = &state->dts_buffer[cpu];
+
+ do {
+ curstate = spec->dtsp_state;
+
+ if (curstate == DTRACESPEC_COMMITTINGMANY)
+ break;
+
+ switch (curstate) {
+ case DTRACESPEC_INACTIVE:
+ case DTRACESPEC_DISCARDING:
+ return;
+
+ case DTRACESPEC_COMMITTING:
+ /*
+ * This is only possible if we are (a) commit()'ing
+ * without having done a prior speculate() on this CPU
+ * and (b) racing with another commit() on a different
+ * CPU. There's nothing to do -- we just assert that
+ * our offset is 0.
+ */
+ ASSERT(src->dtb_offset == 0);
+ return;
+
+ case DTRACESPEC_ACTIVE:
+ new = DTRACESPEC_COMMITTING;
+ break;
+
+ case DTRACESPEC_ACTIVEONE:
+ /*
+ * This speculation is active on one CPU. If our
+ * buffer offset is non-zero, we know that the one CPU
+ * must be us. Otherwise, we are committing on a
+ * different CPU from the speculate(), and we must
+ * rely on being asynchronously cleaned.
+ */
+ if (src->dtb_offset != 0) {
+ new = DTRACESPEC_COMMITTING;
+ break;
+ }
+ /*FALLTHROUGH*/
+
+ case DTRACESPEC_ACTIVEMANY:
+ new = DTRACESPEC_COMMITTINGMANY;
+ break;
+
+ default:
+ ASSERT(0);
+ }
+ } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
+ curstate, new) != curstate);
+
+ /*
+ * We have set the state to indicate that we are committing this
+ * speculation. Now reserve the necessary space in the destination
+ * buffer.
+ */
+ if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
+ sizeof (uint64_t), state, NULL)) < 0) {
+ dtrace_buffer_drop(dest);
+ goto out;
+ }
+
+ /*
+ * We have sufficient space to copy the speculative buffer into the
+ * primary buffer. First, modify the speculative buffer, filling
+ * in the timestamp of all entries with the curstate time. The data
+ * must have the commit() time rather than the time it was traced,
+ * so that all entries in the primary buffer are in timestamp order.
+ */
+ timestamp = dtrace_gethrtime();
+ saddr = (uintptr_t)src->dtb_tomax;
+ slimit = saddr + src->dtb_offset;
+ while (saddr < slimit) {
+ size_t size;
+ dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
+
+ if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
+ saddr += sizeof (dtrace_epid_t);
+ continue;
+ }
+ ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
+ size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
+
+ ASSERT3U(saddr + size, <=, slimit);
+ ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
+ ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
+
+ DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
+
+ saddr += size;
+ }
+
+ /*
+ * Copy the buffer across. (Note that this is a
+ * highly subobtimal bcopy(); in the unlikely event that this becomes
+ * a serious performance issue, a high-performance DTrace-specific
+ * bcopy() should obviously be invented.)
+ */
+ daddr = (uintptr_t)dest->dtb_tomax + offs;
+ dlimit = daddr + src->dtb_offset;
+ saddr = (uintptr_t)src->dtb_tomax;
+
+ /*
+ * First, the aligned portion.
+ */
+ while (dlimit - daddr >= sizeof (uint64_t)) {
+ *((uint64_t *)daddr) = *((uint64_t *)saddr);
+
+ daddr += sizeof (uint64_t);
+ saddr += sizeof (uint64_t);
+ }
+
+ /*
+ * Now any left-over bit...
+ */
+ while (dlimit - daddr)
+ *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
+
+ /*
+ * Finally, commit the reserved space in the destination buffer.
+ */
+ dest->dtb_offset = offs + src->dtb_offset;
+
+out:
+ /*
+ * If we're lucky enough to be the only active CPU on this speculation
+ * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
+ */
+ if (curstate == DTRACESPEC_ACTIVE ||
+ (curstate == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
+ uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
+ DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
+
+ ASSERT(rval == DTRACESPEC_COMMITTING);
+ }
+
+ src->dtb_offset = 0;
+ src->dtb_xamot_drops += src->dtb_drops;
+ src->dtb_drops = 0;
+}
+
+/*
+ * This routine discards an active speculation. If the specified speculation
+ * is not in a valid state to perform a discard(), this routine will silently
+ * do nothing. The state of the specified speculation is transitioned
+ * according to the state transition diagram outlined in <sys/dtrace_impl.h>
+ */
+static void
+dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
+ dtrace_specid_t which)
+{
+ dtrace_speculation_t *spec;
+ dtrace_speculation_state_t curstate, new = 0;
+ dtrace_buffer_t *buf;
+
+ if (which == 0)
+ return;
+
+ if (which > state->dts_nspeculations) {
+ cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
+ return;
+ }
+
+ spec = &state->dts_speculations[which - 1];
+ buf = &spec->dtsp_buffer[cpu];
+
+ do {
+ curstate = spec->dtsp_state;
+
+ switch (curstate) {
+ case DTRACESPEC_INACTIVE:
+ case DTRACESPEC_COMMITTINGMANY:
+ case DTRACESPEC_COMMITTING:
+ case DTRACESPEC_DISCARDING:
+ return;
+
+ case DTRACESPEC_ACTIVE:
+ case DTRACESPEC_ACTIVEMANY:
+ new = DTRACESPEC_DISCARDING;
+ break;
+
+ case DTRACESPEC_ACTIVEONE:
+ if (buf->dtb_offset != 0) {
+ new = DTRACESPEC_INACTIVE;
+ } else {
+ new = DTRACESPEC_DISCARDING;
+ }
+ break;
+
+ default:
+ ASSERT(0);
+ }
+ } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
+ curstate, new) != curstate);
+
+ buf->dtb_offset = 0;
+ buf->dtb_drops = 0;
+}
+
+/*
+ * Note: not called from probe context. This function is called
+ * asynchronously from cross call context to clean any speculations that are
+ * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
+ * transitioned back to the INACTIVE state until all CPUs have cleaned the
+ * speculation.
+ */
+static void
+dtrace_speculation_clean_here(dtrace_state_t *state)
+{
+ dtrace_icookie_t cookie;
+ processorid_t cpu = curcpu;
+ dtrace_buffer_t *dest = &state->dts_buffer[cpu];
+ dtrace_specid_t i;
+
+ cookie = dtrace_interrupt_disable();
+
+ if (dest->dtb_tomax == NULL) {
+ dtrace_interrupt_enable(cookie);
+ return;
+ }
+
+ for (i = 0; i < state->dts_nspeculations; i++) {
+ dtrace_speculation_t *spec = &state->dts_speculations[i];
+ dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
+
+ if (src->dtb_tomax == NULL)
+ continue;
+
+ if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
+ src->dtb_offset = 0;
+ continue;
+ }
+
+ if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
+ continue;
+
+ if (src->dtb_offset == 0)
+ continue;
+
+ dtrace_speculation_commit(state, cpu, i + 1);
+ }
+
+ dtrace_interrupt_enable(cookie);
+}
+
+/*
+ * Note: not called from probe context. This function is called
+ * asynchronously (and at a regular interval) to clean any speculations that
+ * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
+ * is work to be done, it cross calls all CPUs to perform that work;
+ * COMMITMANY and DISCARDING speculations may not be transitioned back to the
+ * INACTIVE state until they have been cleaned by all CPUs.
+ */
+static void
+dtrace_speculation_clean(dtrace_state_t *state)
+{
+ int work = 0, rv;
+ dtrace_specid_t i;
+
+ for (i = 0; i < state->dts_nspeculations; i++) {
+ dtrace_speculation_t *spec = &state->dts_speculations[i];
+
+ ASSERT(!spec->dtsp_cleaning);
+
+ if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
+ spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
+ continue;
+
+ work++;
+ spec->dtsp_cleaning = 1;
+ }
+
+ if (!work)
+ return;
+
+ dtrace_xcall(DTRACE_CPUALL,
+ (dtrace_xcall_t)dtrace_speculation_clean_here, state);
+
+ /*
+ * We now know that all CPUs have committed or discarded their
+ * speculation buffers, as appropriate. We can now set the state
+ * to inactive.
+ */
+ for (i = 0; i < state->dts_nspeculations; i++) {
+ dtrace_speculation_t *spec = &state->dts_speculations[i];
+ dtrace_speculation_state_t curstate, new;
+
+ if (!spec->dtsp_cleaning)
+ continue;
+
+ curstate = spec->dtsp_state;
+ ASSERT(curstate == DTRACESPEC_DISCARDING ||
+ curstate == DTRACESPEC_COMMITTINGMANY);
+
+ new = DTRACESPEC_INACTIVE;
+
+ rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, curstate, new);
+ ASSERT(rv == curstate);
+ spec->dtsp_cleaning = 0;
+ }
+}
+
+/*
+ * Called as part of a speculate() to get the speculative buffer associated
+ * with a given speculation. Returns NULL if the specified speculation is not
+ * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
+ * the active CPU is not the specified CPU -- the speculation will be
+ * atomically transitioned into the ACTIVEMANY state.
+ */
+static dtrace_buffer_t *
+dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
+ dtrace_specid_t which)
+{
+ dtrace_speculation_t *spec;
+ dtrace_speculation_state_t curstate, new = 0;
+ dtrace_buffer_t *buf;
+
+ if (which == 0)
+ return (NULL);
+
+ if (which > state->dts_nspeculations) {
+ cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
+ return (NULL);
+ }
+
+ spec = &state->dts_speculations[which - 1];
+ buf = &spec->dtsp_buffer[cpuid];
+
+ do {
+ curstate = spec->dtsp_state;
+
+ switch (curstate) {
+ case DTRACESPEC_INACTIVE:
+ case DTRACESPEC_COMMITTINGMANY:
+ case DTRACESPEC_DISCARDING:
+ return (NULL);
+
+ case DTRACESPEC_COMMITTING:
+ ASSERT(buf->dtb_offset == 0);
+ return (NULL);
+
+ case DTRACESPEC_ACTIVEONE:
+ /*
+ * This speculation is currently active on one CPU.
+ * Check the offset in the buffer; if it's non-zero,
+ * that CPU must be us (and we leave the state alone).
+ * If it's zero, assume that we're starting on a new
+ * CPU -- and change the state to indicate that the
+ * speculation is active on more than one CPU.
+ */
+ if (buf->dtb_offset != 0)
+ return (buf);
+
+ new = DTRACESPEC_ACTIVEMANY;
+ break;
+
+ case DTRACESPEC_ACTIVEMANY:
+ return (buf);
+
+ case DTRACESPEC_ACTIVE:
+ new = DTRACESPEC_ACTIVEONE;
+ break;
+
+ default:
+ ASSERT(0);
+ }
+ } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
+ curstate, new) != curstate);
+
+ ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
+ return (buf);
+}
+
+/*
+ * Return a string. In the event that the user lacks the privilege to access
+ * arbitrary kernel memory, we copy the string out to scratch memory so that we
+ * don't fail access checking.
+ *
+ * dtrace_dif_variable() uses this routine as a helper for various
+ * builtin values such as 'execname' and 'probefunc.'
+ */
+uintptr_t
+dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
+ dtrace_mstate_t *mstate)
+{
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t ret;
+ size_t strsz;
+
+ /*
+ * The easy case: this probe is allowed to read all of memory, so
+ * we can just return this as a vanilla pointer.
+ */
+ if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
+ return (addr);
+
+ /*
+ * This is the tougher case: we copy the string in question from
+ * kernel memory into scratch memory and return it that way: this
+ * ensures that we won't trip up when access checking tests the
+ * BYREF return value.
+ */
+ strsz = dtrace_strlen((char *)addr, size) + 1;
+
+ if (mstate->dtms_scratch_ptr + strsz >
+ mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ return (0);
+ }
+
+ dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
+ strsz);
+ ret = mstate->dtms_scratch_ptr;
+ mstate->dtms_scratch_ptr += strsz;
+ return (ret);
+}
+
+/*
+ * Return a string from a memoy address which is known to have one or
+ * more concatenated, individually zero terminated, sub-strings.
+ * In the event that the user lacks the privilege to access
+ * arbitrary kernel memory, we copy the string out to scratch memory so that we
+ * don't fail access checking.
+ *
+ * dtrace_dif_variable() uses this routine as a helper for various
+ * builtin values such as 'execargs'.
+ */
+static uintptr_t
+dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
+ dtrace_mstate_t *mstate)
+{
+ char *p;
+ size_t i;
+ uintptr_t ret;
+
+ if (mstate->dtms_scratch_ptr + strsz >
+ mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ return (0);
+ }
+
+ dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
+ strsz);
+
+ /* Replace sub-string termination characters with a space. */
+ for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
+ p++, i++)
+ if (*p == '\0')
+ *p = ' ';
+
+ ret = mstate->dtms_scratch_ptr;
+ mstate->dtms_scratch_ptr += strsz;
+ return (ret);
+}
+
+/*
+ * This function implements the DIF emulator's variable lookups. The emulator
+ * passes a reserved variable identifier and optional built-in array index.
+ */
+static uint64_t
+dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
+ uint64_t ndx)
+{
+ /*
+ * If we're accessing one of the uncached arguments, we'll turn this
+ * into a reference in the args array.
+ */
+ if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
+ ndx = v - DIF_VAR_ARG0;
+ v = DIF_VAR_ARGS;
+ }
+
+ switch (v) {
+ case DIF_VAR_ARGS:
+ ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
+ if (ndx >= sizeof (mstate->dtms_arg) /
+ sizeof (mstate->dtms_arg[0])) {
+ int aframes = mstate->dtms_probe->dtpr_aframes + 2;
+ dtrace_provider_t *pv;
+ uint64_t val;
+
+ pv = mstate->dtms_probe->dtpr_provider;
+ if (pv->dtpv_pops.dtps_getargval != NULL)
+ val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
+ mstate->dtms_probe->dtpr_id,
+ mstate->dtms_probe->dtpr_arg, ndx, aframes);
+ else
+ val = dtrace_getarg(ndx, aframes);
+
+ /*
+ * This is regrettably required to keep the compiler
+ * from tail-optimizing the call to dtrace_getarg().
+ * The condition always evaluates to true, but the
+ * compiler has no way of figuring that out a priori.
+ * (None of this would be necessary if the compiler
+ * could be relied upon to _always_ tail-optimize
+ * the call to dtrace_getarg() -- but it can't.)
+ */
+ if (mstate->dtms_probe != NULL)
+ return (val);
+
+ ASSERT(0);
+ }
+
+ return (mstate->dtms_arg[ndx]);
+
+#ifdef illumos
+ case DIF_VAR_UREGS: {
+ klwp_t *lwp;
+
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+ if ((lwp = curthread->t_lwp) == NULL) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
+ cpu_core[curcpu].cpuc_dtrace_illval = NULL;
+ return (0);
+ }
+
+ return (dtrace_getreg(lwp->lwp_regs, ndx));
+ return (0);
+ }
+#else
+ case DIF_VAR_UREGS: {
+ struct trapframe *tframe;
+
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+ if ((tframe = curthread->td_frame) == NULL) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
+ cpu_core[curcpu].cpuc_dtrace_illval = 0;
+ return (0);
+ }
+
+ return (dtrace_getreg(tframe, ndx));
+ }
+#endif
+
+ case DIF_VAR_CURTHREAD:
+ if (!dtrace_priv_proc(state))
+ return (0);
+ return ((uint64_t)(uintptr_t)curthread);
+
+ case DIF_VAR_TIMESTAMP:
+ if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
+ mstate->dtms_timestamp = dtrace_gethrtime();
+ mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
+ }
+ return (mstate->dtms_timestamp);
+
+ case DIF_VAR_VTIMESTAMP:
+ ASSERT(dtrace_vtime_references != 0);
+ return (curthread->t_dtrace_vtime);
+
+ case DIF_VAR_WALLTIMESTAMP:
+ if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
+ mstate->dtms_walltimestamp = dtrace_gethrestime();
+ mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
+ }
+ return (mstate->dtms_walltimestamp);
+
+#ifdef illumos
+ case DIF_VAR_IPL:
+ if (!dtrace_priv_kernel(state))
+ return (0);
+ if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
+ mstate->dtms_ipl = dtrace_getipl();
+ mstate->dtms_present |= DTRACE_MSTATE_IPL;
+ }
+ return (mstate->dtms_ipl);
+#endif
+
+ case DIF_VAR_EPID:
+ ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
+ return (mstate->dtms_epid);
+
+ case DIF_VAR_ID:
+ ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
+ return (mstate->dtms_probe->dtpr_id);
+
+ case DIF_VAR_STACKDEPTH:
+ if (!dtrace_priv_kernel(state))
+ return (0);
+ if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
+ int aframes = mstate->dtms_probe->dtpr_aframes + 2;
+
+ mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
+ mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
+ }
+ return (mstate->dtms_stackdepth);
+
+ case DIF_VAR_USTACKDEPTH:
+ if (!dtrace_priv_proc(state))
+ return (0);
+ if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) &&
+ CPU_ON_INTR(CPU)) {
+ mstate->dtms_ustackdepth = 0;
+ } else {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ mstate->dtms_ustackdepth =
+ dtrace_getustackdepth();
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ }
+ mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
+ }
+ return (mstate->dtms_ustackdepth);
+
+ case DIF_VAR_CALLER:
+ if (!dtrace_priv_kernel(state))
+ return (0);
+ if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
+ int aframes = mstate->dtms_probe->dtpr_aframes + 2;
+
+ if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
+ /*
+ * If this is an unanchored probe, we are
+ * required to go through the slow path:
+ * dtrace_caller() only guarantees correct
+ * results for anchored probes.
+ */
+ pc_t caller[2] = {0, 0};
+
+ dtrace_getpcstack(caller, 2, aframes,
+ (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
+ mstate->dtms_caller = caller[1];
+ } else if ((mstate->dtms_caller =
+ dtrace_caller(aframes)) == -1) {
+ /*
+ * We have failed to do this the quick way;
+ * we must resort to the slower approach of
+ * calling dtrace_getpcstack().
+ */
+ pc_t caller = 0;
+
+ dtrace_getpcstack(&caller, 1, aframes, NULL);
+ mstate->dtms_caller = caller;
+ }
+
+ mstate->dtms_present |= DTRACE_MSTATE_CALLER;
+ }
+ return (mstate->dtms_caller);
+
+ case DIF_VAR_UCALLER:
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+ if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
+ uint64_t ustack[3];
+
+ /*
+ * dtrace_getupcstack() fills in the first uint64_t
+ * with the current PID. The second uint64_t will
+ * be the program counter at user-level. The third
+ * uint64_t will contain the caller, which is what
+ * we're after.
+ */
+ ustack[2] = 0;
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ dtrace_getupcstack(ustack, 3);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ mstate->dtms_ucaller = ustack[2];
+ mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
+ }
+
+ return (mstate->dtms_ucaller);
+
+ case DIF_VAR_PROBEPROV:
+ ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
+ return (dtrace_dif_varstr(
+ (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
+ state, mstate));
+
+ case DIF_VAR_PROBEMOD:
+ ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
+ return (dtrace_dif_varstr(
+ (uintptr_t)mstate->dtms_probe->dtpr_mod,
+ state, mstate));
+
+ case DIF_VAR_PROBEFUNC:
+ ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
+ return (dtrace_dif_varstr(
+ (uintptr_t)mstate->dtms_probe->dtpr_func,
+ state, mstate));
+
+ case DIF_VAR_PROBENAME:
+ ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
+ return (dtrace_dif_varstr(
+ (uintptr_t)mstate->dtms_probe->dtpr_name,
+ state, mstate));
+
+ case DIF_VAR_PID:
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+#ifdef illumos
+ /*
+ * Note that we are assuming that an unanchored probe is
+ * always due to a high-level interrupt. (And we're assuming
+ * that there is only a single high level interrupt.)
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
+ return (pid0.pid_id);
+
+ /*
+ * It is always safe to dereference one's own t_procp pointer:
+ * it always points to a valid, allocated proc structure.
+ * Further, it is always safe to dereference the p_pidp member
+ * of one's own proc structure. (These are truisms becuase
+ * threads and processes don't clean up their own state --
+ * they leave that task to whomever reaps them.)
+ */
+ return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
+#else
+ return ((uint64_t)curproc->p_pid);
+#endif
+
+ case DIF_VAR_PPID:
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+#ifdef illumos
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
+ return (pid0.pid_id);
+
+ /*
+ * It is always safe to dereference one's own t_procp pointer:
+ * it always points to a valid, allocated proc structure.
+ * (This is true because threads don't clean up their own
+ * state -- they leave that task to whomever reaps them.)
+ */
+ return ((uint64_t)curthread->t_procp->p_ppid);
+#else
+ if (curproc->p_pid == proc0.p_pid)
+ return (curproc->p_pid);
+ else
+ return (curproc->p_pptr->p_pid);
+#endif
+
+ case DIF_VAR_TID:
+#ifdef illumos
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
+ return (0);
+#endif
+
+ return ((uint64_t)curthread->t_tid);
+
+ case DIF_VAR_EXECARGS: {
+ struct pargs *p_args = curthread->td_proc->p_args;
+
+ if (p_args == NULL)
+ return(0);
+
+ return (dtrace_dif_varstrz(
+ (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
+ }
+
+ case DIF_VAR_EXECNAME:
+#ifdef illumos
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
+ return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
+
+ /*
+ * It is always safe to dereference one's own t_procp pointer:
+ * it always points to a valid, allocated proc structure.
+ * (This is true because threads don't clean up their own
+ * state -- they leave that task to whomever reaps them.)
+ */
+ return (dtrace_dif_varstr(
+ (uintptr_t)curthread->t_procp->p_user.u_comm,
+ state, mstate));
+#else
+ return (dtrace_dif_varstr(
+ (uintptr_t) curthread->td_proc->p_comm, state, mstate));
+#endif
+
+ case DIF_VAR_ZONENAME:
+#ifdef illumos
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
+ return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
+
+ /*
+ * It is always safe to dereference one's own t_procp pointer:
+ * it always points to a valid, allocated proc structure.
+ * (This is true because threads don't clean up their own
+ * state -- they leave that task to whomever reaps them.)
+ */
+ return (dtrace_dif_varstr(
+ (uintptr_t)curthread->t_procp->p_zone->zone_name,
+ state, mstate));
+#elif defined(__FreeBSD__)
+ /*
+ * On FreeBSD, we introduce compatibility to zonename by falling through
+ * into jailname.
+ */
+ case DIF_VAR_JAILNAME:
+ if (!dtrace_priv_kernel(state))
+ return (0);
+
+ return (dtrace_dif_varstr(
+ (uintptr_t)curthread->td_ucred->cr_prison->pr_name,
+ state, mstate));
+
+ case DIF_VAR_JID:
+ if (!dtrace_priv_kernel(state))
+ return (0);
+
+ return ((uint64_t)curthread->td_ucred->cr_prison->pr_id);
+#else
+ return (0);
+#endif
+
+ case DIF_VAR_UID:
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+#ifdef illumos
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
+ return ((uint64_t)p0.p_cred->cr_uid);
+
+ /*
+ * It is always safe to dereference one's own t_procp pointer:
+ * it always points to a valid, allocated proc structure.
+ * (This is true because threads don't clean up their own
+ * state -- they leave that task to whomever reaps them.)
+ *
+ * Additionally, it is safe to dereference one's own process
+ * credential, since this is never NULL after process birth.
+ */
+ return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
+#else
+ return ((uint64_t)curthread->td_ucred->cr_uid);
+#endif
+
+ case DIF_VAR_GID:
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+#ifdef illumos
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
+ return ((uint64_t)p0.p_cred->cr_gid);
+
+ /*
+ * It is always safe to dereference one's own t_procp pointer:
+ * it always points to a valid, allocated proc structure.
+ * (This is true because threads don't clean up their own
+ * state -- they leave that task to whomever reaps them.)
+ *
+ * Additionally, it is safe to dereference one's own process
+ * credential, since this is never NULL after process birth.
+ */
+ return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
+#else
+ return ((uint64_t)curthread->td_ucred->cr_gid);
+#endif
+
+ case DIF_VAR_ERRNO: {
+#ifdef illumos
+ klwp_t *lwp;
+ if (!dtrace_priv_proc(state))
+ return (0);
+
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
+ return (0);
+
+ /*
+ * It is always safe to dereference one's own t_lwp pointer in
+ * the event that this pointer is non-NULL. (This is true
+ * because threads and lwps don't clean up their own state --
+ * they leave that task to whomever reaps them.)
+ */
+ if ((lwp = curthread->t_lwp) == NULL)
+ return (0);
+
+ return ((uint64_t)lwp->lwp_errno);
+#else
+ return (curthread->td_errno);
+#endif
+ }
+#ifndef illumos
+ case DIF_VAR_CPU: {
+ return curcpu;
+ }
+#endif
+ default:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+ return (0);
+ }
+}
+
+
+typedef enum dtrace_json_state {
+ DTRACE_JSON_REST = 1,
+ DTRACE_JSON_OBJECT,
+ DTRACE_JSON_STRING,
+ DTRACE_JSON_STRING_ESCAPE,
+ DTRACE_JSON_STRING_ESCAPE_UNICODE,
+ DTRACE_JSON_COLON,
+ DTRACE_JSON_COMMA,
+ DTRACE_JSON_VALUE,
+ DTRACE_JSON_IDENTIFIER,
+ DTRACE_JSON_NUMBER,
+ DTRACE_JSON_NUMBER_FRAC,
+ DTRACE_JSON_NUMBER_EXP,
+ DTRACE_JSON_COLLECT_OBJECT
+} dtrace_json_state_t;
+
+/*
+ * This function possesses just enough knowledge about JSON to extract a single
+ * value from a JSON string and store it in the scratch buffer. It is able
+ * to extract nested object values, and members of arrays by index.
+ *
+ * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
+ * be looked up as we descend into the object tree. e.g.
+ *
+ * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
+ * with nelems = 5.
+ *
+ * The run time of this function must be bounded above by strsize to limit the
+ * amount of work done in probe context. As such, it is implemented as a
+ * simple state machine, reading one character at a time using safe loads
+ * until we find the requested element, hit a parsing error or run off the
+ * end of the object or string.
+ *
+ * As there is no way for a subroutine to return an error without interrupting
+ * clause execution, we simply return NULL in the event of a missing key or any
+ * other error condition. Each NULL return in this function is commented with
+ * the error condition it represents -- parsing or otherwise.
+ *
+ * The set of states for the state machine closely matches the JSON
+ * specification (http://json.org/). Briefly:
+ *
+ * DTRACE_JSON_REST:
+ * Skip whitespace until we find either a top-level Object, moving
+ * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
+ *
+ * DTRACE_JSON_OBJECT:
+ * Locate the next key String in an Object. Sets a flag to denote
+ * the next String as a key string and moves to DTRACE_JSON_STRING.
+ *
+ * DTRACE_JSON_COLON:
+ * Skip whitespace until we find the colon that separates key Strings
+ * from their values. Once found, move to DTRACE_JSON_VALUE.
+ *
+ * DTRACE_JSON_VALUE:
+ * Detects the type of the next value (String, Number, Identifier, Object
+ * or Array) and routes to the states that process that type. Here we also
+ * deal with the element selector list if we are requested to traverse down
+ * into the object tree.
+ *
+ * DTRACE_JSON_COMMA:
+ * Skip whitespace until we find the comma that separates key-value pairs
+ * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
+ * (similarly DTRACE_JSON_VALUE). All following literal value processing
+ * states return to this state at the end of their value, unless otherwise
+ * noted.
+ *
+ * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
+ * Processes a Number literal from the JSON, including any exponent
+ * component that may be present. Numbers are returned as strings, which
+ * may be passed to strtoll() if an integer is required.
+ *
+ * DTRACE_JSON_IDENTIFIER:
+ * Processes a "true", "false" or "null" literal in the JSON.
+ *
+ * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
+ * DTRACE_JSON_STRING_ESCAPE_UNICODE:
+ * Processes a String literal from the JSON, whether the String denotes
+ * a key, a value or part of a larger Object. Handles all escape sequences
+ * present in the specification, including four-digit unicode characters,
+ * but merely includes the escape sequence without converting it to the
+ * actual escaped character. If the String is flagged as a key, we
+ * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
+ *
+ * DTRACE_JSON_COLLECT_OBJECT:
+ * This state collects an entire Object (or Array), correctly handling
+ * embedded strings. If the full element selector list matches this nested
+ * object, we return the Object in full as a string. If not, we use this
+ * state to skip to the next value at this level and continue processing.
+ *
+ * NOTE: This function uses various macros from strtolctype.h to manipulate
+ * digit values, etc -- these have all been checked to ensure they make
+ * no additional function calls.
+ */
+static char *
+dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
+ char *dest)
+{
+ dtrace_json_state_t state = DTRACE_JSON_REST;
+ int64_t array_elem = INT64_MIN;
+ int64_t array_pos = 0;
+ uint8_t escape_unicount = 0;
+ boolean_t string_is_key = B_FALSE;
+ boolean_t collect_object = B_FALSE;
+ boolean_t found_key = B_FALSE;
+ boolean_t in_array = B_FALSE;
+ uint32_t braces = 0, brackets = 0;
+ char *elem = elemlist;
+ char *dd = dest;
+ uintptr_t cur;
+
+ for (cur = json; cur < json + size; cur++) {
+ char cc = dtrace_load8(cur);
+ if (cc == '\0')
+ return (NULL);
+
+ switch (state) {
+ case DTRACE_JSON_REST:
+ if (isspace(cc))
+ break;
+
+ if (cc == '{') {
+ state = DTRACE_JSON_OBJECT;
+ break;
+ }
+
+ if (cc == '[') {
+ in_array = B_TRUE;
+ array_pos = 0;
+ array_elem = dtrace_strtoll(elem, 10, size);
+ found_key = array_elem == 0 ? B_TRUE : B_FALSE;
+ state = DTRACE_JSON_VALUE;
+ break;
+ }
+
+ /*
+ * ERROR: expected to find a top-level object or array.
+ */
+ return (NULL);
+ case DTRACE_JSON_OBJECT:
+ if (isspace(cc))
+ break;
+
+ if (cc == '"') {
+ state = DTRACE_JSON_STRING;
+ string_is_key = B_TRUE;
+ break;
+ }
+
+ /*
+ * ERROR: either the object did not start with a key
+ * string, or we've run off the end of the object
+ * without finding the requested key.
+ */
+ return (NULL);
+ case DTRACE_JSON_STRING:
+ if (cc == '\\') {
+ *dd++ = '\\';
+ state = DTRACE_JSON_STRING_ESCAPE;
+ break;
+ }
+
+ if (cc == '"') {
+ if (collect_object) {
+ /*
+ * We don't reset the dest here, as
+ * the string is part of a larger
+ * object being collected.
+ */
+ *dd++ = cc;
+ collect_object = B_FALSE;
+ state = DTRACE_JSON_COLLECT_OBJECT;
+ break;
+ }
+ *dd = '\0';
+ dd = dest; /* reset string buffer */
+ if (string_is_key) {
+ if (dtrace_strncmp(dest, elem,
+ size) == 0)
+ found_key = B_TRUE;
+ } else if (found_key) {
+ if (nelems > 1) {
+ /*
+ * We expected an object, not
+ * this string.
+ */
+ return (NULL);
+ }
+ return (dest);
+ }
+ state = string_is_key ? DTRACE_JSON_COLON :
+ DTRACE_JSON_COMMA;
+ string_is_key = B_FALSE;
+ break;
+ }
+
+ *dd++ = cc;
+ break;
+ case DTRACE_JSON_STRING_ESCAPE:
+ *dd++ = cc;
+ if (cc == 'u') {
+ escape_unicount = 0;
+ state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
+ } else {
+ state = DTRACE_JSON_STRING;
+ }
+ break;
+ case DTRACE_JSON_STRING_ESCAPE_UNICODE:
+ if (!isxdigit(cc)) {
+ /*
+ * ERROR: invalid unicode escape, expected
+ * four valid hexidecimal digits.
+ */
+ return (NULL);
+ }
+
+ *dd++ = cc;
+ if (++escape_unicount == 4)
+ state = DTRACE_JSON_STRING;
+ break;
+ case DTRACE_JSON_COLON:
+ if (isspace(cc))
+ break;
+
+ if (cc == ':') {
+ state = DTRACE_JSON_VALUE;
+ break;
+ }
+
+ /*
+ * ERROR: expected a colon.
+ */
+ return (NULL);
+ case DTRACE_JSON_COMMA:
+ if (isspace(cc))
+ break;
+
+ if (cc == ',') {
+ if (in_array) {
+ state = DTRACE_JSON_VALUE;
+ if (++array_pos == array_elem)
+ found_key = B_TRUE;
+ } else {
+ state = DTRACE_JSON_OBJECT;
+ }
+ break;
+ }
+
+ /*
+ * ERROR: either we hit an unexpected character, or
+ * we reached the end of the object or array without
+ * finding the requested key.
+ */
+ return (NULL);
+ case DTRACE_JSON_IDENTIFIER:
+ if (islower(cc)) {
+ *dd++ = cc;
+ break;
+ }
+
+ *dd = '\0';
+ dd = dest; /* reset string buffer */
+
+ if (dtrace_strncmp(dest, "true", 5) == 0 ||
+ dtrace_strncmp(dest, "false", 6) == 0 ||
+ dtrace_strncmp(dest, "null", 5) == 0) {
+ if (found_key) {
+ if (nelems > 1) {
+ /*
+ * ERROR: We expected an object,
+ * not this identifier.
+ */
+ return (NULL);
+ }
+ return (dest);
+ } else {
+ cur--;
+ state = DTRACE_JSON_COMMA;
+ break;
+ }
+ }
+
+ /*
+ * ERROR: we did not recognise the identifier as one
+ * of those in the JSON specification.
+ */
+ return (NULL);
+ case DTRACE_JSON_NUMBER:
+ if (cc == '.') {
+ *dd++ = cc;
+ state = DTRACE_JSON_NUMBER_FRAC;
+ break;
+ }
+
+ if (cc == 'x' || cc == 'X') {
+ /*
+ * ERROR: specification explicitly excludes
+ * hexidecimal or octal numbers.
+ */
+ return (NULL);
+ }
+
+ /* FALLTHRU */
+ case DTRACE_JSON_NUMBER_FRAC:
+ if (cc == 'e' || cc == 'E') {
+ *dd++ = cc;
+ state = DTRACE_JSON_NUMBER_EXP;
+ break;
+ }
+
+ if (cc == '+' || cc == '-') {
+ /*
+ * ERROR: expect sign as part of exponent only.
+ */
+ return (NULL);
+ }
+ /* FALLTHRU */
+ case DTRACE_JSON_NUMBER_EXP:
+ if (isdigit(cc) || cc == '+' || cc == '-') {
+ *dd++ = cc;
+ break;
+ }
+
+ *dd = '\0';
+ dd = dest; /* reset string buffer */
+ if (found_key) {
+ if (nelems > 1) {
+ /*
+ * ERROR: We expected an object, not
+ * this number.
+ */
+ return (NULL);
+ }
+ return (dest);
+ }
+
+ cur--;
+ state = DTRACE_JSON_COMMA;
+ break;
+ case DTRACE_JSON_VALUE:
+ if (isspace(cc))
+ break;
+
+ if (cc == '{' || cc == '[') {
+ if (nelems > 1 && found_key) {
+ in_array = cc == '[' ? B_TRUE : B_FALSE;
+ /*
+ * If our element selector directs us
+ * to descend into this nested object,
+ * then move to the next selector
+ * element in the list and restart the
+ * state machine.
+ */
+ while (*elem != '\0')
+ elem++;
+ elem++; /* skip the inter-element NUL */
+ nelems--;
+ dd = dest;
+ if (in_array) {
+ state = DTRACE_JSON_VALUE;
+ array_pos = 0;
+ array_elem = dtrace_strtoll(
+ elem, 10, size);
+ found_key = array_elem == 0 ?
+ B_TRUE : B_FALSE;
+ } else {
+ found_key = B_FALSE;
+ state = DTRACE_JSON_OBJECT;
+ }
+ break;
+ }
+
+ /*
+ * Otherwise, we wish to either skip this
+ * nested object or return it in full.
+ */
+ if (cc == '[')
+ brackets = 1;
+ else
+ braces = 1;
+ *dd++ = cc;
+ state = DTRACE_JSON_COLLECT_OBJECT;
+ break;
+ }
+
+ if (cc == '"') {
+ state = DTRACE_JSON_STRING;
+ break;
+ }
+
+ if (islower(cc)) {
+ /*
+ * Here we deal with true, false and null.
+ */
+ *dd++ = cc;
+ state = DTRACE_JSON_IDENTIFIER;
+ break;
+ }
+
+ if (cc == '-' || isdigit(cc)) {
+ *dd++ = cc;
+ state = DTRACE_JSON_NUMBER;
+ break;
+ }
+
+ /*
+ * ERROR: unexpected character at start of value.
+ */
+ return (NULL);
+ case DTRACE_JSON_COLLECT_OBJECT:
+ if (cc == '\0')
+ /*
+ * ERROR: unexpected end of input.
+ */
+ return (NULL);
+
+ *dd++ = cc;
+ if (cc == '"') {
+ collect_object = B_TRUE;
+ state = DTRACE_JSON_STRING;
+ break;
+ }
+
+ if (cc == ']') {
+ if (brackets-- == 0) {
+ /*
+ * ERROR: unbalanced brackets.
+ */
+ return (NULL);
+ }
+ } else if (cc == '}') {
+ if (braces-- == 0) {
+ /*
+ * ERROR: unbalanced braces.
+ */
+ return (NULL);
+ }
+ } else if (cc == '{') {
+ braces++;
+ } else if (cc == '[') {
+ brackets++;
+ }
+
+ if (brackets == 0 && braces == 0) {
+ if (found_key) {
+ *dd = '\0';
+ return (dest);
+ }
+ dd = dest; /* reset string buffer */
+ state = DTRACE_JSON_COMMA;
+ }
+ break;
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
+ * Notice that we don't bother validating the proper number of arguments or
+ * their types in the tuple stack. This isn't needed because all argument
+ * interpretation is safe because of our load safety -- the worst that can
+ * happen is that a bogus program can obtain bogus results.
+ */
+static void
+dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
+ dtrace_key_t *tupregs, int nargs,
+ dtrace_mstate_t *mstate, dtrace_state_t *state)
+{
+ volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
+ volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
+ dtrace_vstate_t *vstate = &state->dts_vstate;
+
+#ifdef illumos
+ union {
+ mutex_impl_t mi;
+ uint64_t mx;
+ } m;
+
+ union {
+ krwlock_t ri;
+ uintptr_t rw;
+ } r;
+#else
+ struct thread *lowner;
+ union {
+ struct lock_object *li;
+ uintptr_t lx;
+ } l;
+#endif
+
+ switch (subr) {
+ case DIF_SUBR_RAND:
+ regs[rd] = dtrace_xoroshiro128_plus_next(
+ state->dts_rstate[curcpu]);
+ break;
+
+#ifdef illumos
+ case DIF_SUBR_MUTEX_OWNED:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ m.mx = dtrace_load64(tupregs[0].dttk_value);
+ if (MUTEX_TYPE_ADAPTIVE(&m.mi))
+ regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
+ else
+ regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
+ break;
+
+ case DIF_SUBR_MUTEX_OWNER:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ m.mx = dtrace_load64(tupregs[0].dttk_value);
+ if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
+ MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
+ regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
+ else
+ regs[rd] = 0;
+ break;
+
+ case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ m.mx = dtrace_load64(tupregs[0].dttk_value);
+ regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
+ break;
+
+ case DIF_SUBR_MUTEX_TYPE_SPIN:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ m.mx = dtrace_load64(tupregs[0].dttk_value);
+ regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
+ break;
+
+ case DIF_SUBR_RW_READ_HELD: {
+ uintptr_t tmp;
+
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ r.rw = dtrace_loadptr(tupregs[0].dttk_value);
+ regs[rd] = _RW_READ_HELD(&r.ri, tmp);
+ break;
+ }
+
+ case DIF_SUBR_RW_WRITE_HELD:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ r.rw = dtrace_loadptr(tupregs[0].dttk_value);
+ regs[rd] = _RW_WRITE_HELD(&r.ri);
+ break;
+
+ case DIF_SUBR_RW_ISWRITER:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ r.rw = dtrace_loadptr(tupregs[0].dttk_value);
+ regs[rd] = _RW_ISWRITER(&r.ri);
+ break;
+
+#else /* !illumos */
+ case DIF_SUBR_MUTEX_OWNED:
+ if (!dtrace_canload(tupregs[0].dttk_value,
+ sizeof (struct lock_object), mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+
+ case DIF_SUBR_MUTEX_OWNER:
+ if (!dtrace_canload(tupregs[0].dttk_value,
+ sizeof (struct lock_object), mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ regs[rd] = (uintptr_t)lowner;
+ break;
+
+ case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SLEEPLOCK) != 0;
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+
+ case DIF_SUBR_MUTEX_TYPE_SPIN:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+
+ case DIF_SUBR_RW_READ_HELD:
+ case DIF_SUBR_SX_SHARED_HELD:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
+ lowner == NULL;
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+
+ case DIF_SUBR_RW_WRITE_HELD:
+ case DIF_SUBR_SX_EXCLUSIVE_HELD:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ l.lx = dtrace_loadptr(tupregs[0].dttk_value);
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
+ lowner != NULL;
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+
+ case DIF_SUBR_RW_ISWRITER:
+ case DIF_SUBR_SX_ISEXCLUSIVE:
+ if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
+ mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ l.lx = dtrace_loadptr(tupregs[0].dttk_value);
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ regs[rd] = (lowner == curthread);
+ break;
+#endif /* illumos */
+
+ case DIF_SUBR_BCOPY: {
+ /*
+ * We need to be sure that the destination is in the scratch
+ * region -- no other region is allowed.
+ */
+ uintptr_t src = tupregs[0].dttk_value;
+ uintptr_t dest = tupregs[1].dttk_value;
+ size_t size = tupregs[2].dttk_value;
+
+ if (!dtrace_inscratch(dest, size, mstate)) {
+ *flags |= CPU_DTRACE_BADADDR;
+ *illval = regs[rd];
+ break;
+ }
+
+ if (!dtrace_canload(src, size, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ dtrace_bcopy((void *)src, (void *)dest, size);
+ break;
+ }
+
+ case DIF_SUBR_ALLOCA:
+ case DIF_SUBR_COPYIN: {
+ uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
+ uint64_t size =
+ tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
+ size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
+
+ /*
+ * This action doesn't require any credential checks since
+ * probes will not activate in user contexts to which the
+ * enabling user does not have permissions.
+ */
+
+ /*
+ * Rounding up the user allocation size could have overflowed
+ * a large, bogus allocation (like -1ULL) to 0.
+ */
+ if (scratch_size < size ||
+ !DTRACE_INSCRATCH(mstate, scratch_size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ if (subr == DIF_SUBR_COPYIN) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ }
+
+ mstate->dtms_scratch_ptr += scratch_size;
+ regs[rd] = dest;
+ break;
+ }
+
+ case DIF_SUBR_COPYINTO: {
+ uint64_t size = tupregs[1].dttk_value;
+ uintptr_t dest = tupregs[2].dttk_value;
+
+ /*
+ * This action doesn't require any credential checks since
+ * probes will not activate in user contexts to which the
+ * enabling user does not have permissions.
+ */
+ if (!dtrace_inscratch(dest, size, mstate)) {
+ *flags |= CPU_DTRACE_BADADDR;
+ *illval = regs[rd];
+ break;
+ }
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+ }
+
+ case DIF_SUBR_COPYINSTR: {
+ uintptr_t dest = mstate->dtms_scratch_ptr;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+
+ if (nargs > 1 && tupregs[1].dttk_value < size)
+ size = tupregs[1].dttk_value + 1;
+
+ /*
+ * This action doesn't require any credential checks since
+ * probes will not activate in user contexts to which the
+ * enabling user does not have permissions.
+ */
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+
+ ((char *)dest)[size - 1] = '\0';
+ mstate->dtms_scratch_ptr += size;
+ regs[rd] = dest;
+ break;
+ }
+
+#ifdef illumos
+ case DIF_SUBR_MSGSIZE:
+ case DIF_SUBR_MSGDSIZE: {
+ uintptr_t baddr = tupregs[0].dttk_value, daddr;
+ uintptr_t wptr, rptr;
+ size_t count = 0;
+ int cont = 0;
+
+ while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
+
+ if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
+ vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ wptr = dtrace_loadptr(baddr +
+ offsetof(mblk_t, b_wptr));
+
+ rptr = dtrace_loadptr(baddr +
+ offsetof(mblk_t, b_rptr));
+
+ if (wptr < rptr) {
+ *flags |= CPU_DTRACE_BADADDR;
+ *illval = tupregs[0].dttk_value;
+ break;
+ }
+
+ daddr = dtrace_loadptr(baddr +
+ offsetof(mblk_t, b_datap));
+
+ baddr = dtrace_loadptr(baddr +
+ offsetof(mblk_t, b_cont));
+
+ /*
+ * We want to prevent against denial-of-service here,
+ * so we're only going to search the list for
+ * dtrace_msgdsize_max mblks.
+ */
+ if (cont++ > dtrace_msgdsize_max) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+
+ if (subr == DIF_SUBR_MSGDSIZE) {
+ if (dtrace_load8(daddr +
+ offsetof(dblk_t, db_type)) != M_DATA)
+ continue;
+ }
+
+ count += wptr - rptr;
+ }
+
+ if (!(*flags & CPU_DTRACE_FAULT))
+ regs[rd] = count;
+
+ break;
+ }
+#endif
+
+ case DIF_SUBR_PROGENYOF: {
+ pid_t pid = tupregs[0].dttk_value;
+ proc_t *p;
+ int rval = 0;
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+
+ for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
+#ifdef illumos
+ if (p->p_pidp->pid_id == pid) {
+#else
+ if (p->p_pid == pid) {
+#endif
+ rval = 1;
+ break;
+ }
+ }
+
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+
+ regs[rd] = rval;
+ break;
+ }
+
+ case DIF_SUBR_SPECULATION:
+ regs[rd] = dtrace_speculation(state);
+ break;
+
+ case DIF_SUBR_COPYOUT: {
+ uintptr_t kaddr = tupregs[0].dttk_value;
+ uintptr_t uaddr = tupregs[1].dttk_value;
+ uint64_t size = tupregs[2].dttk_value;
+
+ if (!dtrace_destructive_disallow &&
+ dtrace_priv_proc_control(state) &&
+ !dtrace_istoxic(kaddr, size) &&
+ dtrace_canload(kaddr, size, mstate, vstate)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ dtrace_copyout(kaddr, uaddr, size, flags);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ }
+ break;
+ }
+
+ case DIF_SUBR_COPYOUTSTR: {
+ uintptr_t kaddr = tupregs[0].dttk_value;
+ uintptr_t uaddr = tupregs[1].dttk_value;
+ uint64_t size = tupregs[2].dttk_value;
+ size_t lim;
+
+ if (!dtrace_destructive_disallow &&
+ dtrace_priv_proc_control(state) &&
+ !dtrace_istoxic(kaddr, size) &&
+ dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ dtrace_copyoutstr(kaddr, uaddr, lim, flags);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ }
+ break;
+ }
+
+ case DIF_SUBR_STRLEN: {
+ size_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
+ size_t lim;
+
+ if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ regs[rd] = dtrace_strlen((char *)addr, lim);
+ break;
+ }
+
+ case DIF_SUBR_STRCHR:
+ case DIF_SUBR_STRRCHR: {
+ /*
+ * We're going to iterate over the string looking for the
+ * specified character. We will iterate until we have reached
+ * the string length or we have found the character. If this
+ * is DIF_SUBR_STRRCHR, we will look for the last occurrence
+ * of the specified character instead of the first.
+ */
+ uintptr_t addr = tupregs[0].dttk_value;
+ uintptr_t addr_limit;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ size_t lim;
+ char c, target = (char)tupregs[1].dttk_value;
+
+ if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ addr_limit = addr + lim;
+
+ for (regs[rd] = 0; addr < addr_limit; addr++) {
+ if ((c = dtrace_load8(addr)) == target) {
+ regs[rd] = addr;
+
+ if (subr == DIF_SUBR_STRCHR)
+ break;
+ }
+
+ if (c == '\0')
+ break;
+ }
+ break;
+ }
+
+ case DIF_SUBR_STRSTR:
+ case DIF_SUBR_INDEX:
+ case DIF_SUBR_RINDEX: {
+ /*
+ * We're going to iterate over the string looking for the
+ * specified string. We will iterate until we have reached
+ * the string length or we have found the string. (Yes, this
+ * is done in the most naive way possible -- but considering
+ * that the string we're searching for is likely to be
+ * relatively short, the complexity of Rabin-Karp or similar
+ * hardly seems merited.)
+ */
+ char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
+ char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ size_t len = dtrace_strlen(addr, size);
+ size_t sublen = dtrace_strlen(substr, size);
+ char *limit = addr + len, *orig = addr;
+ int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
+ int inc = 1;
+
+ regs[rd] = notfound;
+
+ if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
+ vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ /*
+ * strstr() and index()/rindex() have similar semantics if
+ * both strings are the empty string: strstr() returns a
+ * pointer to the (empty) string, and index() and rindex()
+ * both return index 0 (regardless of any position argument).
+ */
+ if (sublen == 0 && len == 0) {
+ if (subr == DIF_SUBR_STRSTR)
+ regs[rd] = (uintptr_t)addr;
+ else
+ regs[rd] = 0;
+ break;
+ }
+
+ if (subr != DIF_SUBR_STRSTR) {
+ if (subr == DIF_SUBR_RINDEX) {
+ limit = orig - 1;
+ addr += len;
+ inc = -1;
+ }
+
+ /*
+ * Both index() and rindex() take an optional position
+ * argument that denotes the starting position.
+ */
+ if (nargs == 3) {
+ int64_t pos = (int64_t)tupregs[2].dttk_value;
+
+ /*
+ * If the position argument to index() is
+ * negative, Perl implicitly clamps it at
+ * zero. This semantic is a little surprising
+ * given the special meaning of negative
+ * positions to similar Perl functions like
+ * substr(), but it appears to reflect a
+ * notion that index() can start from a
+ * negative index and increment its way up to
+ * the string. Given this notion, Perl's
+ * rindex() is at least self-consistent in
+ * that it implicitly clamps positions greater
+ * than the string length to be the string
+ * length. Where Perl completely loses
+ * coherence, however, is when the specified
+ * substring is the empty string (""). In
+ * this case, even if the position is
+ * negative, rindex() returns 0 -- and even if
+ * the position is greater than the length,
+ * index() returns the string length. These
+ * semantics violate the notion that index()
+ * should never return a value less than the
+ * specified position and that rindex() should
+ * never return a value greater than the
+ * specified position. (One assumes that
+ * these semantics are artifacts of Perl's
+ * implementation and not the results of
+ * deliberate design -- it beggars belief that
+ * even Larry Wall could desire such oddness.)
+ * While in the abstract one would wish for
+ * consistent position semantics across
+ * substr(), index() and rindex() -- or at the
+ * very least self-consistent position
+ * semantics for index() and rindex() -- we
+ * instead opt to keep with the extant Perl
+ * semantics, in all their broken glory. (Do
+ * we have more desire to maintain Perl's
+ * semantics than Perl does? Probably.)
+ */
+ if (subr == DIF_SUBR_RINDEX) {
+ if (pos < 0) {
+ if (sublen == 0)
+ regs[rd] = 0;
+ break;
+ }
+
+ if (pos > len)
+ pos = len;
+ } else {
+ if (pos < 0)
+ pos = 0;
+
+ if (pos >= len) {
+ if (sublen == 0)
+ regs[rd] = len;
+ break;
+ }
+ }
+
+ addr = orig + pos;
+ }
+ }
+
+ for (regs[rd] = notfound; addr != limit; addr += inc) {
+ if (dtrace_strncmp(addr, substr, sublen) == 0) {
+ if (subr != DIF_SUBR_STRSTR) {
+ /*
+ * As D index() and rindex() are
+ * modeled on Perl (and not on awk),
+ * we return a zero-based (and not a
+ * one-based) index. (For you Perl
+ * weenies: no, we're not going to add
+ * $[ -- and shouldn't you be at a con
+ * or something?)
+ */
+ regs[rd] = (uintptr_t)(addr - orig);
+ break;
+ }
+
+ ASSERT(subr == DIF_SUBR_STRSTR);
+ regs[rd] = (uintptr_t)addr;
+ break;
+ }
+ }
+
+ break;
+ }
+
+ case DIF_SUBR_STRTOK: {
+ uintptr_t addr = tupregs[0].dttk_value;
+ uintptr_t tokaddr = tupregs[1].dttk_value;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t limit, toklimit;
+ size_t clim;
+ uint8_t c = 0, tokmap[32]; /* 256 / 8 */
+ char *dest = (char *)mstate->dtms_scratch_ptr;
+ int i;
+
+ /*
+ * Check both the token buffer and (later) the input buffer,
+ * since both could be non-scratch addresses.
+ */
+ if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ toklimit = tokaddr + clim;
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ if (addr == 0) {
+ /*
+ * If the address specified is NULL, we use our saved
+ * strtok pointer from the mstate. Note that this
+ * means that the saved strtok pointer is _only_
+ * valid within multiple enablings of the same probe --
+ * it behaves like an implicit clause-local variable.
+ */
+ addr = mstate->dtms_strtok;
+ limit = mstate->dtms_strtok_limit;
+ } else {
+ /*
+ * If the user-specified address is non-NULL we must
+ * access check it. This is the only time we have
+ * a chance to do so, since this address may reside
+ * in the string table of this clause-- future calls
+ * (when we fetch addr from mstate->dtms_strtok)
+ * would fail this access check.
+ */
+ if (!dtrace_strcanload(addr, size, &clim, mstate,
+ vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+ limit = addr + clim;
+ }
+
+ /*
+ * First, zero the token map, and then process the token
+ * string -- setting a bit in the map for every character
+ * found in the token string.
+ */
+ for (i = 0; i < sizeof (tokmap); i++)
+ tokmap[i] = 0;
+
+ for (; tokaddr < toklimit; tokaddr++) {
+ if ((c = dtrace_load8(tokaddr)) == '\0')
+ break;
+
+ ASSERT((c >> 3) < sizeof (tokmap));
+ tokmap[c >> 3] |= (1 << (c & 0x7));
+ }
+
+ for (; addr < limit; addr++) {
+ /*
+ * We're looking for a character that is _not_
+ * contained in the token string.
+ */
+ if ((c = dtrace_load8(addr)) == '\0')
+ break;
+
+ if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
+ break;
+ }
+
+ if (c == '\0') {
+ /*
+ * We reached the end of the string without finding
+ * any character that was not in the token string.
+ * We return NULL in this case, and we set the saved
+ * address to NULL as well.
+ */
+ regs[rd] = 0;
+ mstate->dtms_strtok = 0;
+ mstate->dtms_strtok_limit = 0;
+ break;
+ }
+
+ /*
+ * From here on, we're copying into the destination string.
+ */
+ for (i = 0; addr < limit && i < size - 1; addr++) {
+ if ((c = dtrace_load8(addr)) == '\0')
+ break;
+
+ if (tokmap[c >> 3] & (1 << (c & 0x7)))
+ break;
+
+ ASSERT(i < size);
+ dest[i++] = c;
+ }
+
+ ASSERT(i < size);
+ dest[i] = '\0';
+ regs[rd] = (uintptr_t)dest;
+ mstate->dtms_scratch_ptr += size;
+ mstate->dtms_strtok = addr;
+ mstate->dtms_strtok_limit = limit;
+ break;
+ }
+
+ case DIF_SUBR_SUBSTR: {
+ uintptr_t s = tupregs[0].dttk_value;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ char *d = (char *)mstate->dtms_scratch_ptr;
+ int64_t index = (int64_t)tupregs[1].dttk_value;
+ int64_t remaining = (int64_t)tupregs[2].dttk_value;
+ size_t len = dtrace_strlen((char *)s, size);
+ int64_t i;
+
+ if (!dtrace_canload(s, len + 1, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ if (nargs <= 2)
+ remaining = (int64_t)size;
+
+ if (index < 0) {
+ index += len;
+
+ if (index < 0 && index + remaining > 0) {
+ remaining += index;
+ index = 0;
+ }
+ }
+
+ if (index >= len || index < 0) {
+ remaining = 0;
+ } else if (remaining < 0) {
+ remaining += len - index;
+ } else if (index + remaining > size) {
+ remaining = size - index;
+ }
+
+ for (i = 0; i < remaining; i++) {
+ if ((d[i] = dtrace_load8(s + index + i)) == '\0')
+ break;
+ }
+
+ d[i] = '\0';
+
+ mstate->dtms_scratch_ptr += size;
+ regs[rd] = (uintptr_t)d;
+ break;
+ }
+
+ case DIF_SUBR_JSON: {
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t json = tupregs[0].dttk_value;
+ size_t jsonlen = dtrace_strlen((char *)json, size);
+ uintptr_t elem = tupregs[1].dttk_value;
+ size_t elemlen = dtrace_strlen((char *)elem, size);
+
+ char *dest = (char *)mstate->dtms_scratch_ptr;
+ char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
+ char *ee = elemlist;
+ int nelems = 1;
+ uintptr_t cur;
+
+ if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
+ !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ /*
+ * Read the element selector and split it up into a packed list
+ * of strings.
+ */
+ for (cur = elem; cur < elem + elemlen; cur++) {
+ char cc = dtrace_load8(cur);
+
+ if (cur == elem && cc == '[') {
+ /*
+ * If the first element selector key is
+ * actually an array index then ignore the
+ * bracket.
+ */
+ continue;
+ }
+
+ if (cc == ']')
+ continue;
+
+ if (cc == '.' || cc == '[') {
+ nelems++;
+ cc = '\0';
+ }
+
+ *ee++ = cc;
+ }
+ *ee++ = '\0';
+
+ if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
+ nelems, dest)) != 0)
+ mstate->dtms_scratch_ptr += jsonlen + 1;
+ break;
+ }
+
+ case DIF_SUBR_TOUPPER:
+ case DIF_SUBR_TOLOWER: {
+ uintptr_t s = tupregs[0].dttk_value;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ char *dest = (char *)mstate->dtms_scratch_ptr, c;
+ size_t len = dtrace_strlen((char *)s, size);
+ char lower, upper, convert;
+ int64_t i;
+
+ if (subr == DIF_SUBR_TOUPPER) {
+ lower = 'a';
+ upper = 'z';
+ convert = 'A';
+ } else {
+ lower = 'A';
+ upper = 'Z';
+ convert = 'a';
+ }
+
+ if (!dtrace_canload(s, len + 1, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ for (i = 0; i < size - 1; i++) {
+ if ((c = dtrace_load8(s + i)) == '\0')
+ break;
+
+ if (c >= lower && c <= upper)
+ c = convert + (c - lower);
+
+ dest[i] = c;
+ }
+
+ ASSERT(i < size);
+ dest[i] = '\0';
+ regs[rd] = (uintptr_t)dest;
+ mstate->dtms_scratch_ptr += size;
+ break;
+ }
+
+#ifdef illumos
+ case DIF_SUBR_GETMAJOR:
+#ifdef _LP64
+ regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
+#else
+ regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
+#endif
+ break;
+
+ case DIF_SUBR_GETMINOR:
+#ifdef _LP64
+ regs[rd] = tupregs[0].dttk_value & MAXMIN64;
+#else
+ regs[rd] = tupregs[0].dttk_value & MAXMIN;
+#endif
+ break;
+
+ case DIF_SUBR_DDI_PATHNAME: {
+ /*
+ * This one is a galactic mess. We are going to roughly
+ * emulate ddi_pathname(), but it's made more complicated
+ * by the fact that we (a) want to include the minor name and
+ * (b) must proceed iteratively instead of recursively.
+ */
+ uintptr_t dest = mstate->dtms_scratch_ptr;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ char *start = (char *)dest, *end = start + size - 1;
+ uintptr_t daddr = tupregs[0].dttk_value;
+ int64_t minor = (int64_t)tupregs[1].dttk_value;
+ char *s;
+ int i, len, depth = 0;
+
+ /*
+ * Due to all the pointer jumping we do and context we must
+ * rely upon, we just mandate that the user must have kernel
+ * read privileges to use this routine.
+ */
+ if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
+ *flags |= CPU_DTRACE_KPRIV;
+ *illval = daddr;
+ regs[rd] = 0;
+ }
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ *end = '\0';
+
+ /*
+ * We want to have a name for the minor. In order to do this,
+ * we need to walk the minor list from the devinfo. We want
+ * to be sure that we don't infinitely walk a circular list,
+ * so we check for circularity by sending a scout pointer
+ * ahead two elements for every element that we iterate over;
+ * if the list is circular, these will ultimately point to the
+ * same element. You may recognize this little trick as the
+ * answer to a stupid interview question -- one that always
+ * seems to be asked by those who had to have it laboriously
+ * explained to them, and who can't even concisely describe
+ * the conditions under which one would be forced to resort to
+ * this technique. Needless to say, those conditions are
+ * found here -- and probably only here. Is this the only use
+ * of this infamous trick in shipping, production code? If it
+ * isn't, it probably should be...
+ */
+ if (minor != -1) {
+ uintptr_t maddr = dtrace_loadptr(daddr +
+ offsetof(struct dev_info, devi_minor));
+
+ uintptr_t next = offsetof(struct ddi_minor_data, next);
+ uintptr_t name = offsetof(struct ddi_minor_data,
+ d_minor) + offsetof(struct ddi_minor, name);
+ uintptr_t dev = offsetof(struct ddi_minor_data,
+ d_minor) + offsetof(struct ddi_minor, dev);
+ uintptr_t scout;
+
+ if (maddr != NULL)
+ scout = dtrace_loadptr(maddr + next);
+
+ while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
+ uint64_t m;
+#ifdef _LP64
+ m = dtrace_load64(maddr + dev) & MAXMIN64;
+#else
+ m = dtrace_load32(maddr + dev) & MAXMIN;
+#endif
+ if (m != minor) {
+ maddr = dtrace_loadptr(maddr + next);
+
+ if (scout == NULL)
+ continue;
+
+ scout = dtrace_loadptr(scout + next);
+
+ if (scout == NULL)
+ continue;
+
+ scout = dtrace_loadptr(scout + next);
+
+ if (scout == NULL)
+ continue;
+
+ if (scout == maddr) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+
+ continue;
+ }
+
+ /*
+ * We have the minor data. Now we need to
+ * copy the minor's name into the end of the
+ * pathname.
+ */
+ s = (char *)dtrace_loadptr(maddr + name);
+ len = dtrace_strlen(s, size);
+
+ if (*flags & CPU_DTRACE_FAULT)
+ break;
+
+ if (len != 0) {
+ if ((end -= (len + 1)) < start)
+ break;
+
+ *end = ':';
+ }
+
+ for (i = 1; i <= len; i++)
+ end[i] = dtrace_load8((uintptr_t)s++);
+ break;
+ }
+ }
+
+ while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
+ ddi_node_state_t devi_state;
+
+ devi_state = dtrace_load32(daddr +
+ offsetof(struct dev_info, devi_node_state));
+
+ if (*flags & CPU_DTRACE_FAULT)
+ break;
+
+ if (devi_state >= DS_INITIALIZED) {
+ s = (char *)dtrace_loadptr(daddr +
+ offsetof(struct dev_info, devi_addr));
+ len = dtrace_strlen(s, size);
+
+ if (*flags & CPU_DTRACE_FAULT)
+ break;
+
+ if (len != 0) {
+ if ((end -= (len + 1)) < start)
+ break;
+
+ *end = '@';
+ }
+
+ for (i = 1; i <= len; i++)
+ end[i] = dtrace_load8((uintptr_t)s++);
+ }
+
+ /*
+ * Now for the node name...
+ */
+ s = (char *)dtrace_loadptr(daddr +
+ offsetof(struct dev_info, devi_node_name));
+
+ daddr = dtrace_loadptr(daddr +
+ offsetof(struct dev_info, devi_parent));
+
+ /*
+ * If our parent is NULL (that is, if we're the root
+ * node), we're going to use the special path
+ * "devices".
+ */
+ if (daddr == 0)
+ s = "devices";
+
+ len = dtrace_strlen(s, size);
+ if (*flags & CPU_DTRACE_FAULT)
+ break;
+
+ if ((end -= (len + 1)) < start)
+ break;
+
+ for (i = 1; i <= len; i++)
+ end[i] = dtrace_load8((uintptr_t)s++);
+ *end = '/';
+
+ if (depth++ > dtrace_devdepth_max) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+ }
+
+ if (end < start)
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+
+ if (daddr == 0) {
+ regs[rd] = (uintptr_t)end;
+ mstate->dtms_scratch_ptr += size;
+ }
+
+ break;
+ }
+#endif
+
+ case DIF_SUBR_STRJOIN: {
+ char *d = (char *)mstate->dtms_scratch_ptr;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t s1 = tupregs[0].dttk_value;
+ uintptr_t s2 = tupregs[1].dttk_value;
+ int i = 0, j = 0;
+ size_t lim1, lim2;
+ char c;
+
+ if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
+ !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ for (;;) {
+ if (i >= size) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+ c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
+ if ((d[i++] = c) == '\0') {
+ i--;
+ break;
+ }
+ }
+
+ for (;;) {
+ if (i >= size) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
+ if ((d[i++] = c) == '\0')
+ break;
+ }
+
+ if (i < size) {
+ mstate->dtms_scratch_ptr += i;
+ regs[rd] = (uintptr_t)d;
+ }
+
+ break;
+ }
+
+ case DIF_SUBR_STRTOLL: {
+ uintptr_t s = tupregs[0].dttk_value;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ size_t lim;
+ int base = 10;
+
+ if (nargs > 1) {
+ if ((base = tupregs[1].dttk_value) <= 1 ||
+ base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+ }
+
+ if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
+ regs[rd] = INT64_MIN;
+ break;
+ }
+
+ regs[rd] = dtrace_strtoll((char *)s, base, lim);
+ break;
+ }
+
+ case DIF_SUBR_LLTOSTR: {
+ int64_t i = (int64_t)tupregs[0].dttk_value;
+ uint64_t val, digit;
+ uint64_t size = 65; /* enough room for 2^64 in binary */
+ char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
+ int base = 10;
+
+ if (nargs > 1) {
+ if ((base = tupregs[1].dttk_value) <= 1 ||
+ base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+ }
+
+ val = (base == 10 && i < 0) ? i * -1 : i;
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ for (*end-- = '\0'; val; val /= base) {
+ if ((digit = val % base) <= '9' - '0') {
+ *end-- = '0' + digit;
+ } else {
+ *end-- = 'a' + (digit - ('9' - '0') - 1);
+ }
+ }
+
+ if (i == 0 && base == 16)
+ *end-- = '0';
+
+ if (base == 16)
+ *end-- = 'x';
+
+ if (i == 0 || base == 8 || base == 16)
+ *end-- = '0';
+
+ if (i < 0 && base == 10)
+ *end-- = '-';
+
+ regs[rd] = (uintptr_t)end + 1;
+ mstate->dtms_scratch_ptr += size;
+ break;
+ }
+
+ case DIF_SUBR_HTONS:
+ case DIF_SUBR_NTOHS:
+#if BYTE_ORDER == BIG_ENDIAN
+ regs[rd] = (uint16_t)tupregs[0].dttk_value;
+#else
+ regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
+#endif
+ break;
+
+
+ case DIF_SUBR_HTONL:
+ case DIF_SUBR_NTOHL:
+#if BYTE_ORDER == BIG_ENDIAN
+ regs[rd] = (uint32_t)tupregs[0].dttk_value;
+#else
+ regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
+#endif
+ break;
+
+
+ case DIF_SUBR_HTONLL:
+ case DIF_SUBR_NTOHLL:
+#if BYTE_ORDER == BIG_ENDIAN
+ regs[rd] = (uint64_t)tupregs[0].dttk_value;
+#else
+ regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
+#endif
+ break;
+
+
+ case DIF_SUBR_DIRNAME:
+ case DIF_SUBR_BASENAME: {
+ char *dest = (char *)mstate->dtms_scratch_ptr;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t src = tupregs[0].dttk_value;
+ int i, j, len = dtrace_strlen((char *)src, size);
+ int lastbase = -1, firstbase = -1, lastdir = -1;
+ int start, end;
+
+ if (!dtrace_canload(src, len + 1, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ /*
+ * The basename and dirname for a zero-length string is
+ * defined to be "."
+ */
+ if (len == 0) {
+ len = 1;
+ src = (uintptr_t)".";
+ }
+
+ /*
+ * Start from the back of the string, moving back toward the
+ * front until we see a character that isn't a slash. That
+ * character is the last character in the basename.
+ */
+ for (i = len - 1; i >= 0; i--) {
+ if (dtrace_load8(src + i) != '/')
+ break;
+ }
+
+ if (i >= 0)
+ lastbase = i;
+
+ /*
+ * Starting from the last character in the basename, move
+ * towards the front until we find a slash. The character
+ * that we processed immediately before that is the first
+ * character in the basename.
+ */
+ for (; i >= 0; i--) {
+ if (dtrace_load8(src + i) == '/')
+ break;
+ }
+
+ if (i >= 0)
+ firstbase = i + 1;
+
+ /*
+ * Now keep going until we find a non-slash character. That
+ * character is the last character in the dirname.
+ */
+ for (; i >= 0; i--) {
+ if (dtrace_load8(src + i) != '/')
+ break;
+ }
+
+ if (i >= 0)
+ lastdir = i;
+
+ ASSERT(!(lastbase == -1 && firstbase != -1));
+ ASSERT(!(firstbase == -1 && lastdir != -1));
+
+ if (lastbase == -1) {
+ /*
+ * We didn't find a non-slash character. We know that
+ * the length is non-zero, so the whole string must be
+ * slashes. In either the dirname or the basename
+ * case, we return '/'.
+ */
+ ASSERT(firstbase == -1);
+ firstbase = lastbase = lastdir = 0;
+ }
+
+ if (firstbase == -1) {
+ /*
+ * The entire string consists only of a basename
+ * component. If we're looking for dirname, we need
+ * to change our string to be just "."; if we're
+ * looking for a basename, we'll just set the first
+ * character of the basename to be 0.
+ */
+ if (subr == DIF_SUBR_DIRNAME) {
+ ASSERT(lastdir == -1);
+ src = (uintptr_t)".";
+ lastdir = 0;
+ } else {
+ firstbase = 0;
+ }
+ }
+
+ if (subr == DIF_SUBR_DIRNAME) {
+ if (lastdir == -1) {
+ /*
+ * We know that we have a slash in the name --
+ * or lastdir would be set to 0, above. And
+ * because lastdir is -1, we know that this
+ * slash must be the first character. (That
+ * is, the full string must be of the form
+ * "/basename".) In this case, the last
+ * character of the directory name is 0.
+ */
+ lastdir = 0;
+ }
+
+ start = 0;
+ end = lastdir;
+ } else {
+ ASSERT(subr == DIF_SUBR_BASENAME);
+ ASSERT(firstbase != -1 && lastbase != -1);
+ start = firstbase;
+ end = lastbase;
+ }
+
+ for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
+ dest[j] = dtrace_load8(src + i);
+
+ dest[j] = '\0';
+ regs[rd] = (uintptr_t)dest;
+ mstate->dtms_scratch_ptr += size;
+ break;
+ }
+
+ case DIF_SUBR_GETF: {
+ uintptr_t fd = tupregs[0].dttk_value;
+ struct filedesc *fdp;
+ file_t *fp;
+
+ if (!dtrace_priv_proc(state)) {
+ regs[rd] = 0;
+ break;
+ }
+ fdp = curproc->p_fd;
+ FILEDESC_SLOCK(fdp);
+ fp = fget_locked(fdp, fd);
+ mstate->dtms_getf = fp;
+ regs[rd] = (uintptr_t)fp;
+ FILEDESC_SUNLOCK(fdp);
+ break;
+ }
+
+ case DIF_SUBR_CLEANPATH: {
+ char *dest = (char *)mstate->dtms_scratch_ptr, c;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t src = tupregs[0].dttk_value;
+ size_t lim;
+ int i = 0, j = 0;
+#ifdef illumos
+ zone_t *z;
+#endif
+
+ if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ /*
+ * Move forward, loading each character.
+ */
+ do {
+ c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
+next:
+ if (j + 5 >= size) /* 5 = strlen("/..c\0") */
+ break;
+
+ if (c != '/') {
+ dest[j++] = c;
+ continue;
+ }
+
+ c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
+
+ if (c == '/') {
+ /*
+ * We have two slashes -- we can just advance
+ * to the next character.
+ */
+ goto next;
+ }
+
+ if (c != '.') {
+ /*
+ * This is not "." and it's not ".." -- we can
+ * just store the "/" and this character and
+ * drive on.
+ */
+ dest[j++] = '/';
+ dest[j++] = c;
+ continue;
+ }
+
+ c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
+
+ if (c == '/') {
+ /*
+ * This is a "/./" component. We're not going
+ * to store anything in the destination buffer;
+ * we're just going to go to the next component.
+ */
+ goto next;
+ }
+
+ if (c != '.') {
+ /*
+ * This is not ".." -- we can just store the
+ * "/." and this character and continue
+ * processing.
+ */
+ dest[j++] = '/';
+ dest[j++] = '.';
+ dest[j++] = c;
+ continue;
+ }
+
+ c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
+
+ if (c != '/' && c != '\0') {
+ /*
+ * This is not ".." -- it's "..[mumble]".
+ * We'll store the "/.." and this character
+ * and continue processing.
+ */
+ dest[j++] = '/';
+ dest[j++] = '.';
+ dest[j++] = '.';
+ dest[j++] = c;
+ continue;
+ }
+
+ /*
+ * This is "/../" or "/..\0". We need to back up
+ * our destination pointer until we find a "/".
+ */
+ i--;
+ while (j != 0 && dest[--j] != '/')
+ continue;
+
+ if (c == '\0')
+ dest[++j] = '/';
+ } while (c != '\0');
+
+ dest[j] = '\0';
+
+#ifdef illumos
+ if (mstate->dtms_getf != NULL &&
+ !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
+ (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
+ /*
+ * If we've done a getf() as a part of this ECB and we
+ * don't have kernel access (and we're not in the global
+ * zone), check if the path we cleaned up begins with
+ * the zone's root path, and trim it off if so. Note
+ * that this is an output cleanliness issue, not a
+ * security issue: knowing one's zone root path does
+ * not enable privilege escalation.
+ */
+ if (strstr(dest, z->zone_rootpath) == dest)
+ dest += strlen(z->zone_rootpath) - 1;
+ }
+#endif
+
+ regs[rd] = (uintptr_t)dest;
+ mstate->dtms_scratch_ptr += size;
+ break;
+ }
+
+ case DIF_SUBR_INET_NTOA:
+ case DIF_SUBR_INET_NTOA6:
+ case DIF_SUBR_INET_NTOP: {
+ size_t size;
+ int af, argi, i;
+ char *base, *end;
+
+ if (subr == DIF_SUBR_INET_NTOP) {
+ af = (int)tupregs[0].dttk_value;
+ argi = 1;
+ } else {
+ af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
+ argi = 0;
+ }
+
+ if (af == AF_INET) {
+ ipaddr_t ip4;
+ uint8_t *ptr8, val;
+
+ if (!dtrace_canload(tupregs[argi].dttk_value,
+ sizeof (ipaddr_t), mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ /*
+ * Safely load the IPv4 address.
+ */
+ ip4 = dtrace_load32(tupregs[argi].dttk_value);
+
+ /*
+ * Check an IPv4 string will fit in scratch.
+ */
+ size = INET_ADDRSTRLEN;
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+ base = (char *)mstate->dtms_scratch_ptr;
+ end = (char *)mstate->dtms_scratch_ptr + size - 1;
+
+ /*
+ * Stringify as a dotted decimal quad.
+ */
+ *end-- = '\0';
+ ptr8 = (uint8_t *)&ip4;
+ for (i = 3; i >= 0; i--) {
+ val = ptr8[i];
+
+ if (val == 0) {
+ *end-- = '0';
+ } else {
+ for (; val; val /= 10) {
+ *end-- = '0' + (val % 10);
+ }
+ }
+
+ if (i > 0)
+ *end-- = '.';
+ }
+ ASSERT(end + 1 >= base);
+
+ } else if (af == AF_INET6) {
+ struct in6_addr ip6;
+ int firstzero, tryzero, numzero, v6end;
+ uint16_t val;
+ const char digits[] = "0123456789abcdef";
+
+ /*
+ * Stringify using RFC 1884 convention 2 - 16 bit
+ * hexadecimal values with a zero-run compression.
+ * Lower case hexadecimal digits are used.
+ * eg, fe80::214:4fff:fe0b:76c8.
+ * The IPv4 embedded form is returned for inet_ntop,
+ * just the IPv4 string is returned for inet_ntoa6.
+ */
+
+ if (!dtrace_canload(tupregs[argi].dttk_value,
+ sizeof (struct in6_addr), mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ /*
+ * Safely load the IPv6 address.
+ */
+ dtrace_bcopy(
+ (void *)(uintptr_t)tupregs[argi].dttk_value,
+ (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
+
+ /*
+ * Check an IPv6 string will fit in scratch.
+ */
+ size = INET6_ADDRSTRLEN;
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+ base = (char *)mstate->dtms_scratch_ptr;
+ end = (char *)mstate->dtms_scratch_ptr + size - 1;
+ *end-- = '\0';
+
+ /*
+ * Find the longest run of 16 bit zero values
+ * for the single allowed zero compression - "::".
+ */
+ firstzero = -1;
+ tryzero = -1;
+ numzero = 1;
+ for (i = 0; i < sizeof (struct in6_addr); i++) {
+#ifdef illumos
+ if (ip6._S6_un._S6_u8[i] == 0 &&
+#else
+ if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
+#endif
+ tryzero == -1 && i % 2 == 0) {
+ tryzero = i;
+ continue;
+ }
+
+ if (tryzero != -1 &&
+#ifdef illumos
+ (ip6._S6_un._S6_u8[i] != 0 ||
+#else
+ (ip6.__u6_addr.__u6_addr8[i] != 0 ||
+#endif
+ i == sizeof (struct in6_addr) - 1)) {
+
+ if (i - tryzero <= numzero) {
+ tryzero = -1;
+ continue;
+ }
+
+ firstzero = tryzero;
+ numzero = i - i % 2 - tryzero;
+ tryzero = -1;
+
+#ifdef illumos
+ if (ip6._S6_un._S6_u8[i] == 0 &&
+#else
+ if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
+#endif
+ i == sizeof (struct in6_addr) - 1)
+ numzero += 2;
+ }
+ }
+ ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
+
+ /*
+ * Check for an IPv4 embedded address.
+ */
+ v6end = sizeof (struct in6_addr) - 2;
+ if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
+ IN6_IS_ADDR_V4COMPAT(&ip6)) {
+ for (i = sizeof (struct in6_addr) - 1;
+ i >= DTRACE_V4MAPPED_OFFSET; i--) {
+ ASSERT(end >= base);
+
+#ifdef illumos
+ val = ip6._S6_un._S6_u8[i];
+#else
+ val = ip6.__u6_addr.__u6_addr8[i];
+#endif
+
+ if (val == 0) {
+ *end-- = '0';
+ } else {
+ for (; val; val /= 10) {
+ *end-- = '0' + val % 10;
+ }
+ }
+
+ if (i > DTRACE_V4MAPPED_OFFSET)
+ *end-- = '.';
+ }
+
+ if (subr == DIF_SUBR_INET_NTOA6)
+ goto inetout;
+
+ /*
+ * Set v6end to skip the IPv4 address that
+ * we have already stringified.
+ */
+ v6end = 10;
+ }
+
+ /*
+ * Build the IPv6 string by working through the
+ * address in reverse.
+ */
+ for (i = v6end; i >= 0; i -= 2) {
+ ASSERT(end >= base);
+
+ if (i == firstzero + numzero - 2) {
+ *end-- = ':';
+ *end-- = ':';
+ i -= numzero - 2;
+ continue;
+ }
+
+ if (i < 14 && i != firstzero - 2)
+ *end-- = ':';
+
+#ifdef illumos
+ val = (ip6._S6_un._S6_u8[i] << 8) +
+ ip6._S6_un._S6_u8[i + 1];
+#else
+ val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
+ ip6.__u6_addr.__u6_addr8[i + 1];
+#endif
+
+ if (val == 0) {
+ *end-- = '0';
+ } else {
+ for (; val; val /= 16) {
+ *end-- = digits[val % 16];
+ }
+ }
+ }
+ ASSERT(end + 1 >= base);
+
+ } else {
+ /*
+ * The user didn't use AH_INET or AH_INET6.
+ */
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+ regs[rd] = 0;
+ break;
+ }
+
+inetout: regs[rd] = (uintptr_t)end + 1;
+ mstate->dtms_scratch_ptr += size;
+ break;
+ }
+
+ case DIF_SUBR_MEMREF: {
+ uintptr_t size = 2 * sizeof(uintptr_t);
+ uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
+ size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
+
+ /* address and length */
+ memref[0] = tupregs[0].dttk_value;
+ memref[1] = tupregs[1].dttk_value;
+
+ regs[rd] = (uintptr_t) memref;
+ mstate->dtms_scratch_ptr += scratch_size;
+ break;
+ }
+
+#ifndef illumos
+ case DIF_SUBR_MEMSTR: {
+ char *str = (char *)mstate->dtms_scratch_ptr;
+ uintptr_t mem = tupregs[0].dttk_value;
+ char c = tupregs[1].dttk_value;
+ size_t size = tupregs[2].dttk_value;
+ uint8_t n;
+ int i;
+
+ regs[rd] = 0;
+
+ if (size == 0)
+ break;
+
+ if (!dtrace_canload(mem, size - 1, mstate, vstate))
+ break;
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ break;
+ }
+
+ if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+
+ for (i = 0; i < size - 1; i++) {
+ n = dtrace_load8(mem++);
+ str[i] = (n == 0) ? c : n;
+ }
+ str[size - 1] = 0;
+
+ regs[rd] = (uintptr_t)str;
+ mstate->dtms_scratch_ptr += size;
+ break;
+ }
+#endif
+ }
+}
+
+/*
+ * Emulate the execution of DTrace IR instructions specified by the given
+ * DIF object. This function is deliberately void of assertions as all of
+ * the necessary checks are handled by a call to dtrace_difo_validate().
+ */
+static uint64_t
+dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
+ dtrace_vstate_t *vstate, dtrace_state_t *state)
+{
+ const dif_instr_t *text = difo->dtdo_buf;
+ const uint_t textlen = difo->dtdo_len;
+ const char *strtab = difo->dtdo_strtab;
+ const uint64_t *inttab = difo->dtdo_inttab;
+
+ uint64_t rval = 0;
+ dtrace_statvar_t *svar;
+ dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
+ dtrace_difv_t *v;
+ volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
+ volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
+
+ dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
+ uint64_t regs[DIF_DIR_NREGS];
+ uint64_t *tmp;
+
+ uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
+ int64_t cc_r;
+ uint_t pc = 0, id, opc = 0;
+ uint8_t ttop = 0;
+ dif_instr_t instr;
+ uint_t r1, r2, rd;
+
+ /*
+ * We stash the current DIF object into the machine state: we need it
+ * for subsequent access checking.
+ */
+ mstate->dtms_difo = difo;
+
+ regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
+
+ while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
+ opc = pc;
+
+ instr = text[pc++];
+ r1 = DIF_INSTR_R1(instr);
+ r2 = DIF_INSTR_R2(instr);
+ rd = DIF_INSTR_RD(instr);
+
+ switch (DIF_INSTR_OP(instr)) {
+ case DIF_OP_OR:
+ regs[rd] = regs[r1] | regs[r2];
+ break;
+ case DIF_OP_XOR:
+ regs[rd] = regs[r1] ^ regs[r2];
+ break;
+ case DIF_OP_AND:
+ regs[rd] = regs[r1] & regs[r2];
+ break;
+ case DIF_OP_SLL:
+ regs[rd] = regs[r1] << regs[r2];
+ break;
+ case DIF_OP_SRL:
+ regs[rd] = regs[r1] >> regs[r2];
+ break;
+ case DIF_OP_SUB:
+ regs[rd] = regs[r1] - regs[r2];
+ break;
+ case DIF_OP_ADD:
+ regs[rd] = regs[r1] + regs[r2];
+ break;
+ case DIF_OP_MUL:
+ regs[rd] = regs[r1] * regs[r2];
+ break;
+ case DIF_OP_SDIV:
+ if (regs[r2] == 0) {
+ regs[rd] = 0;
+ *flags |= CPU_DTRACE_DIVZERO;
+ } else {
+ regs[rd] = (int64_t)regs[r1] /
+ (int64_t)regs[r2];
+ }
+ break;
+
+ case DIF_OP_UDIV:
+ if (regs[r2] == 0) {
+ regs[rd] = 0;
+ *flags |= CPU_DTRACE_DIVZERO;
+ } else {
+ regs[rd] = regs[r1] / regs[r2];
+ }
+ break;
+
+ case DIF_OP_SREM:
+ if (regs[r2] == 0) {
+ regs[rd] = 0;
+ *flags |= CPU_DTRACE_DIVZERO;
+ } else {
+ regs[rd] = (int64_t)regs[r1] %
+ (int64_t)regs[r2];
+ }
+ break;
+
+ case DIF_OP_UREM:
+ if (regs[r2] == 0) {
+ regs[rd] = 0;
+ *flags |= CPU_DTRACE_DIVZERO;
+ } else {
+ regs[rd] = regs[r1] % regs[r2];
+ }
+ break;
+
+ case DIF_OP_NOT:
+ regs[rd] = ~regs[r1];
+ break;
+ case DIF_OP_MOV:
+ regs[rd] = regs[r1];
+ break;
+ case DIF_OP_CMP:
+ cc_r = regs[r1] - regs[r2];
+ cc_n = cc_r < 0;
+ cc_z = cc_r == 0;
+ cc_v = 0;
+ cc_c = regs[r1] < regs[r2];
+ break;
+ case DIF_OP_TST:
+ cc_n = cc_v = cc_c = 0;
+ cc_z = regs[r1] == 0;
+ break;
+ case DIF_OP_BA:
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BE:
+ if (cc_z)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BNE:
+ if (cc_z == 0)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BG:
+ if ((cc_z | (cc_n ^ cc_v)) == 0)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BGU:
+ if ((cc_c | cc_z) == 0)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BGE:
+ if ((cc_n ^ cc_v) == 0)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BGEU:
+ if (cc_c == 0)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BL:
+ if (cc_n ^ cc_v)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BLU:
+ if (cc_c)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BLE:
+ if (cc_z | (cc_n ^ cc_v))
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_BLEU:
+ if (cc_c | cc_z)
+ pc = DIF_INSTR_LABEL(instr);
+ break;
+ case DIF_OP_RLDSB:
+ if (!dtrace_canload(regs[r1], 1, mstate, vstate))
+ break;
+ /*FALLTHROUGH*/
+ case DIF_OP_LDSB:
+ regs[rd] = (int8_t)dtrace_load8(regs[r1]);
+ break;
+ case DIF_OP_RLDSH:
+ if (!dtrace_canload(regs[r1], 2, mstate, vstate))
+ break;
+ /*FALLTHROUGH*/
+ case DIF_OP_LDSH:
+ regs[rd] = (int16_t)dtrace_load16(regs[r1]);
+ break;
+ case DIF_OP_RLDSW:
+ if (!dtrace_canload(regs[r1], 4, mstate, vstate))
+ break;
+ /*FALLTHROUGH*/
+ case DIF_OP_LDSW:
+ regs[rd] = (int32_t)dtrace_load32(regs[r1]);
+ break;
+ case DIF_OP_RLDUB:
+ if (!dtrace_canload(regs[r1], 1, mstate, vstate))
+ break;
+ /*FALLTHROUGH*/
+ case DIF_OP_LDUB:
+ regs[rd] = dtrace_load8(regs[r1]);
+ break;
+ case DIF_OP_RLDUH:
+ if (!dtrace_canload(regs[r1], 2, mstate, vstate))
+ break;
+ /*FALLTHROUGH*/
+ case DIF_OP_LDUH:
+ regs[rd] = dtrace_load16(regs[r1]);
+ break;
+ case DIF_OP_RLDUW:
+ if (!dtrace_canload(regs[r1], 4, mstate, vstate))
+ break;
+ /*FALLTHROUGH*/
+ case DIF_OP_LDUW:
+ regs[rd] = dtrace_load32(regs[r1]);
+ break;
+ case DIF_OP_RLDX:
+ if (!dtrace_canload(regs[r1], 8, mstate, vstate))
+ break;
+ /*FALLTHROUGH*/
+ case DIF_OP_LDX:
+ regs[rd] = dtrace_load64(regs[r1]);
+ break;
+ case DIF_OP_ULDSB:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] = (int8_t)
+ dtrace_fuword8((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+ case DIF_OP_ULDSH:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] = (int16_t)
+ dtrace_fuword16((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+ case DIF_OP_ULDSW:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] = (int32_t)
+ dtrace_fuword32((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+ case DIF_OP_ULDUB:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] =
+ dtrace_fuword8((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+ case DIF_OP_ULDUH:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] =
+ dtrace_fuword16((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+ case DIF_OP_ULDUW:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] =
+ dtrace_fuword32((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+ case DIF_OP_ULDX:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ regs[rd] =
+ dtrace_fuword64((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ break;
+ case DIF_OP_RET:
+ rval = regs[rd];
+ pc = textlen;
+ break;
+ case DIF_OP_NOP:
+ break;
+ case DIF_OP_SETX:
+ regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
+ break;
+ case DIF_OP_SETS:
+ regs[rd] = (uint64_t)(uintptr_t)
+ (strtab + DIF_INSTR_STRING(instr));
+ break;
+ case DIF_OP_SCMP: {
+ size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t s1 = regs[r1];
+ uintptr_t s2 = regs[r2];
+ size_t lim1, lim2;
+
+ if (s1 != 0 &&
+ !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
+ break;
+ if (s2 != 0 &&
+ !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
+ break;
+
+ cc_r = dtrace_strncmp((char *)s1, (char *)s2,
+ MIN(lim1, lim2));
+
+ cc_n = cc_r < 0;
+ cc_z = cc_r == 0;
+ cc_v = cc_c = 0;
+ break;
+ }
+ case DIF_OP_LDGA:
+ regs[rd] = dtrace_dif_variable(mstate, state,
+ r1, regs[r2]);
+ break;
+ case DIF_OP_LDGS:
+ id = DIF_INSTR_VAR(instr);
+
+ if (id >= DIF_VAR_OTHER_UBASE) {
+ uintptr_t a;
+
+ id -= DIF_VAR_OTHER_UBASE;
+ svar = vstate->dtvs_globals[id];
+ ASSERT(svar != NULL);
+ v = &svar->dtsv_var;
+
+ if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
+ regs[rd] = svar->dtsv_data;
+ break;
+ }
+
+ a = (uintptr_t)svar->dtsv_data;
+
+ if (*(uint8_t *)a == UINT8_MAX) {
+ /*
+ * If the 0th byte is set to UINT8_MAX
+ * then this is to be treated as a
+ * reference to a NULL variable.
+ */
+ regs[rd] = 0;
+ } else {
+ regs[rd] = a + sizeof (uint64_t);
+ }
+
+ break;
+ }
+
+ regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
+ break;
+
+ case DIF_OP_STGS:
+ id = DIF_INSTR_VAR(instr);
+
+ ASSERT(id >= DIF_VAR_OTHER_UBASE);
+ id -= DIF_VAR_OTHER_UBASE;
+
+ VERIFY(id < vstate->dtvs_nglobals);
+ svar = vstate->dtvs_globals[id];
+ ASSERT(svar != NULL);
+ v = &svar->dtsv_var;
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+ uintptr_t a = (uintptr_t)svar->dtsv_data;
+ size_t lim;
+
+ ASSERT(a != 0);
+ ASSERT(svar->dtsv_size != 0);
+
+ if (regs[rd] == 0) {
+ *(uint8_t *)a = UINT8_MAX;
+ break;
+ } else {
+ *(uint8_t *)a = 0;
+ a += sizeof (uint64_t);
+ }
+ if (!dtrace_vcanload(
+ (void *)(uintptr_t)regs[rd], &v->dtdv_type,
+ &lim, mstate, vstate))
+ break;
+
+ dtrace_vcopy((void *)(uintptr_t)regs[rd],
+ (void *)a, &v->dtdv_type, lim);
+ break;
+ }
+
+ svar->dtsv_data = regs[rd];
+ break;
+
+ case DIF_OP_LDTA:
+ /*
+ * There are no DTrace built-in thread-local arrays at
+ * present. This opcode is saved for future work.
+ */
+ *flags |= CPU_DTRACE_ILLOP;
+ regs[rd] = 0;
+ break;
+
+ case DIF_OP_LDLS:
+ id = DIF_INSTR_VAR(instr);
+
+ if (id < DIF_VAR_OTHER_UBASE) {
+ /*
+ * For now, this has no meaning.
+ */
+ regs[rd] = 0;
+ break;
+ }
+
+ id -= DIF_VAR_OTHER_UBASE;
+
+ ASSERT(id < vstate->dtvs_nlocals);
+ ASSERT(vstate->dtvs_locals != NULL);
+
+ svar = vstate->dtvs_locals[id];
+ ASSERT(svar != NULL);
+ v = &svar->dtsv_var;
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+ uintptr_t a = (uintptr_t)svar->dtsv_data;
+ size_t sz = v->dtdv_type.dtdt_size;
+ size_t lim;
+
+ sz += sizeof (uint64_t);
+ ASSERT(svar->dtsv_size == NCPU * sz);
+ a += curcpu * sz;
+
+ if (*(uint8_t *)a == UINT8_MAX) {
+ /*
+ * If the 0th byte is set to UINT8_MAX
+ * then this is to be treated as a
+ * reference to a NULL variable.
+ */
+ regs[rd] = 0;
+ } else {
+ regs[rd] = a + sizeof (uint64_t);
+ }
+
+ break;
+ }
+
+ ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
+ tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
+ regs[rd] = tmp[curcpu];
+ break;
+
+ case DIF_OP_STLS:
+ id = DIF_INSTR_VAR(instr);
+
+ ASSERT(id >= DIF_VAR_OTHER_UBASE);
+ id -= DIF_VAR_OTHER_UBASE;
+ VERIFY(id < vstate->dtvs_nlocals);
+
+ ASSERT(vstate->dtvs_locals != NULL);
+ svar = vstate->dtvs_locals[id];
+ ASSERT(svar != NULL);
+ v = &svar->dtsv_var;
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+ uintptr_t a = (uintptr_t)svar->dtsv_data;
+ size_t sz = v->dtdv_type.dtdt_size;
+ size_t lim;
+
+ sz += sizeof (uint64_t);
+ ASSERT(svar->dtsv_size == NCPU * sz);
+ a += curcpu * sz;
+
+ if (regs[rd] == 0) {
+ *(uint8_t *)a = UINT8_MAX;
+ break;
+ } else {
+ *(uint8_t *)a = 0;
+ a += sizeof (uint64_t);
+ }
+
+ if (!dtrace_vcanload(
+ (void *)(uintptr_t)regs[rd], &v->dtdv_type,
+ &lim, mstate, vstate))
+ break;
+
+ dtrace_vcopy((void *)(uintptr_t)regs[rd],
+ (void *)a, &v->dtdv_type, lim);
+ break;
+ }
+
+ ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
+ tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
+ tmp[curcpu] = regs[rd];
+ break;
+
+ case DIF_OP_LDTS: {
+ dtrace_dynvar_t *dvar;
+ dtrace_key_t *key;
+
+ id = DIF_INSTR_VAR(instr);
+ ASSERT(id >= DIF_VAR_OTHER_UBASE);
+ id -= DIF_VAR_OTHER_UBASE;
+ v = &vstate->dtvs_tlocals[id];
+
+ key = &tupregs[DIF_DTR_NREGS];
+ key[0].dttk_value = (uint64_t)id;
+ key[0].dttk_size = 0;
+ DTRACE_TLS_THRKEY(key[1].dttk_value);
+ key[1].dttk_size = 0;
+
+ dvar = dtrace_dynvar(dstate, 2, key,
+ sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
+ mstate, vstate);
+
+ if (dvar == NULL) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+ regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
+ } else {
+ regs[rd] = *((uint64_t *)dvar->dtdv_data);
+ }
+
+ break;
+ }
+
+ case DIF_OP_STTS: {
+ dtrace_dynvar_t *dvar;
+ dtrace_key_t *key;
+
+ id = DIF_INSTR_VAR(instr);
+ ASSERT(id >= DIF_VAR_OTHER_UBASE);
+ id -= DIF_VAR_OTHER_UBASE;
+ VERIFY(id < vstate->dtvs_ntlocals);
+
+ key = &tupregs[DIF_DTR_NREGS];
+ key[0].dttk_value = (uint64_t)id;
+ key[0].dttk_size = 0;
+ DTRACE_TLS_THRKEY(key[1].dttk_value);
+ key[1].dttk_size = 0;
+ v = &vstate->dtvs_tlocals[id];
+
+ dvar = dtrace_dynvar(dstate, 2, key,
+ v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
+ v->dtdv_type.dtdt_size : sizeof (uint64_t),
+ regs[rd] ? DTRACE_DYNVAR_ALLOC :
+ DTRACE_DYNVAR_DEALLOC, mstate, vstate);
+
+ /*
+ * Given that we're storing to thread-local data,
+ * we need to flush our predicate cache.
+ */
+ curthread->t_predcache = 0;
+
+ if (dvar == NULL)
+ break;
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+ size_t lim;
+
+ if (!dtrace_vcanload(
+ (void *)(uintptr_t)regs[rd],
+ &v->dtdv_type, &lim, mstate, vstate))
+ break;
+
+ dtrace_vcopy((void *)(uintptr_t)regs[rd],
+ dvar->dtdv_data, &v->dtdv_type, lim);
+ } else {
+ *((uint64_t *)dvar->dtdv_data) = regs[rd];
+ }
+
+ break;
+ }
+
+ case DIF_OP_SRA:
+ regs[rd] = (int64_t)regs[r1] >> regs[r2];
+ break;
+
+ case DIF_OP_CALL:
+ dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
+ regs, tupregs, ttop, mstate, state);
+ break;
+
+ case DIF_OP_PUSHTR:
+ if (ttop == DIF_DTR_NREGS) {
+ *flags |= CPU_DTRACE_TUPOFLOW;
+ break;
+ }
+
+ if (r1 == DIF_TYPE_STRING) {
+ /*
+ * If this is a string type and the size is 0,
+ * we'll use the system-wide default string
+ * size. Note that we are _not_ looking at
+ * the value of the DTRACEOPT_STRSIZE option;
+ * had this been set, we would expect to have
+ * a non-zero size value in the "pushtr".
+ */
+ tupregs[ttop].dttk_size =
+ dtrace_strlen((char *)(uintptr_t)regs[rd],
+ regs[r2] ? regs[r2] :
+ dtrace_strsize_default) + 1;
+ } else {
+ if (regs[r2] > LONG_MAX) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+
+ tupregs[ttop].dttk_size = regs[r2];
+ }
+
+ tupregs[ttop++].dttk_value = regs[rd];
+ break;
+
+ case DIF_OP_PUSHTV:
+ if (ttop == DIF_DTR_NREGS) {
+ *flags |= CPU_DTRACE_TUPOFLOW;
+ break;
+ }
+
+ tupregs[ttop].dttk_value = regs[rd];
+ tupregs[ttop++].dttk_size = 0;
+ break;
+
+ case DIF_OP_POPTS:
+ if (ttop != 0)
+ ttop--;
+ break;
+
+ case DIF_OP_FLUSHTS:
+ ttop = 0;
+ break;
+
+ case DIF_OP_LDGAA:
+ case DIF_OP_LDTAA: {
+ dtrace_dynvar_t *dvar;
+ dtrace_key_t *key = tupregs;
+ uint_t nkeys = ttop;
+
+ id = DIF_INSTR_VAR(instr);
+ ASSERT(id >= DIF_VAR_OTHER_UBASE);
+ id -= DIF_VAR_OTHER_UBASE;
+
+ key[nkeys].dttk_value = (uint64_t)id;
+ key[nkeys++].dttk_size = 0;
+
+ if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
+ DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
+ key[nkeys++].dttk_size = 0;
+ VERIFY(id < vstate->dtvs_ntlocals);
+ v = &vstate->dtvs_tlocals[id];
+ } else {
+ VERIFY(id < vstate->dtvs_nglobals);
+ v = &vstate->dtvs_globals[id]->dtsv_var;
+ }
+
+ dvar = dtrace_dynvar(dstate, nkeys, key,
+ v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
+ v->dtdv_type.dtdt_size : sizeof (uint64_t),
+ DTRACE_DYNVAR_NOALLOC, mstate, vstate);
+
+ if (dvar == NULL) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+ regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
+ } else {
+ regs[rd] = *((uint64_t *)dvar->dtdv_data);
+ }
+
+ break;
+ }
+
+ case DIF_OP_STGAA:
+ case DIF_OP_STTAA: {
+ dtrace_dynvar_t *dvar;
+ dtrace_key_t *key = tupregs;
+ uint_t nkeys = ttop;
+
+ id = DIF_INSTR_VAR(instr);
+ ASSERT(id >= DIF_VAR_OTHER_UBASE);
+ id -= DIF_VAR_OTHER_UBASE;
+
+ key[nkeys].dttk_value = (uint64_t)id;
+ key[nkeys++].dttk_size = 0;
+
+ if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
+ DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
+ key[nkeys++].dttk_size = 0;
+ VERIFY(id < vstate->dtvs_ntlocals);
+ v = &vstate->dtvs_tlocals[id];
+ } else {
+ VERIFY(id < vstate->dtvs_nglobals);
+ v = &vstate->dtvs_globals[id]->dtsv_var;
+ }
+
+ dvar = dtrace_dynvar(dstate, nkeys, key,
+ v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
+ v->dtdv_type.dtdt_size : sizeof (uint64_t),
+ regs[rd] ? DTRACE_DYNVAR_ALLOC :
+ DTRACE_DYNVAR_DEALLOC, mstate, vstate);
+
+ if (dvar == NULL)
+ break;
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+ size_t lim;
+
+ if (!dtrace_vcanload(
+ (void *)(uintptr_t)regs[rd], &v->dtdv_type,
+ &lim, mstate, vstate))
+ break;
+
+ dtrace_vcopy((void *)(uintptr_t)regs[rd],
+ dvar->dtdv_data, &v->dtdv_type, lim);
+ } else {
+ *((uint64_t *)dvar->dtdv_data) = regs[rd];
+ }
+
+ break;
+ }
+
+ case DIF_OP_ALLOCS: {
+ uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
+ size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
+
+ /*
+ * Rounding up the user allocation size could have
+ * overflowed large, bogus allocations (like -1ULL) to
+ * 0.
+ */
+ if (size < regs[r1] ||
+ !DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
+ mstate->dtms_scratch_ptr += size;
+ regs[rd] = ptr;
+ break;
+ }
+
+ case DIF_OP_COPYS:
+ if (!dtrace_canstore(regs[rd], regs[r2],
+ mstate, vstate)) {
+ *flags |= CPU_DTRACE_BADADDR;
+ *illval = regs[rd];
+ break;
+ }
+
+ if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
+ break;
+
+ dtrace_bcopy((void *)(uintptr_t)regs[r1],
+ (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
+ break;
+
+ case DIF_OP_STB:
+ if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
+ *flags |= CPU_DTRACE_BADADDR;
+ *illval = regs[rd];
+ break;
+ }
+ *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
+ break;
+
+ case DIF_OP_STH:
+ if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
+ *flags |= CPU_DTRACE_BADADDR;
+ *illval = regs[rd];
+ break;
+ }
+ if (regs[rd] & 1) {
+ *flags |= CPU_DTRACE_BADALIGN;
+ *illval = regs[rd];
+ break;
+ }
+ *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
+ break;
+
+ case DIF_OP_STW:
+ if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
+ *flags |= CPU_DTRACE_BADADDR;
+ *illval = regs[rd];
+ break;
+ }
+ if (regs[rd] & 3) {
+ *flags |= CPU_DTRACE_BADALIGN;
+ *illval = regs[rd];
+ break;
+ }
+ *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
+ break;
+
+ case DIF_OP_STX:
+ if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
+ *flags |= CPU_DTRACE_BADADDR;
+ *illval = regs[rd];
+ break;
+ }
+ if (regs[rd] & 7) {
+ *flags |= CPU_DTRACE_BADALIGN;
+ *illval = regs[rd];
+ break;
+ }
+ *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
+ break;
+ }
+ }
+
+ if (!(*flags & CPU_DTRACE_FAULT))
+ return (rval);
+
+ mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
+ mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
+
+ return (0);
+}
+
+static void
+dtrace_action_breakpoint(dtrace_ecb_t *ecb)
+{
+ dtrace_probe_t *probe = ecb->dte_probe;
+ dtrace_provider_t *prov = probe->dtpr_provider;
+ char c[DTRACE_FULLNAMELEN + 80], *str;
+ char *msg = "dtrace: breakpoint action at probe ";
+ char *ecbmsg = " (ecb ";
+ uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
+ uintptr_t val = (uintptr_t)ecb;
+ int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
+
+ if (dtrace_destructive_disallow)
+ return;
+
+ /*
+ * It's impossible to be taking action on the NULL probe.
+ */
+ ASSERT(probe != NULL);
+
+ /*
+ * This is a poor man's (destitute man's?) sprintf(): we want to
+ * print the provider name, module name, function name and name of
+ * the probe, along with the hex address of the ECB with the breakpoint
+ * action -- all of which we must place in the character buffer by
+ * hand.
+ */
+ while (*msg != '\0')
+ c[i++] = *msg++;
+
+ for (str = prov->dtpv_name; *str != '\0'; str++)
+ c[i++] = *str;
+ c[i++] = ':';
+
+ for (str = probe->dtpr_mod; *str != '\0'; str++)
+ c[i++] = *str;
+ c[i++] = ':';
+
+ for (str = probe->dtpr_func; *str != '\0'; str++)
+ c[i++] = *str;
+ c[i++] = ':';
+
+ for (str = probe->dtpr_name; *str != '\0'; str++)
+ c[i++] = *str;
+
+ while (*ecbmsg != '\0')
+ c[i++] = *ecbmsg++;
+
+ while (shift >= 0) {
+ mask = (uintptr_t)0xf << shift;
+
+ if (val >= ((uintptr_t)1 << shift))
+ c[i++] = "0123456789abcdef"[(val & mask) >> shift];
+ shift -= 4;
+ }
+
+ c[i++] = ')';
+ c[i] = '\0';
+
+#ifdef illumos
+ debug_enter(c);
+#else
+ kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
+#endif
+}
+
+static void
+dtrace_action_panic(dtrace_ecb_t *ecb)
+{
+ dtrace_probe_t *probe = ecb->dte_probe;
+
+ /*
+ * It's impossible to be taking action on the NULL probe.
+ */
+ ASSERT(probe != NULL);
+
+ if (dtrace_destructive_disallow)
+ return;
+
+ if (dtrace_panicked != NULL)
+ return;
+
+ if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
+ return;
+
+ /*
+ * We won the right to panic. (We want to be sure that only one
+ * thread calls panic() from dtrace_probe(), and that panic() is
+ * called exactly once.)
+ */
+ dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
+ probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
+ probe->dtpr_func, probe->dtpr_name, (void *)ecb);
+}
+
+static void
+dtrace_action_raise(uint64_t sig)
+{
+ if (dtrace_destructive_disallow)
+ return;
+
+ if (sig >= NSIG) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+ return;
+ }
+
+#ifdef illumos
+ /*
+ * raise() has a queue depth of 1 -- we ignore all subsequent
+ * invocations of the raise() action.
+ */
+ if (curthread->t_dtrace_sig == 0)
+ curthread->t_dtrace_sig = (uint8_t)sig;
+
+ curthread->t_sig_check = 1;
+ aston(curthread);
+#else
+ struct proc *p = curproc;
+ PROC_LOCK(p);
+ kern_psignal(p, sig);
+ PROC_UNLOCK(p);
+#endif
+}
+
+static void
+dtrace_action_stop(void)
+{
+ if (dtrace_destructive_disallow)
+ return;
+
+#ifdef illumos
+ if (!curthread->t_dtrace_stop) {
+ curthread->t_dtrace_stop = 1;
+ curthread->t_sig_check = 1;
+ aston(curthread);
+ }
+#else
+ struct proc *p = curproc;
+ PROC_LOCK(p);
+ kern_psignal(p, SIGSTOP);
+ PROC_UNLOCK(p);
+#endif
+}
+
+static void
+dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
+{
+ hrtime_t now;
+ volatile uint16_t *flags;
+#ifdef illumos
+ cpu_t *cpu = CPU;
+#else
+ cpu_t *cpu = &solaris_cpu[curcpu];
+#endif
+
+ if (dtrace_destructive_disallow)
+ return;
+
+ flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
+
+ now = dtrace_gethrtime();
+
+ if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
+ /*
+ * We need to advance the mark to the current time.
+ */
+ cpu->cpu_dtrace_chillmark = now;
+ cpu->cpu_dtrace_chilled = 0;
+ }
+
+ /*
+ * Now check to see if the requested chill time would take us over
+ * the maximum amount of time allowed in the chill interval. (Or
+ * worse, if the calculation itself induces overflow.)
+ */
+ if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
+ cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
+ *flags |= CPU_DTRACE_ILLOP;
+ return;
+ }
+
+ while (dtrace_gethrtime() - now < val)
+ continue;
+
+ /*
+ * Normally, we assure that the value of the variable "timestamp" does
+ * not change within an ECB. The presence of chill() represents an
+ * exception to this rule, however.
+ */
+ mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
+ cpu->cpu_dtrace_chilled += val;
+}
+
+static void
+dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
+ uint64_t *buf, uint64_t arg)
+{
+ int nframes = DTRACE_USTACK_NFRAMES(arg);
+ int strsize = DTRACE_USTACK_STRSIZE(arg);
+ uint64_t *pcs = &buf[1], *fps;
+ char *str = (char *)&pcs[nframes];
+ int size, offs = 0, i, j;
+ size_t rem;
+ uintptr_t old = mstate->dtms_scratch_ptr, saved;
+ uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
+ char *sym;
+
+ /*
+ * Should be taking a faster path if string space has not been
+ * allocated.
+ */
+ ASSERT(strsize != 0);
+
+ /*
+ * We will first allocate some temporary space for the frame pointers.
+ */
+ fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
+ size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
+ (nframes * sizeof (uint64_t));
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ /*
+ * Not enough room for our frame pointers -- need to indicate
+ * that we ran out of scratch space.
+ */
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ return;
+ }
+
+ mstate->dtms_scratch_ptr += size;
+ saved = mstate->dtms_scratch_ptr;
+
+ /*
+ * Now get a stack with both program counters and frame pointers.
+ */
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ dtrace_getufpstack(buf, fps, nframes + 1);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+
+ /*
+ * If that faulted, we're cooked.
+ */
+ if (*flags & CPU_DTRACE_FAULT)
+ goto out;
+
+ /*
+ * Now we want to walk up the stack, calling the USTACK helper. For
+ * each iteration, we restore the scratch pointer.
+ */
+ for (i = 0; i < nframes; i++) {
+ mstate->dtms_scratch_ptr = saved;
+
+ if (offs >= strsize)
+ break;
+
+ sym = (char *)(uintptr_t)dtrace_helper(
+ DTRACE_HELPER_ACTION_USTACK,
+ mstate, state, pcs[i], fps[i]);
+
+ /*
+ * If we faulted while running the helper, we're going to
+ * clear the fault and null out the corresponding string.
+ */
+ if (*flags & CPU_DTRACE_FAULT) {
+ *flags &= ~CPU_DTRACE_FAULT;
+ str[offs++] = '\0';
+ continue;
+ }
+
+ if (sym == NULL) {
+ str[offs++] = '\0';
+ continue;
+ }
+
+ if (!dtrace_strcanload((uintptr_t)sym, strsize, &rem, mstate,
+ &(state->dts_vstate))) {
+ str[offs++] = '\0';
+ continue;
+ }
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+
+ /*
+ * Now copy in the string that the helper returned to us.
+ */
+ for (j = 0; offs + j < strsize && j < rem; j++) {
+ if ((str[offs + j] = sym[j]) == '\0')
+ break;
+ }
+
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+
+ offs += j + 1;
+ }
+
+ if (offs >= strsize) {
+ /*
+ * If we didn't have room for all of the strings, we don't
+ * abort processing -- this needn't be a fatal error -- but we
+ * still want to increment a counter (dts_stkstroverflows) to
+ * allow this condition to be warned about. (If this is from
+ * a jstack() action, it is easily tuned via jstackstrsize.)
+ */
+ dtrace_error(&state->dts_stkstroverflows);
+ }
+
+ while (offs < strsize)
+ str[offs++] = '\0';
+
+out:
+ mstate->dtms_scratch_ptr = old;
+}
+
+static void
+dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
+ size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
+{
+ volatile uint16_t *flags;
+ uint64_t val = *valp;
+ size_t valoffs = *valoffsp;
+
+ flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
+ ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
+
+ /*
+ * If this is a string, we're going to only load until we find the zero
+ * byte -- after which we'll store zero bytes.
+ */
+ if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
+ char c = '\0' + 1;
+ size_t s;
+
+ for (s = 0; s < size; s++) {
+ if (c != '\0' && dtkind == DIF_TF_BYREF) {
+ c = dtrace_load8(val++);
+ } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ c = dtrace_fuword8((void *)(uintptr_t)val++);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ if (*flags & CPU_DTRACE_FAULT)
+ break;
+ }
+
+ DTRACE_STORE(uint8_t, tomax, valoffs++, c);
+
+ if (c == '\0' && intuple)
+ break;
+ }
+ } else {
+ uint8_t c;
+ while (valoffs < end) {
+ if (dtkind == DIF_TF_BYREF) {
+ c = dtrace_load8(val++);
+ } else if (dtkind == DIF_TF_BYUREF) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ c = dtrace_fuword8((void *)(uintptr_t)val++);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ if (*flags & CPU_DTRACE_FAULT)
+ break;
+ }
+
+ DTRACE_STORE(uint8_t, tomax,
+ valoffs++, c);
+ }
+ }
+
+ *valp = val;
+ *valoffsp = valoffs;
+}
+
+/*
+ * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
+ * defined, we also assert that we are not recursing unless the probe ID is an
+ * error probe.
+ */
+static dtrace_icookie_t
+dtrace_probe_enter(dtrace_id_t id)
+{
+ dtrace_icookie_t cookie;
+
+ cookie = dtrace_interrupt_disable();
+
+ /*
+ * Unless this is an ERROR probe, we are not allowed to recurse in
+ * dtrace_probe(). Recursing into DTrace probe usually means that a
+ * function is instrumented that should not have been instrumented or
+ * that the ordering guarantee of the records will be violated,
+ * resulting in unexpected output. If there is an exception to this
+ * assertion, a new case should be added.
+ */
+ ASSERT(curthread->t_dtrace_inprobe == 0 ||
+ id == dtrace_probeid_error);
+ curthread->t_dtrace_inprobe = 1;
+
+ return (cookie);
+}
+
+/*
+ * Clears the per-thread inprobe flag and enables interrupts.
+ */
+static void
+dtrace_probe_exit(dtrace_icookie_t cookie)
+{
+
+ curthread->t_dtrace_inprobe = 0;
+ dtrace_interrupt_enable(cookie);
+}
+
+/*
+ * If you're looking for the epicenter of DTrace, you just found it. This
+ * is the function called by the provider to fire a probe -- from which all
+ * subsequent probe-context DTrace activity emanates.
+ */
+void
+dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
+ uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
+{
+ processorid_t cpuid;
+ dtrace_icookie_t cookie;
+ dtrace_probe_t *probe;
+ dtrace_mstate_t mstate;
+ dtrace_ecb_t *ecb;
+ dtrace_action_t *act;
+ intptr_t offs;
+ size_t size;
+ int vtime, onintr;
+ volatile uint16_t *flags;
+ hrtime_t now;
+
+ if (panicstr != NULL)
+ return;
+
+#ifdef illumos
+ /*
+ * Kick out immediately if this CPU is still being born (in which case
+ * curthread will be set to -1) or the current thread can't allow
+ * probes in its current context.
+ */
+ if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
+ return;
+#endif
+
+ cookie = dtrace_probe_enter(id);
+ probe = dtrace_probes[id - 1];
+ cpuid = curcpu;
+ onintr = CPU_ON_INTR(CPU);
+
+ if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
+ probe->dtpr_predcache == curthread->t_predcache) {
+ /*
+ * We have hit in the predicate cache; we know that
+ * this predicate would evaluate to be false.
+ */
+ dtrace_probe_exit(cookie);
+ return;
+ }
+
+#ifdef illumos
+ if (panic_quiesce) {
+#else
+ if (panicstr != NULL) {
+#endif
+ /*
+ * We don't trace anything if we're panicking.
+ */
+ dtrace_probe_exit(cookie);
+ return;
+ }
+
+ now = mstate.dtms_timestamp = dtrace_gethrtime();
+ mstate.dtms_present = DTRACE_MSTATE_TIMESTAMP;
+ vtime = dtrace_vtime_references != 0;
+
+ if (vtime && curthread->t_dtrace_start)
+ curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
+
+ mstate.dtms_difo = NULL;
+ mstate.dtms_probe = probe;
+ mstate.dtms_strtok = 0;
+ mstate.dtms_arg[0] = arg0;
+ mstate.dtms_arg[1] = arg1;
+ mstate.dtms_arg[2] = arg2;
+ mstate.dtms_arg[3] = arg3;
+ mstate.dtms_arg[4] = arg4;
+
+ flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
+
+ for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
+ dtrace_predicate_t *pred = ecb->dte_predicate;
+ dtrace_state_t *state = ecb->dte_state;
+ dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
+ dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
+ dtrace_vstate_t *vstate = &state->dts_vstate;
+ dtrace_provider_t *prov = probe->dtpr_provider;
+ uint64_t tracememsize = 0;
+ int committed = 0;
+ caddr_t tomax;
+
+ /*
+ * A little subtlety with the following (seemingly innocuous)
+ * declaration of the automatic 'val': by looking at the
+ * code, you might think that it could be declared in the
+ * action processing loop, below. (That is, it's only used in
+ * the action processing loop.) However, it must be declared
+ * out of that scope because in the case of DIF expression
+ * arguments to aggregating actions, one iteration of the
+ * action loop will use the last iteration's value.
+ */
+ uint64_t val = 0;
+
+ mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
+ mstate.dtms_getf = NULL;
+
+ *flags &= ~CPU_DTRACE_ERROR;
+
+ if (prov == dtrace_provider) {
+ /*
+ * If dtrace itself is the provider of this probe,
+ * we're only going to continue processing the ECB if
+ * arg0 (the dtrace_state_t) is equal to the ECB's
+ * creating state. (This prevents disjoint consumers
+ * from seeing one another's metaprobes.)
+ */
+ if (arg0 != (uint64_t)(uintptr_t)state)
+ continue;
+ }
+
+ if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
+ /*
+ * We're not currently active. If our provider isn't
+ * the dtrace pseudo provider, we're not interested.
+ */
+ if (prov != dtrace_provider)
+ continue;
+
+ /*
+ * Now we must further check if we are in the BEGIN
+ * probe. If we are, we will only continue processing
+ * if we're still in WARMUP -- if one BEGIN enabling
+ * has invoked the exit() action, we don't want to
+ * evaluate subsequent BEGIN enablings.
+ */
+ if (probe->dtpr_id == dtrace_probeid_begin &&
+ state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
+ ASSERT(state->dts_activity ==
+ DTRACE_ACTIVITY_DRAINING);
+ continue;
+ }
+ }
+
+ if (ecb->dte_cond) {
+ /*
+ * If the dte_cond bits indicate that this
+ * consumer is only allowed to see user-mode firings
+ * of this probe, call the provider's dtps_usermode()
+ * entry point to check that the probe was fired
+ * while in a user context. Skip this ECB if that's
+ * not the case.
+ */
+ if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
+ prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
+ probe->dtpr_id, probe->dtpr_arg) == 0)
+ continue;
+
+#ifdef illumos
+ /*
+ * This is more subtle than it looks. We have to be
+ * absolutely certain that CRED() isn't going to
+ * change out from under us so it's only legit to
+ * examine that structure if we're in constrained
+ * situations. Currently, the only times we'll this
+ * check is if a non-super-user has enabled the
+ * profile or syscall providers -- providers that
+ * allow visibility of all processes. For the
+ * profile case, the check above will ensure that
+ * we're examining a user context.
+ */
+ if (ecb->dte_cond & DTRACE_COND_OWNER) {
+ cred_t *cr;
+ cred_t *s_cr =
+ ecb->dte_state->dts_cred.dcr_cred;
+ proc_t *proc;
+
+ ASSERT(s_cr != NULL);
+
+ if ((cr = CRED()) == NULL ||
+ s_cr->cr_uid != cr->cr_uid ||
+ s_cr->cr_uid != cr->cr_ruid ||
+ s_cr->cr_uid != cr->cr_suid ||
+ s_cr->cr_gid != cr->cr_gid ||
+ s_cr->cr_gid != cr->cr_rgid ||
+ s_cr->cr_gid != cr->cr_sgid ||
+ (proc = ttoproc(curthread)) == NULL ||
+ (proc->p_flag & SNOCD))
+ continue;
+ }
+
+ if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
+ cred_t *cr;
+ cred_t *s_cr =
+ ecb->dte_state->dts_cred.dcr_cred;
+
+ ASSERT(s_cr != NULL);
+
+ if ((cr = CRED()) == NULL ||
+ s_cr->cr_zone->zone_id !=
+ cr->cr_zone->zone_id)
+ continue;
+ }
+#endif
+ }
+
+ if (now - state->dts_alive > dtrace_deadman_timeout) {
+ /*
+ * We seem to be dead. Unless we (a) have kernel
+ * destructive permissions (b) have explicitly enabled
+ * destructive actions and (c) destructive actions have
+ * not been disabled, we're going to transition into
+ * the KILLED state, from which no further processing
+ * on this state will be performed.
+ */
+ if (!dtrace_priv_kernel_destructive(state) ||
+ !state->dts_cred.dcr_destructive ||
+ dtrace_destructive_disallow) {
+ void *activity = &state->dts_activity;
+ dtrace_activity_t curstate;
+
+ do {
+ curstate = state->dts_activity;
+ } while (dtrace_cas32(activity, curstate,
+ DTRACE_ACTIVITY_KILLED) != curstate);
+
+ continue;
+ }
+ }
+
+ if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
+ ecb->dte_alignment, state, &mstate)) < 0)
+ continue;
+
+ tomax = buf->dtb_tomax;
+ ASSERT(tomax != NULL);
+
+ if (ecb->dte_size != 0) {
+ dtrace_rechdr_t dtrh;
+ if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
+ mstate.dtms_timestamp = dtrace_gethrtime();
+ mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
+ }
+ ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
+ dtrh.dtrh_epid = ecb->dte_epid;
+ DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
+ mstate.dtms_timestamp);
+ *((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
+ }
+
+ mstate.dtms_epid = ecb->dte_epid;
+ mstate.dtms_present |= DTRACE_MSTATE_EPID;
+
+ if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
+ mstate.dtms_access = DTRACE_ACCESS_KERNEL;
+ else
+ mstate.dtms_access = 0;
+
+ if (pred != NULL) {
+ dtrace_difo_t *dp = pred->dtp_difo;
+ uint64_t rval;
+
+ rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
+
+ if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
+ dtrace_cacheid_t cid = probe->dtpr_predcache;
+
+ if (cid != DTRACE_CACHEIDNONE && !onintr) {
+ /*
+ * Update the predicate cache...
+ */
+ ASSERT(cid == pred->dtp_cacheid);
+ curthread->t_predcache = cid;
+ }
+
+ continue;
+ }
+ }
+
+ for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
+ act != NULL; act = act->dta_next) {
+ size_t valoffs;
+ dtrace_difo_t *dp;
+ dtrace_recdesc_t *rec = &act->dta_rec;
+
+ size = rec->dtrd_size;
+ valoffs = offs + rec->dtrd_offset;
+
+ if (DTRACEACT_ISAGG(act->dta_kind)) {
+ uint64_t v = 0xbad;
+ dtrace_aggregation_t *agg;
+
+ agg = (dtrace_aggregation_t *)act;
+
+ if ((dp = act->dta_difo) != NULL)
+ v = dtrace_dif_emulate(dp,
+ &mstate, vstate, state);
+
+ if (*flags & CPU_DTRACE_ERROR)
+ continue;
+
+ /*
+ * Note that we always pass the expression
+ * value from the previous iteration of the
+ * action loop. This value will only be used
+ * if there is an expression argument to the
+ * aggregating action, denoted by the
+ * dtag_hasarg field.
+ */
+ dtrace_aggregate(agg, buf,
+ offs, aggbuf, v, val);
+ continue;
+ }
+
+ switch (act->dta_kind) {
+ case DTRACEACT_STOP:
+ if (dtrace_priv_proc_destructive(state))
+ dtrace_action_stop();
+ continue;
+
+ case DTRACEACT_BREAKPOINT:
+ if (dtrace_priv_kernel_destructive(state))
+ dtrace_action_breakpoint(ecb);
+ continue;
+
+ case DTRACEACT_PANIC:
+ if (dtrace_priv_kernel_destructive(state))
+ dtrace_action_panic(ecb);
+ continue;
+
+ case DTRACEACT_STACK:
+ if (!dtrace_priv_kernel(state))
+ continue;
+
+ dtrace_getpcstack((pc_t *)(tomax + valoffs),
+ size / sizeof (pc_t), probe->dtpr_aframes,
+ DTRACE_ANCHORED(probe) ? NULL :
+ (uint32_t *)arg0);
+ continue;
+
+ case DTRACEACT_JSTACK:
+ case DTRACEACT_USTACK:
+ if (!dtrace_priv_proc(state))
+ continue;
+
+ /*
+ * See comment in DIF_VAR_PID.
+ */
+ if (DTRACE_ANCHORED(mstate.dtms_probe) &&
+ CPU_ON_INTR(CPU)) {
+ int depth = DTRACE_USTACK_NFRAMES(
+ rec->dtrd_arg) + 1;
+
+ dtrace_bzero((void *)(tomax + valoffs),
+ DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
+ + depth * sizeof (uint64_t));
+
+ continue;
+ }
+
+ if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
+ curproc->p_dtrace_helpers != NULL) {
+ /*
+ * This is the slow path -- we have
+ * allocated string space, and we're
+ * getting the stack of a process that
+ * has helpers. Call into a separate
+ * routine to perform this processing.
+ */
+ dtrace_action_ustack(&mstate, state,
+ (uint64_t *)(tomax + valoffs),
+ rec->dtrd_arg);
+ continue;
+ }
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ dtrace_getupcstack((uint64_t *)
+ (tomax + valoffs),
+ DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ continue;
+
+ default:
+ break;
+ }
+
+ dp = act->dta_difo;
+ ASSERT(dp != NULL);
+
+ val = dtrace_dif_emulate(dp, &mstate, vstate, state);
+
+ if (*flags & CPU_DTRACE_ERROR)
+ continue;
+
+ switch (act->dta_kind) {
+ case DTRACEACT_SPECULATE: {
+ dtrace_rechdr_t *dtrh;
+
+ ASSERT(buf == &state->dts_buffer[cpuid]);
+ buf = dtrace_speculation_buffer(state,
+ cpuid, val);
+
+ if (buf == NULL) {
+ *flags |= CPU_DTRACE_DROP;
+ continue;
+ }
+
+ offs = dtrace_buffer_reserve(buf,
+ ecb->dte_needed, ecb->dte_alignment,
+ state, NULL);
+
+ if (offs < 0) {
+ *flags |= CPU_DTRACE_DROP;
+ continue;
+ }
+
+ tomax = buf->dtb_tomax;
+ ASSERT(tomax != NULL);
+
+ if (ecb->dte_size == 0)
+ continue;
+
+ ASSERT3U(ecb->dte_size, >=,
+ sizeof (dtrace_rechdr_t));
+ dtrh = ((void *)(tomax + offs));
+ dtrh->dtrh_epid = ecb->dte_epid;
+ /*
+ * When the speculation is committed, all of
+ * the records in the speculative buffer will
+ * have their timestamps set to the commit
+ * time. Until then, it is set to a sentinel
+ * value, for debugability.
+ */
+ DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
+ continue;
+ }
+
+ case DTRACEACT_PRINTM: {
+ /* The DIF returns a 'memref'. */
+ uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
+
+ /* Get the size from the memref. */
+ size = memref[1];
+
+ /*
+ * Check if the size exceeds the allocated
+ * buffer size.
+ */
+ if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
+ /* Flag a drop! */
+ *flags |= CPU_DTRACE_DROP;
+ continue;
+ }
+
+ /* Store the size in the buffer first. */
+ DTRACE_STORE(uintptr_t, tomax,
+ valoffs, size);
+
+ /*
+ * Offset the buffer address to the start
+ * of the data.
+ */
+ valoffs += sizeof(uintptr_t);
+
+ /*
+ * Reset to the memory address rather than
+ * the memref array, then let the BYREF
+ * code below do the work to store the
+ * memory data in the buffer.
+ */
+ val = memref[0];
+ break;
+ }
+
+ case DTRACEACT_CHILL:
+ if (dtrace_priv_kernel_destructive(state))
+ dtrace_action_chill(&mstate, val);
+ continue;
+
+ case DTRACEACT_RAISE:
+ if (dtrace_priv_proc_destructive(state))
+ dtrace_action_raise(val);
+ continue;
+
+ case DTRACEACT_COMMIT:
+ ASSERT(!committed);
+
+ /*
+ * We need to commit our buffer state.
+ */
+ if (ecb->dte_size)
+ buf->dtb_offset = offs + ecb->dte_size;
+ buf = &state->dts_buffer[cpuid];
+ dtrace_speculation_commit(state, cpuid, val);
+ committed = 1;
+ continue;
+
+ case DTRACEACT_DISCARD:
+ dtrace_speculation_discard(state, cpuid, val);
+ continue;
+
+ case DTRACEACT_DIFEXPR:
+ case DTRACEACT_LIBACT:
+ case DTRACEACT_PRINTF:
+ case DTRACEACT_PRINTA:
+ case DTRACEACT_SYSTEM:
+ case DTRACEACT_FREOPEN:
+ case DTRACEACT_TRACEMEM:
+ break;
+
+ case DTRACEACT_TRACEMEM_DYNSIZE:
+ tracememsize = val;
+ break;
+
+ case DTRACEACT_SYM:
+ case DTRACEACT_MOD:
+ if (!dtrace_priv_kernel(state))
+ continue;
+ break;
+
+ case DTRACEACT_USYM:
+ case DTRACEACT_UMOD:
+ case DTRACEACT_UADDR: {
+#ifdef illumos
+ struct pid *pid = curthread->t_procp->p_pidp;
+#endif
+
+ if (!dtrace_priv_proc(state))
+ continue;
+
+ DTRACE_STORE(uint64_t, tomax,
+#ifdef illumos
+ valoffs, (uint64_t)pid->pid_id);
+#else
+ valoffs, (uint64_t) curproc->p_pid);
+#endif
+ DTRACE_STORE(uint64_t, tomax,
+ valoffs + sizeof (uint64_t), val);
+
+ continue;
+ }
+
+ case DTRACEACT_EXIT: {
+ /*
+ * For the exit action, we are going to attempt
+ * to atomically set our activity to be
+ * draining. If this fails (either because
+ * another CPU has beat us to the exit action,
+ * or because our current activity is something
+ * other than ACTIVE or WARMUP), we will
+ * continue. This assures that the exit action
+ * can be successfully recorded at most once
+ * when we're in the ACTIVE state. If we're
+ * encountering the exit() action while in
+ * COOLDOWN, however, we want to honor the new
+ * status code. (We know that we're the only
+ * thread in COOLDOWN, so there is no race.)
+ */
+ void *activity = &state->dts_activity;
+ dtrace_activity_t curstate = state->dts_activity;
+
+ if (curstate == DTRACE_ACTIVITY_COOLDOWN)
+ break;
+
+ if (curstate != DTRACE_ACTIVITY_WARMUP)
+ curstate = DTRACE_ACTIVITY_ACTIVE;
+
+ if (dtrace_cas32(activity, curstate,
+ DTRACE_ACTIVITY_DRAINING) != curstate) {
+ *flags |= CPU_DTRACE_DROP;
+ continue;
+ }
+
+ break;
+ }
+
+ default:
+ ASSERT(0);
+ }
+
+ if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
+ dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
+ uintptr_t end = valoffs + size;
+
+ if (tracememsize != 0 &&
+ valoffs + tracememsize < end) {
+ end = valoffs + tracememsize;
+ tracememsize = 0;
+ }
+
+ if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
+ !dtrace_vcanload((void *)(uintptr_t)val,
+ &dp->dtdo_rtype, NULL, &mstate, vstate))
+ continue;
+
+ dtrace_store_by_ref(dp, tomax, size, &valoffs,
+ &val, end, act->dta_intuple,
+ dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
+ DIF_TF_BYREF: DIF_TF_BYUREF);
+ continue;
+ }
+
+ switch (size) {
+ case 0:
+ break;
+
+ case sizeof (uint8_t):
+ DTRACE_STORE(uint8_t, tomax, valoffs, val);
+ break;
+ case sizeof (uint16_t):
+ DTRACE_STORE(uint16_t, tomax, valoffs, val);
+ break;
+ case sizeof (uint32_t):
+ DTRACE_STORE(uint32_t, tomax, valoffs, val);
+ break;
+ case sizeof (uint64_t):
+ DTRACE_STORE(uint64_t, tomax, valoffs, val);
+ break;
+ default:
+ /*
+ * Any other size should have been returned by
+ * reference, not by value.
+ */
+ ASSERT(0);
+ break;
+ }
+ }
+
+ if (*flags & CPU_DTRACE_DROP)
+ continue;
+
+ if (*flags & CPU_DTRACE_FAULT) {
+ int ndx;
+ dtrace_action_t *err;
+
+ buf->dtb_errors++;
+
+ if (probe->dtpr_id == dtrace_probeid_error) {
+ /*
+ * There's nothing we can do -- we had an
+ * error on the error probe. We bump an
+ * error counter to at least indicate that
+ * this condition happened.
+ */
+ dtrace_error(&state->dts_dblerrors);
+ continue;
+ }
+
+ if (vtime) {
+ /*
+ * Before recursing on dtrace_probe(), we
+ * need to explicitly clear out our start
+ * time to prevent it from being accumulated
+ * into t_dtrace_vtime.
+ */
+ curthread->t_dtrace_start = 0;
+ }
+
+ /*
+ * Iterate over the actions to figure out which action
+ * we were processing when we experienced the error.
+ * Note that act points _past_ the faulting action; if
+ * act is ecb->dte_action, the fault was in the
+ * predicate, if it's ecb->dte_action->dta_next it's
+ * in action #1, and so on.
+ */
+ for (err = ecb->dte_action, ndx = 0;
+ err != act; err = err->dta_next, ndx++)
+ continue;
+
+ dtrace_probe_error(state, ecb->dte_epid, ndx,
+ (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
+ mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
+ cpu_core[cpuid].cpuc_dtrace_illval);
+
+ continue;
+ }
+
+ if (!committed)
+ buf->dtb_offset = offs + ecb->dte_size;
+ }
+
+ if (vtime)
+ curthread->t_dtrace_start = dtrace_gethrtime();
+
+ dtrace_probe_exit(cookie);
+}
+
+/*
+ * DTrace Probe Hashing Functions
+ *
+ * The functions in this section (and indeed, the functions in remaining
+ * sections) are not _called_ from probe context. (Any exceptions to this are
+ * marked with a "Note:".) Rather, they are called from elsewhere in the
+ * DTrace framework to look-up probes in, add probes to and remove probes from
+ * the DTrace probe hashes. (Each probe is hashed by each element of the
+ * probe tuple -- allowing for fast lookups, regardless of what was
+ * specified.)
+ */
+static uint_t
+dtrace_hash_str(const char *p)
+{
+ unsigned int g;
+ uint_t hval = 0;
+
+ while (*p) {
+ hval = (hval << 4) + *p++;
+ if ((g = (hval & 0xf0000000)) != 0)
+ hval ^= g >> 24;
+ hval &= ~g;
+ }
+ return (hval);
+}
+
+static dtrace_hash_t *
+dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
+{
+ dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
+
+ hash->dth_stroffs = stroffs;
+ hash->dth_nextoffs = nextoffs;
+ hash->dth_prevoffs = prevoffs;
+
+ hash->dth_size = 1;
+ hash->dth_mask = hash->dth_size - 1;
+
+ hash->dth_tab = kmem_zalloc(hash->dth_size *
+ sizeof (dtrace_hashbucket_t *), KM_SLEEP);
+
+ return (hash);
+}
+
+static void
+dtrace_hash_destroy(dtrace_hash_t *hash)
+{
+#ifdef DEBUG
+ int i;
+
+ for (i = 0; i < hash->dth_size; i++)
+ ASSERT(hash->dth_tab[i] == NULL);
+#endif
+
+ kmem_free(hash->dth_tab,
+ hash->dth_size * sizeof (dtrace_hashbucket_t *));
+ kmem_free(hash, sizeof (dtrace_hash_t));
+}
+
+static void
+dtrace_hash_resize(dtrace_hash_t *hash)
+{
+ int size = hash->dth_size, i, ndx;
+ int new_size = hash->dth_size << 1;
+ int new_mask = new_size - 1;
+ dtrace_hashbucket_t **new_tab, *bucket, *next;
+
+ ASSERT((new_size & new_mask) == 0);
+
+ new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
+
+ for (i = 0; i < size; i++) {
+ for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
+ dtrace_probe_t *probe = bucket->dthb_chain;
+
+ ASSERT(probe != NULL);
+ ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
+
+ next = bucket->dthb_next;
+ bucket->dthb_next = new_tab[ndx];
+ new_tab[ndx] = bucket;
+ }
+ }
+
+ kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
+ hash->dth_tab = new_tab;
+ hash->dth_size = new_size;
+ hash->dth_mask = new_mask;
+}
+
+static void
+dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
+{
+ int hashval = DTRACE_HASHSTR(hash, new);
+ int ndx = hashval & hash->dth_mask;
+ dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
+ dtrace_probe_t **nextp, **prevp;
+
+ for (; bucket != NULL; bucket = bucket->dthb_next) {
+ if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
+ goto add;
+ }
+
+ if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
+ dtrace_hash_resize(hash);
+ dtrace_hash_add(hash, new);
+ return;
+ }
+
+ bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
+ bucket->dthb_next = hash->dth_tab[ndx];
+ hash->dth_tab[ndx] = bucket;
+ hash->dth_nbuckets++;
+
+add:
+ nextp = DTRACE_HASHNEXT(hash, new);
+ ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
+ *nextp = bucket->dthb_chain;
+
+ if (bucket->dthb_chain != NULL) {
+ prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
+ ASSERT(*prevp == NULL);
+ *prevp = new;
+ }
+
+ bucket->dthb_chain = new;
+ bucket->dthb_len++;
+}
+
+static dtrace_probe_t *
+dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
+{
+ int hashval = DTRACE_HASHSTR(hash, template);
+ int ndx = hashval & hash->dth_mask;
+ dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
+
+ for (; bucket != NULL; bucket = bucket->dthb_next) {
+ if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
+ return (bucket->dthb_chain);
+ }
+
+ return (NULL);
+}
+
+static int
+dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
+{
+ int hashval = DTRACE_HASHSTR(hash, template);
+ int ndx = hashval & hash->dth_mask;
+ dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
+
+ for (; bucket != NULL; bucket = bucket->dthb_next) {
+ if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
+ return (bucket->dthb_len);
+ }
+
+ return (0);
+}
+
+static void
+dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
+{
+ int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
+ dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
+
+ dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
+ dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
+
+ /*
+ * Find the bucket that we're removing this probe from.
+ */
+ for (; bucket != NULL; bucket = bucket->dthb_next) {
+ if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
+ break;
+ }
+
+ ASSERT(bucket != NULL);
+
+ if (*prevp == NULL) {
+ if (*nextp == NULL) {
+ /*
+ * The removed probe was the only probe on this
+ * bucket; we need to remove the bucket.
+ */
+ dtrace_hashbucket_t *b = hash->dth_tab[ndx];
+
+ ASSERT(bucket->dthb_chain == probe);
+ ASSERT(b != NULL);
+
+ if (b == bucket) {
+ hash->dth_tab[ndx] = bucket->dthb_next;
+ } else {
+ while (b->dthb_next != bucket)
+ b = b->dthb_next;
+ b->dthb_next = bucket->dthb_next;
+ }
+
+ ASSERT(hash->dth_nbuckets > 0);
+ hash->dth_nbuckets--;
+ kmem_free(bucket, sizeof (dtrace_hashbucket_t));
+ return;
+ }
+
+ bucket->dthb_chain = *nextp;
+ } else {
+ *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
+ }
+
+ if (*nextp != NULL)
+ *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
+}
+
+/*
+ * DTrace Utility Functions
+ *
+ * These are random utility functions that are _not_ called from probe context.
+ */
+static int
+dtrace_badattr(const dtrace_attribute_t *a)
+{
+ return (a->dtat_name > DTRACE_STABILITY_MAX ||
+ a->dtat_data > DTRACE_STABILITY_MAX ||
+ a->dtat_class > DTRACE_CLASS_MAX);
+}
+
+/*
+ * Return a duplicate copy of a string. If the specified string is NULL,
+ * this function returns a zero-length string.
+ */
+static char *
+dtrace_strdup(const char *str)
+{
+ char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
+
+ if (str != NULL)
+ (void) strcpy(new, str);
+
+ return (new);
+}
+
+#define DTRACE_ISALPHA(c) \
+ (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+
+static int
+dtrace_badname(const char *s)
+{
+ char c;
+
+ if (s == NULL || (c = *s++) == '\0')
+ return (0);
+
+ if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
+ return (1);
+
+ while ((c = *s++) != '\0') {
+ if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
+ c != '-' && c != '_' && c != '.' && c != '`')
+ return (1);
+ }
+
+ return (0);
+}
+
+static void
+dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
+{
+ uint32_t priv;
+
+#ifdef illumos
+ if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
+ /*
+ * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
+ */
+ priv = DTRACE_PRIV_ALL;
+ } else {
+ *uidp = crgetuid(cr);
+ *zoneidp = crgetzoneid(cr);
+
+ priv = 0;
+ if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
+ priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
+ else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
+ priv |= DTRACE_PRIV_USER;
+ if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
+ priv |= DTRACE_PRIV_PROC;
+ if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
+ priv |= DTRACE_PRIV_OWNER;
+ if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
+ priv |= DTRACE_PRIV_ZONEOWNER;
+ }
+#else
+ priv = DTRACE_PRIV_ALL;
+#endif
+
+ *privp = priv;
+}
+
+#ifdef DTRACE_ERRDEBUG
+static void
+dtrace_errdebug(const char *str)
+{
+ int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
+ int occupied = 0;
+
+ mutex_enter(&dtrace_errlock);
+ dtrace_errlast = str;
+ dtrace_errthread = curthread;
+
+ while (occupied++ < DTRACE_ERRHASHSZ) {
+ if (dtrace_errhash[hval].dter_msg == str) {
+ dtrace_errhash[hval].dter_count++;
+ goto out;
+ }
+
+ if (dtrace_errhash[hval].dter_msg != NULL) {
+ hval = (hval + 1) % DTRACE_ERRHASHSZ;
+ continue;
+ }
+
+ dtrace_errhash[hval].dter_msg = str;
+ dtrace_errhash[hval].dter_count = 1;
+ goto out;
+ }
+
+ panic("dtrace: undersized error hash");
+out:
+ mutex_exit(&dtrace_errlock);
+}
+#endif
+
+/*
+ * DTrace Matching Functions
+ *
+ * These functions are used to match groups of probes, given some elements of
+ * a probe tuple, or some globbed expressions for elements of a probe tuple.
+ */
+static int
+dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
+ zoneid_t zoneid)
+{
+ if (priv != DTRACE_PRIV_ALL) {
+ uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
+ uint32_t match = priv & ppriv;
+
+ /*
+ * No PRIV_DTRACE_* privileges...
+ */
+ if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
+ DTRACE_PRIV_KERNEL)) == 0)
+ return (0);
+
+ /*
+ * No matching bits, but there were bits to match...
+ */
+ if (match == 0 && ppriv != 0)
+ return (0);
+
+ /*
+ * Need to have permissions to the process, but don't...
+ */
+ if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
+ uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
+ return (0);
+ }
+
+ /*
+ * Need to be in the same zone unless we possess the
+ * privilege to examine all zones.
+ */
+ if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
+ zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
+ return (0);
+ }
+ }
+
+ return (1);
+}
+
+/*
+ * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
+ * consists of input pattern strings and an ops-vector to evaluate them.
+ * This function returns >0 for match, 0 for no match, and <0 for error.
+ */
+static int
+dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
+ uint32_t priv, uid_t uid, zoneid_t zoneid)
+{
+ dtrace_provider_t *pvp = prp->dtpr_provider;
+ int rv;
+
+ if (pvp->dtpv_defunct)
+ return (0);
+
+ if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
+ return (rv);
+
+ if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
+ return (rv);
+
+ if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
+ return (rv);
+
+ if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
+ return (rv);
+
+ if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
+ return (0);
+
+ return (rv);
+}
+
+/*
+ * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
+ * interface for matching a glob pattern 'p' to an input string 's'. Unlike
+ * libc's version, the kernel version only applies to 8-bit ASCII strings.
+ * In addition, all of the recursion cases except for '*' matching have been
+ * unwound. For '*', we still implement recursive evaluation, but a depth
+ * counter is maintained and matching is aborted if we recurse too deep.
+ * The function returns 0 if no match, >0 if match, and <0 if recursion error.
+ */
+static int
+dtrace_match_glob(const char *s, const char *p, int depth)
+{
+ const char *olds;
+ char s1, c;
+ int gs;
+
+ if (depth > DTRACE_PROBEKEY_MAXDEPTH)
+ return (-1);
+
+ if (s == NULL)
+ s = ""; /* treat NULL as empty string */
+
+top:
+ olds = s;
+ s1 = *s++;
+
+ if (p == NULL)
+ return (0);
+
+ if ((c = *p++) == '\0')
+ return (s1 == '\0');
+
+ switch (c) {
+ case '[': {
+ int ok = 0, notflag = 0;
+ char lc = '\0';
+
+ if (s1 == '\0')
+ return (0);
+
+ if (*p == '!') {
+ notflag = 1;
+ p++;
+ }
+
+ if ((c = *p++) == '\0')
+ return (0);
+
+ do {
+ if (c == '-' && lc != '\0' && *p != ']') {
+ if ((c = *p++) == '\0')
+ return (0);
+ if (c == '\\' && (c = *p++) == '\0')
+ return (0);
+
+ if (notflag) {
+ if (s1 < lc || s1 > c)
+ ok++;
+ else
+ return (0);
+ } else if (lc <= s1 && s1 <= c)
+ ok++;
+
+ } else if (c == '\\' && (c = *p++) == '\0')
+ return (0);
+
+ lc = c; /* save left-hand 'c' for next iteration */
+
+ if (notflag) {
+ if (s1 != c)
+ ok++;
+ else
+ return (0);
+ } else if (s1 == c)
+ ok++;
+
+ if ((c = *p++) == '\0')
+ return (0);
+
+ } while (c != ']');
+
+ if (ok)
+ goto top;
+
+ return (0);
+ }
+
+ case '\\':
+ if ((c = *p++) == '\0')
+ return (0);
+ /*FALLTHRU*/
+
+ default:
+ if (c != s1)
+ return (0);
+ /*FALLTHRU*/
+
+ case '?':
+ if (s1 != '\0')
+ goto top;
+ return (0);
+
+ case '*':
+ while (*p == '*')
+ p++; /* consecutive *'s are identical to a single one */
+
+ if (*p == '\0')
+ return (1);
+
+ for (s = olds; *s != '\0'; s++) {
+ if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
+ return (gs);
+ }
+
+ return (0);
+ }
+}
+
+/*ARGSUSED*/
+static int
+dtrace_match_string(const char *s, const char *p, int depth)
+{
+ return (s != NULL && strcmp(s, p) == 0);
+}
+
+/*ARGSUSED*/
+static int
+dtrace_match_nul(const char *s, const char *p, int depth)
+{
+ return (1); /* always match the empty pattern */
+}
+
+/*ARGSUSED*/
+static int
+dtrace_match_nonzero(const char *s, const char *p, int depth)
+{
+ return (s != NULL && s[0] != '\0');
+}
+
+static int
+dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
+ zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
+{
+ dtrace_probe_t template, *probe;
+ dtrace_hash_t *hash = NULL;
+ int len, best = INT_MAX, nmatched = 0;
+ dtrace_id_t i;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ /*
+ * If the probe ID is specified in the key, just lookup by ID and
+ * invoke the match callback once if a matching probe is found.
+ */
+ if (pkp->dtpk_id != DTRACE_IDNONE) {
+ if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
+ dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
+ (void) (*matched)(probe, arg);
+ nmatched++;
+ }
+ return (nmatched);
+ }
+
+ template.dtpr_mod = (char *)pkp->dtpk_mod;
+ template.dtpr_func = (char *)pkp->dtpk_func;
+ template.dtpr_name = (char *)pkp->dtpk_name;
+
+ /*
+ * We want to find the most distinct of the module name, function
+ * name, and name. So for each one that is not a glob pattern or
+ * empty string, we perform a lookup in the corresponding hash and
+ * use the hash table with the fewest collisions to do our search.
+ */
+ if (pkp->dtpk_mmatch == &dtrace_match_string &&
+ (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
+ best = len;
+ hash = dtrace_bymod;
+ }
+
+ if (pkp->dtpk_fmatch == &dtrace_match_string &&
+ (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
+ best = len;
+ hash = dtrace_byfunc;
+ }
+
+ if (pkp->dtpk_nmatch == &dtrace_match_string &&
+ (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
+ best = len;
+ hash = dtrace_byname;
+ }
+
+ /*
+ * If we did not select a hash table, iterate over every probe and
+ * invoke our callback for each one that matches our input probe key.
+ */
+ if (hash == NULL) {
+ for (i = 0; i < dtrace_nprobes; i++) {
+ if ((probe = dtrace_probes[i]) == NULL ||
+ dtrace_match_probe(probe, pkp, priv, uid,
+ zoneid) <= 0)
+ continue;
+
+ nmatched++;
+
+ if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
+ break;
+ }
+
+ return (nmatched);
+ }
+
+ /*
+ * If we selected a hash table, iterate over each probe of the same key
+ * name and invoke the callback for every probe that matches the other
+ * attributes of our input probe key.
+ */
+ for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
+ probe = *(DTRACE_HASHNEXT(hash, probe))) {
+
+ if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
+ continue;
+
+ nmatched++;
+
+ if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
+ break;
+ }
+
+ return (nmatched);
+}
+
+/*
+ * Return the function pointer dtrace_probecmp() should use to compare the
+ * specified pattern with a string. For NULL or empty patterns, we select
+ * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
+ * For non-empty non-glob strings, we use dtrace_match_string().
+ */
+static dtrace_probekey_f *
+dtrace_probekey_func(const char *p)
+{
+ char c;
+
+ if (p == NULL || *p == '\0')
+ return (&dtrace_match_nul);
+
+ while ((c = *p++) != '\0') {
+ if (c == '[' || c == '?' || c == '*' || c == '\\')
+ return (&dtrace_match_glob);
+ }
+
+ return (&dtrace_match_string);
+}
+
+/*
+ * Build a probe comparison key for use with dtrace_match_probe() from the
+ * given probe description. By convention, a null key only matches anchored
+ * probes: if each field is the empty string, reset dtpk_fmatch to
+ * dtrace_match_nonzero().
+ */
+static void
+dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
+{
+ pkp->dtpk_prov = pdp->dtpd_provider;
+ pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
+
+ pkp->dtpk_mod = pdp->dtpd_mod;
+ pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
+
+ pkp->dtpk_func = pdp->dtpd_func;
+ pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
+
+ pkp->dtpk_name = pdp->dtpd_name;
+ pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
+
+ pkp->dtpk_id = pdp->dtpd_id;
+
+ if (pkp->dtpk_id == DTRACE_IDNONE &&
+ pkp->dtpk_pmatch == &dtrace_match_nul &&
+ pkp->dtpk_mmatch == &dtrace_match_nul &&
+ pkp->dtpk_fmatch == &dtrace_match_nul &&
+ pkp->dtpk_nmatch == &dtrace_match_nul)
+ pkp->dtpk_fmatch = &dtrace_match_nonzero;
+}
+
+/*
+ * DTrace Provider-to-Framework API Functions
+ *
+ * These functions implement much of the Provider-to-Framework API, as
+ * described in <sys/dtrace.h>. The parts of the API not in this section are
+ * the functions in the API for probe management (found below), and
+ * dtrace_probe() itself (found above).
+ */
+
+/*
+ * Register the calling provider with the DTrace framework. This should
+ * generally be called by DTrace providers in their attach(9E) entry point.
+ */
+int
+dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
+ cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
+{
+ dtrace_provider_t *provider;
+
+ if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
+ cmn_err(CE_WARN, "failed to register provider '%s': invalid "
+ "arguments", name ? name : "<NULL>");
+ return (EINVAL);
+ }
+
+ if (name[0] == '\0' || dtrace_badname(name)) {
+ cmn_err(CE_WARN, "failed to register provider '%s': invalid "
+ "provider name", name);
+ return (EINVAL);
+ }
+
+ if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
+ pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
+ pops->dtps_destroy == NULL ||
+ ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
+ cmn_err(CE_WARN, "failed to register provider '%s': invalid "
+ "provider ops", name);
+ return (EINVAL);
+ }
+
+ if (dtrace_badattr(&pap->dtpa_provider) ||
+ dtrace_badattr(&pap->dtpa_mod) ||
+ dtrace_badattr(&pap->dtpa_func) ||
+ dtrace_badattr(&pap->dtpa_name) ||
+ dtrace_badattr(&pap->dtpa_args)) {
+ cmn_err(CE_WARN, "failed to register provider '%s': invalid "
+ "provider attributes", name);
+ return (EINVAL);
+ }
+
+ if (priv & ~DTRACE_PRIV_ALL) {
+ cmn_err(CE_WARN, "failed to register provider '%s': invalid "
+ "privilege attributes", name);
+ return (EINVAL);
+ }
+
+ if ((priv & DTRACE_PRIV_KERNEL) &&
+ (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
+ pops->dtps_usermode == NULL) {
+ cmn_err(CE_WARN, "failed to register provider '%s': need "
+ "dtps_usermode() op for given privilege attributes", name);
+ return (EINVAL);
+ }
+
+ provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
+ provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
+ (void) strcpy(provider->dtpv_name, name);
+
+ provider->dtpv_attr = *pap;
+ provider->dtpv_priv.dtpp_flags = priv;
+ if (cr != NULL) {
+ provider->dtpv_priv.dtpp_uid = crgetuid(cr);
+ provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
+ }
+ provider->dtpv_pops = *pops;
+
+ if (pops->dtps_provide == NULL) {
+ ASSERT(pops->dtps_provide_module != NULL);
+ provider->dtpv_pops.dtps_provide =
+ (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
+ }
+
+ if (pops->dtps_provide_module == NULL) {
+ ASSERT(pops->dtps_provide != NULL);
+ provider->dtpv_pops.dtps_provide_module =
+ (void (*)(void *, modctl_t *))dtrace_nullop;
+ }
+
+ if (pops->dtps_suspend == NULL) {
+ ASSERT(pops->dtps_resume == NULL);
+ provider->dtpv_pops.dtps_suspend =
+ (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
+ provider->dtpv_pops.dtps_resume =
+ (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
+ }
+
+ provider->dtpv_arg = arg;
+ *idp = (dtrace_provider_id_t)provider;
+
+ if (pops == &dtrace_provider_ops) {
+ ASSERT(MUTEX_HELD(&dtrace_provider_lock));
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(dtrace_anon.dta_enabling == NULL);
+
+ /*
+ * We make sure that the DTrace provider is at the head of
+ * the provider chain.
+ */
+ provider->dtpv_next = dtrace_provider;
+ dtrace_provider = provider;
+ return (0);
+ }
+
+ mutex_enter(&dtrace_provider_lock);
+ mutex_enter(&dtrace_lock);
+
+ /*
+ * If there is at least one provider registered, we'll add this
+ * provider after the first provider.
+ */
+ if (dtrace_provider != NULL) {
+ provider->dtpv_next = dtrace_provider->dtpv_next;
+ dtrace_provider->dtpv_next = provider;
+ } else {
+ dtrace_provider = provider;
+ }
+
+ if (dtrace_retained != NULL) {
+ dtrace_enabling_provide(provider);
+
+ /*
+ * Now we need to call dtrace_enabling_matchall() -- which
+ * will acquire cpu_lock and dtrace_lock. We therefore need
+ * to drop all of our locks before calling into it...
+ */
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_provider_lock);
+ dtrace_enabling_matchall();
+
+ return (0);
+ }
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_provider_lock);
+
+ return (0);
+}
+
+/*
+ * Unregister the specified provider from the DTrace framework. This should
+ * generally be called by DTrace providers in their detach(9E) entry point.
+ */
+int
+dtrace_unregister(dtrace_provider_id_t id)
+{
+ dtrace_provider_t *old = (dtrace_provider_t *)id;
+ dtrace_provider_t *prev = NULL;
+ int i, self = 0, noreap = 0;
+ dtrace_probe_t *probe, *first = NULL;
+
+ if (old->dtpv_pops.dtps_enable ==
+ (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
+ /*
+ * If DTrace itself is the provider, we're called with locks
+ * already held.
+ */
+ ASSERT(old == dtrace_provider);
+#ifdef illumos
+ ASSERT(dtrace_devi != NULL);
+#endif
+ ASSERT(MUTEX_HELD(&dtrace_provider_lock));
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ self = 1;
+
+ if (dtrace_provider->dtpv_next != NULL) {
+ /*
+ * There's another provider here; return failure.
+ */
+ return (EBUSY);
+ }
+ } else {
+ mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
+ mutex_enter(&mod_lock);
+#endif
+ mutex_enter(&dtrace_lock);
+ }
+
+ /*
+ * If anyone has /dev/dtrace open, or if there are anonymous enabled
+ * probes, we refuse to let providers slither away, unless this
+ * provider has already been explicitly invalidated.
+ */
+ if (!old->dtpv_defunct &&
+ (dtrace_opens || (dtrace_anon.dta_state != NULL &&
+ dtrace_anon.dta_state->dts_necbs > 0))) {
+ if (!self) {
+ mutex_exit(&dtrace_lock);
+#ifdef illumos
+ mutex_exit(&mod_lock);
+#endif
+ mutex_exit(&dtrace_provider_lock);
+ }
+ return (EBUSY);
+ }
+
+ /*
+ * Attempt to destroy the probes associated with this provider.
+ */
+ for (i = 0; i < dtrace_nprobes; i++) {
+ if ((probe = dtrace_probes[i]) == NULL)
+ continue;
+
+ if (probe->dtpr_provider != old)
+ continue;
+
+ if (probe->dtpr_ecb == NULL)
+ continue;
+
+ /*
+ * If we are trying to unregister a defunct provider, and the
+ * provider was made defunct within the interval dictated by
+ * dtrace_unregister_defunct_reap, we'll (asynchronously)
+ * attempt to reap our enablings. To denote that the provider
+ * should reattempt to unregister itself at some point in the
+ * future, we will return a differentiable error code (EAGAIN
+ * instead of EBUSY) in this case.
+ */
+ if (dtrace_gethrtime() - old->dtpv_defunct >
+ dtrace_unregister_defunct_reap)
+ noreap = 1;
+
+ if (!self) {
+ mutex_exit(&dtrace_lock);
+#ifdef illumos
+ mutex_exit(&mod_lock);
+#endif
+ mutex_exit(&dtrace_provider_lock);
+ }
+
+ if (noreap)
+ return (EBUSY);
+
+ (void) taskq_dispatch(dtrace_taskq,
+ (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
+
+ return (EAGAIN);
+ }
+
+ /*
+ * All of the probes for this provider are disabled; we can safely
+ * remove all of them from their hash chains and from the probe array.
+ */
+ for (i = 0; i < dtrace_nprobes; i++) {
+ if ((probe = dtrace_probes[i]) == NULL)
+ continue;
+
+ if (probe->dtpr_provider != old)
+ continue;
+
+ dtrace_probes[i] = NULL;
+
+ dtrace_hash_remove(dtrace_bymod, probe);
+ dtrace_hash_remove(dtrace_byfunc, probe);
+ dtrace_hash_remove(dtrace_byname, probe);
+
+ if (first == NULL) {
+ first = probe;
+ probe->dtpr_nextmod = NULL;
+ } else {
+ probe->dtpr_nextmod = first;
+ first = probe;
+ }
+ }
+
+ /*
+ * The provider's probes have been removed from the hash chains and
+ * from the probe array. Now issue a dtrace_sync() to be sure that
+ * everyone has cleared out from any probe array processing.
+ */
+ dtrace_sync();
+
+ for (probe = first; probe != NULL; probe = first) {
+ first = probe->dtpr_nextmod;
+
+ old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
+ probe->dtpr_arg);
+ kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
+ kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
+ kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+#ifdef illumos
+ vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
+#else
+ free_unr(dtrace_arena, probe->dtpr_id);
+#endif
+ kmem_free(probe, sizeof (dtrace_probe_t));
+ }
+
+ if ((prev = dtrace_provider) == old) {
+#ifdef illumos
+ ASSERT(self || dtrace_devi == NULL);
+ ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
+#endif
+ dtrace_provider = old->dtpv_next;
+ } else {
+ while (prev != NULL && prev->dtpv_next != old)
+ prev = prev->dtpv_next;
+
+ if (prev == NULL) {
+ panic("attempt to unregister non-existent "
+ "dtrace provider %p\n", (void *)id);
+ }
+
+ prev->dtpv_next = old->dtpv_next;
+ }
+
+ if (!self) {
+ mutex_exit(&dtrace_lock);
+#ifdef illumos
+ mutex_exit(&mod_lock);
+#endif
+ mutex_exit(&dtrace_provider_lock);
+ }
+
+ kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
+ kmem_free(old, sizeof (dtrace_provider_t));
+
+ return (0);
+}
+
+/*
+ * Invalidate the specified provider. All subsequent probe lookups for the
+ * specified provider will fail, but its probes will not be removed.
+ */
+void
+dtrace_invalidate(dtrace_provider_id_t id)
+{
+ dtrace_provider_t *pvp = (dtrace_provider_t *)id;
+
+ ASSERT(pvp->dtpv_pops.dtps_enable !=
+ (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
+
+ mutex_enter(&dtrace_provider_lock);
+ mutex_enter(&dtrace_lock);
+
+ pvp->dtpv_defunct = dtrace_gethrtime();
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_provider_lock);
+}
+
+/*
+ * Indicate whether or not DTrace has attached.
+ */
+int
+dtrace_attached(void)
+{
+ /*
+ * dtrace_provider will be non-NULL iff the DTrace driver has
+ * attached. (It's non-NULL because DTrace is always itself a
+ * provider.)
+ */
+ return (dtrace_provider != NULL);
+}
+
+/*
+ * Remove all the unenabled probes for the given provider. This function is
+ * not unlike dtrace_unregister(), except that it doesn't remove the provider
+ * -- just as many of its associated probes as it can.
+ */
+int
+dtrace_condense(dtrace_provider_id_t id)
+{
+ dtrace_provider_t *prov = (dtrace_provider_t *)id;
+ int i;
+ dtrace_probe_t *probe;
+
+ /*
+ * Make sure this isn't the dtrace provider itself.
+ */
+ ASSERT(prov->dtpv_pops.dtps_enable !=
+ (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
+
+ mutex_enter(&dtrace_provider_lock);
+ mutex_enter(&dtrace_lock);
+
+ /*
+ * Attempt to destroy the probes associated with this provider.
+ */
+ for (i = 0; i < dtrace_nprobes; i++) {
+ if ((probe = dtrace_probes[i]) == NULL)
+ continue;
+
+ if (probe->dtpr_provider != prov)
+ continue;
+
+ if (probe->dtpr_ecb != NULL)
+ continue;
+
+ dtrace_probes[i] = NULL;
+
+ dtrace_hash_remove(dtrace_bymod, probe);
+ dtrace_hash_remove(dtrace_byfunc, probe);
+ dtrace_hash_remove(dtrace_byname, probe);
+
+ prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
+ probe->dtpr_arg);
+ kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
+ kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
+ kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+ kmem_free(probe, sizeof (dtrace_probe_t));
+#ifdef illumos
+ vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
+#else
+ free_unr(dtrace_arena, i + 1);
+#endif
+ }
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_provider_lock);
+
+ return (0);
+}
+
+/*
+ * DTrace Probe Management Functions
+ *
+ * The functions in this section perform the DTrace probe management,
+ * including functions to create probes, look-up probes, and call into the
+ * providers to request that probes be provided. Some of these functions are
+ * in the Provider-to-Framework API; these functions can be identified by the
+ * fact that they are not declared "static".
+ */
+
+/*
+ * Create a probe with the specified module name, function name, and name.
+ */
+dtrace_id_t
+dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
+ const char *func, const char *name, int aframes, void *arg)
+{
+ dtrace_probe_t *probe, **probes;
+ dtrace_provider_t *provider = (dtrace_provider_t *)prov;
+ dtrace_id_t id;
+
+ if (provider == dtrace_provider) {
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ } else {
+ mutex_enter(&dtrace_lock);
+ }
+
+#ifdef illumos
+ id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
+ VM_BESTFIT | VM_SLEEP);
+#else
+ id = alloc_unr(dtrace_arena);
+#endif
+ probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
+
+ probe->dtpr_id = id;
+ probe->dtpr_gen = dtrace_probegen++;
+ probe->dtpr_mod = dtrace_strdup(mod);
+ probe->dtpr_func = dtrace_strdup(func);
+ probe->dtpr_name = dtrace_strdup(name);
+ probe->dtpr_arg = arg;
+ probe->dtpr_aframes = aframes;
+ probe->dtpr_provider = provider;
+
+ dtrace_hash_add(dtrace_bymod, probe);
+ dtrace_hash_add(dtrace_byfunc, probe);
+ dtrace_hash_add(dtrace_byname, probe);
+
+ if (id - 1 >= dtrace_nprobes) {
+ size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
+ size_t nsize = osize << 1;
+
+ if (nsize == 0) {
+ ASSERT(osize == 0);
+ ASSERT(dtrace_probes == NULL);
+ nsize = sizeof (dtrace_probe_t *);
+ }
+
+ probes = kmem_zalloc(nsize, KM_SLEEP);
+
+ if (dtrace_probes == NULL) {
+ ASSERT(osize == 0);
+ dtrace_probes = probes;
+ dtrace_nprobes = 1;
+ } else {
+ dtrace_probe_t **oprobes = dtrace_probes;
+
+ bcopy(oprobes, probes, osize);
+ dtrace_membar_producer();
+ dtrace_probes = probes;
+
+ dtrace_sync();
+
+ /*
+ * All CPUs are now seeing the new probes array; we can
+ * safely free the old array.
+ */
+ kmem_free(oprobes, osize);
+ dtrace_nprobes <<= 1;
+ }
+
+ ASSERT(id - 1 < dtrace_nprobes);
+ }
+
+ ASSERT(dtrace_probes[id - 1] == NULL);
+ dtrace_probes[id - 1] = probe;
+
+ if (provider != dtrace_provider)
+ mutex_exit(&dtrace_lock);
+
+ return (id);
+}
+
+static dtrace_probe_t *
+dtrace_probe_lookup_id(dtrace_id_t id)
+{
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if (id == 0 || id > dtrace_nprobes)
+ return (NULL);
+
+ return (dtrace_probes[id - 1]);
+}
+
+static int
+dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
+{
+ *((dtrace_id_t *)arg) = probe->dtpr_id;
+
+ return (DTRACE_MATCH_DONE);
+}
+
+/*
+ * Look up a probe based on provider and one or more of module name, function
+ * name and probe name.
+ */
+dtrace_id_t
+dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
+ char *func, char *name)
+{
+ dtrace_probekey_t pkey;
+ dtrace_id_t id;
+ int match;
+
+ pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
+ pkey.dtpk_pmatch = &dtrace_match_string;
+ pkey.dtpk_mod = mod;
+ pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
+ pkey.dtpk_func = func;
+ pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
+ pkey.dtpk_name = name;
+ pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
+ pkey.dtpk_id = DTRACE_IDNONE;
+
+ mutex_enter(&dtrace_lock);
+ match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
+ dtrace_probe_lookup_match, &id);
+ mutex_exit(&dtrace_lock);
+
+ ASSERT(match == 1 || match == 0);
+ return (match ? id : 0);
+}
+
+/*
+ * Returns the probe argument associated with the specified probe.
+ */
+void *
+dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
+{
+ dtrace_probe_t *probe;
+ void *rval = NULL;
+
+ mutex_enter(&dtrace_lock);
+
+ if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
+ probe->dtpr_provider == (dtrace_provider_t *)id)
+ rval = probe->dtpr_arg;
+
+ mutex_exit(&dtrace_lock);
+
+ return (rval);
+}
+
+/*
+ * Copy a probe into a probe description.
+ */
+static void
+dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
+{
+ bzero(pdp, sizeof (dtrace_probedesc_t));
+ pdp->dtpd_id = prp->dtpr_id;
+
+ (void) strncpy(pdp->dtpd_provider,
+ prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
+
+ (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
+ (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
+ (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
+}
+
+/*
+ * Called to indicate that a probe -- or probes -- should be provided by a
+ * specfied provider. If the specified description is NULL, the provider will
+ * be told to provide all of its probes. (This is done whenever a new
+ * consumer comes along, or whenever a retained enabling is to be matched.) If
+ * the specified description is non-NULL, the provider is given the
+ * opportunity to dynamically provide the specified probe, allowing providers
+ * to support the creation of probes on-the-fly. (So-called _autocreated_
+ * probes.) If the provider is NULL, the operations will be applied to all
+ * providers; if the provider is non-NULL the operations will only be applied
+ * to the specified provider. The dtrace_provider_lock must be held, and the
+ * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
+ * will need to grab the dtrace_lock when it reenters the framework through
+ * dtrace_probe_lookup(), dtrace_probe_create(), etc.
+ */
+static void
+dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
+{
+#ifdef illumos
+ modctl_t *ctl;
+#endif
+ int all = 0;
+
+ ASSERT(MUTEX_HELD(&dtrace_provider_lock));
+
+ if (prv == NULL) {
+ all = 1;
+ prv = dtrace_provider;
+ }
+
+ do {
+ /*
+ * First, call the blanket provide operation.
+ */
+ prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
+
+#ifdef illumos
+ /*
+ * Now call the per-module provide operation. We will grab
+ * mod_lock to prevent the list from being modified. Note
+ * that this also prevents the mod_busy bits from changing.
+ * (mod_busy can only be changed with mod_lock held.)
+ */
+ mutex_enter(&mod_lock);
+
+ ctl = &modules;
+ do {
+ if (ctl->mod_busy || ctl->mod_mp == NULL)
+ continue;
+
+ prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
+
+ } while ((ctl = ctl->mod_next) != &modules);
+
+ mutex_exit(&mod_lock);
+#endif
+ } while (all && (prv = prv->dtpv_next) != NULL);
+}
+
+#ifdef illumos
+/*
+ * Iterate over each probe, and call the Framework-to-Provider API function
+ * denoted by offs.
+ */
+static void
+dtrace_probe_foreach(uintptr_t offs)
+{
+ dtrace_provider_t *prov;
+ void (*func)(void *, dtrace_id_t, void *);
+ dtrace_probe_t *probe;
+ dtrace_icookie_t cookie;
+ int i;
+
+ /*
+ * We disable interrupts to walk through the probe array. This is
+ * safe -- the dtrace_sync() in dtrace_unregister() assures that we
+ * won't see stale data.
+ */
+ cookie = dtrace_interrupt_disable();
+
+ for (i = 0; i < dtrace_nprobes; i++) {
+ if ((probe = dtrace_probes[i]) == NULL)
+ continue;
+
+ if (probe->dtpr_ecb == NULL) {
+ /*
+ * This probe isn't enabled -- don't call the function.
+ */
+ continue;
+ }
+
+ prov = probe->dtpr_provider;
+ func = *((void(**)(void *, dtrace_id_t, void *))
+ ((uintptr_t)&prov->dtpv_pops + offs));
+
+ func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
+ }
+
+ dtrace_interrupt_enable(cookie);
+}
+#endif
+
+static int
+dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
+{
+ dtrace_probekey_t pkey;
+ uint32_t priv;
+ uid_t uid;
+ zoneid_t zoneid;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ dtrace_ecb_create_cache = NULL;
+
+ if (desc == NULL) {
+ /*
+ * If we're passed a NULL description, we're being asked to
+ * create an ECB with a NULL probe.
+ */
+ (void) dtrace_ecb_create_enable(NULL, enab);
+ return (0);
+ }
+
+ dtrace_probekey(desc, &pkey);
+ dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
+ &priv, &uid, &zoneid);
+
+ return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
+ enab));
+}
+
+/*
+ * DTrace Helper Provider Functions
+ */
+static void
+dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
+{
+ attr->dtat_name = DOF_ATTR_NAME(dofattr);
+ attr->dtat_data = DOF_ATTR_DATA(dofattr);
+ attr->dtat_class = DOF_ATTR_CLASS(dofattr);
+}
+
+static void
+dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
+ const dof_provider_t *dofprov, char *strtab)
+{
+ hprov->dthpv_provname = strtab + dofprov->dofpv_name;
+ dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
+ dofprov->dofpv_provattr);
+ dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
+ dofprov->dofpv_modattr);
+ dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
+ dofprov->dofpv_funcattr);
+ dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
+ dofprov->dofpv_nameattr);
+ dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
+ dofprov->dofpv_argsattr);
+}
+
+static void
+dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
+{
+ uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
+ dof_hdr_t *dof = (dof_hdr_t *)daddr;
+ dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
+ dof_provider_t *provider;
+ dof_probe_t *probe;
+ uint32_t *off, *enoff;
+ uint8_t *arg;
+ char *strtab;
+ uint_t i, nprobes;
+ dtrace_helper_provdesc_t dhpv;
+ dtrace_helper_probedesc_t dhpb;
+ dtrace_meta_t *meta = dtrace_meta_pid;
+ dtrace_mops_t *mops = &meta->dtm_mops;
+ void *parg;
+
+ provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
+ str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
+ provider->dofpv_strtab * dof->dofh_secsize);
+ prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
+ provider->dofpv_probes * dof->dofh_secsize);
+ arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
+ provider->dofpv_prargs * dof->dofh_secsize);
+ off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
+ provider->dofpv_proffs * dof->dofh_secsize);
+
+ strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
+ off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
+ arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
+ enoff = NULL;
+
+ /*
+ * See dtrace_helper_provider_validate().
+ */
+ if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
+ provider->dofpv_prenoffs != DOF_SECT_NONE) {
+ enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
+ provider->dofpv_prenoffs * dof->dofh_secsize);
+ enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
+ }
+
+ nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
+
+ /*
+ * Create the provider.
+ */
+ dtrace_dofprov2hprov(&dhpv, provider, strtab);
+
+ if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
+ return;
+
+ meta->dtm_count++;
+
+ /*
+ * Create the probes.
+ */
+ for (i = 0; i < nprobes; i++) {
+ probe = (dof_probe_t *)(uintptr_t)(daddr +
+ prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
+
+ /* See the check in dtrace_helper_provider_validate(). */
+ if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN)
+ continue;
+
+ dhpb.dthpb_mod = dhp->dofhp_mod;
+ dhpb.dthpb_func = strtab + probe->dofpr_func;
+ dhpb.dthpb_name = strtab + probe->dofpr_name;
+ dhpb.dthpb_base = probe->dofpr_addr;
+ dhpb.dthpb_offs = off + probe->dofpr_offidx;
+ dhpb.dthpb_noffs = probe->dofpr_noffs;
+ if (enoff != NULL) {
+ dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
+ dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
+ } else {
+ dhpb.dthpb_enoffs = NULL;
+ dhpb.dthpb_nenoffs = 0;
+ }
+ dhpb.dthpb_args = arg + probe->dofpr_argidx;
+ dhpb.dthpb_nargc = probe->dofpr_nargc;
+ dhpb.dthpb_xargc = probe->dofpr_xargc;
+ dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
+ dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
+
+ mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
+ }
+}
+
+static void
+dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
+{
+ uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
+ dof_hdr_t *dof = (dof_hdr_t *)daddr;
+ int i;
+
+ ASSERT(MUTEX_HELD(&dtrace_meta_lock));
+
+ for (i = 0; i < dof->dofh_secnum; i++) {
+ dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
+ dof->dofh_secoff + i * dof->dofh_secsize);
+
+ if (sec->dofs_type != DOF_SECT_PROVIDER)
+ continue;
+
+ dtrace_helper_provide_one(dhp, sec, pid);
+ }
+
+ /*
+ * We may have just created probes, so we must now rematch against
+ * any retained enablings. Note that this call will acquire both
+ * cpu_lock and dtrace_lock; the fact that we are holding
+ * dtrace_meta_lock now is what defines the ordering with respect to
+ * these three locks.
+ */
+ dtrace_enabling_matchall();
+}
+
+static void
+dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
+{
+ uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
+ dof_hdr_t *dof = (dof_hdr_t *)daddr;
+ dof_sec_t *str_sec;
+ dof_provider_t *provider;
+ char *strtab;
+ dtrace_helper_provdesc_t dhpv;
+ dtrace_meta_t *meta = dtrace_meta_pid;
+ dtrace_mops_t *mops = &meta->dtm_mops;
+
+ provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
+ str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
+ provider->dofpv_strtab * dof->dofh_secsize);
+
+ strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
+
+ /*
+ * Create the provider.
+ */
+ dtrace_dofprov2hprov(&dhpv, provider, strtab);
+
+ mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
+
+ meta->dtm_count--;
+}
+
+static void
+dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
+{
+ uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
+ dof_hdr_t *dof = (dof_hdr_t *)daddr;
+ int i;
+
+ ASSERT(MUTEX_HELD(&dtrace_meta_lock));
+
+ for (i = 0; i < dof->dofh_secnum; i++) {
+ dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
+ dof->dofh_secoff + i * dof->dofh_secsize);
+
+ if (sec->dofs_type != DOF_SECT_PROVIDER)
+ continue;
+
+ dtrace_helper_provider_remove_one(dhp, sec, pid);
+ }
+}
+
+/*
+ * DTrace Meta Provider-to-Framework API Functions
+ *
+ * These functions implement the Meta Provider-to-Framework API, as described
+ * in <sys/dtrace.h>.
+ */
+int
+dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
+ dtrace_meta_provider_id_t *idp)
+{
+ dtrace_meta_t *meta;
+ dtrace_helpers_t *help, *next;
+ int i;
+
+ *idp = DTRACE_METAPROVNONE;
+
+ /*
+ * We strictly don't need the name, but we hold onto it for
+ * debuggability. All hail error queues!
+ */
+ if (name == NULL) {
+ cmn_err(CE_WARN, "failed to register meta-provider: "
+ "invalid name");
+ return (EINVAL);
+ }
+
+ if (mops == NULL ||
+ mops->dtms_create_probe == NULL ||
+ mops->dtms_provide_pid == NULL ||
+ mops->dtms_remove_pid == NULL) {
+ cmn_err(CE_WARN, "failed to register meta-register %s: "
+ "invalid ops", name);
+ return (EINVAL);
+ }
+
+ meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
+ meta->dtm_mops = *mops;
+ meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
+ (void) strcpy(meta->dtm_name, name);
+ meta->dtm_arg = arg;
+
+ mutex_enter(&dtrace_meta_lock);
+ mutex_enter(&dtrace_lock);
+
+ if (dtrace_meta_pid != NULL) {
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_meta_lock);
+ cmn_err(CE_WARN, "failed to register meta-register %s: "
+ "user-land meta-provider exists", name);
+ kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
+ kmem_free(meta, sizeof (dtrace_meta_t));
+ return (EINVAL);
+ }
+
+ dtrace_meta_pid = meta;
+ *idp = (dtrace_meta_provider_id_t)meta;
+
+ /*
+ * If there are providers and probes ready to go, pass them
+ * off to the new meta provider now.
+ */
+
+ help = dtrace_deferred_pid;
+ dtrace_deferred_pid = NULL;
+
+ mutex_exit(&dtrace_lock);
+
+ while (help != NULL) {
+ for (i = 0; i < help->dthps_nprovs; i++) {
+ dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
+ help->dthps_pid);
+ }
+
+ next = help->dthps_next;
+ help->dthps_next = NULL;
+ help->dthps_prev = NULL;
+ help->dthps_deferred = 0;
+ help = next;
+ }
+
+ mutex_exit(&dtrace_meta_lock);
+
+ return (0);
+}
+
+int
+dtrace_meta_unregister(dtrace_meta_provider_id_t id)
+{
+ dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
+
+ mutex_enter(&dtrace_meta_lock);
+ mutex_enter(&dtrace_lock);
+
+ if (old == dtrace_meta_pid) {
+ pp = &dtrace_meta_pid;
+ } else {
+ panic("attempt to unregister non-existent "
+ "dtrace meta-provider %p\n", (void *)old);
+ }
+
+ if (old->dtm_count != 0) {
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_meta_lock);
+ return (EBUSY);
+ }
+
+ *pp = NULL;
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_meta_lock);
+
+ kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
+ kmem_free(old, sizeof (dtrace_meta_t));
+
+ return (0);
+}
+
+
+/*
+ * DTrace DIF Object Functions
+ */
+static int
+dtrace_difo_err(uint_t pc, const char *format, ...)
+{
+ if (dtrace_err_verbose) {
+ va_list alist;
+
+ (void) uprintf("dtrace DIF object error: [%u]: ", pc);
+ va_start(alist, format);
+ (void) vuprintf(format, alist);
+ va_end(alist);
+ }
+
+#ifdef DTRACE_ERRDEBUG
+ dtrace_errdebug(format);
+#endif
+ return (1);
+}
+
+/*
+ * Validate a DTrace DIF object by checking the IR instructions. The following
+ * rules are currently enforced by dtrace_difo_validate():
+ *
+ * 1. Each instruction must have a valid opcode
+ * 2. Each register, string, variable, or subroutine reference must be valid
+ * 3. No instruction can modify register %r0 (must be zero)
+ * 4. All instruction reserved bits must be set to zero
+ * 5. The last instruction must be a "ret" instruction
+ * 6. All branch targets must reference a valid instruction _after_ the branch
+ */
+static int
+dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
+ cred_t *cr)
+{
+ int err = 0, i;
+ int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
+ int kcheckload;
+ uint_t pc;
+ int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
+
+ kcheckload = cr == NULL ||
+ (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
+
+ dp->dtdo_destructive = 0;
+
+ for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
+ dif_instr_t instr = dp->dtdo_buf[pc];
+
+ uint_t r1 = DIF_INSTR_R1(instr);
+ uint_t r2 = DIF_INSTR_R2(instr);
+ uint_t rd = DIF_INSTR_RD(instr);
+ uint_t rs = DIF_INSTR_RS(instr);
+ uint_t label = DIF_INSTR_LABEL(instr);
+ uint_t v = DIF_INSTR_VAR(instr);
+ uint_t subr = DIF_INSTR_SUBR(instr);
+ uint_t type = DIF_INSTR_TYPE(instr);
+ uint_t op = DIF_INSTR_OP(instr);
+
+ switch (op) {
+ case DIF_OP_OR:
+ case DIF_OP_XOR:
+ case DIF_OP_AND:
+ case DIF_OP_SLL:
+ case DIF_OP_SRL:
+ case DIF_OP_SRA:
+ case DIF_OP_SUB:
+ case DIF_OP_ADD:
+ case DIF_OP_MUL:
+ case DIF_OP_SDIV:
+ case DIF_OP_UDIV:
+ case DIF_OP_SREM:
+ case DIF_OP_UREM:
+ case DIF_OP_COPYS:
+ if (r1 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r1);
+ if (r2 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r2);
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ break;
+ case DIF_OP_NOT:
+ case DIF_OP_MOV:
+ case DIF_OP_ALLOCS:
+ if (r1 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r1);
+ if (r2 != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ break;
+ case DIF_OP_LDSB:
+ case DIF_OP_LDSH:
+ case DIF_OP_LDSW:
+ case DIF_OP_LDUB:
+ case DIF_OP_LDUH:
+ case DIF_OP_LDUW:
+ case DIF_OP_LDX:
+ if (r1 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r1);
+ if (r2 != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ if (kcheckload)
+ dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
+ DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
+ break;
+ case DIF_OP_RLDSB:
+ case DIF_OP_RLDSH:
+ case DIF_OP_RLDSW:
+ case DIF_OP_RLDUB:
+ case DIF_OP_RLDUH:
+ case DIF_OP_RLDUW:
+ case DIF_OP_RLDX:
+ if (r1 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r1);
+ if (r2 != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ break;
+ case DIF_OP_ULDSB:
+ case DIF_OP_ULDSH:
+ case DIF_OP_ULDSW:
+ case DIF_OP_ULDUB:
+ case DIF_OP_ULDUH:
+ case DIF_OP_ULDUW:
+ case DIF_OP_ULDX:
+ if (r1 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r1);
+ if (r2 != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ break;
+ case DIF_OP_STB:
+ case DIF_OP_STH:
+ case DIF_OP_STW:
+ case DIF_OP_STX:
+ if (r1 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r1);
+ if (r2 != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to 0 address\n");
+ break;
+ case DIF_OP_CMP:
+ case DIF_OP_SCMP:
+ if (r1 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r1);
+ if (r2 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r2);
+ if (rd != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ break;
+ case DIF_OP_TST:
+ if (r1 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r1);
+ if (r2 != 0 || rd != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ break;
+ case DIF_OP_BA:
+ case DIF_OP_BE:
+ case DIF_OP_BNE:
+ case DIF_OP_BG:
+ case DIF_OP_BGU:
+ case DIF_OP_BGE:
+ case DIF_OP_BGEU:
+ case DIF_OP_BL:
+ case DIF_OP_BLU:
+ case DIF_OP_BLE:
+ case DIF_OP_BLEU:
+ if (label >= dp->dtdo_len) {
+ err += efunc(pc, "invalid branch target %u\n",
+ label);
+ }
+ if (label <= pc) {
+ err += efunc(pc, "backward branch to %u\n",
+ label);
+ }
+ break;
+ case DIF_OP_RET:
+ if (r1 != 0 || r2 != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ break;
+ case DIF_OP_NOP:
+ case DIF_OP_POPTS:
+ case DIF_OP_FLUSHTS:
+ if (r1 != 0 || r2 != 0 || rd != 0)
+ err += efunc(pc, "non-zero reserved bits\n");
+ break;
+ case DIF_OP_SETX:
+ if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
+ err += efunc(pc, "invalid integer ref %u\n",
+ DIF_INSTR_INTEGER(instr));
+ }
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ break;
+ case DIF_OP_SETS:
+ if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
+ err += efunc(pc, "invalid string ref %u\n",
+ DIF_INSTR_STRING(instr));
+ }
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ break;
+ case DIF_OP_LDGA:
+ case DIF_OP_LDTA:
+ if (r1 > DIF_VAR_ARRAY_MAX)
+ err += efunc(pc, "invalid array %u\n", r1);
+ if (r2 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r2);
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ break;
+ case DIF_OP_LDGS:
+ case DIF_OP_LDTS:
+ case DIF_OP_LDLS:
+ case DIF_OP_LDGAA:
+ case DIF_OP_LDTAA:
+ if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
+ err += efunc(pc, "invalid variable %u\n", v);
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+ break;
+ case DIF_OP_STGS:
+ case DIF_OP_STTS:
+ case DIF_OP_STLS:
+ case DIF_OP_STGAA:
+ case DIF_OP_STTAA:
+ if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
+ err += efunc(pc, "invalid variable %u\n", v);
+ if (rs >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ break;
+ case DIF_OP_CALL:
+ if (subr > DIF_SUBR_MAX)
+ err += efunc(pc, "invalid subr %u\n", subr);
+ if (rd >= nregs)
+ err += efunc(pc, "invalid register %u\n", rd);
+ if (rd == 0)
+ err += efunc(pc, "cannot write to %r0\n");
+
+ if (subr == DIF_SUBR_COPYOUT ||
+ subr == DIF_SUBR_COPYOUTSTR) {
+ dp->dtdo_destructive = 1;
+ }
+
+ if (subr == DIF_SUBR_GETF) {
+ /*
+ * If we have a getf() we need to record that
+ * in our state. Note that our state can be
+ * NULL if this is a helper -- but in that
+ * case, the call to getf() is itself illegal,
+ * and will be caught (slightly later) when
+ * the helper is validated.
+ */
+ if (vstate->dtvs_state != NULL)
+ vstate->dtvs_state->dts_getf++;
+ }
+
+ break;
+ case DIF_OP_PUSHTR:
+ if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
+ err += efunc(pc, "invalid ref type %u\n", type);
+ if (r2 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r2);
+ if (rs >= nregs)
+ err += efunc(pc, "invalid register %u\n", rs);
+ break;
+ case DIF_OP_PUSHTV:
+ if (type != DIF_TYPE_CTF)
+ err += efunc(pc, "invalid val type %u\n", type);
+ if (r2 >= nregs)
+ err += efunc(pc, "invalid register %u\n", r2);
+ if (rs >= nregs)
+ err += efunc(pc, "invalid register %u\n", rs);
+ break;
+ default:
+ err += efunc(pc, "invalid opcode %u\n",
+ DIF_INSTR_OP(instr));
+ }
+ }
+
+ if (dp->dtdo_len != 0 &&
+ DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
+ err += efunc(dp->dtdo_len - 1,
+ "expected 'ret' as last DIF instruction\n");
+ }
+
+ if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
+ /*
+ * If we're not returning by reference, the size must be either
+ * 0 or the size of one of the base types.
+ */
+ switch (dp->dtdo_rtype.dtdt_size) {
+ case 0:
+ case sizeof (uint8_t):
+ case sizeof (uint16_t):
+ case sizeof (uint32_t):
+ case sizeof (uint64_t):
+ break;
+
+ default:
+ err += efunc(dp->dtdo_len - 1, "bad return size\n");
+ }
+ }
+
+ for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
+ dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
+ dtrace_diftype_t *vt, *et;
+ uint_t id, ndx;
+
+ if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
+ v->dtdv_scope != DIFV_SCOPE_THREAD &&
+ v->dtdv_scope != DIFV_SCOPE_LOCAL) {
+ err += efunc(i, "unrecognized variable scope %d\n",
+ v->dtdv_scope);
+ break;
+ }
+
+ if (v->dtdv_kind != DIFV_KIND_ARRAY &&
+ v->dtdv_kind != DIFV_KIND_SCALAR) {
+ err += efunc(i, "unrecognized variable type %d\n",
+ v->dtdv_kind);
+ break;
+ }
+
+ if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
+ err += efunc(i, "%d exceeds variable id limit\n", id);
+ break;
+ }
+
+ if (id < DIF_VAR_OTHER_UBASE)
+ continue;
+
+ /*
+ * For user-defined variables, we need to check that this
+ * definition is identical to any previous definition that we
+ * encountered.
+ */
+ ndx = id - DIF_VAR_OTHER_UBASE;
+
+ switch (v->dtdv_scope) {
+ case DIFV_SCOPE_GLOBAL:
+ if (maxglobal == -1 || ndx > maxglobal)
+ maxglobal = ndx;
+
+ if (ndx < vstate->dtvs_nglobals) {
+ dtrace_statvar_t *svar;
+
+ if ((svar = vstate->dtvs_globals[ndx]) != NULL)
+ existing = &svar->dtsv_var;
+ }
+
+ break;
+
+ case DIFV_SCOPE_THREAD:
+ if (maxtlocal == -1 || ndx > maxtlocal)
+ maxtlocal = ndx;
+
+ if (ndx < vstate->dtvs_ntlocals)
+ existing = &vstate->dtvs_tlocals[ndx];
+ break;
+
+ case DIFV_SCOPE_LOCAL:
+ if (maxlocal == -1 || ndx > maxlocal)
+ maxlocal = ndx;
+
+ if (ndx < vstate->dtvs_nlocals) {
+ dtrace_statvar_t *svar;
+
+ if ((svar = vstate->dtvs_locals[ndx]) != NULL)
+ existing = &svar->dtsv_var;
+ }
+
+ break;
+ }
+
+ vt = &v->dtdv_type;
+
+ if (vt->dtdt_flags & DIF_TF_BYREF) {
+ if (vt->dtdt_size == 0) {
+ err += efunc(i, "zero-sized variable\n");
+ break;
+ }
+
+ if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
+ v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
+ vt->dtdt_size > dtrace_statvar_maxsize) {
+ err += efunc(i, "oversized by-ref static\n");
+ break;
+ }
+ }
+
+ if (existing == NULL || existing->dtdv_id == 0)
+ continue;
+
+ ASSERT(existing->dtdv_id == v->dtdv_id);
+ ASSERT(existing->dtdv_scope == v->dtdv_scope);
+
+ if (existing->dtdv_kind != v->dtdv_kind)
+ err += efunc(i, "%d changed variable kind\n", id);
+
+ et = &existing->dtdv_type;
+
+ if (vt->dtdt_flags != et->dtdt_flags) {
+ err += efunc(i, "%d changed variable type flags\n", id);
+ break;
+ }
+
+ if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
+ err += efunc(i, "%d changed variable type size\n", id);
+ break;
+ }
+ }
+
+ for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
+ dif_instr_t instr = dp->dtdo_buf[pc];
+
+ uint_t v = DIF_INSTR_VAR(instr);
+ uint_t op = DIF_INSTR_OP(instr);
+
+ switch (op) {
+ case DIF_OP_LDGS:
+ case DIF_OP_LDGAA:
+ case DIF_OP_STGS:
+ case DIF_OP_STGAA:
+ if (v > DIF_VAR_OTHER_UBASE + maxglobal)
+ err += efunc(pc, "invalid variable %u\n", v);
+ break;
+ case DIF_OP_LDTS:
+ case DIF_OP_LDTAA:
+ case DIF_OP_STTS:
+ case DIF_OP_STTAA:
+ if (v > DIF_VAR_OTHER_UBASE + maxtlocal)
+ err += efunc(pc, "invalid variable %u\n", v);
+ break;
+ case DIF_OP_LDLS:
+ case DIF_OP_STLS:
+ if (v > DIF_VAR_OTHER_UBASE + maxlocal)
+ err += efunc(pc, "invalid variable %u\n", v);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return (err);
+}
+
+/*
+ * Validate a DTrace DIF object that it is to be used as a helper. Helpers
+ * are much more constrained than normal DIFOs. Specifically, they may
+ * not:
+ *
+ * 1. Make calls to subroutines other than copyin(), copyinstr() or
+ * miscellaneous string routines
+ * 2. Access DTrace variables other than the args[] array, and the
+ * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
+ * 3. Have thread-local variables.
+ * 4. Have dynamic variables.
+ */
+static int
+dtrace_difo_validate_helper(dtrace_difo_t *dp)
+{
+ int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
+ int err = 0;
+ uint_t pc;
+
+ for (pc = 0; pc < dp->dtdo_len; pc++) {
+ dif_instr_t instr = dp->dtdo_buf[pc];
+
+ uint_t v = DIF_INSTR_VAR(instr);
+ uint_t subr = DIF_INSTR_SUBR(instr);
+ uint_t op = DIF_INSTR_OP(instr);
+
+ switch (op) {
+ case DIF_OP_OR:
+ case DIF_OP_XOR:
+ case DIF_OP_AND:
+ case DIF_OP_SLL:
+ case DIF_OP_SRL:
+ case DIF_OP_SRA:
+ case DIF_OP_SUB:
+ case DIF_OP_ADD:
+ case DIF_OP_MUL:
+ case DIF_OP_SDIV:
+ case DIF_OP_UDIV:
+ case DIF_OP_SREM:
+ case DIF_OP_UREM:
+ case DIF_OP_COPYS:
+ case DIF_OP_NOT:
+ case DIF_OP_MOV:
+ case DIF_OP_RLDSB:
+ case DIF_OP_RLDSH:
+ case DIF_OP_RLDSW:
+ case DIF_OP_RLDUB:
+ case DIF_OP_RLDUH:
+ case DIF_OP_RLDUW:
+ case DIF_OP_RLDX:
+ case DIF_OP_ULDSB:
+ case DIF_OP_ULDSH:
+ case DIF_OP_ULDSW:
+ case DIF_OP_ULDUB:
+ case DIF_OP_ULDUH:
+ case DIF_OP_ULDUW:
+ case DIF_OP_ULDX:
+ case DIF_OP_STB:
+ case DIF_OP_STH:
+ case DIF_OP_STW:
+ case DIF_OP_STX:
+ case DIF_OP_ALLOCS:
+ case DIF_OP_CMP:
+ case DIF_OP_SCMP:
+ case DIF_OP_TST:
+ case DIF_OP_BA:
+ case DIF_OP_BE:
+ case DIF_OP_BNE:
+ case DIF_OP_BG:
+ case DIF_OP_BGU:
+ case DIF_OP_BGE:
+ case DIF_OP_BGEU:
+ case DIF_OP_BL:
+ case DIF_OP_BLU:
+ case DIF_OP_BLE:
+ case DIF_OP_BLEU:
+ case DIF_OP_RET:
+ case DIF_OP_NOP:
+ case DIF_OP_POPTS:
+ case DIF_OP_FLUSHTS:
+ case DIF_OP_SETX:
+ case DIF_OP_SETS:
+ case DIF_OP_LDGA:
+ case DIF_OP_LDLS:
+ case DIF_OP_STGS:
+ case DIF_OP_STLS:
+ case DIF_OP_PUSHTR:
+ case DIF_OP_PUSHTV:
+ break;
+
+ case DIF_OP_LDGS:
+ if (v >= DIF_VAR_OTHER_UBASE)
+ break;
+
+ if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
+ break;
+
+ if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
+ v == DIF_VAR_PPID || v == DIF_VAR_TID ||
+ v == DIF_VAR_EXECARGS ||
+ v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
+ v == DIF_VAR_UID || v == DIF_VAR_GID)
+ break;
+
+ err += efunc(pc, "illegal variable %u\n", v);
+ break;
+
+ case DIF_OP_LDTA:
+ case DIF_OP_LDTS:
+ case DIF_OP_LDGAA:
+ case DIF_OP_LDTAA:
+ err += efunc(pc, "illegal dynamic variable load\n");
+ break;
+
+ case DIF_OP_STTS:
+ case DIF_OP_STGAA:
+ case DIF_OP_STTAA:
+ err += efunc(pc, "illegal dynamic variable store\n");
+ break;
+
+ case DIF_OP_CALL:
+ if (subr == DIF_SUBR_ALLOCA ||
+ subr == DIF_SUBR_BCOPY ||
+ subr == DIF_SUBR_COPYIN ||
+ subr == DIF_SUBR_COPYINTO ||
+ subr == DIF_SUBR_COPYINSTR ||
+ subr == DIF_SUBR_INDEX ||
+ subr == DIF_SUBR_INET_NTOA ||
+ subr == DIF_SUBR_INET_NTOA6 ||
+ subr == DIF_SUBR_INET_NTOP ||
+ subr == DIF_SUBR_JSON ||
+ subr == DIF_SUBR_LLTOSTR ||
+ subr == DIF_SUBR_STRTOLL ||
+ subr == DIF_SUBR_RINDEX ||
+ subr == DIF_SUBR_STRCHR ||
+ subr == DIF_SUBR_STRJOIN ||
+ subr == DIF_SUBR_STRRCHR ||
+ subr == DIF_SUBR_STRSTR ||
+ subr == DIF_SUBR_HTONS ||
+ subr == DIF_SUBR_HTONL ||
+ subr == DIF_SUBR_HTONLL ||
+ subr == DIF_SUBR_NTOHS ||
+ subr == DIF_SUBR_NTOHL ||
+ subr == DIF_SUBR_NTOHLL ||
+ subr == DIF_SUBR_MEMREF)
+ break;
+#ifdef __FreeBSD__
+ if (subr == DIF_SUBR_MEMSTR)
+ break;
+#endif
+
+ err += efunc(pc, "invalid subr %u\n", subr);
+ break;
+
+ default:
+ err += efunc(pc, "invalid opcode %u\n",
+ DIF_INSTR_OP(instr));
+ }
+ }
+
+ return (err);
+}
+
+/*
+ * Returns 1 if the expression in the DIF object can be cached on a per-thread
+ * basis; 0 if not.
+ */
+static int
+dtrace_difo_cacheable(dtrace_difo_t *dp)
+{
+ int i;
+
+ if (dp == NULL)
+ return (0);
+
+ for (i = 0; i < dp->dtdo_varlen; i++) {
+ dtrace_difv_t *v = &dp->dtdo_vartab[i];
+
+ if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
+ continue;
+
+ switch (v->dtdv_id) {
+ case DIF_VAR_CURTHREAD:
+ case DIF_VAR_PID:
+ case DIF_VAR_TID:
+ case DIF_VAR_EXECARGS:
+ case DIF_VAR_EXECNAME:
+ case DIF_VAR_ZONENAME:
+ break;
+
+ default:
+ return (0);
+ }
+ }
+
+ /*
+ * This DIF object may be cacheable. Now we need to look for any
+ * array loading instructions, any memory loading instructions, or
+ * any stores to thread-local variables.
+ */
+ for (i = 0; i < dp->dtdo_len; i++) {
+ uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
+
+ if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
+ (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
+ (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
+ op == DIF_OP_LDGA || op == DIF_OP_STTS)
+ return (0);
+ }
+
+ return (1);
+}
+
+static void
+dtrace_difo_hold(dtrace_difo_t *dp)
+{
+ int i;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ dp->dtdo_refcnt++;
+ ASSERT(dp->dtdo_refcnt != 0);
+
+ /*
+ * We need to check this DIF object for references to the variable
+ * DIF_VAR_VTIMESTAMP.
+ */
+ for (i = 0; i < dp->dtdo_varlen; i++) {
+ dtrace_difv_t *v = &dp->dtdo_vartab[i];
+
+ if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
+ continue;
+
+ if (dtrace_vtime_references++ == 0)
+ dtrace_vtime_enable();
+ }
+}
+
+/*
+ * This routine calculates the dynamic variable chunksize for a given DIF
+ * object. The calculation is not fool-proof, and can probably be tricked by
+ * malicious DIF -- but it works for all compiler-generated DIF. Because this
+ * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
+ * if a dynamic variable size exceeds the chunksize.
+ */
+static void
+dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
+{
+ uint64_t sval = 0;
+ dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
+ const dif_instr_t *text = dp->dtdo_buf;
+ uint_t pc, srd = 0;
+ uint_t ttop = 0;
+ size_t size, ksize;
+ uint_t id, i;
+
+ for (pc = 0; pc < dp->dtdo_len; pc++) {
+ dif_instr_t instr = text[pc];
+ uint_t op = DIF_INSTR_OP(instr);
+ uint_t rd = DIF_INSTR_RD(instr);
+ uint_t r1 = DIF_INSTR_R1(instr);
+ uint_t nkeys = 0;
+ uchar_t scope = 0;
+
+ dtrace_key_t *key = tupregs;
+
+ switch (op) {
+ case DIF_OP_SETX:
+ sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
+ srd = rd;
+ continue;
+
+ case DIF_OP_STTS:
+ key = &tupregs[DIF_DTR_NREGS];
+ key[0].dttk_size = 0;
+ key[1].dttk_size = 0;
+ nkeys = 2;
+ scope = DIFV_SCOPE_THREAD;
+ break;
+
+ case DIF_OP_STGAA:
+ case DIF_OP_STTAA:
+ nkeys = ttop;
+
+ if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
+ key[nkeys++].dttk_size = 0;
+
+ key[nkeys++].dttk_size = 0;
+
+ if (op == DIF_OP_STTAA) {
+ scope = DIFV_SCOPE_THREAD;
+ } else {
+ scope = DIFV_SCOPE_GLOBAL;
+ }
+
+ break;
+
+ case DIF_OP_PUSHTR:
+ if (ttop == DIF_DTR_NREGS)
+ return;
+
+ if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
+ /*
+ * If the register for the size of the "pushtr"
+ * is %r0 (or the value is 0) and the type is
+ * a string, we'll use the system-wide default
+ * string size.
+ */
+ tupregs[ttop++].dttk_size =
+ dtrace_strsize_default;
+ } else {
+ if (srd == 0)
+ return;
+
+ if (sval > LONG_MAX)
+ return;
+
+ tupregs[ttop++].dttk_size = sval;
+ }
+
+ break;
+
+ case DIF_OP_PUSHTV:
+ if (ttop == DIF_DTR_NREGS)
+ return;
+
+ tupregs[ttop++].dttk_size = 0;
+ break;
+
+ case DIF_OP_FLUSHTS:
+ ttop = 0;
+ break;
+
+ case DIF_OP_POPTS:
+ if (ttop != 0)
+ ttop--;
+ break;
+ }
+
+ sval = 0;
+ srd = 0;
+
+ if (nkeys == 0)
+ continue;
+
+ /*
+ * We have a dynamic variable allocation; calculate its size.
+ */
+ for (ksize = 0, i = 0; i < nkeys; i++)
+ ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
+
+ size = sizeof (dtrace_dynvar_t);
+ size += sizeof (dtrace_key_t) * (nkeys - 1);
+ size += ksize;
+
+ /*
+ * Now we need to determine the size of the stored data.
+ */
+ id = DIF_INSTR_VAR(instr);
+
+ for (i = 0; i < dp->dtdo_varlen; i++) {
+ dtrace_difv_t *v = &dp->dtdo_vartab[i];
+
+ if (v->dtdv_id == id && v->dtdv_scope == scope) {
+ size += v->dtdv_type.dtdt_size;
+ break;
+ }
+ }
+
+ if (i == dp->dtdo_varlen)
+ return;
+
+ /*
+ * We have the size. If this is larger than the chunk size
+ * for our dynamic variable state, reset the chunk size.
+ */
+ size = P2ROUNDUP(size, sizeof (uint64_t));
+
+ /*
+ * Before setting the chunk size, check that we're not going
+ * to set it to a negative value...
+ */
+ if (size > LONG_MAX)
+ return;
+
+ /*
+ * ...and make certain that we didn't badly overflow.
+ */
+ if (size < ksize || size < sizeof (dtrace_dynvar_t))
+ return;
+
+ if (size > vstate->dtvs_dynvars.dtds_chunksize)
+ vstate->dtvs_dynvars.dtds_chunksize = size;
+ }
+}
+
+static void
+dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
+{
+ int i, oldsvars, osz, nsz, otlocals, ntlocals;
+ uint_t id;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
+
+ for (i = 0; i < dp->dtdo_varlen; i++) {
+ dtrace_difv_t *v = &dp->dtdo_vartab[i];
+ dtrace_statvar_t *svar, ***svarp = NULL;
+ size_t dsize = 0;
+ uint8_t scope = v->dtdv_scope;
+ int *np = NULL;
+
+ if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
+ continue;
+
+ id -= DIF_VAR_OTHER_UBASE;
+
+ switch (scope) {
+ case DIFV_SCOPE_THREAD:
+ while (id >= (otlocals = vstate->dtvs_ntlocals)) {
+ dtrace_difv_t *tlocals;
+
+ if ((ntlocals = (otlocals << 1)) == 0)
+ ntlocals = 1;
+
+ osz = otlocals * sizeof (dtrace_difv_t);
+ nsz = ntlocals * sizeof (dtrace_difv_t);
+
+ tlocals = kmem_zalloc(nsz, KM_SLEEP);
+
+ if (osz != 0) {
+ bcopy(vstate->dtvs_tlocals,
+ tlocals, osz);
+ kmem_free(vstate->dtvs_tlocals, osz);
+ }
+
+ vstate->dtvs_tlocals = tlocals;
+ vstate->dtvs_ntlocals = ntlocals;
+ }
+
+ vstate->dtvs_tlocals[id] = *v;
+ continue;
+
+ case DIFV_SCOPE_LOCAL:
+ np = &vstate->dtvs_nlocals;
+ svarp = &vstate->dtvs_locals;
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
+ dsize = NCPU * (v->dtdv_type.dtdt_size +
+ sizeof (uint64_t));
+ else
+ dsize = NCPU * sizeof (uint64_t);
+
+ break;
+
+ case DIFV_SCOPE_GLOBAL:
+ np = &vstate->dtvs_nglobals;
+ svarp = &vstate->dtvs_globals;
+
+ if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
+ dsize = v->dtdv_type.dtdt_size +
+ sizeof (uint64_t);
+
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+ while (id >= (oldsvars = *np)) {
+ dtrace_statvar_t **statics;
+ int newsvars, oldsize, newsize;
+
+ if ((newsvars = (oldsvars << 1)) == 0)
+ newsvars = 1;
+
+ oldsize = oldsvars * sizeof (dtrace_statvar_t *);
+ newsize = newsvars * sizeof (dtrace_statvar_t *);
+
+ statics = kmem_zalloc(newsize, KM_SLEEP);
+
+ if (oldsize != 0) {
+ bcopy(*svarp, statics, oldsize);
+ kmem_free(*svarp, oldsize);
+ }
+
+ *svarp = statics;
+ *np = newsvars;
+ }
+
+ if ((svar = (*svarp)[id]) == NULL) {
+ svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
+ svar->dtsv_var = *v;
+
+ if ((svar->dtsv_size = dsize) != 0) {
+ svar->dtsv_data = (uint64_t)(uintptr_t)
+ kmem_zalloc(dsize, KM_SLEEP);
+ }
+
+ (*svarp)[id] = svar;
+ }
+
+ svar->dtsv_refcnt++;
+ }
+
+ dtrace_difo_chunksize(dp, vstate);
+ dtrace_difo_hold(dp);
+}
+
+static dtrace_difo_t *
+dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
+{
+ dtrace_difo_t *new;
+ size_t sz;
+
+ ASSERT(dp->dtdo_buf != NULL);
+ ASSERT(dp->dtdo_refcnt != 0);
+
+ new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
+
+ ASSERT(dp->dtdo_buf != NULL);
+ sz = dp->dtdo_len * sizeof (dif_instr_t);
+ new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
+ bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
+ new->dtdo_len = dp->dtdo_len;
+
+ if (dp->dtdo_strtab != NULL) {
+ ASSERT(dp->dtdo_strlen != 0);
+ new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
+ bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
+ new->dtdo_strlen = dp->dtdo_strlen;
+ }
+
+ if (dp->dtdo_inttab != NULL) {
+ ASSERT(dp->dtdo_intlen != 0);
+ sz = dp->dtdo_intlen * sizeof (uint64_t);
+ new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
+ bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
+ new->dtdo_intlen = dp->dtdo_intlen;
+ }
+
+ if (dp->dtdo_vartab != NULL) {
+ ASSERT(dp->dtdo_varlen != 0);
+ sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
+ new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
+ bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
+ new->dtdo_varlen = dp->dtdo_varlen;
+ }
+
+ dtrace_difo_init(new, vstate);
+ return (new);
+}
+
+static void
+dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
+{
+ int i;
+
+ ASSERT(dp->dtdo_refcnt == 0);
+
+ for (i = 0; i < dp->dtdo_varlen; i++) {
+ dtrace_difv_t *v = &dp->dtdo_vartab[i];
+ dtrace_statvar_t *svar, **svarp = NULL;
+ uint_t id;
+ uint8_t scope = v->dtdv_scope;
+ int *np = NULL;
+
+ switch (scope) {
+ case DIFV_SCOPE_THREAD:
+ continue;
+
+ case DIFV_SCOPE_LOCAL:
+ np = &vstate->dtvs_nlocals;
+ svarp = vstate->dtvs_locals;
+ break;
+
+ case DIFV_SCOPE_GLOBAL:
+ np = &vstate->dtvs_nglobals;
+ svarp = vstate->dtvs_globals;
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+ if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
+ continue;
+
+ id -= DIF_VAR_OTHER_UBASE;
+ ASSERT(id < *np);
+
+ svar = svarp[id];
+ ASSERT(svar != NULL);
+ ASSERT(svar->dtsv_refcnt > 0);
+
+ if (--svar->dtsv_refcnt > 0)
+ continue;
+
+ if (svar->dtsv_size != 0) {
+ ASSERT(svar->dtsv_data != 0);
+ kmem_free((void *)(uintptr_t)svar->dtsv_data,
+ svar->dtsv_size);
+ }
+
+ kmem_free(svar, sizeof (dtrace_statvar_t));
+ svarp[id] = NULL;
+ }
+
+ if (dp->dtdo_buf != NULL)
+ kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
+ if (dp->dtdo_inttab != NULL)
+ kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
+ if (dp->dtdo_strtab != NULL)
+ kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
+ if (dp->dtdo_vartab != NULL)
+ kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
+
+ kmem_free(dp, sizeof (dtrace_difo_t));
+}
+
+static void
+dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
+{
+ int i;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(dp->dtdo_refcnt != 0);
+
+ for (i = 0; i < dp->dtdo_varlen; i++) {
+ dtrace_difv_t *v = &dp->dtdo_vartab[i];
+
+ if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
+ continue;
+
+ ASSERT(dtrace_vtime_references > 0);
+ if (--dtrace_vtime_references == 0)
+ dtrace_vtime_disable();
+ }
+
+ if (--dp->dtdo_refcnt == 0)
+ dtrace_difo_destroy(dp, vstate);
+}
+
+/*
+ * DTrace Format Functions
+ */
+static uint16_t
+dtrace_format_add(dtrace_state_t *state, char *str)
+{
+ char *fmt, **new;
+ uint16_t ndx, len = strlen(str) + 1;
+
+ fmt = kmem_zalloc(len, KM_SLEEP);
+ bcopy(str, fmt, len);
+
+ for (ndx = 0; ndx < state->dts_nformats; ndx++) {
+ if (state->dts_formats[ndx] == NULL) {
+ state->dts_formats[ndx] = fmt;
+ return (ndx + 1);
+ }
+ }
+
+ if (state->dts_nformats == USHRT_MAX) {
+ /*
+ * This is only likely if a denial-of-service attack is being
+ * attempted. As such, it's okay to fail silently here.
+ */
+ kmem_free(fmt, len);
+ return (0);
+ }
+
+ /*
+ * For simplicity, we always resize the formats array to be exactly the
+ * number of formats.
+ */
+ ndx = state->dts_nformats++;
+ new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
+
+ if (state->dts_formats != NULL) {
+ ASSERT(ndx != 0);
+ bcopy(state->dts_formats, new, ndx * sizeof (char *));
+ kmem_free(state->dts_formats, ndx * sizeof (char *));
+ }
+
+ state->dts_formats = new;
+ state->dts_formats[ndx] = fmt;
+
+ return (ndx + 1);
+}
+
+static void
+dtrace_format_remove(dtrace_state_t *state, uint16_t format)
+{
+ char *fmt;
+
+ ASSERT(state->dts_formats != NULL);
+ ASSERT(format <= state->dts_nformats);
+ ASSERT(state->dts_formats[format - 1] != NULL);
+
+ fmt = state->dts_formats[format - 1];
+ kmem_free(fmt, strlen(fmt) + 1);
+ state->dts_formats[format - 1] = NULL;
+}
+
+static void
+dtrace_format_destroy(dtrace_state_t *state)
+{
+ int i;
+
+ if (state->dts_nformats == 0) {
+ ASSERT(state->dts_formats == NULL);
+ return;
+ }
+
+ ASSERT(state->dts_formats != NULL);
+
+ for (i = 0; i < state->dts_nformats; i++) {
+ char *fmt = state->dts_formats[i];
+
+ if (fmt == NULL)
+ continue;
+
+ kmem_free(fmt, strlen(fmt) + 1);
+ }
+
+ kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
+ state->dts_nformats = 0;
+ state->dts_formats = NULL;
+}
+
+/*
+ * DTrace Predicate Functions
+ */
+static dtrace_predicate_t *
+dtrace_predicate_create(dtrace_difo_t *dp)
+{
+ dtrace_predicate_t *pred;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(dp->dtdo_refcnt != 0);
+
+ pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
+ pred->dtp_difo = dp;
+ pred->dtp_refcnt = 1;
+
+ if (!dtrace_difo_cacheable(dp))
+ return (pred);
+
+ if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
+ /*
+ * This is only theoretically possible -- we have had 2^32
+ * cacheable predicates on this machine. We cannot allow any
+ * more predicates to become cacheable: as unlikely as it is,
+ * there may be a thread caching a (now stale) predicate cache
+ * ID. (N.B.: the temptation is being successfully resisted to
+ * have this cmn_err() "Holy shit -- we executed this code!")
+ */
+ return (pred);
+ }
+
+ pred->dtp_cacheid = dtrace_predcache_id++;
+
+ return (pred);
+}
+
+static void
+dtrace_predicate_hold(dtrace_predicate_t *pred)
+{
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
+ ASSERT(pred->dtp_refcnt > 0);
+
+ pred->dtp_refcnt++;
+}
+
+static void
+dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
+{
+ dtrace_difo_t *dp = pred->dtp_difo;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
+ ASSERT(pred->dtp_refcnt > 0);
+
+ if (--pred->dtp_refcnt == 0) {
+ dtrace_difo_release(pred->dtp_difo, vstate);
+ kmem_free(pred, sizeof (dtrace_predicate_t));
+ }
+}
+
+/*
+ * DTrace Action Description Functions
+ */
+static dtrace_actdesc_t *
+dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
+ uint64_t uarg, uint64_t arg)
+{
+ dtrace_actdesc_t *act;
+
+#ifdef illumos
+ ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
+ arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
+#endif
+
+ act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
+ act->dtad_kind = kind;
+ act->dtad_ntuple = ntuple;
+ act->dtad_uarg = uarg;
+ act->dtad_arg = arg;
+ act->dtad_refcnt = 1;
+
+ return (act);
+}
+
+static void
+dtrace_actdesc_hold(dtrace_actdesc_t *act)
+{
+ ASSERT(act->dtad_refcnt >= 1);
+ act->dtad_refcnt++;
+}
+
+static void
+dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
+{
+ dtrace_actkind_t kind = act->dtad_kind;
+ dtrace_difo_t *dp;
+
+ ASSERT(act->dtad_refcnt >= 1);
+
+ if (--act->dtad_refcnt != 0)
+ return;
+
+ if ((dp = act->dtad_difo) != NULL)
+ dtrace_difo_release(dp, vstate);
+
+ if (DTRACEACT_ISPRINTFLIKE(kind)) {
+ char *str = (char *)(uintptr_t)act->dtad_arg;
+
+#ifdef illumos
+ ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
+ (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
+#endif
+
+ if (str != NULL)
+ kmem_free(str, strlen(str) + 1);
+ }
+
+ kmem_free(act, sizeof (dtrace_actdesc_t));
+}
+
+/*
+ * DTrace ECB Functions
+ */
+static dtrace_ecb_t *
+dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
+{
+ dtrace_ecb_t *ecb;
+ dtrace_epid_t epid;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
+ ecb->dte_predicate = NULL;
+ ecb->dte_probe = probe;
+
+ /*
+ * The default size is the size of the default action: recording
+ * the header.
+ */
+ ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
+ ecb->dte_alignment = sizeof (dtrace_epid_t);
+
+ epid = state->dts_epid++;
+
+ if (epid - 1 >= state->dts_necbs) {
+ dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
+ int necbs = state->dts_necbs << 1;
+
+ ASSERT(epid == state->dts_necbs + 1);
+
+ if (necbs == 0) {
+ ASSERT(oecbs == NULL);
+ necbs = 1;
+ }
+
+ ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
+
+ if (oecbs != NULL)
+ bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
+
+ dtrace_membar_producer();
+ state->dts_ecbs = ecbs;
+
+ if (oecbs != NULL) {
+ /*
+ * If this state is active, we must dtrace_sync()
+ * before we can free the old dts_ecbs array: we're
+ * coming in hot, and there may be active ring
+ * buffer processing (which indexes into the dts_ecbs
+ * array) on another CPU.
+ */
+ if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
+ dtrace_sync();
+
+ kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
+ }
+
+ dtrace_membar_producer();
+ state->dts_necbs = necbs;
+ }
+
+ ecb->dte_state = state;
+
+ ASSERT(state->dts_ecbs[epid - 1] == NULL);
+ dtrace_membar_producer();
+ state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
+
+ return (ecb);
+}
+
+static void
+dtrace_ecb_enable(dtrace_ecb_t *ecb)
+{
+ dtrace_probe_t *probe = ecb->dte_probe;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(ecb->dte_next == NULL);
+
+ if (probe == NULL) {
+ /*
+ * This is the NULL probe -- there's nothing to do.
+ */
+ return;
+ }
+
+ if (probe->dtpr_ecb == NULL) {
+ dtrace_provider_t *prov = probe->dtpr_provider;
+
+ /*
+ * We're the first ECB on this probe.
+ */
+ probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
+
+ if (ecb->dte_predicate != NULL)
+ probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
+
+ prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
+ probe->dtpr_id, probe->dtpr_arg);
+ } else {
+ /*
+ * This probe is already active. Swing the last pointer to
+ * point to the new ECB, and issue a dtrace_sync() to assure
+ * that all CPUs have seen the change.
+ */
+ ASSERT(probe->dtpr_ecb_last != NULL);
+ probe->dtpr_ecb_last->dte_next = ecb;
+ probe->dtpr_ecb_last = ecb;
+ probe->dtpr_predcache = 0;
+
+ dtrace_sync();
+ }
+}
+
+static int
+dtrace_ecb_resize(dtrace_ecb_t *ecb)
+{
+ dtrace_action_t *act;
+ uint32_t curneeded = UINT32_MAX;
+ uint32_t aggbase = UINT32_MAX;
+
+ /*
+ * If we record anything, we always record the dtrace_rechdr_t. (And
+ * we always record it first.)
+ */
+ ecb->dte_size = sizeof (dtrace_rechdr_t);
+ ecb->dte_alignment = sizeof (dtrace_epid_t);
+
+ for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
+ dtrace_recdesc_t *rec = &act->dta_rec;
+ ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
+
+ ecb->dte_alignment = MAX(ecb->dte_alignment,
+ rec->dtrd_alignment);
+
+ if (DTRACEACT_ISAGG(act->dta_kind)) {
+ dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
+
+ ASSERT(rec->dtrd_size != 0);
+ ASSERT(agg->dtag_first != NULL);
+ ASSERT(act->dta_prev->dta_intuple);
+ ASSERT(aggbase != UINT32_MAX);
+ ASSERT(curneeded != UINT32_MAX);
+
+ agg->dtag_base = aggbase;
+
+ curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+ rec->dtrd_offset = curneeded;
+ if (curneeded + rec->dtrd_size < curneeded)
+ return (EINVAL);
+ curneeded += rec->dtrd_size;
+ ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
+
+ aggbase = UINT32_MAX;
+ curneeded = UINT32_MAX;
+ } else if (act->dta_intuple) {
+ if (curneeded == UINT32_MAX) {
+ /*
+ * This is the first record in a tuple. Align
+ * curneeded to be at offset 4 in an 8-byte
+ * aligned block.
+ */
+ ASSERT(act->dta_prev == NULL ||
+ !act->dta_prev->dta_intuple);
+ ASSERT3U(aggbase, ==, UINT32_MAX);
+ curneeded = P2PHASEUP(ecb->dte_size,
+ sizeof (uint64_t), sizeof (dtrace_aggid_t));
+
+ aggbase = curneeded - sizeof (dtrace_aggid_t);
+ ASSERT(IS_P2ALIGNED(aggbase,
+ sizeof (uint64_t)));
+ }
+ curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+ rec->dtrd_offset = curneeded;
+ if (curneeded + rec->dtrd_size < curneeded)
+ return (EINVAL);
+ curneeded += rec->dtrd_size;
+ } else {
+ /* tuples must be followed by an aggregation */
+ ASSERT(act->dta_prev == NULL ||
+ !act->dta_prev->dta_intuple);
+
+ ecb->dte_size = P2ROUNDUP(ecb->dte_size,
+ rec->dtrd_alignment);
+ rec->dtrd_offset = ecb->dte_size;
+ if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
+ return (EINVAL);
+ ecb->dte_size += rec->dtrd_size;
+ ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
+ }
+ }
+
+ if ((act = ecb->dte_action) != NULL &&
+ !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
+ ecb->dte_size == sizeof (dtrace_rechdr_t)) {
+ /*
+ * If the size is still sizeof (dtrace_rechdr_t), then all
+ * actions store no data; set the size to 0.
+ */
+ ecb->dte_size = 0;
+ }
+
+ ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
+ ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
+ ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
+ ecb->dte_needed);
+ return (0);
+}
+
+static dtrace_action_t *
+dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
+{
+ dtrace_aggregation_t *agg;
+ size_t size = sizeof (uint64_t);
+ int ntuple = desc->dtad_ntuple;
+ dtrace_action_t *act;
+ dtrace_recdesc_t *frec;
+ dtrace_aggid_t aggid;
+ dtrace_state_t *state = ecb->dte_state;
+
+ agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
+ agg->dtag_ecb = ecb;
+
+ ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
+
+ switch (desc->dtad_kind) {
+ case DTRACEAGG_MIN:
+ agg->dtag_initial = INT64_MAX;
+ agg->dtag_aggregate = dtrace_aggregate_min;
+ break;
+
+ case DTRACEAGG_MAX:
+ agg->dtag_initial = INT64_MIN;
+ agg->dtag_aggregate = dtrace_aggregate_max;
+ break;
+
+ case DTRACEAGG_COUNT:
+ agg->dtag_aggregate = dtrace_aggregate_count;
+ break;
+
+ case DTRACEAGG_QUANTIZE:
+ agg->dtag_aggregate = dtrace_aggregate_quantize;
+ size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
+ sizeof (uint64_t);
+ break;
+
+ case DTRACEAGG_LQUANTIZE: {
+ uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
+ uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
+
+ agg->dtag_initial = desc->dtad_arg;
+ agg->dtag_aggregate = dtrace_aggregate_lquantize;
+
+ if (step == 0 || levels == 0)
+ goto err;
+
+ size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
+ break;
+ }
+
+ case DTRACEAGG_LLQUANTIZE: {
+ uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
+ uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
+ uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
+ uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
+ int64_t v;
+
+ agg->dtag_initial = desc->dtad_arg;
+ agg->dtag_aggregate = dtrace_aggregate_llquantize;
+
+ if (factor < 2 || low >= high || nsteps < factor)
+ goto err;
+
+ /*
+ * Now check that the number of steps evenly divides a power
+ * of the factor. (This assures both integer bucket size and
+ * linearity within each magnitude.)
+ */
+ for (v = factor; v < nsteps; v *= factor)
+ continue;
+
+ if ((v % nsteps) || (nsteps % factor))
+ goto err;
+
+ size = (dtrace_aggregate_llquantize_bucket(factor,
+ low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
+ break;
+ }
+
+ case DTRACEAGG_AVG:
+ agg->dtag_aggregate = dtrace_aggregate_avg;
+ size = sizeof (uint64_t) * 2;
+ break;
+
+ case DTRACEAGG_STDDEV:
+ agg->dtag_aggregate = dtrace_aggregate_stddev;
+ size = sizeof (uint64_t) * 4;
+ break;
+
+ case DTRACEAGG_SUM:
+ agg->dtag_aggregate = dtrace_aggregate_sum;
+ break;
+
+ default:
+ goto err;
+ }
+
+ agg->dtag_action.dta_rec.dtrd_size = size;
+
+ if (ntuple == 0)
+ goto err;
+
+ /*
+ * We must make sure that we have enough actions for the n-tuple.
+ */
+ for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
+ if (DTRACEACT_ISAGG(act->dta_kind))
+ break;
+
+ if (--ntuple == 0) {
+ /*
+ * This is the action with which our n-tuple begins.
+ */
+ agg->dtag_first = act;
+ goto success;
+ }
+ }
+
+ /*
+ * This n-tuple is short by ntuple elements. Return failure.
+ */
+ ASSERT(ntuple != 0);
+err:
+ kmem_free(agg, sizeof (dtrace_aggregation_t));
+ return (NULL);
+
+success:
+ /*
+ * If the last action in the tuple has a size of zero, it's actually
+ * an expression argument for the aggregating action.
+ */
+ ASSERT(ecb->dte_action_last != NULL);
+ act = ecb->dte_action_last;
+
+ if (act->dta_kind == DTRACEACT_DIFEXPR) {
+ ASSERT(act->dta_difo != NULL);
+
+ if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
+ agg->dtag_hasarg = 1;
+ }
+
+ /*
+ * We need to allocate an id for this aggregation.
+ */
+#ifdef illumos
+ aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
+ VM_BESTFIT | VM_SLEEP);
+#else
+ aggid = alloc_unr(state->dts_aggid_arena);
+#endif
+
+ if (aggid - 1 >= state->dts_naggregations) {
+ dtrace_aggregation_t **oaggs = state->dts_aggregations;
+ dtrace_aggregation_t **aggs;
+ int naggs = state->dts_naggregations << 1;
+ int onaggs = state->dts_naggregations;
+
+ ASSERT(aggid == state->dts_naggregations + 1);
+
+ if (naggs == 0) {
+ ASSERT(oaggs == NULL);
+ naggs = 1;
+ }
+
+ aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
+
+ if (oaggs != NULL) {
+ bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
+ kmem_free(oaggs, onaggs * sizeof (*aggs));
+ }
+
+ state->dts_aggregations = aggs;
+ state->dts_naggregations = naggs;
+ }
+
+ ASSERT(state->dts_aggregations[aggid - 1] == NULL);
+ state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
+
+ frec = &agg->dtag_first->dta_rec;
+ if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
+ frec->dtrd_alignment = sizeof (dtrace_aggid_t);
+
+ for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
+ ASSERT(!act->dta_intuple);
+ act->dta_intuple = 1;
+ }
+
+ return (&agg->dtag_action);
+}
+
+static void
+dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
+{
+ dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
+ dtrace_state_t *state = ecb->dte_state;
+ dtrace_aggid_t aggid = agg->dtag_id;
+
+ ASSERT(DTRACEACT_ISAGG(act->dta_kind));
+#ifdef illumos
+ vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
+#else
+ free_unr(state->dts_aggid_arena, aggid);
+#endif
+
+ ASSERT(state->dts_aggregations[aggid - 1] == agg);
+ state->dts_aggregations[aggid - 1] = NULL;
+
+ kmem_free(agg, sizeof (dtrace_aggregation_t));
+}
+
+static int
+dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
+{
+ dtrace_action_t *action, *last;
+ dtrace_difo_t *dp = desc->dtad_difo;
+ uint32_t size = 0, align = sizeof (uint8_t), mask;
+ uint16_t format = 0;
+ dtrace_recdesc_t *rec;
+ dtrace_state_t *state = ecb->dte_state;
+ dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
+ uint64_t arg = desc->dtad_arg;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
+
+ if (DTRACEACT_ISAGG(desc->dtad_kind)) {
+ /*
+ * If this is an aggregating action, there must be neither
+ * a speculate nor a commit on the action chain.
+ */
+ dtrace_action_t *act;
+
+ for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
+ if (act->dta_kind == DTRACEACT_COMMIT)
+ return (EINVAL);
+
+ if (act->dta_kind == DTRACEACT_SPECULATE)
+ return (EINVAL);
+ }
+
+ action = dtrace_ecb_aggregation_create(ecb, desc);
+
+ if (action == NULL)
+ return (EINVAL);
+ } else {
+ if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
+ (desc->dtad_kind == DTRACEACT_DIFEXPR &&
+ dp != NULL && dp->dtdo_destructive)) {
+ state->dts_destructive = 1;
+ }
+
+ switch (desc->dtad_kind) {
+ case DTRACEACT_PRINTF:
+ case DTRACEACT_PRINTA:
+ case DTRACEACT_SYSTEM:
+ case DTRACEACT_FREOPEN:
+ case DTRACEACT_DIFEXPR:
+ /*
+ * We know that our arg is a string -- turn it into a
+ * format.
+ */
+ if (arg == 0) {
+ ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
+ desc->dtad_kind == DTRACEACT_DIFEXPR);
+ format = 0;
+ } else {
+ ASSERT(arg != 0);
+#ifdef illumos
+ ASSERT(arg > KERNELBASE);
+#endif
+ format = dtrace_format_add(state,
+ (char *)(uintptr_t)arg);
+ }
+
+ /*FALLTHROUGH*/
+ case DTRACEACT_LIBACT:
+ case DTRACEACT_TRACEMEM:
+ case DTRACEACT_TRACEMEM_DYNSIZE:
+ if (dp == NULL)
+ return (EINVAL);
+
+ if ((size = dp->dtdo_rtype.dtdt_size) != 0)
+ break;
+
+ if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
+ if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
+ return (EINVAL);
+
+ size = opt[DTRACEOPT_STRSIZE];
+ }
+
+ break;
+
+ case DTRACEACT_STACK:
+ if ((nframes = arg) == 0) {
+ nframes = opt[DTRACEOPT_STACKFRAMES];
+ ASSERT(nframes > 0);
+ arg = nframes;
+ }
+
+ size = nframes * sizeof (pc_t);
+ break;
+
+ case DTRACEACT_JSTACK:
+ if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
+ strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
+
+ if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
+ nframes = opt[DTRACEOPT_JSTACKFRAMES];
+
+ arg = DTRACE_USTACK_ARG(nframes, strsize);
+
+ /*FALLTHROUGH*/
+ case DTRACEACT_USTACK:
+ if (desc->dtad_kind != DTRACEACT_JSTACK &&
+ (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
+ strsize = DTRACE_USTACK_STRSIZE(arg);
+ nframes = opt[DTRACEOPT_USTACKFRAMES];
+ ASSERT(nframes > 0);
+ arg = DTRACE_USTACK_ARG(nframes, strsize);
+ }
+
+ /*
+ * Save a slot for the pid.
+ */
+ size = (nframes + 1) * sizeof (uint64_t);
+ size += DTRACE_USTACK_STRSIZE(arg);
+ size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
+
+ break;
+
+ case DTRACEACT_SYM:
+ case DTRACEACT_MOD:
+ if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
+ sizeof (uint64_t)) ||
+ (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
+ return (EINVAL);
+ break;
+
+ case DTRACEACT_USYM:
+ case DTRACEACT_UMOD:
+ case DTRACEACT_UADDR:
+ if (dp == NULL ||
+ (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
+ (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
+ return (EINVAL);
+
+ /*
+ * We have a slot for the pid, plus a slot for the
+ * argument. To keep things simple (aligned with
+ * bitness-neutral sizing), we store each as a 64-bit
+ * quantity.
+ */
+ size = 2 * sizeof (uint64_t);
+ break;
+
+ case DTRACEACT_STOP:
+ case DTRACEACT_BREAKPOINT:
+ case DTRACEACT_PANIC:
+ break;
+
+ case DTRACEACT_CHILL:
+ case DTRACEACT_DISCARD:
+ case DTRACEACT_RAISE:
+ if (dp == NULL)
+ return (EINVAL);
+ break;
+
+ case DTRACEACT_EXIT:
+ if (dp == NULL ||
+ (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
+ (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
+ return (EINVAL);
+ break;
+
+ case DTRACEACT_SPECULATE:
+ if (ecb->dte_size > sizeof (dtrace_rechdr_t))
+ return (EINVAL);
+
+ if (dp == NULL)
+ return (EINVAL);
+
+ state->dts_speculates = 1;
+ break;
+
+ case DTRACEACT_PRINTM:
+ size = dp->dtdo_rtype.dtdt_size;
+ break;
+
+ case DTRACEACT_COMMIT: {
+ dtrace_action_t *act = ecb->dte_action;
+
+ for (; act != NULL; act = act->dta_next) {
+ if (act->dta_kind == DTRACEACT_COMMIT)
+ return (EINVAL);
+ }
+
+ if (dp == NULL)
+ return (EINVAL);
+ break;
+ }
+
+ default:
+ return (EINVAL);
+ }
+
+ if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
+ /*
+ * If this is a data-storing action or a speculate,
+ * we must be sure that there isn't a commit on the
+ * action chain.
+ */
+ dtrace_action_t *act = ecb->dte_action;
+
+ for (; act != NULL; act = act->dta_next) {
+ if (act->dta_kind == DTRACEACT_COMMIT)
+ return (EINVAL);
+ }
+ }
+
+ action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
+ action->dta_rec.dtrd_size = size;
+ }
+
+ action->dta_refcnt = 1;
+ rec = &action->dta_rec;
+ size = rec->dtrd_size;
+
+ for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
+ if (!(size & mask)) {
+ align = mask + 1;
+ break;
+ }
+ }
+
+ action->dta_kind = desc->dtad_kind;
+
+ if ((action->dta_difo = dp) != NULL)
+ dtrace_difo_hold(dp);
+
+ rec->dtrd_action = action->dta_kind;
+ rec->dtrd_arg = arg;
+ rec->dtrd_uarg = desc->dtad_uarg;
+ rec->dtrd_alignment = (uint16_t)align;
+ rec->dtrd_format = format;
+
+ if ((last = ecb->dte_action_last) != NULL) {
+ ASSERT(ecb->dte_action != NULL);
+ action->dta_prev = last;
+ last->dta_next = action;
+ } else {
+ ASSERT(ecb->dte_action == NULL);
+ ecb->dte_action = action;
+ }
+
+ ecb->dte_action_last = action;
+
+ return (0);
+}
+
+static void
+dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
+{
+ dtrace_action_t *act = ecb->dte_action, *next;
+ dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
+ dtrace_difo_t *dp;
+ uint16_t format;
+
+ if (act != NULL && act->dta_refcnt > 1) {
+ ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
+ act->dta_refcnt--;
+ } else {
+ for (; act != NULL; act = next) {
+ next = act->dta_next;
+ ASSERT(next != NULL || act == ecb->dte_action_last);
+ ASSERT(act->dta_refcnt == 1);
+
+ if ((format = act->dta_rec.dtrd_format) != 0)
+ dtrace_format_remove(ecb->dte_state, format);
+
+ if ((dp = act->dta_difo) != NULL)
+ dtrace_difo_release(dp, vstate);
+
+ if (DTRACEACT_ISAGG(act->dta_kind)) {
+ dtrace_ecb_aggregation_destroy(ecb, act);
+ } else {
+ kmem_free(act, sizeof (dtrace_action_t));
+ }
+ }
+ }
+
+ ecb->dte_action = NULL;
+ ecb->dte_action_last = NULL;
+ ecb->dte_size = 0;
+}
+
+static void
+dtrace_ecb_disable(dtrace_ecb_t *ecb)
+{
+ /*
+ * We disable the ECB by removing it from its probe.
+ */
+ dtrace_ecb_t *pecb, *prev = NULL;
+ dtrace_probe_t *probe = ecb->dte_probe;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if (probe == NULL) {
+ /*
+ * This is the NULL probe; there is nothing to disable.
+ */
+ return;
+ }
+
+ for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
+ if (pecb == ecb)
+ break;
+ prev = pecb;
+ }
+
+ ASSERT(pecb != NULL);
+
+ if (prev == NULL) {
+ probe->dtpr_ecb = ecb->dte_next;
+ } else {
+ prev->dte_next = ecb->dte_next;
+ }
+
+ if (ecb == probe->dtpr_ecb_last) {
+ ASSERT(ecb->dte_next == NULL);
+ probe->dtpr_ecb_last = prev;
+ }
+
+ /*
+ * The ECB has been disconnected from the probe; now sync to assure
+ * that all CPUs have seen the change before returning.
+ */
+ dtrace_sync();
+
+ if (probe->dtpr_ecb == NULL) {
+ /*
+ * That was the last ECB on the probe; clear the predicate
+ * cache ID for the probe, disable it and sync one more time
+ * to assure that we'll never hit it again.
+ */
+ dtrace_provider_t *prov = probe->dtpr_provider;
+
+ ASSERT(ecb->dte_next == NULL);
+ ASSERT(probe->dtpr_ecb_last == NULL);
+ probe->dtpr_predcache = DTRACE_CACHEIDNONE;
+ prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
+ probe->dtpr_id, probe->dtpr_arg);
+ dtrace_sync();
+ } else {
+ /*
+ * There is at least one ECB remaining on the probe. If there
+ * is _exactly_ one, set the probe's predicate cache ID to be
+ * the predicate cache ID of the remaining ECB.
+ */
+ ASSERT(probe->dtpr_ecb_last != NULL);
+ ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
+
+ if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
+ dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
+
+ ASSERT(probe->dtpr_ecb->dte_next == NULL);
+
+ if (p != NULL)
+ probe->dtpr_predcache = p->dtp_cacheid;
+ }
+
+ ecb->dte_next = NULL;
+ }
+}
+
+static void
+dtrace_ecb_destroy(dtrace_ecb_t *ecb)
+{
+ dtrace_state_t *state = ecb->dte_state;
+ dtrace_vstate_t *vstate = &state->dts_vstate;
+ dtrace_predicate_t *pred;
+ dtrace_epid_t epid = ecb->dte_epid;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(ecb->dte_next == NULL);
+ ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
+
+ if ((pred = ecb->dte_predicate) != NULL)
+ dtrace_predicate_release(pred, vstate);
+
+ dtrace_ecb_action_remove(ecb);
+
+ ASSERT(state->dts_ecbs[epid - 1] == ecb);
+ state->dts_ecbs[epid - 1] = NULL;
+
+ kmem_free(ecb, sizeof (dtrace_ecb_t));
+}
+
+static dtrace_ecb_t *
+dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
+ dtrace_enabling_t *enab)
+{
+ dtrace_ecb_t *ecb;
+ dtrace_predicate_t *pred;
+ dtrace_actdesc_t *act;
+ dtrace_provider_t *prov;
+ dtrace_ecbdesc_t *desc = enab->dten_current;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(state != NULL);
+
+ ecb = dtrace_ecb_add(state, probe);
+ ecb->dte_uarg = desc->dted_uarg;
+
+ if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
+ dtrace_predicate_hold(pred);
+ ecb->dte_predicate = pred;
+ }
+
+ if (probe != NULL) {
+ /*
+ * If the provider shows more leg than the consumer is old
+ * enough to see, we need to enable the appropriate implicit
+ * predicate bits to prevent the ecb from activating at
+ * revealing times.
+ *
+ * Providers specifying DTRACE_PRIV_USER at register time
+ * are stating that they need the /proc-style privilege
+ * model to be enforced, and this is what DTRACE_COND_OWNER
+ * and DTRACE_COND_ZONEOWNER will then do at probe time.
+ */
+ prov = probe->dtpr_provider;
+ if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
+ (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
+ ecb->dte_cond |= DTRACE_COND_OWNER;
+
+ if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
+ (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
+ ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
+
+ /*
+ * If the provider shows us kernel innards and the user
+ * is lacking sufficient privilege, enable the
+ * DTRACE_COND_USERMODE implicit predicate.
+ */
+ if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
+ (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
+ ecb->dte_cond |= DTRACE_COND_USERMODE;
+ }
+
+ if (dtrace_ecb_create_cache != NULL) {
+ /*
+ * If we have a cached ecb, we'll use its action list instead
+ * of creating our own (saving both time and space).
+ */
+ dtrace_ecb_t *cached = dtrace_ecb_create_cache;
+ dtrace_action_t *act = cached->dte_action;
+
+ if (act != NULL) {
+ ASSERT(act->dta_refcnt > 0);
+ act->dta_refcnt++;
+ ecb->dte_action = act;
+ ecb->dte_action_last = cached->dte_action_last;
+ ecb->dte_needed = cached->dte_needed;
+ ecb->dte_size = cached->dte_size;
+ ecb->dte_alignment = cached->dte_alignment;
+ }
+
+ return (ecb);
+ }
+
+ for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
+ if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
+ dtrace_ecb_destroy(ecb);
+ return (NULL);
+ }
+ }
+
+ if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
+ dtrace_ecb_destroy(ecb);
+ return (NULL);
+ }
+
+ return (dtrace_ecb_create_cache = ecb);
+}
+
+static int
+dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
+{
+ dtrace_ecb_t *ecb;
+ dtrace_enabling_t *enab = arg;
+ dtrace_state_t *state = enab->dten_vstate->dtvs_state;
+
+ ASSERT(state != NULL);
+
+ if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
+ /*
+ * This probe was created in a generation for which this
+ * enabling has previously created ECBs; we don't want to
+ * enable it again, so just kick out.
+ */
+ return (DTRACE_MATCH_NEXT);
+ }
+
+ if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
+ return (DTRACE_MATCH_DONE);
+
+ dtrace_ecb_enable(ecb);
+ return (DTRACE_MATCH_NEXT);
+}
+
+static dtrace_ecb_t *
+dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
+{
+ dtrace_ecb_t *ecb;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if (id == 0 || id > state->dts_necbs)
+ return (NULL);
+
+ ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
+ ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
+
+ return (state->dts_ecbs[id - 1]);
+}
+
+static dtrace_aggregation_t *
+dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
+{
+ dtrace_aggregation_t *agg;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if (id == 0 || id > state->dts_naggregations)
+ return (NULL);
+
+ ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
+ ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
+ agg->dtag_id == id);
+
+ return (state->dts_aggregations[id - 1]);
+}
+
+/*
+ * DTrace Buffer Functions
+ *
+ * The following functions manipulate DTrace buffers. Most of these functions
+ * are called in the context of establishing or processing consumer state;
+ * exceptions are explicitly noted.
+ */
+
+/*
+ * Note: called from cross call context. This function switches the two
+ * buffers on a given CPU. The atomicity of this operation is assured by
+ * disabling interrupts while the actual switch takes place; the disabling of
+ * interrupts serializes the execution with any execution of dtrace_probe() on
+ * the same CPU.
+ */
+static void
+dtrace_buffer_switch(dtrace_buffer_t *buf)
+{
+ caddr_t tomax = buf->dtb_tomax;
+ caddr_t xamot = buf->dtb_xamot;
+ dtrace_icookie_t cookie;
+ hrtime_t now;
+
+ ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
+ ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
+
+ cookie = dtrace_interrupt_disable();
+ now = dtrace_gethrtime();
+ buf->dtb_tomax = xamot;
+ buf->dtb_xamot = tomax;
+ buf->dtb_xamot_drops = buf->dtb_drops;
+ buf->dtb_xamot_offset = buf->dtb_offset;
+ buf->dtb_xamot_errors = buf->dtb_errors;
+ buf->dtb_xamot_flags = buf->dtb_flags;
+ buf->dtb_offset = 0;
+ buf->dtb_drops = 0;
+ buf->dtb_errors = 0;
+ buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
+ buf->dtb_interval = now - buf->dtb_switched;
+ buf->dtb_switched = now;
+ dtrace_interrupt_enable(cookie);
+}
+
+/*
+ * Note: called from cross call context. This function activates a buffer
+ * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
+ * is guaranteed by the disabling of interrupts.
+ */
+static void
+dtrace_buffer_activate(dtrace_state_t *state)
+{
+ dtrace_buffer_t *buf;
+ dtrace_icookie_t cookie = dtrace_interrupt_disable();
+
+ buf = &state->dts_buffer[curcpu];
+
+ if (buf->dtb_tomax != NULL) {
+ /*
+ * We might like to assert that the buffer is marked inactive,
+ * but this isn't necessarily true: the buffer for the CPU
+ * that processes the BEGIN probe has its buffer activated
+ * manually. In this case, we take the (harmless) action
+ * re-clearing the bit INACTIVE bit.
+ */
+ buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
+ }
+
+ dtrace_interrupt_enable(cookie);
+}
+
+#ifdef __FreeBSD__
+/*
+ * Activate the specified per-CPU buffer. This is used instead of
+ * dtrace_buffer_activate() when APs have not yet started, i.e. when
+ * activating anonymous state.
+ */
+static void
+dtrace_buffer_activate_cpu(dtrace_state_t *state, int cpu)
+{
+
+ if (state->dts_buffer[cpu].dtb_tomax != NULL)
+ state->dts_buffer[cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
+}
+#endif
+
+static int
+dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
+ processorid_t cpu, int *factor)
+{
+#ifdef illumos
+ cpu_t *cp;
+#endif
+ dtrace_buffer_t *buf;
+ int allocated = 0, desired = 0;
+
+#ifdef illumos
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ *factor = 1;
+
+ if (size > dtrace_nonroot_maxsize &&
+ !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
+ return (EFBIG);
+
+ cp = cpu_list;
+
+ do {
+ if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
+ continue;
+
+ buf = &bufs[cp->cpu_id];
+
+ /*
+ * If there is already a buffer allocated for this CPU, it
+ * is only possible that this is a DR event. In this case,
+ */
+ if (buf->dtb_tomax != NULL) {
+ ASSERT(buf->dtb_size == size);
+ continue;
+ }
+
+ ASSERT(buf->dtb_xamot == NULL);
+
+ if ((buf->dtb_tomax = kmem_zalloc(size,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL)
+ goto err;
+
+ buf->dtb_size = size;
+ buf->dtb_flags = flags;
+ buf->dtb_offset = 0;
+ buf->dtb_drops = 0;
+
+ if (flags & DTRACEBUF_NOSWITCH)
+ continue;
+
+ if ((buf->dtb_xamot = kmem_zalloc(size,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL)
+ goto err;
+ } while ((cp = cp->cpu_next) != cpu_list);
+
+ return (0);
+
+err:
+ cp = cpu_list;
+
+ do {
+ if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
+ continue;
+
+ buf = &bufs[cp->cpu_id];
+ desired += 2;
+
+ if (buf->dtb_xamot != NULL) {
+ ASSERT(buf->dtb_tomax != NULL);
+ ASSERT(buf->dtb_size == size);
+ kmem_free(buf->dtb_xamot, size);
+ allocated++;
+ }
+
+ if (buf->dtb_tomax != NULL) {
+ ASSERT(buf->dtb_size == size);
+ kmem_free(buf->dtb_tomax, size);
+ allocated++;
+ }
+
+ buf->dtb_tomax = NULL;
+ buf->dtb_xamot = NULL;
+ buf->dtb_size = 0;
+ } while ((cp = cp->cpu_next) != cpu_list);
+#else
+ int i;
+
+ *factor = 1;
+#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
+ defined(__mips__) || defined(__powerpc__) || defined(__riscv)
+ /*
+ * FreeBSD isn't good at limiting the amount of memory we
+ * ask to malloc, so let's place a limit here before trying
+ * to do something that might well end in tears at bedtime.
+ */
+ if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
+ return (ENOMEM);
+#endif
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ CPU_FOREACH(i) {
+ if (cpu != DTRACE_CPUALL && cpu != i)
+ continue;
+
+ buf = &bufs[i];
+
+ /*
+ * If there is already a buffer allocated for this CPU, it
+ * is only possible that this is a DR event. In this case,
+ * the buffer size must match our specified size.
+ */
+ if (buf->dtb_tomax != NULL) {
+ ASSERT(buf->dtb_size == size);
+ continue;
+ }
+
+ ASSERT(buf->dtb_xamot == NULL);
+
+ if ((buf->dtb_tomax = kmem_zalloc(size,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL)
+ goto err;
+
+ buf->dtb_size = size;
+ buf->dtb_flags = flags;
+ buf->dtb_offset = 0;
+ buf->dtb_drops = 0;
+
+ if (flags & DTRACEBUF_NOSWITCH)
+ continue;
+
+ if ((buf->dtb_xamot = kmem_zalloc(size,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL)
+ goto err;
+ }
+
+ return (0);
+
+err:
+ /*
+ * Error allocating memory, so free the buffers that were
+ * allocated before the failed allocation.
+ */
+ CPU_FOREACH(i) {
+ if (cpu != DTRACE_CPUALL && cpu != i)
+ continue;
+
+ buf = &bufs[i];
+ desired += 2;
+
+ if (buf->dtb_xamot != NULL) {
+ ASSERT(buf->dtb_tomax != NULL);
+ ASSERT(buf->dtb_size == size);
+ kmem_free(buf->dtb_xamot, size);
+ allocated++;
+ }
+
+ if (buf->dtb_tomax != NULL) {
+ ASSERT(buf->dtb_size == size);
+ kmem_free(buf->dtb_tomax, size);
+ allocated++;
+ }
+
+ buf->dtb_tomax = NULL;
+ buf->dtb_xamot = NULL;
+ buf->dtb_size = 0;
+
+ }
+#endif
+ *factor = desired / (allocated > 0 ? allocated : 1);
+
+ return (ENOMEM);
+}
+
+/*
+ * Note: called from probe context. This function just increments the drop
+ * count on a buffer. It has been made a function to allow for the
+ * possibility of understanding the source of mysterious drop counts. (A
+ * problem for which one may be particularly disappointed that DTrace cannot
+ * be used to understand DTrace.)
+ */
+static void
+dtrace_buffer_drop(dtrace_buffer_t *buf)
+{
+ buf->dtb_drops++;
+}
+
+/*
+ * Note: called from probe context. This function is called to reserve space
+ * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
+ * mstate. Returns the new offset in the buffer, or a negative value if an
+ * error has occurred.
+ */
+static intptr_t
+dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
+ dtrace_state_t *state, dtrace_mstate_t *mstate)
+{
+ intptr_t offs = buf->dtb_offset, soffs;
+ intptr_t woffs;
+ caddr_t tomax;
+ size_t total;
+
+ if (buf->dtb_flags & DTRACEBUF_INACTIVE)
+ return (-1);
+
+ if ((tomax = buf->dtb_tomax) == NULL) {
+ dtrace_buffer_drop(buf);
+ return (-1);
+ }
+
+ if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
+ while (offs & (align - 1)) {
+ /*
+ * Assert that our alignment is off by a number which
+ * is itself sizeof (uint32_t) aligned.
+ */
+ ASSERT(!((align - (offs & (align - 1))) &
+ (sizeof (uint32_t) - 1)));
+ DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
+ offs += sizeof (uint32_t);
+ }
+
+ if ((soffs = offs + needed) > buf->dtb_size) {
+ dtrace_buffer_drop(buf);
+ return (-1);
+ }
+
+ if (mstate == NULL)
+ return (offs);
+
+ mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
+ mstate->dtms_scratch_size = buf->dtb_size - soffs;
+ mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
+
+ return (offs);
+ }
+
+ if (buf->dtb_flags & DTRACEBUF_FILL) {
+ if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
+ (buf->dtb_flags & DTRACEBUF_FULL))
+ return (-1);
+ goto out;
+ }
+
+ total = needed + (offs & (align - 1));
+
+ /*
+ * For a ring buffer, life is quite a bit more complicated. Before
+ * we can store any padding, we need to adjust our wrapping offset.
+ * (If we've never before wrapped or we're not about to, no adjustment
+ * is required.)
+ */
+ if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
+ offs + total > buf->dtb_size) {
+ woffs = buf->dtb_xamot_offset;
+
+ if (offs + total > buf->dtb_size) {
+ /*
+ * We can't fit in the end of the buffer. First, a
+ * sanity check that we can fit in the buffer at all.
+ */
+ if (total > buf->dtb_size) {
+ dtrace_buffer_drop(buf);
+ return (-1);
+ }
+
+ /*
+ * We're going to be storing at the top of the buffer,
+ * so now we need to deal with the wrapped offset. We
+ * only reset our wrapped offset to 0 if it is
+ * currently greater than the current offset. If it
+ * is less than the current offset, it is because a
+ * previous allocation induced a wrap -- but the
+ * allocation didn't subsequently take the space due
+ * to an error or false predicate evaluation. In this
+ * case, we'll just leave the wrapped offset alone: if
+ * the wrapped offset hasn't been advanced far enough
+ * for this allocation, it will be adjusted in the
+ * lower loop.
+ */
+ if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
+ if (woffs >= offs)
+ woffs = 0;
+ } else {
+ woffs = 0;
+ }
+
+ /*
+ * Now we know that we're going to be storing to the
+ * top of the buffer and that there is room for us
+ * there. We need to clear the buffer from the current
+ * offset to the end (there may be old gunk there).
+ */
+ while (offs < buf->dtb_size)
+ tomax[offs++] = 0;
+
+ /*
+ * We need to set our offset to zero. And because we
+ * are wrapping, we need to set the bit indicating as
+ * much. We can also adjust our needed space back
+ * down to the space required by the ECB -- we know
+ * that the top of the buffer is aligned.
+ */
+ offs = 0;
+ total = needed;
+ buf->dtb_flags |= DTRACEBUF_WRAPPED;
+ } else {
+ /*
+ * There is room for us in the buffer, so we simply
+ * need to check the wrapped offset.
+ */
+ if (woffs < offs) {
+ /*
+ * The wrapped offset is less than the offset.
+ * This can happen if we allocated buffer space
+ * that induced a wrap, but then we didn't
+ * subsequently take the space due to an error
+ * or false predicate evaluation. This is
+ * okay; we know that _this_ allocation isn't
+ * going to induce a wrap. We still can't
+ * reset the wrapped offset to be zero,
+ * however: the space may have been trashed in
+ * the previous failed probe attempt. But at
+ * least the wrapped offset doesn't need to
+ * be adjusted at all...
+ */
+ goto out;
+ }
+ }
+
+ while (offs + total > woffs) {
+ dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
+ size_t size;
+
+ if (epid == DTRACE_EPIDNONE) {
+ size = sizeof (uint32_t);
+ } else {
+ ASSERT3U(epid, <=, state->dts_necbs);
+ ASSERT(state->dts_ecbs[epid - 1] != NULL);
+
+ size = state->dts_ecbs[epid - 1]->dte_size;
+ }
+
+ ASSERT(woffs + size <= buf->dtb_size);
+ ASSERT(size != 0);
+
+ if (woffs + size == buf->dtb_size) {
+ /*
+ * We've reached the end of the buffer; we want
+ * to set the wrapped offset to 0 and break
+ * out. However, if the offs is 0, then we're
+ * in a strange edge-condition: the amount of
+ * space that we want to reserve plus the size
+ * of the record that we're overwriting is
+ * greater than the size of the buffer. This
+ * is problematic because if we reserve the
+ * space but subsequently don't consume it (due
+ * to a failed predicate or error) the wrapped
+ * offset will be 0 -- yet the EPID at offset 0
+ * will not be committed. This situation is
+ * relatively easy to deal with: if we're in
+ * this case, the buffer is indistinguishable
+ * from one that hasn't wrapped; we need only
+ * finish the job by clearing the wrapped bit,
+ * explicitly setting the offset to be 0, and
+ * zero'ing out the old data in the buffer.
+ */
+ if (offs == 0) {
+ buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
+ buf->dtb_offset = 0;
+ woffs = total;
+
+ while (woffs < buf->dtb_size)
+ tomax[woffs++] = 0;
+ }
+
+ woffs = 0;
+ break;
+ }
+
+ woffs += size;
+ }
+
+ /*
+ * We have a wrapped offset. It may be that the wrapped offset
+ * has become zero -- that's okay.
+ */
+ buf->dtb_xamot_offset = woffs;
+ }
+
+out:
+ /*
+ * Now we can plow the buffer with any necessary padding.
+ */
+ while (offs & (align - 1)) {
+ /*
+ * Assert that our alignment is off by a number which
+ * is itself sizeof (uint32_t) aligned.
+ */
+ ASSERT(!((align - (offs & (align - 1))) &
+ (sizeof (uint32_t) - 1)));
+ DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
+ offs += sizeof (uint32_t);
+ }
+
+ if (buf->dtb_flags & DTRACEBUF_FILL) {
+ if (offs + needed > buf->dtb_size - state->dts_reserve) {
+ buf->dtb_flags |= DTRACEBUF_FULL;
+ return (-1);
+ }
+ }
+
+ if (mstate == NULL)
+ return (offs);
+
+ /*
+ * For ring buffers and fill buffers, the scratch space is always
+ * the inactive buffer.
+ */
+ mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
+ mstate->dtms_scratch_size = buf->dtb_size;
+ mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
+
+ return (offs);
+}
+
+static void
+dtrace_buffer_polish(dtrace_buffer_t *buf)
+{
+ ASSERT(buf->dtb_flags & DTRACEBUF_RING);
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
+ return;
+
+ /*
+ * We need to polish the ring buffer. There are three cases:
+ *
+ * - The first (and presumably most common) is that there is no gap
+ * between the buffer offset and the wrapped offset. In this case,
+ * there is nothing in the buffer that isn't valid data; we can
+ * mark the buffer as polished and return.
+ *
+ * - The second (less common than the first but still more common
+ * than the third) is that there is a gap between the buffer offset
+ * and the wrapped offset, and the wrapped offset is larger than the
+ * buffer offset. This can happen because of an alignment issue, or
+ * can happen because of a call to dtrace_buffer_reserve() that
+ * didn't subsequently consume the buffer space. In this case,
+ * we need to zero the data from the buffer offset to the wrapped
+ * offset.
+ *
+ * - The third (and least common) is that there is a gap between the
+ * buffer offset and the wrapped offset, but the wrapped offset is
+ * _less_ than the buffer offset. This can only happen because a
+ * call to dtrace_buffer_reserve() induced a wrap, but the space
+ * was not subsequently consumed. In this case, we need to zero the
+ * space from the offset to the end of the buffer _and_ from the
+ * top of the buffer to the wrapped offset.
+ */
+ if (buf->dtb_offset < buf->dtb_xamot_offset) {
+ bzero(buf->dtb_tomax + buf->dtb_offset,
+ buf->dtb_xamot_offset - buf->dtb_offset);
+ }
+
+ if (buf->dtb_offset > buf->dtb_xamot_offset) {
+ bzero(buf->dtb_tomax + buf->dtb_offset,
+ buf->dtb_size - buf->dtb_offset);
+ bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
+ }
+}
+
+/*
+ * This routine determines if data generated at the specified time has likely
+ * been entirely consumed at user-level. This routine is called to determine
+ * if an ECB on a defunct probe (but for an active enabling) can be safely
+ * disabled and destroyed.
+ */
+static int
+dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
+{
+ int i;
+
+ for (i = 0; i < NCPU; i++) {
+ dtrace_buffer_t *buf = &bufs[i];
+
+ if (buf->dtb_size == 0)
+ continue;
+
+ if (buf->dtb_flags & DTRACEBUF_RING)
+ return (0);
+
+ if (!buf->dtb_switched && buf->dtb_offset != 0)
+ return (0);
+
+ if (buf->dtb_switched - buf->dtb_interval < when)
+ return (0);
+ }
+
+ return (1);
+}
+
+static void
+dtrace_buffer_free(dtrace_buffer_t *bufs)
+{
+ int i;
+
+ for (i = 0; i < NCPU; i++) {
+ dtrace_buffer_t *buf = &bufs[i];
+
+ if (buf->dtb_tomax == NULL) {
+ ASSERT(buf->dtb_xamot == NULL);
+ ASSERT(buf->dtb_size == 0);
+ continue;
+ }
+
+ if (buf->dtb_xamot != NULL) {
+ ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
+ kmem_free(buf->dtb_xamot, buf->dtb_size);
+ }
+
+ kmem_free(buf->dtb_tomax, buf->dtb_size);
+ buf->dtb_size = 0;
+ buf->dtb_tomax = NULL;
+ buf->dtb_xamot = NULL;
+ }
+}
+
+/*
+ * DTrace Enabling Functions
+ */
+static dtrace_enabling_t *
+dtrace_enabling_create(dtrace_vstate_t *vstate)
+{
+ dtrace_enabling_t *enab;
+
+ enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
+ enab->dten_vstate = vstate;
+
+ return (enab);
+}
+
+static void
+dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
+{
+ dtrace_ecbdesc_t **ndesc;
+ size_t osize, nsize;
+
+ /*
+ * We can't add to enablings after we've enabled them, or after we've
+ * retained them.
+ */
+ ASSERT(enab->dten_probegen == 0);
+ ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
+
+ if (enab->dten_ndesc < enab->dten_maxdesc) {
+ enab->dten_desc[enab->dten_ndesc++] = ecb;
+ return;
+ }
+
+ osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
+
+ if (enab->dten_maxdesc == 0) {
+ enab->dten_maxdesc = 1;
+ } else {
+ enab->dten_maxdesc <<= 1;
+ }
+
+ ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
+
+ nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
+ ndesc = kmem_zalloc(nsize, KM_SLEEP);
+ bcopy(enab->dten_desc, ndesc, osize);
+ if (enab->dten_desc != NULL)
+ kmem_free(enab->dten_desc, osize);
+
+ enab->dten_desc = ndesc;
+ enab->dten_desc[enab->dten_ndesc++] = ecb;
+}
+
+static void
+dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
+ dtrace_probedesc_t *pd)
+{
+ dtrace_ecbdesc_t *new;
+ dtrace_predicate_t *pred;
+ dtrace_actdesc_t *act;
+
+ /*
+ * We're going to create a new ECB description that matches the
+ * specified ECB in every way, but has the specified probe description.
+ */
+ new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
+
+ if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
+ dtrace_predicate_hold(pred);
+
+ for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
+ dtrace_actdesc_hold(act);
+
+ new->dted_action = ecb->dted_action;
+ new->dted_pred = ecb->dted_pred;
+ new->dted_probe = *pd;
+ new->dted_uarg = ecb->dted_uarg;
+
+ dtrace_enabling_add(enab, new);
+}
+
+static void
+dtrace_enabling_dump(dtrace_enabling_t *enab)
+{
+ int i;
+
+ for (i = 0; i < enab->dten_ndesc; i++) {
+ dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
+
+#ifdef __FreeBSD__
+ printf("dtrace: enabling probe %d (%s:%s:%s:%s)\n", i,
+ desc->dtpd_provider, desc->dtpd_mod,
+ desc->dtpd_func, desc->dtpd_name);
+#else
+ cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
+ desc->dtpd_provider, desc->dtpd_mod,
+ desc->dtpd_func, desc->dtpd_name);
+#endif
+ }
+}
+
+static void
+dtrace_enabling_destroy(dtrace_enabling_t *enab)
+{
+ int i;
+ dtrace_ecbdesc_t *ep;
+ dtrace_vstate_t *vstate = enab->dten_vstate;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ for (i = 0; i < enab->dten_ndesc; i++) {
+ dtrace_actdesc_t *act, *next;
+ dtrace_predicate_t *pred;
+
+ ep = enab->dten_desc[i];
+
+ if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
+ dtrace_predicate_release(pred, vstate);
+
+ for (act = ep->dted_action; act != NULL; act = next) {
+ next = act->dtad_next;
+ dtrace_actdesc_release(act, vstate);
+ }
+
+ kmem_free(ep, sizeof (dtrace_ecbdesc_t));
+ }
+
+ if (enab->dten_desc != NULL)
+ kmem_free(enab->dten_desc,
+ enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
+
+ /*
+ * If this was a retained enabling, decrement the dts_nretained count
+ * and take it off of the dtrace_retained list.
+ */
+ if (enab->dten_prev != NULL || enab->dten_next != NULL ||
+ dtrace_retained == enab) {
+ ASSERT(enab->dten_vstate->dtvs_state != NULL);
+ ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
+ enab->dten_vstate->dtvs_state->dts_nretained--;
+ dtrace_retained_gen++;
+ }
+
+ if (enab->dten_prev == NULL) {
+ if (dtrace_retained == enab) {
+ dtrace_retained = enab->dten_next;
+
+ if (dtrace_retained != NULL)
+ dtrace_retained->dten_prev = NULL;
+ }
+ } else {
+ ASSERT(enab != dtrace_retained);
+ ASSERT(dtrace_retained != NULL);
+ enab->dten_prev->dten_next = enab->dten_next;
+ }
+
+ if (enab->dten_next != NULL) {
+ ASSERT(dtrace_retained != NULL);
+ enab->dten_next->dten_prev = enab->dten_prev;
+ }
+
+ kmem_free(enab, sizeof (dtrace_enabling_t));
+}
+
+static int
+dtrace_enabling_retain(dtrace_enabling_t *enab)
+{
+ dtrace_state_t *state;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
+ ASSERT(enab->dten_vstate != NULL);
+
+ state = enab->dten_vstate->dtvs_state;
+ ASSERT(state != NULL);
+
+ /*
+ * We only allow each state to retain dtrace_retain_max enablings.
+ */
+ if (state->dts_nretained >= dtrace_retain_max)
+ return (ENOSPC);
+
+ state->dts_nretained++;
+ dtrace_retained_gen++;
+
+ if (dtrace_retained == NULL) {
+ dtrace_retained = enab;
+ return (0);
+ }
+
+ enab->dten_next = dtrace_retained;
+ dtrace_retained->dten_prev = enab;
+ dtrace_retained = enab;
+
+ return (0);
+}
+
+static int
+dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
+ dtrace_probedesc_t *create)
+{
+ dtrace_enabling_t *new, *enab;
+ int found = 0, err = ENOENT;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
+ ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
+ ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
+ ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
+
+ new = dtrace_enabling_create(&state->dts_vstate);
+
+ /*
+ * Iterate over all retained enablings, looking for enablings that
+ * match the specified state.
+ */
+ for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
+ int i;
+
+ /*
+ * dtvs_state can only be NULL for helper enablings -- and
+ * helper enablings can't be retained.
+ */
+ ASSERT(enab->dten_vstate->dtvs_state != NULL);
+
+ if (enab->dten_vstate->dtvs_state != state)
+ continue;
+
+ /*
+ * Now iterate over each probe description; we're looking for
+ * an exact match to the specified probe description.
+ */
+ for (i = 0; i < enab->dten_ndesc; i++) {
+ dtrace_ecbdesc_t *ep = enab->dten_desc[i];
+ dtrace_probedesc_t *pd = &ep->dted_probe;
+
+ if (strcmp(pd->dtpd_provider, match->dtpd_provider))
+ continue;
+
+ if (strcmp(pd->dtpd_mod, match->dtpd_mod))
+ continue;
+
+ if (strcmp(pd->dtpd_func, match->dtpd_func))
+ continue;
+
+ if (strcmp(pd->dtpd_name, match->dtpd_name))
+ continue;
+
+ /*
+ * We have a winning probe! Add it to our growing
+ * enabling.
+ */
+ found = 1;
+ dtrace_enabling_addlike(new, ep, create);
+ }
+ }
+
+ if (!found || (err = dtrace_enabling_retain(new)) != 0) {
+ dtrace_enabling_destroy(new);
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+dtrace_enabling_retract(dtrace_state_t *state)
+{
+ dtrace_enabling_t *enab, *next;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ /*
+ * Iterate over all retained enablings, destroy the enablings retained
+ * for the specified state.
+ */
+ for (enab = dtrace_retained; enab != NULL; enab = next) {
+ next = enab->dten_next;
+
+ /*
+ * dtvs_state can only be NULL for helper enablings -- and
+ * helper enablings can't be retained.
+ */
+ ASSERT(enab->dten_vstate->dtvs_state != NULL);
+
+ if (enab->dten_vstate->dtvs_state == state) {
+ ASSERT(state->dts_nretained > 0);
+ dtrace_enabling_destroy(enab);
+ }
+ }
+
+ ASSERT(state->dts_nretained == 0);
+}
+
+static int
+dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
+{
+ int i = 0;
+ int matched = 0;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ for (i = 0; i < enab->dten_ndesc; i++) {
+ dtrace_ecbdesc_t *ep = enab->dten_desc[i];
+
+ enab->dten_current = ep;
+ enab->dten_error = 0;
+
+ matched += dtrace_probe_enable(&ep->dted_probe, enab);
+
+ if (enab->dten_error != 0) {
+ /*
+ * If we get an error half-way through enabling the
+ * probes, we kick out -- perhaps with some number of
+ * them enabled. Leaving enabled probes enabled may
+ * be slightly confusing for user-level, but we expect
+ * that no one will attempt to actually drive on in
+ * the face of such errors. If this is an anonymous
+ * enabling (indicated with a NULL nmatched pointer),
+ * we cmn_err() a message. We aren't expecting to
+ * get such an error -- such as it can exist at all,
+ * it would be a result of corrupted DOF in the driver
+ * properties.
+ */
+ if (nmatched == NULL) {
+ cmn_err(CE_WARN, "dtrace_enabling_match() "
+ "error on %p: %d", (void *)ep,
+ enab->dten_error);
+ }
+
+ return (enab->dten_error);
+ }
+ }
+
+ enab->dten_probegen = dtrace_probegen;
+ if (nmatched != NULL)
+ *nmatched = matched;
+
+ return (0);
+}
+
+static void
+dtrace_enabling_matchall(void)
+{
+ dtrace_enabling_t *enab;
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_lock);
+
+ /*
+ * Iterate over all retained enablings to see if any probes match
+ * against them. We only perform this operation on enablings for which
+ * we have sufficient permissions by virtue of being in the global zone
+ * or in the same zone as the DTrace client. Because we can be called
+ * after dtrace_detach() has been called, we cannot assert that there
+ * are retained enablings. We can safely load from dtrace_retained,
+ * however: the taskq_destroy() at the end of dtrace_detach() will
+ * block pending our completion.
+ */
+ for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
+#ifdef illumos
+ cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
+
+ if (INGLOBALZONE(curproc) ||
+ cr != NULL && getzoneid() == crgetzoneid(cr))
+#endif
+ (void) dtrace_enabling_match(enab, NULL);
+ }
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+}
+
+/*
+ * If an enabling is to be enabled without having matched probes (that is, if
+ * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
+ * enabling must be _primed_ by creating an ECB for every ECB description.
+ * This must be done to assure that we know the number of speculations, the
+ * number of aggregations, the minimum buffer size needed, etc. before we
+ * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
+ * enabling any probes, we create ECBs for every ECB decription, but with a
+ * NULL probe -- which is exactly what this function does.
+ */
+static void
+dtrace_enabling_prime(dtrace_state_t *state)
+{
+ dtrace_enabling_t *enab;
+ int i;
+
+ for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
+ ASSERT(enab->dten_vstate->dtvs_state != NULL);
+
+ if (enab->dten_vstate->dtvs_state != state)
+ continue;
+
+ /*
+ * We don't want to prime an enabling more than once, lest
+ * we allow a malicious user to induce resource exhaustion.
+ * (The ECBs that result from priming an enabling aren't
+ * leaked -- but they also aren't deallocated until the
+ * consumer state is destroyed.)
+ */
+ if (enab->dten_primed)
+ continue;
+
+ for (i = 0; i < enab->dten_ndesc; i++) {
+ enab->dten_current = enab->dten_desc[i];
+ (void) dtrace_probe_enable(NULL, enab);
+ }
+
+ enab->dten_primed = 1;
+ }
+}
+
+/*
+ * Called to indicate that probes should be provided due to retained
+ * enablings. This is implemented in terms of dtrace_probe_provide(), but it
+ * must take an initial lap through the enabling calling the dtps_provide()
+ * entry point explicitly to allow for autocreated probes.
+ */
+static void
+dtrace_enabling_provide(dtrace_provider_t *prv)
+{
+ int i, all = 0;
+ dtrace_probedesc_t desc;
+ dtrace_genid_t gen;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(MUTEX_HELD(&dtrace_provider_lock));
+
+ if (prv == NULL) {
+ all = 1;
+ prv = dtrace_provider;
+ }
+
+ do {
+ dtrace_enabling_t *enab;
+ void *parg = prv->dtpv_arg;
+
+retry:
+ gen = dtrace_retained_gen;
+ for (enab = dtrace_retained; enab != NULL;
+ enab = enab->dten_next) {
+ for (i = 0; i < enab->dten_ndesc; i++) {
+ desc = enab->dten_desc[i]->dted_probe;
+ mutex_exit(&dtrace_lock);
+ prv->dtpv_pops.dtps_provide(parg, &desc);
+ mutex_enter(&dtrace_lock);
+ /*
+ * Process the retained enablings again if
+ * they have changed while we weren't holding
+ * dtrace_lock.
+ */
+ if (gen != dtrace_retained_gen)
+ goto retry;
+ }
+ }
+ } while (all && (prv = prv->dtpv_next) != NULL);
+
+ mutex_exit(&dtrace_lock);
+ dtrace_probe_provide(NULL, all ? NULL : prv);
+ mutex_enter(&dtrace_lock);
+}
+
+/*
+ * Called to reap ECBs that are attached to probes from defunct providers.
+ */
+static void
+dtrace_enabling_reap(void)
+{
+ dtrace_provider_t *prov;
+ dtrace_probe_t *probe;
+ dtrace_ecb_t *ecb;
+ hrtime_t when;
+ int i;
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_lock);
+
+ for (i = 0; i < dtrace_nprobes; i++) {
+ if ((probe = dtrace_probes[i]) == NULL)
+ continue;
+
+ if (probe->dtpr_ecb == NULL)
+ continue;
+
+ prov = probe->dtpr_provider;
+
+ if ((when = prov->dtpv_defunct) == 0)
+ continue;
+
+ /*
+ * We have ECBs on a defunct provider: we want to reap these
+ * ECBs to allow the provider to unregister. The destruction
+ * of these ECBs must be done carefully: if we destroy the ECB
+ * and the consumer later wishes to consume an EPID that
+ * corresponds to the destroyed ECB (and if the EPID metadata
+ * has not been previously consumed), the consumer will abort
+ * processing on the unknown EPID. To reduce (but not, sadly,
+ * eliminate) the possibility of this, we will only destroy an
+ * ECB for a defunct provider if, for the state that
+ * corresponds to the ECB:
+ *
+ * (a) There is no speculative tracing (which can effectively
+ * cache an EPID for an arbitrary amount of time).
+ *
+ * (b) The principal buffers have been switched twice since the
+ * provider became defunct.
+ *
+ * (c) The aggregation buffers are of zero size or have been
+ * switched twice since the provider became defunct.
+ *
+ * We use dts_speculates to determine (a) and call a function
+ * (dtrace_buffer_consumed()) to determine (b) and (c). Note
+ * that as soon as we've been unable to destroy one of the ECBs
+ * associated with the probe, we quit trying -- reaping is only
+ * fruitful in as much as we can destroy all ECBs associated
+ * with the defunct provider's probes.
+ */
+ while ((ecb = probe->dtpr_ecb) != NULL) {
+ dtrace_state_t *state = ecb->dte_state;
+ dtrace_buffer_t *buf = state->dts_buffer;
+ dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
+
+ if (state->dts_speculates)
+ break;
+
+ if (!dtrace_buffer_consumed(buf, when))
+ break;
+
+ if (!dtrace_buffer_consumed(aggbuf, when))
+ break;
+
+ dtrace_ecb_disable(ecb);
+ ASSERT(probe->dtpr_ecb != ecb);
+ dtrace_ecb_destroy(ecb);
+ }
+ }
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+}
+
+/*
+ * DTrace DOF Functions
+ */
+/*ARGSUSED*/
+static void
+dtrace_dof_error(dof_hdr_t *dof, const char *str)
+{
+ if (dtrace_err_verbose)
+ cmn_err(CE_WARN, "failed to process DOF: %s", str);
+
+#ifdef DTRACE_ERRDEBUG
+ dtrace_errdebug(str);
+#endif
+}
+
+/*
+ * Create DOF out of a currently enabled state. Right now, we only create
+ * DOF containing the run-time options -- but this could be expanded to create
+ * complete DOF representing the enabled state.
+ */
+static dof_hdr_t *
+dtrace_dof_create(dtrace_state_t *state)
+{
+ dof_hdr_t *dof;
+ dof_sec_t *sec;
+ dof_optdesc_t *opt;
+ int i, len = sizeof (dof_hdr_t) +
+ roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
+ sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ dof = kmem_zalloc(len, KM_SLEEP);
+ dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
+ dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
+ dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
+ dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
+
+ dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
+ dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
+ dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
+ dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
+ dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
+ dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
+
+ dof->dofh_flags = 0;
+ dof->dofh_hdrsize = sizeof (dof_hdr_t);
+ dof->dofh_secsize = sizeof (dof_sec_t);
+ dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
+ dof->dofh_secoff = sizeof (dof_hdr_t);
+ dof->dofh_loadsz = len;
+ dof->dofh_filesz = len;
+ dof->dofh_pad = 0;
+
+ /*
+ * Fill in the option section header...
+ */
+ sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
+ sec->dofs_type = DOF_SECT_OPTDESC;
+ sec->dofs_align = sizeof (uint64_t);
+ sec->dofs_flags = DOF_SECF_LOAD;
+ sec->dofs_entsize = sizeof (dof_optdesc_t);
+
+ opt = (dof_optdesc_t *)((uintptr_t)sec +
+ roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
+
+ sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
+ sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
+
+ for (i = 0; i < DTRACEOPT_MAX; i++) {
+ opt[i].dofo_option = i;
+ opt[i].dofo_strtab = DOF_SECIDX_NONE;
+ opt[i].dofo_value = state->dts_options[i];
+ }
+
+ return (dof);
+}
+
+static dof_hdr_t *
+dtrace_dof_copyin(uintptr_t uarg, int *errp)
+{
+ dof_hdr_t hdr, *dof;
+
+ ASSERT(!MUTEX_HELD(&dtrace_lock));
+
+ /*
+ * First, we're going to copyin() the sizeof (dof_hdr_t).
+ */
+ if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
+ dtrace_dof_error(NULL, "failed to copyin DOF header");
+ *errp = EFAULT;
+ return (NULL);
+ }
+
+ /*
+ * Now we'll allocate the entire DOF and copy it in -- provided
+ * that the length isn't outrageous.
+ */
+ if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
+ dtrace_dof_error(&hdr, "load size exceeds maximum");
+ *errp = E2BIG;
+ return (NULL);
+ }
+
+ if (hdr.dofh_loadsz < sizeof (hdr)) {
+ dtrace_dof_error(&hdr, "invalid load size");
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
+
+ if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
+ dof->dofh_loadsz != hdr.dofh_loadsz) {
+ kmem_free(dof, hdr.dofh_loadsz);
+ *errp = EFAULT;
+ return (NULL);
+ }
+
+ return (dof);
+}
+
+#ifdef __FreeBSD__
+static dof_hdr_t *
+dtrace_dof_copyin_proc(struct proc *p, uintptr_t uarg, int *errp)
+{
+ dof_hdr_t hdr, *dof;
+ struct thread *td;
+ size_t loadsz;
+
+ ASSERT(!MUTEX_HELD(&dtrace_lock));
+
+ td = curthread;
+
+ /*
+ * First, we're going to copyin() the sizeof (dof_hdr_t).
+ */
+ if (proc_readmem(td, p, uarg, &hdr, sizeof(hdr)) != sizeof(hdr)) {
+ dtrace_dof_error(NULL, "failed to copyin DOF header");
+ *errp = EFAULT;
+ return (NULL);
+ }
+
+ /*
+ * Now we'll allocate the entire DOF and copy it in -- provided
+ * that the length isn't outrageous.
+ */
+ if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
+ dtrace_dof_error(&hdr, "load size exceeds maximum");
+ *errp = E2BIG;
+ return (NULL);
+ }
+ loadsz = (size_t)hdr.dofh_loadsz;
+
+ if (loadsz < sizeof (hdr)) {
+ dtrace_dof_error(&hdr, "invalid load size");
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ dof = kmem_alloc(loadsz, KM_SLEEP);
+
+ if (proc_readmem(td, p, uarg, dof, loadsz) != loadsz ||
+ dof->dofh_loadsz != loadsz) {
+ kmem_free(dof, hdr.dofh_loadsz);
+ *errp = EFAULT;
+ return (NULL);
+ }
+
+ return (dof);
+}
+
+static __inline uchar_t
+dtrace_dof_char(char c)
+{
+
+ switch (c) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ return (c - '0');
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ return (c - 'A' + 10);
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ return (c - 'a' + 10);
+ }
+ /* Should not reach here. */
+ return (UCHAR_MAX);
+}
+#endif /* __FreeBSD__ */
+
+static dof_hdr_t *
+dtrace_dof_property(const char *name)
+{
+#ifdef __FreeBSD__
+ uint8_t *dofbuf;
+ u_char *data, *eol;
+ caddr_t doffile;
+ size_t bytes, len, i;
+ dof_hdr_t *dof;
+ u_char c1, c2;
+
+ dof = NULL;
+
+ doffile = preload_search_by_type("dtrace_dof");
+ if (doffile == NULL)
+ return (NULL);
+
+ data = preload_fetch_addr(doffile);
+ len = preload_fetch_size(doffile);
+ for (;;) {
+ /* Look for the end of the line. All lines end in a newline. */
+ eol = memchr(data, '\n', len);
+ if (eol == NULL)
+ return (NULL);
+
+ if (strncmp(name, data, strlen(name)) == 0)
+ break;
+
+ eol++; /* skip past the newline */
+ len -= eol - data;
+ data = eol;
+ }
+
+ /* We've found the data corresponding to the specified key. */
+
+ data += strlen(name) + 1; /* skip past the '=' */
+ len = eol - data;
+ if (len % 2 != 0) {
+ dtrace_dof_error(NULL, "invalid DOF encoding length");
+ goto doferr;
+ }
+ bytes = len / 2;
+ if (bytes < sizeof(dof_hdr_t)) {
+ dtrace_dof_error(NULL, "truncated header");
+ goto doferr;
+ }
+
+ /*
+ * Each byte is represented by the two ASCII characters in its hex
+ * representation.
+ */
+ dofbuf = malloc(bytes, M_SOLARIS, M_WAITOK);
+ for (i = 0; i < bytes; i++) {
+ c1 = dtrace_dof_char(data[i * 2]);
+ c2 = dtrace_dof_char(data[i * 2 + 1]);
+ if (c1 == UCHAR_MAX || c2 == UCHAR_MAX) {
+ dtrace_dof_error(NULL, "invalid hex char in DOF");
+ goto doferr;
+ }
+ dofbuf[i] = c1 * 16 + c2;
+ }
+
+ dof = (dof_hdr_t *)dofbuf;
+ if (bytes < dof->dofh_loadsz) {
+ dtrace_dof_error(NULL, "truncated DOF");
+ goto doferr;
+ }
+
+ if (dof->dofh_loadsz >= dtrace_dof_maxsize) {
+ dtrace_dof_error(NULL, "oversized DOF");
+ goto doferr;
+ }
+
+ return (dof);
+
+doferr:
+ free(dof, M_SOLARIS);
+ return (NULL);
+#else /* __FreeBSD__ */
+ uchar_t *buf;
+ uint64_t loadsz;
+ unsigned int len, i;
+ dof_hdr_t *dof;
+
+ /*
+ * Unfortunately, array of values in .conf files are always (and
+ * only) interpreted to be integer arrays. We must read our DOF
+ * as an integer array, and then squeeze it into a byte array.
+ */
+ if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
+ (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
+ return (NULL);
+
+ for (i = 0; i < len; i++)
+ buf[i] = (uchar_t)(((int *)buf)[i]);
+
+ if (len < sizeof (dof_hdr_t)) {
+ ddi_prop_free(buf);
+ dtrace_dof_error(NULL, "truncated header");
+ return (NULL);
+ }
+
+ if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
+ ddi_prop_free(buf);
+ dtrace_dof_error(NULL, "truncated DOF");
+ return (NULL);
+ }
+
+ if (loadsz >= dtrace_dof_maxsize) {
+ ddi_prop_free(buf);
+ dtrace_dof_error(NULL, "oversized DOF");
+ return (NULL);
+ }
+
+ dof = kmem_alloc(loadsz, KM_SLEEP);
+ bcopy(buf, dof, loadsz);
+ ddi_prop_free(buf);
+
+ return (dof);
+#endif /* !__FreeBSD__ */
+}
+
+static void
+dtrace_dof_destroy(dof_hdr_t *dof)
+{
+ kmem_free(dof, dof->dofh_loadsz);
+}
+
+/*
+ * Return the dof_sec_t pointer corresponding to a given section index. If the
+ * index is not valid, dtrace_dof_error() is called and NULL is returned. If
+ * a type other than DOF_SECT_NONE is specified, the header is checked against
+ * this type and NULL is returned if the types do not match.
+ */
+static dof_sec_t *
+dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
+{
+ dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
+ ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
+
+ if (i >= dof->dofh_secnum) {
+ dtrace_dof_error(dof, "referenced section index is invalid");
+ return (NULL);
+ }
+
+ if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
+ dtrace_dof_error(dof, "referenced section is not loadable");
+ return (NULL);
+ }
+
+ if (type != DOF_SECT_NONE && type != sec->dofs_type) {
+ dtrace_dof_error(dof, "referenced section is the wrong type");
+ return (NULL);
+ }
+
+ return (sec);
+}
+
+static dtrace_probedesc_t *
+dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
+{
+ dof_probedesc_t *probe;
+ dof_sec_t *strtab;
+ uintptr_t daddr = (uintptr_t)dof;
+ uintptr_t str;
+ size_t size;
+
+ if (sec->dofs_type != DOF_SECT_PROBEDESC) {
+ dtrace_dof_error(dof, "invalid probe section");
+ return (NULL);
+ }
+
+ if (sec->dofs_align != sizeof (dof_secidx_t)) {
+ dtrace_dof_error(dof, "bad alignment in probe description");
+ return (NULL);
+ }
+
+ if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
+ dtrace_dof_error(dof, "truncated probe description");
+ return (NULL);
+ }
+
+ probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
+ strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
+
+ if (strtab == NULL)
+ return (NULL);
+
+ str = daddr + strtab->dofs_offset;
+ size = strtab->dofs_size;
+
+ if (probe->dofp_provider >= strtab->dofs_size) {
+ dtrace_dof_error(dof, "corrupt probe provider");
+ return (NULL);
+ }
+
+ (void) strncpy(desc->dtpd_provider,
+ (char *)(str + probe->dofp_provider),
+ MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
+
+ if (probe->dofp_mod >= strtab->dofs_size) {
+ dtrace_dof_error(dof, "corrupt probe module");
+ return (NULL);
+ }
+
+ (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
+ MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
+
+ if (probe->dofp_func >= strtab->dofs_size) {
+ dtrace_dof_error(dof, "corrupt probe function");
+ return (NULL);
+ }
+
+ (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
+ MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
+
+ if (probe->dofp_name >= strtab->dofs_size) {
+ dtrace_dof_error(dof, "corrupt probe name");
+ return (NULL);
+ }
+
+ (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
+ MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
+
+ return (desc);
+}
+
+static dtrace_difo_t *
+dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
+ cred_t *cr)
+{
+ dtrace_difo_t *dp;
+ size_t ttl = 0;
+ dof_difohdr_t *dofd;
+ uintptr_t daddr = (uintptr_t)dof;
+ size_t max = dtrace_difo_maxsize;
+ int i, l, n;
+
+ static const struct {
+ int section;
+ int bufoffs;
+ int lenoffs;
+ int entsize;
+ int align;
+ const char *msg;
+ } difo[] = {
+ { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
+ offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
+ sizeof (dif_instr_t), "multiple DIF sections" },
+
+ { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
+ offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
+ sizeof (uint64_t), "multiple integer tables" },
+
+ { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
+ offsetof(dtrace_difo_t, dtdo_strlen), 0,
+ sizeof (char), "multiple string tables" },
+
+ { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
+ offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
+ sizeof (uint_t), "multiple variable tables" },
+
+ { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
+ };
+
+ if (sec->dofs_type != DOF_SECT_DIFOHDR) {
+ dtrace_dof_error(dof, "invalid DIFO header section");
+ return (NULL);
+ }
+
+ if (sec->dofs_align != sizeof (dof_secidx_t)) {
+ dtrace_dof_error(dof, "bad alignment in DIFO header");
+ return (NULL);
+ }
+
+ if (sec->dofs_size < sizeof (dof_difohdr_t) ||
+ sec->dofs_size % sizeof (dof_secidx_t)) {
+ dtrace_dof_error(dof, "bad size in DIFO header");
+ return (NULL);
+ }
+
+ dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
+ n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
+
+ dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
+ dp->dtdo_rtype = dofd->dofd_rtype;
+
+ for (l = 0; l < n; l++) {
+ dof_sec_t *subsec;
+ void **bufp;
+ uint32_t *lenp;
+
+ if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
+ dofd->dofd_links[l])) == NULL)
+ goto err; /* invalid section link */
+
+ if (ttl + subsec->dofs_size > max) {
+ dtrace_dof_error(dof, "exceeds maximum size");
+ goto err;
+ }
+
+ ttl += subsec->dofs_size;
+
+ for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
+ if (subsec->dofs_type != difo[i].section)
+ continue;
+
+ if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
+ dtrace_dof_error(dof, "section not loaded");
+ goto err;
+ }
+
+ if (subsec->dofs_align != difo[i].align) {
+ dtrace_dof_error(dof, "bad alignment");
+ goto err;
+ }
+
+ bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
+ lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
+
+ if (*bufp != NULL) {
+ dtrace_dof_error(dof, difo[i].msg);
+ goto err;
+ }
+
+ if (difo[i].entsize != subsec->dofs_entsize) {
+ dtrace_dof_error(dof, "entry size mismatch");
+ goto err;
+ }
+
+ if (subsec->dofs_entsize != 0 &&
+ (subsec->dofs_size % subsec->dofs_entsize) != 0) {
+ dtrace_dof_error(dof, "corrupt entry size");
+ goto err;
+ }
+
+ *lenp = subsec->dofs_size;
+ *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
+ bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
+ *bufp, subsec->dofs_size);
+
+ if (subsec->dofs_entsize != 0)
+ *lenp /= subsec->dofs_entsize;
+
+ break;
+ }
+
+ /*
+ * If we encounter a loadable DIFO sub-section that is not
+ * known to us, assume this is a broken program and fail.
+ */
+ if (difo[i].section == DOF_SECT_NONE &&
+ (subsec->dofs_flags & DOF_SECF_LOAD)) {
+ dtrace_dof_error(dof, "unrecognized DIFO subsection");
+ goto err;
+ }
+ }
+
+ if (dp->dtdo_buf == NULL) {
+ /*
+ * We can't have a DIF object without DIF text.
+ */
+ dtrace_dof_error(dof, "missing DIF text");
+ goto err;
+ }
+
+ /*
+ * Before we validate the DIF object, run through the variable table
+ * looking for the strings -- if any of their size are under, we'll set
+ * their size to be the system-wide default string size. Note that
+ * this should _not_ happen if the "strsize" option has been set --
+ * in this case, the compiler should have set the size to reflect the
+ * setting of the option.
+ */
+ for (i = 0; i < dp->dtdo_varlen; i++) {
+ dtrace_difv_t *v = &dp->dtdo_vartab[i];
+ dtrace_diftype_t *t = &v->dtdv_type;
+
+ if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
+ continue;
+
+ if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
+ t->dtdt_size = dtrace_strsize_default;
+ }
+
+ if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
+ goto err;
+
+ dtrace_difo_init(dp, vstate);
+ return (dp);
+
+err:
+ kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
+ kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
+ kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
+ kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
+
+ kmem_free(dp, sizeof (dtrace_difo_t));
+ return (NULL);
+}
+
+static dtrace_predicate_t *
+dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
+ cred_t *cr)
+{
+ dtrace_difo_t *dp;
+
+ if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
+ return (NULL);
+
+ return (dtrace_predicate_create(dp));
+}
+
+static dtrace_actdesc_t *
+dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
+ cred_t *cr)
+{
+ dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
+ dof_actdesc_t *desc;
+ dof_sec_t *difosec;
+ size_t offs;
+ uintptr_t daddr = (uintptr_t)dof;
+ uint64_t arg;
+ dtrace_actkind_t kind;
+
+ if (sec->dofs_type != DOF_SECT_ACTDESC) {
+ dtrace_dof_error(dof, "invalid action section");
+ return (NULL);
+ }
+
+ if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
+ dtrace_dof_error(dof, "truncated action description");
+ return (NULL);
+ }
+
+ if (sec->dofs_align != sizeof (uint64_t)) {
+ dtrace_dof_error(dof, "bad alignment in action description");
+ return (NULL);
+ }
+
+ if (sec->dofs_size < sec->dofs_entsize) {
+ dtrace_dof_error(dof, "section entry size exceeds total size");
+ return (NULL);
+ }
+
+ if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
+ dtrace_dof_error(dof, "bad entry size in action description");
+ return (NULL);
+ }
+
+ if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
+ dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
+ return (NULL);
+ }
+
+ for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
+ desc = (dof_actdesc_t *)(daddr +
+ (uintptr_t)sec->dofs_offset + offs);
+ kind = (dtrace_actkind_t)desc->dofa_kind;
+
+ if ((DTRACEACT_ISPRINTFLIKE(kind) &&
+ (kind != DTRACEACT_PRINTA ||
+ desc->dofa_strtab != DOF_SECIDX_NONE)) ||
+ (kind == DTRACEACT_DIFEXPR &&
+ desc->dofa_strtab != DOF_SECIDX_NONE)) {
+ dof_sec_t *strtab;
+ char *str, *fmt;
+ uint64_t i;
+
+ /*
+ * The argument to these actions is an index into the
+ * DOF string table. For printf()-like actions, this
+ * is the format string. For print(), this is the
+ * CTF type of the expression result.
+ */
+ if ((strtab = dtrace_dof_sect(dof,
+ DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
+ goto err;
+
+ str = (char *)((uintptr_t)dof +
+ (uintptr_t)strtab->dofs_offset);
+
+ for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
+ if (str[i] == '\0')
+ break;
+ }
+
+ if (i >= strtab->dofs_size) {
+ dtrace_dof_error(dof, "bogus format string");
+ goto err;
+ }
+
+ if (i == desc->dofa_arg) {
+ dtrace_dof_error(dof, "empty format string");
+ goto err;
+ }
+
+ i -= desc->dofa_arg;
+ fmt = kmem_alloc(i + 1, KM_SLEEP);
+ bcopy(&str[desc->dofa_arg], fmt, i + 1);
+ arg = (uint64_t)(uintptr_t)fmt;
+ } else {
+ if (kind == DTRACEACT_PRINTA) {
+ ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
+ arg = 0;
+ } else {
+ arg = desc->dofa_arg;
+ }
+ }
+
+ act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
+ desc->dofa_uarg, arg);
+
+ if (last != NULL) {
+ last->dtad_next = act;
+ } else {
+ first = act;
+ }
+
+ last = act;
+
+ if (desc->dofa_difo == DOF_SECIDX_NONE)
+ continue;
+
+ if ((difosec = dtrace_dof_sect(dof,
+ DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
+ goto err;
+
+ act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
+
+ if (act->dtad_difo == NULL)
+ goto err;
+ }
+
+ ASSERT(first != NULL);
+ return (first);
+
+err:
+ for (act = first; act != NULL; act = next) {
+ next = act->dtad_next;
+ dtrace_actdesc_release(act, vstate);
+ }
+
+ return (NULL);
+}
+
+static dtrace_ecbdesc_t *
+dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
+ cred_t *cr)
+{
+ dtrace_ecbdesc_t *ep;
+ dof_ecbdesc_t *ecb;
+ dtrace_probedesc_t *desc;
+ dtrace_predicate_t *pred = NULL;
+
+ if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
+ dtrace_dof_error(dof, "truncated ECB description");
+ return (NULL);
+ }
+
+ if (sec->dofs_align != sizeof (uint64_t)) {
+ dtrace_dof_error(dof, "bad alignment in ECB description");
+ return (NULL);
+ }
+
+ ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
+ sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
+
+ if (sec == NULL)
+ return (NULL);
+
+ ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
+ ep->dted_uarg = ecb->dofe_uarg;
+ desc = &ep->dted_probe;
+
+ if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
+ goto err;
+
+ if (ecb->dofe_pred != DOF_SECIDX_NONE) {
+ if ((sec = dtrace_dof_sect(dof,
+ DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
+ goto err;
+
+ if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
+ goto err;
+
+ ep->dted_pred.dtpdd_predicate = pred;
+ }
+
+ if (ecb->dofe_actions != DOF_SECIDX_NONE) {
+ if ((sec = dtrace_dof_sect(dof,
+ DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
+ goto err;
+
+ ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
+
+ if (ep->dted_action == NULL)
+ goto err;
+ }
+
+ return (ep);
+
+err:
+ if (pred != NULL)
+ dtrace_predicate_release(pred, vstate);
+ kmem_free(ep, sizeof (dtrace_ecbdesc_t));
+ return (NULL);
+}
+
+/*
+ * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
+ * specified DOF. SETX relocations are computed using 'ubase', the base load
+ * address of the object containing the DOF, and DOFREL relocations are relative
+ * to the relocation offset within the DOF.
+ */
+static int
+dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase,
+ uint64_t udaddr)
+{
+ uintptr_t daddr = (uintptr_t)dof;
+ uintptr_t ts_end;
+ dof_relohdr_t *dofr =
+ (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
+ dof_sec_t *ss, *rs, *ts;
+ dof_relodesc_t *r;
+ uint_t i, n;
+
+ if (sec->dofs_size < sizeof (dof_relohdr_t) ||
+ sec->dofs_align != sizeof (dof_secidx_t)) {
+ dtrace_dof_error(dof, "invalid relocation header");
+ return (-1);
+ }
+
+ ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
+ rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
+ ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
+ ts_end = (uintptr_t)ts + sizeof (dof_sec_t);
+
+ if (ss == NULL || rs == NULL || ts == NULL)
+ return (-1); /* dtrace_dof_error() has been called already */
+
+ if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
+ rs->dofs_align != sizeof (uint64_t)) {
+ dtrace_dof_error(dof, "invalid relocation section");
+ return (-1);
+ }
+
+ r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
+ n = rs->dofs_size / rs->dofs_entsize;
+
+ for (i = 0; i < n; i++) {
+ uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
+
+ switch (r->dofr_type) {
+ case DOF_RELO_NONE:
+ break;
+ case DOF_RELO_SETX:
+ case DOF_RELO_DOFREL:
+ if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
+ sizeof (uint64_t) > ts->dofs_size) {
+ dtrace_dof_error(dof, "bad relocation offset");
+ return (-1);
+ }
+
+ if (taddr >= (uintptr_t)ts && taddr < ts_end) {
+ dtrace_dof_error(dof, "bad relocation offset");
+ return (-1);
+ }
+
+ if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
+ dtrace_dof_error(dof, "misaligned setx relo");
+ return (-1);
+ }
+
+ if (r->dofr_type == DOF_RELO_SETX)
+ *(uint64_t *)taddr += ubase;
+ else
+ *(uint64_t *)taddr +=
+ udaddr + ts->dofs_offset + r->dofr_offset;
+ break;
+ default:
+ dtrace_dof_error(dof, "invalid relocation type");
+ return (-1);
+ }
+
+ r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
+ }
+
+ return (0);
+}
+
+/*
+ * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
+ * header: it should be at the front of a memory region that is at least
+ * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
+ * size. It need not be validated in any other way.
+ */
+static int
+dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
+ dtrace_enabling_t **enabp, uint64_t ubase, uint64_t udaddr, int noprobes)
+{
+ uint64_t len = dof->dofh_loadsz, seclen;
+ uintptr_t daddr = (uintptr_t)dof;
+ dtrace_ecbdesc_t *ep;
+ dtrace_enabling_t *enab;
+ uint_t i;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
+
+ /*
+ * Check the DOF header identification bytes. In addition to checking
+ * valid settings, we also verify that unused bits/bytes are zeroed so
+ * we can use them later without fear of regressing existing binaries.
+ */
+ if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
+ DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
+ dtrace_dof_error(dof, "DOF magic string mismatch");
+ return (-1);
+ }
+
+ if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
+ dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
+ dtrace_dof_error(dof, "DOF has invalid data model");
+ return (-1);
+ }
+
+ if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
+ dtrace_dof_error(dof, "DOF encoding mismatch");
+ return (-1);
+ }
+
+ if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
+ dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
+ dtrace_dof_error(dof, "DOF version mismatch");
+ return (-1);
+ }
+
+ if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
+ dtrace_dof_error(dof, "DOF uses unsupported instruction set");
+ return (-1);
+ }
+
+ if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
+ dtrace_dof_error(dof, "DOF uses too many integer registers");
+ return (-1);
+ }
+
+ if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
+ dtrace_dof_error(dof, "DOF uses too many tuple registers");
+ return (-1);
+ }
+
+ for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
+ if (dof->dofh_ident[i] != 0) {
+ dtrace_dof_error(dof, "DOF has invalid ident byte set");
+ return (-1);
+ }
+ }
+
+ if (dof->dofh_flags & ~DOF_FL_VALID) {
+ dtrace_dof_error(dof, "DOF has invalid flag bits set");
+ return (-1);
+ }
+
+ if (dof->dofh_secsize == 0) {
+ dtrace_dof_error(dof, "zero section header size");
+ return (-1);
+ }
+
+ /*
+ * Check that the section headers don't exceed the amount of DOF
+ * data. Note that we cast the section size and number of sections
+ * to uint64_t's to prevent possible overflow in the multiplication.
+ */
+ seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
+
+ if (dof->dofh_secoff > len || seclen > len ||
+ dof->dofh_secoff + seclen > len) {
+ dtrace_dof_error(dof, "truncated section headers");
+ return (-1);
+ }
+
+ if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
+ dtrace_dof_error(dof, "misaligned section headers");
+ return (-1);
+ }
+
+ if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
+ dtrace_dof_error(dof, "misaligned section size");
+ return (-1);
+ }
+
+ /*
+ * Take an initial pass through the section headers to be sure that
+ * the headers don't have stray offsets. If the 'noprobes' flag is
+ * set, do not permit sections relating to providers, probes, or args.
+ */
+ for (i = 0; i < dof->dofh_secnum; i++) {
+ dof_sec_t *sec = (dof_sec_t *)(daddr +
+ (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
+
+ if (noprobes) {
+ switch (sec->dofs_type) {
+ case DOF_SECT_PROVIDER:
+ case DOF_SECT_PROBES:
+ case DOF_SECT_PRARGS:
+ case DOF_SECT_PROFFS:
+ dtrace_dof_error(dof, "illegal sections "
+ "for enabling");
+ return (-1);
+ }
+ }
+
+ if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
+ !(sec->dofs_flags & DOF_SECF_LOAD)) {
+ dtrace_dof_error(dof, "loadable section with load "
+ "flag unset");
+ return (-1);
+ }
+
+ if (!(sec->dofs_flags & DOF_SECF_LOAD))
+ continue; /* just ignore non-loadable sections */
+
+ if (!ISP2(sec->dofs_align)) {
+ dtrace_dof_error(dof, "bad section alignment");
+ return (-1);
+ }
+
+ if (sec->dofs_offset & (sec->dofs_align - 1)) {
+ dtrace_dof_error(dof, "misaligned section");
+ return (-1);
+ }
+
+ if (sec->dofs_offset > len || sec->dofs_size > len ||
+ sec->dofs_offset + sec->dofs_size > len) {
+ dtrace_dof_error(dof, "corrupt section header");
+ return (-1);
+ }
+
+ if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
+ sec->dofs_offset + sec->dofs_size - 1) != '\0') {
+ dtrace_dof_error(dof, "non-terminating string table");
+ return (-1);
+ }
+ }
+
+ /*
+ * Take a second pass through the sections and locate and perform any
+ * relocations that are present. We do this after the first pass to
+ * be sure that all sections have had their headers validated.
+ */
+ for (i = 0; i < dof->dofh_secnum; i++) {
+ dof_sec_t *sec = (dof_sec_t *)(daddr +
+ (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
+
+ if (!(sec->dofs_flags & DOF_SECF_LOAD))
+ continue; /* skip sections that are not loadable */
+
+ switch (sec->dofs_type) {
+ case DOF_SECT_URELHDR:
+ if (dtrace_dof_relocate(dof, sec, ubase, udaddr) != 0)
+ return (-1);
+ break;
+ }
+ }
+
+ if ((enab = *enabp) == NULL)
+ enab = *enabp = dtrace_enabling_create(vstate);
+
+ for (i = 0; i < dof->dofh_secnum; i++) {
+ dof_sec_t *sec = (dof_sec_t *)(daddr +
+ (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
+
+ if (sec->dofs_type != DOF_SECT_ECBDESC)
+ continue;
+
+ if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
+ dtrace_enabling_destroy(enab);
+ *enabp = NULL;
+ return (-1);
+ }
+
+ dtrace_enabling_add(enab, ep);
+ }
+
+ return (0);
+}
+
+/*
+ * Process DOF for any options. This routine assumes that the DOF has been
+ * at least processed by dtrace_dof_slurp().
+ */
+static int
+dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
+{
+ int i, rval;
+ uint32_t entsize;
+ size_t offs;
+ dof_optdesc_t *desc;
+
+ for (i = 0; i < dof->dofh_secnum; i++) {
+ dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
+ (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
+
+ if (sec->dofs_type != DOF_SECT_OPTDESC)
+ continue;
+
+ if (sec->dofs_align != sizeof (uint64_t)) {
+ dtrace_dof_error(dof, "bad alignment in "
+ "option description");
+ return (EINVAL);
+ }
+
+ if ((entsize = sec->dofs_entsize) == 0) {
+ dtrace_dof_error(dof, "zeroed option entry size");
+ return (EINVAL);
+ }
+
+ if (entsize < sizeof (dof_optdesc_t)) {
+ dtrace_dof_error(dof, "bad option entry size");
+ return (EINVAL);
+ }
+
+ for (offs = 0; offs < sec->dofs_size; offs += entsize) {
+ desc = (dof_optdesc_t *)((uintptr_t)dof +
+ (uintptr_t)sec->dofs_offset + offs);
+
+ if (desc->dofo_strtab != DOF_SECIDX_NONE) {
+ dtrace_dof_error(dof, "non-zero option string");
+ return (EINVAL);
+ }
+
+ if (desc->dofo_value == DTRACEOPT_UNSET) {
+ dtrace_dof_error(dof, "unset option");
+ return (EINVAL);
+ }
+
+ if ((rval = dtrace_state_option(state,
+ desc->dofo_option, desc->dofo_value)) != 0) {
+ dtrace_dof_error(dof, "rejected option");
+ return (rval);
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * DTrace Consumer State Functions
+ */
+static int
+dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
+{
+ size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
+ void *base;
+ uintptr_t limit;
+ dtrace_dynvar_t *dvar, *next, *start;
+ int i;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
+
+ bzero(dstate, sizeof (dtrace_dstate_t));
+
+ if ((dstate->dtds_chunksize = chunksize) == 0)
+ dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
+
+ VERIFY(dstate->dtds_chunksize < LONG_MAX);
+
+ if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
+ size = min;
+
+ if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
+ return (ENOMEM);
+
+ dstate->dtds_size = size;
+ dstate->dtds_base = base;
+ dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
+ bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
+
+ hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
+
+ if (hashsize != 1 && (hashsize & 1))
+ hashsize--;
+
+ dstate->dtds_hashsize = hashsize;
+ dstate->dtds_hash = dstate->dtds_base;
+
+ /*
+ * Set all of our hash buckets to point to the single sink, and (if
+ * it hasn't already been set), set the sink's hash value to be the
+ * sink sentinel value. The sink is needed for dynamic variable
+ * lookups to know that they have iterated over an entire, valid hash
+ * chain.
+ */
+ for (i = 0; i < hashsize; i++)
+ dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
+
+ if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
+ dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
+
+ /*
+ * Determine number of active CPUs. Divide free list evenly among
+ * active CPUs.
+ */
+ start = (dtrace_dynvar_t *)
+ ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
+ limit = (uintptr_t)base + size;
+
+ VERIFY((uintptr_t)start < limit);
+ VERIFY((uintptr_t)start >= (uintptr_t)base);
+
+ maxper = (limit - (uintptr_t)start) / NCPU;
+ maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
+
+#ifndef illumos
+ CPU_FOREACH(i) {
+#else
+ for (i = 0; i < NCPU; i++) {
+#endif
+ dstate->dtds_percpu[i].dtdsc_free = dvar = start;
+
+ /*
+ * If we don't even have enough chunks to make it once through
+ * NCPUs, we're just going to allocate everything to the first
+ * CPU. And if we're on the last CPU, we're going to allocate
+ * whatever is left over. In either case, we set the limit to
+ * be the limit of the dynamic variable space.
+ */
+ if (maxper == 0 || i == NCPU - 1) {
+ limit = (uintptr_t)base + size;
+ start = NULL;
+ } else {
+ limit = (uintptr_t)start + maxper;
+ start = (dtrace_dynvar_t *)limit;
+ }
+
+ VERIFY(limit <= (uintptr_t)base + size);
+
+ for (;;) {
+ next = (dtrace_dynvar_t *)((uintptr_t)dvar +
+ dstate->dtds_chunksize);
+
+ if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
+ break;
+
+ VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
+ (uintptr_t)dvar <= (uintptr_t)base + size);
+ dvar->dtdv_next = next;
+ dvar = next;
+ }
+
+ if (maxper == 0)
+ break;
+ }
+
+ return (0);
+}
+
+static void
+dtrace_dstate_fini(dtrace_dstate_t *dstate)
+{
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ if (dstate->dtds_base == NULL)
+ return;
+
+ kmem_free(dstate->dtds_base, dstate->dtds_size);
+ kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
+}
+
+static void
+dtrace_vstate_fini(dtrace_vstate_t *vstate)
+{
+ /*
+ * Logical XOR, where are you?
+ */
+ ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
+
+ if (vstate->dtvs_nglobals > 0) {
+ kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
+ sizeof (dtrace_statvar_t *));
+ }
+
+ if (vstate->dtvs_ntlocals > 0) {
+ kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
+ sizeof (dtrace_difv_t));
+ }
+
+ ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
+
+ if (vstate->dtvs_nlocals > 0) {
+ kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
+ sizeof (dtrace_statvar_t *));
+ }
+}
+
+#ifdef illumos
+static void
+dtrace_state_clean(dtrace_state_t *state)
+{
+ if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
+ return;
+
+ dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
+ dtrace_speculation_clean(state);
+}
+
+static void
+dtrace_state_deadman(dtrace_state_t *state)
+{
+ hrtime_t now;
+
+ dtrace_sync();
+
+ now = dtrace_gethrtime();
+
+ if (state != dtrace_anon.dta_state &&
+ now - state->dts_laststatus >= dtrace_deadman_user)
+ return;
+
+ /*
+ * We must be sure that dts_alive never appears to be less than the
+ * value upon entry to dtrace_state_deadman(), and because we lack a
+ * dtrace_cas64(), we cannot store to it atomically. We thus instead
+ * store INT64_MAX to it, followed by a memory barrier, followed by
+ * the new value. This assures that dts_alive never appears to be
+ * less than its true value, regardless of the order in which the
+ * stores to the underlying storage are issued.
+ */
+ state->dts_alive = INT64_MAX;
+ dtrace_membar_producer();
+ state->dts_alive = now;
+}
+#else /* !illumos */
+static void
+dtrace_state_clean(void *arg)
+{
+ dtrace_state_t *state = arg;
+ dtrace_optval_t *opt = state->dts_options;
+
+ if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
+ return;
+
+ dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
+ dtrace_speculation_clean(state);
+
+ callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
+ dtrace_state_clean, state);
+}
+
+static void
+dtrace_state_deadman(void *arg)
+{
+ dtrace_state_t *state = arg;
+ hrtime_t now;
+
+ dtrace_sync();
+
+ dtrace_debug_output();
+
+ now = dtrace_gethrtime();
+
+ if (state != dtrace_anon.dta_state &&
+ now - state->dts_laststatus >= dtrace_deadman_user)
+ return;
+
+ /*
+ * We must be sure that dts_alive never appears to be less than the
+ * value upon entry to dtrace_state_deadman(), and because we lack a
+ * dtrace_cas64(), we cannot store to it atomically. We thus instead
+ * store INT64_MAX to it, followed by a memory barrier, followed by
+ * the new value. This assures that dts_alive never appears to be
+ * less than its true value, regardless of the order in which the
+ * stores to the underlying storage are issued.
+ */
+ state->dts_alive = INT64_MAX;
+ dtrace_membar_producer();
+ state->dts_alive = now;
+
+ callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
+ dtrace_state_deadman, state);
+}
+#endif /* illumos */
+
+static dtrace_state_t *
+#ifdef illumos
+dtrace_state_create(dev_t *devp, cred_t *cr)
+#else
+dtrace_state_create(struct cdev *dev, struct ucred *cred __unused)
+#endif
+{
+#ifdef illumos
+ minor_t minor;
+ major_t major;
+#else
+ cred_t *cr = NULL;
+ int m = 0;
+#endif
+ char c[30];
+ dtrace_state_t *state;
+ dtrace_optval_t *opt;
+ int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
+ int cpu_it;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+#ifdef illumos
+ minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
+ VM_BESTFIT | VM_SLEEP);
+
+ if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
+ vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
+ return (NULL);
+ }
+
+ state = ddi_get_soft_state(dtrace_softstate, minor);
+#else
+ if (dev != NULL) {
+ cr = dev->si_cred;
+ m = dev2unit(dev);
+ }
+
+ /* Allocate memory for the state. */
+ state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
+#endif
+
+ state->dts_epid = DTRACE_EPIDNONE + 1;
+
+ (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
+#ifdef illumos
+ state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
+ NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
+
+ if (devp != NULL) {
+ major = getemajor(*devp);
+ } else {
+ major = ddi_driver_major(dtrace_devi);
+ }
+
+ state->dts_dev = makedevice(major, minor);
+
+ if (devp != NULL)
+ *devp = state->dts_dev;
+#else
+ state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
+ state->dts_dev = dev;
+#endif
+
+ /*
+ * We allocate NCPU buffers. On the one hand, this can be quite
+ * a bit of memory per instance (nearly 36K on a Starcat). On the
+ * other hand, it saves an additional memory reference in the probe
+ * path.
+ */
+ state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
+ state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
+
+ /*
+ * Allocate and initialise the per-process per-CPU random state.
+ * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
+ * assumed to be seeded at this point (if from Fortuna seed file).
+ */
+ arc4random_buf(&state->dts_rstate[0], 2 * sizeof(uint64_t));
+ for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
+ /*
+ * Each CPU is assigned a 2^64 period, non-overlapping
+ * subsequence.
+ */
+ dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
+ state->dts_rstate[cpu_it]);
+ }
+
+#ifdef illumos
+ state->dts_cleaner = CYCLIC_NONE;
+ state->dts_deadman = CYCLIC_NONE;
+#else
+ callout_init(&state->dts_cleaner, 1);
+ callout_init(&state->dts_deadman, 1);
+#endif
+ state->dts_vstate.dtvs_state = state;
+
+ for (i = 0; i < DTRACEOPT_MAX; i++)
+ state->dts_options[i] = DTRACEOPT_UNSET;
+
+ /*
+ * Set the default options.
+ */
+ opt = state->dts_options;
+ opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
+ opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
+ opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
+ opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
+ opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
+ opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
+ opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
+ opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
+ opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
+ opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
+ opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
+ opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
+ opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
+ opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
+
+ state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
+
+ /*
+ * Depending on the user credentials, we set flag bits which alter probe
+ * visibility or the amount of destructiveness allowed. In the case of
+ * actual anonymous tracing, or the possession of all privileges, all of
+ * the normal checks are bypassed.
+ */
+ if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
+ state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
+ state->dts_cred.dcr_action = DTRACE_CRA_ALL;
+ } else {
+ /*
+ * Set up the credentials for this instantiation. We take a
+ * hold on the credential to prevent it from disappearing on
+ * us; this in turn prevents the zone_t referenced by this
+ * credential from disappearing. This means that we can
+ * examine the credential and the zone from probe context.
+ */
+ crhold(cr);
+ state->dts_cred.dcr_cred = cr;
+
+ /*
+ * CRA_PROC means "we have *some* privilege for dtrace" and
+ * unlocks the use of variables like pid, zonename, etc.
+ */
+ if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
+ PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
+ state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
+ }
+
+ /*
+ * dtrace_user allows use of syscall and profile providers.
+ * If the user also has proc_owner and/or proc_zone, we
+ * extend the scope to include additional visibility and
+ * destructive power.
+ */
+ if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
+ if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
+ state->dts_cred.dcr_visible |=
+ DTRACE_CRV_ALLPROC;
+
+ state->dts_cred.dcr_action |=
+ DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
+ }
+
+ if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
+ state->dts_cred.dcr_visible |=
+ DTRACE_CRV_ALLZONE;
+
+ state->dts_cred.dcr_action |=
+ DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
+ }
+
+ /*
+ * If we have all privs in whatever zone this is,
+ * we can do destructive things to processes which
+ * have altered credentials.
+ */
+#ifdef illumos
+ if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
+ cr->cr_zone->zone_privset)) {
+ state->dts_cred.dcr_action |=
+ DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
+ }
+#endif
+ }
+
+ /*
+ * Holding the dtrace_kernel privilege also implies that
+ * the user has the dtrace_user privilege from a visibility
+ * perspective. But without further privileges, some
+ * destructive actions are not available.
+ */
+ if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
+ /*
+ * Make all probes in all zones visible. However,
+ * this doesn't mean that all actions become available
+ * to all zones.
+ */
+ state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
+ DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
+
+ state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
+ DTRACE_CRA_PROC;
+ /*
+ * Holding proc_owner means that destructive actions
+ * for *this* zone are allowed.
+ */
+ if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
+ state->dts_cred.dcr_action |=
+ DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
+
+ /*
+ * Holding proc_zone means that destructive actions
+ * for this user/group ID in all zones is allowed.
+ */
+ if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
+ state->dts_cred.dcr_action |=
+ DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
+
+#ifdef illumos
+ /*
+ * If we have all privs in whatever zone this is,
+ * we can do destructive things to processes which
+ * have altered credentials.
+ */
+ if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
+ cr->cr_zone->zone_privset)) {
+ state->dts_cred.dcr_action |=
+ DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
+ }
+#endif
+ }
+
+ /*
+ * Holding the dtrace_proc privilege gives control over fasttrap
+ * and pid providers. We need to grant wider destructive
+ * privileges in the event that the user has proc_owner and/or
+ * proc_zone.
+ */
+ if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
+ if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
+ state->dts_cred.dcr_action |=
+ DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
+
+ if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
+ state->dts_cred.dcr_action |=
+ DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
+ }
+ }
+
+ return (state);
+}
+
+static int
+dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
+{
+ dtrace_optval_t *opt = state->dts_options, size;
+ processorid_t cpu = 0;;
+ int flags = 0, rval, factor, divisor = 1;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ ASSERT(which < DTRACEOPT_MAX);
+ ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
+ (state == dtrace_anon.dta_state &&
+ state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
+
+ if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
+ return (0);
+
+ if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
+ cpu = opt[DTRACEOPT_CPU];
+
+ if (which == DTRACEOPT_SPECSIZE)
+ flags |= DTRACEBUF_NOSWITCH;
+
+ if (which == DTRACEOPT_BUFSIZE) {
+ if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
+ flags |= DTRACEBUF_RING;
+
+ if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
+ flags |= DTRACEBUF_FILL;
+
+ if (state != dtrace_anon.dta_state ||
+ state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
+ flags |= DTRACEBUF_INACTIVE;
+ }
+
+ for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
+ /*
+ * The size must be 8-byte aligned. If the size is not 8-byte
+ * aligned, drop it down by the difference.
+ */
+ if (size & (sizeof (uint64_t) - 1))
+ size -= size & (sizeof (uint64_t) - 1);
+
+ if (size < state->dts_reserve) {
+ /*
+ * Buffers always must be large enough to accommodate
+ * their prereserved space. We return E2BIG instead
+ * of ENOMEM in this case to allow for user-level
+ * software to differentiate the cases.
+ */
+ return (E2BIG);
+ }
+
+ rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
+
+ if (rval != ENOMEM) {
+ opt[which] = size;
+ return (rval);
+ }
+
+ if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
+ return (rval);
+
+ for (divisor = 2; divisor < factor; divisor <<= 1)
+ continue;
+ }
+
+ return (ENOMEM);
+}
+
+static int
+dtrace_state_buffers(dtrace_state_t *state)
+{
+ dtrace_speculation_t *spec = state->dts_speculations;
+ int rval, i;
+
+ if ((rval = dtrace_state_buffer(state, state->dts_buffer,
+ DTRACEOPT_BUFSIZE)) != 0)
+ return (rval);
+
+ if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
+ DTRACEOPT_AGGSIZE)) != 0)
+ return (rval);
+
+ for (i = 0; i < state->dts_nspeculations; i++) {
+ if ((rval = dtrace_state_buffer(state,
+ spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
+ return (rval);
+ }
+
+ return (0);
+}
+
+static void
+dtrace_state_prereserve(dtrace_state_t *state)
+{
+ dtrace_ecb_t *ecb;
+ dtrace_probe_t *probe;
+
+ state->dts_reserve = 0;
+
+ if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
+ return;
+
+ /*
+ * If our buffer policy is a "fill" buffer policy, we need to set the
+ * prereserved space to be the space required by the END probes.
+ */
+ probe = dtrace_probes[dtrace_probeid_end - 1];
+ ASSERT(probe != NULL);
+
+ for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
+ if (ecb->dte_state != state)
+ continue;
+
+ state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
+ }
+}
+
+static int
+dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
+{
+ dtrace_optval_t *opt = state->dts_options, sz, nspec;
+ dtrace_speculation_t *spec;
+ dtrace_buffer_t *buf;
+#ifdef illumos
+ cyc_handler_t hdlr;
+ cyc_time_t when;
+#endif
+ int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
+ dtrace_icookie_t cookie;
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_lock);
+
+ if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
+ rval = EBUSY;
+ goto out;
+ }
+
+ /*
+ * Before we can perform any checks, we must prime all of the
+ * retained enablings that correspond to this state.
+ */
+ dtrace_enabling_prime(state);
+
+ if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
+ rval = EACCES;
+ goto out;
+ }
+
+ dtrace_state_prereserve(state);
+
+ /*
+ * Now we want to do is try to allocate our speculations.
+ * We do not automatically resize the number of speculations; if
+ * this fails, we will fail the operation.
+ */
+ nspec = opt[DTRACEOPT_NSPEC];
+ ASSERT(nspec != DTRACEOPT_UNSET);
+
+ if (nspec > INT_MAX) {
+ rval = ENOMEM;
+ goto out;
+ }
+
+ spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
+ KM_NOSLEEP | KM_NORMALPRI);
+
+ if (spec == NULL) {
+ rval = ENOMEM;
+ goto out;
+ }
+
+ state->dts_speculations = spec;
+ state->dts_nspeculations = (int)nspec;
+
+ for (i = 0; i < nspec; i++) {
+ if ((buf = kmem_zalloc(bufsize,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
+ rval = ENOMEM;
+ goto err;
+ }
+
+ spec[i].dtsp_buffer = buf;
+ }
+
+ if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
+ if (dtrace_anon.dta_state == NULL) {
+ rval = ENOENT;
+ goto out;
+ }
+
+ if (state->dts_necbs != 0) {
+ rval = EALREADY;
+ goto out;
+ }
+
+ state->dts_anon = dtrace_anon_grab();
+ ASSERT(state->dts_anon != NULL);
+ state = state->dts_anon;
+
+ /*
+ * We want "grabanon" to be set in the grabbed state, so we'll
+ * copy that option value from the grabbing state into the
+ * grabbed state.
+ */
+ state->dts_options[DTRACEOPT_GRABANON] =
+ opt[DTRACEOPT_GRABANON];
+
+ *cpu = dtrace_anon.dta_beganon;
+
+ /*
+ * If the anonymous state is active (as it almost certainly
+ * is if the anonymous enabling ultimately matched anything),
+ * we don't allow any further option processing -- but we
+ * don't return failure.
+ */
+ if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
+ goto out;
+ }
+
+ if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
+ opt[DTRACEOPT_AGGSIZE] != 0) {
+ if (state->dts_aggregations == NULL) {
+ /*
+ * We're not going to create an aggregation buffer
+ * because we don't have any ECBs that contain
+ * aggregations -- set this option to 0.
+ */
+ opt[DTRACEOPT_AGGSIZE] = 0;
+ } else {
+ /*
+ * If we have an aggregation buffer, we must also have
+ * a buffer to use as scratch.
+ */
+ if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
+ opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
+ opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
+ }
+ }
+ }
+
+ if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
+ opt[DTRACEOPT_SPECSIZE] != 0) {
+ if (!state->dts_speculates) {
+ /*
+ * We're not going to create speculation buffers
+ * because we don't have any ECBs that actually
+ * speculate -- set the speculation size to 0.
+ */
+ opt[DTRACEOPT_SPECSIZE] = 0;
+ }
+ }
+
+ /*
+ * The bare minimum size for any buffer that we're actually going to
+ * do anything to is sizeof (uint64_t).
+ */
+ sz = sizeof (uint64_t);
+
+ if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
+ (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
+ (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
+ /*
+ * A buffer size has been explicitly set to 0 (or to a size
+ * that will be adjusted to 0) and we need the space -- we
+ * need to return failure. We return ENOSPC to differentiate
+ * it from failing to allocate a buffer due to failure to meet
+ * the reserve (for which we return E2BIG).
+ */
+ rval = ENOSPC;
+ goto out;
+ }
+
+ if ((rval = dtrace_state_buffers(state)) != 0)
+ goto err;
+
+ if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
+ sz = dtrace_dstate_defsize;
+
+ do {
+ rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
+
+ if (rval == 0)
+ break;
+
+ if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
+ goto err;
+ } while (sz >>= 1);
+
+ opt[DTRACEOPT_DYNVARSIZE] = sz;
+
+ if (rval != 0)
+ goto err;
+
+ if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
+ opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
+
+ if (opt[DTRACEOPT_CLEANRATE] == 0)
+ opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
+
+ if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
+ opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
+
+ if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
+ opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
+
+ state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
+#ifdef illumos
+ hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
+ hdlr.cyh_arg = state;
+ hdlr.cyh_level = CY_LOW_LEVEL;
+
+ when.cyt_when = 0;
+ when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
+
+ state->dts_cleaner = cyclic_add(&hdlr, &when);
+
+ hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
+ hdlr.cyh_arg = state;
+ hdlr.cyh_level = CY_LOW_LEVEL;
+
+ when.cyt_when = 0;
+ when.cyt_interval = dtrace_deadman_interval;
+
+ state->dts_deadman = cyclic_add(&hdlr, &when);
+#else
+ callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
+ dtrace_state_clean, state);
+ callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
+ dtrace_state_deadman, state);
+#endif
+
+ state->dts_activity = DTRACE_ACTIVITY_WARMUP;
+
+#ifdef illumos
+ if (state->dts_getf != 0 &&
+ !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+ /*
+ * We don't have kernel privs but we have at least one call
+ * to getf(); we need to bump our zone's count, and (if
+ * this is the first enabling to have an unprivileged call
+ * to getf()) we need to hook into closef().
+ */
+ state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
+
+ if (dtrace_getf++ == 0) {
+ ASSERT(dtrace_closef == NULL);
+ dtrace_closef = dtrace_getf_barrier;
+ }
+ }
+#endif
+
+ /*
+ * Now it's time to actually fire the BEGIN probe. We need to disable
+ * interrupts here both to record the CPU on which we fired the BEGIN
+ * probe (the data from this CPU will be processed first at user
+ * level) and to manually activate the buffer for this CPU.
+ */
+ cookie = dtrace_interrupt_disable();
+ *cpu = curcpu;
+ ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
+ state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
+
+ dtrace_probe(dtrace_probeid_begin,
+ (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
+ dtrace_interrupt_enable(cookie);
+ /*
+ * We may have had an exit action from a BEGIN probe; only change our
+ * state to ACTIVE if we're still in WARMUP.
+ */
+ ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
+ state->dts_activity == DTRACE_ACTIVITY_DRAINING);
+
+ if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
+ state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
+
+#ifdef __FreeBSD__
+ /*
+ * We enable anonymous tracing before APs are started, so we must
+ * activate buffers using the current CPU.
+ */
+ if (state == dtrace_anon.dta_state)
+ for (int i = 0; i < NCPU; i++)
+ dtrace_buffer_activate_cpu(state, i);
+ else
+ dtrace_xcall(DTRACE_CPUALL,
+ (dtrace_xcall_t)dtrace_buffer_activate, state);
+#else
+ /*
+ * Regardless of whether or not now we're in ACTIVE or DRAINING, we
+ * want each CPU to transition its principal buffer out of the
+ * INACTIVE state. Doing this assures that no CPU will suddenly begin
+ * processing an ECB halfway down a probe's ECB chain; all CPUs will
+ * atomically transition from processing none of a state's ECBs to
+ * processing all of them.
+ */
+ dtrace_xcall(DTRACE_CPUALL,
+ (dtrace_xcall_t)dtrace_buffer_activate, state);
+#endif
+ goto out;
+
+err:
+ dtrace_buffer_free(state->dts_buffer);
+ dtrace_buffer_free(state->dts_aggbuffer);
+
+ if ((nspec = state->dts_nspeculations) == 0) {
+ ASSERT(state->dts_speculations == NULL);
+ goto out;
+ }
+
+ spec = state->dts_speculations;
+ ASSERT(spec != NULL);
+
+ for (i = 0; i < state->dts_nspeculations; i++) {
+ if ((buf = spec[i].dtsp_buffer) == NULL)
+ break;
+
+ dtrace_buffer_free(buf);
+ kmem_free(buf, bufsize);
+ }
+
+ kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
+ state->dts_nspeculations = 0;
+ state->dts_speculations = NULL;
+
+out:
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+
+ return (rval);
+}
+
+static int
+dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
+{
+ dtrace_icookie_t cookie;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
+ state->dts_activity != DTRACE_ACTIVITY_DRAINING)
+ return (EINVAL);
+
+ /*
+ * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
+ * to be sure that every CPU has seen it. See below for the details
+ * on why this is done.
+ */
+ state->dts_activity = DTRACE_ACTIVITY_DRAINING;
+ dtrace_sync();
+
+ /*
+ * By this point, it is impossible for any CPU to be still processing
+ * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
+ * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
+ * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
+ * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
+ * iff we're in the END probe.
+ */
+ state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
+ dtrace_sync();
+ ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
+
+ /*
+ * Finally, we can release the reserve and call the END probe. We
+ * disable interrupts across calling the END probe to allow us to
+ * return the CPU on which we actually called the END probe. This
+ * allows user-land to be sure that this CPU's principal buffer is
+ * processed last.
+ */
+ state->dts_reserve = 0;
+
+ cookie = dtrace_interrupt_disable();
+ *cpu = curcpu;
+ dtrace_probe(dtrace_probeid_end,
+ (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
+ dtrace_interrupt_enable(cookie);
+
+ state->dts_activity = DTRACE_ACTIVITY_STOPPED;
+ dtrace_sync();
+
+#ifdef illumos
+ if (state->dts_getf != 0 &&
+ !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+ /*
+ * We don't have kernel privs but we have at least one call
+ * to getf(); we need to lower our zone's count, and (if
+ * this is the last enabling to have an unprivileged call
+ * to getf()) we need to clear the closef() hook.
+ */
+ ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
+ ASSERT(dtrace_closef == dtrace_getf_barrier);
+ ASSERT(dtrace_getf > 0);
+
+ state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
+
+ if (--dtrace_getf == 0)
+ dtrace_closef = NULL;
+ }
+#endif
+
+ return (0);
+}
+
+static int
+dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
+ dtrace_optval_t val)
+{
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
+ return (EBUSY);
+
+ if (option >= DTRACEOPT_MAX)
+ return (EINVAL);
+
+ if (option != DTRACEOPT_CPU && val < 0)
+ return (EINVAL);
+
+ switch (option) {
+ case DTRACEOPT_DESTRUCTIVE:
+ if (dtrace_destructive_disallow)
+ return (EACCES);
+
+ state->dts_cred.dcr_destructive = 1;
+ break;
+
+ case DTRACEOPT_BUFSIZE:
+ case DTRACEOPT_DYNVARSIZE:
+ case DTRACEOPT_AGGSIZE:
+ case DTRACEOPT_SPECSIZE:
+ case DTRACEOPT_STRSIZE:
+ if (val < 0)
+ return (EINVAL);
+
+ if (val >= LONG_MAX) {
+ /*
+ * If this is an otherwise negative value, set it to
+ * the highest multiple of 128m less than LONG_MAX.
+ * Technically, we're adjusting the size without
+ * regard to the buffer resizing policy, but in fact,
+ * this has no effect -- if we set the buffer size to
+ * ~LONG_MAX and the buffer policy is ultimately set to
+ * be "manual", the buffer allocation is guaranteed to
+ * fail, if only because the allocation requires two
+ * buffers. (We set the the size to the highest
+ * multiple of 128m because it ensures that the size
+ * will remain a multiple of a megabyte when
+ * repeatedly halved -- all the way down to 15m.)
+ */
+ val = LONG_MAX - (1 << 27) + 1;
+ }
+ }
+
+ state->dts_options[option] = val;
+
+ return (0);
+}
+
+static void
+dtrace_state_destroy(dtrace_state_t *state)
+{
+ dtrace_ecb_t *ecb;
+ dtrace_vstate_t *vstate = &state->dts_vstate;
+#ifdef illumos
+ minor_t minor = getminor(state->dts_dev);
+#endif
+ int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
+ dtrace_speculation_t *spec = state->dts_speculations;
+ int nspec = state->dts_nspeculations;
+ uint32_t match;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ /*
+ * First, retract any retained enablings for this state.
+ */
+ dtrace_enabling_retract(state);
+ ASSERT(state->dts_nretained == 0);
+
+ if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
+ state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
+ /*
+ * We have managed to come into dtrace_state_destroy() on a
+ * hot enabling -- almost certainly because of a disorderly
+ * shutdown of a consumer. (That is, a consumer that is
+ * exiting without having called dtrace_stop().) In this case,
+ * we're going to set our activity to be KILLED, and then
+ * issue a sync to be sure that everyone is out of probe
+ * context before we start blowing away ECBs.
+ */
+ state->dts_activity = DTRACE_ACTIVITY_KILLED;
+ dtrace_sync();
+ }
+
+ /*
+ * Release the credential hold we took in dtrace_state_create().
+ */
+ if (state->dts_cred.dcr_cred != NULL)
+ crfree(state->dts_cred.dcr_cred);
+
+ /*
+ * Now we can safely disable and destroy any enabled probes. Because
+ * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
+ * (especially if they're all enabled), we take two passes through the
+ * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
+ * in the second we disable whatever is left over.
+ */
+ for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
+ for (i = 0; i < state->dts_necbs; i++) {
+ if ((ecb = state->dts_ecbs[i]) == NULL)
+ continue;
+
+ if (match && ecb->dte_probe != NULL) {
+ dtrace_probe_t *probe = ecb->dte_probe;
+ dtrace_provider_t *prov = probe->dtpr_provider;
+
+ if (!(prov->dtpv_priv.dtpp_flags & match))
+ continue;
+ }
+
+ dtrace_ecb_disable(ecb);
+ dtrace_ecb_destroy(ecb);
+ }
+
+ if (!match)
+ break;
+ }
+
+ /*
+ * Before we free the buffers, perform one more sync to assure that
+ * every CPU is out of probe context.
+ */
+ dtrace_sync();
+
+ dtrace_buffer_free(state->dts_buffer);
+ dtrace_buffer_free(state->dts_aggbuffer);
+
+ for (i = 0; i < nspec; i++)
+ dtrace_buffer_free(spec[i].dtsp_buffer);
+
+#ifdef illumos
+ if (state->dts_cleaner != CYCLIC_NONE)
+ cyclic_remove(state->dts_cleaner);
+
+ if (state->dts_deadman != CYCLIC_NONE)
+ cyclic_remove(state->dts_deadman);
+#else
+ callout_stop(&state->dts_cleaner);
+ callout_drain(&state->dts_cleaner);
+ callout_stop(&state->dts_deadman);
+ callout_drain(&state->dts_deadman);
+#endif
+
+ dtrace_dstate_fini(&vstate->dtvs_dynvars);
+ dtrace_vstate_fini(vstate);
+ if (state->dts_ecbs != NULL)
+ kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
+
+ if (state->dts_aggregations != NULL) {
+#ifdef DEBUG
+ for (i = 0; i < state->dts_naggregations; i++)
+ ASSERT(state->dts_aggregations[i] == NULL);
+#endif
+ ASSERT(state->dts_naggregations > 0);
+ kmem_free(state->dts_aggregations,
+ state->dts_naggregations * sizeof (dtrace_aggregation_t *));
+ }
+
+ kmem_free(state->dts_buffer, bufsize);
+ kmem_free(state->dts_aggbuffer, bufsize);
+
+ for (i = 0; i < nspec; i++)
+ kmem_free(spec[i].dtsp_buffer, bufsize);
+
+ if (spec != NULL)
+ kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
+
+ dtrace_format_destroy(state);
+
+ if (state->dts_aggid_arena != NULL) {
+#ifdef illumos
+ vmem_destroy(state->dts_aggid_arena);
+#else
+ delete_unrhdr(state->dts_aggid_arena);
+#endif
+ state->dts_aggid_arena = NULL;
+ }
+#ifdef illumos
+ ddi_soft_state_free(dtrace_softstate, minor);
+ vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
+#endif
+}
+
+/*
+ * DTrace Anonymous Enabling Functions
+ */
+static dtrace_state_t *
+dtrace_anon_grab(void)
+{
+ dtrace_state_t *state;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if ((state = dtrace_anon.dta_state) == NULL) {
+ ASSERT(dtrace_anon.dta_enabling == NULL);
+ return (NULL);
+ }
+
+ ASSERT(dtrace_anon.dta_enabling != NULL);
+ ASSERT(dtrace_retained != NULL);
+
+ dtrace_enabling_destroy(dtrace_anon.dta_enabling);
+ dtrace_anon.dta_enabling = NULL;
+ dtrace_anon.dta_state = NULL;
+
+ return (state);
+}
+
+static void
+dtrace_anon_property(void)
+{
+ int i, rv;
+ dtrace_state_t *state;
+ dof_hdr_t *dof;
+ char c[32]; /* enough for "dof-data-" + digits */
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ for (i = 0; ; i++) {
+ (void) snprintf(c, sizeof (c), "dof-data-%d", i);
+
+ dtrace_err_verbose = 1;
+
+ if ((dof = dtrace_dof_property(c)) == NULL) {
+ dtrace_err_verbose = 0;
+ break;
+ }
+
+#ifdef illumos
+ /*
+ * We want to create anonymous state, so we need to transition
+ * the kernel debugger to indicate that DTrace is active. If
+ * this fails (e.g. because the debugger has modified text in
+ * some way), we won't continue with the processing.
+ */
+ if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
+ cmn_err(CE_NOTE, "kernel debugger active; anonymous "
+ "enabling ignored.");
+ dtrace_dof_destroy(dof);
+ break;
+ }
+#endif
+
+ /*
+ * If we haven't allocated an anonymous state, we'll do so now.
+ */
+ if ((state = dtrace_anon.dta_state) == NULL) {
+ state = dtrace_state_create(NULL, NULL);
+ dtrace_anon.dta_state = state;
+
+ if (state == NULL) {
+ /*
+ * This basically shouldn't happen: the only
+ * failure mode from dtrace_state_create() is a
+ * failure of ddi_soft_state_zalloc() that
+ * itself should never happen. Still, the
+ * interface allows for a failure mode, and
+ * we want to fail as gracefully as possible:
+ * we'll emit an error message and cease
+ * processing anonymous state in this case.
+ */
+ cmn_err(CE_WARN, "failed to create "
+ "anonymous state");
+ dtrace_dof_destroy(dof);
+ break;
+ }
+ }
+
+ rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
+ &dtrace_anon.dta_enabling, 0, 0, B_TRUE);
+
+ if (rv == 0)
+ rv = dtrace_dof_options(dof, state);
+
+ dtrace_err_verbose = 0;
+ dtrace_dof_destroy(dof);
+
+ if (rv != 0) {
+ /*
+ * This is malformed DOF; chuck any anonymous state
+ * that we created.
+ */
+ ASSERT(dtrace_anon.dta_enabling == NULL);
+ dtrace_state_destroy(state);
+ dtrace_anon.dta_state = NULL;
+ break;
+ }
+
+ ASSERT(dtrace_anon.dta_enabling != NULL);
+ }
+
+ if (dtrace_anon.dta_enabling != NULL) {
+ int rval;
+
+ /*
+ * dtrace_enabling_retain() can only fail because we are
+ * trying to retain more enablings than are allowed -- but
+ * we only have one anonymous enabling, and we are guaranteed
+ * to be allowed at least one retained enabling; we assert
+ * that dtrace_enabling_retain() returns success.
+ */
+ rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
+ ASSERT(rval == 0);
+
+ dtrace_enabling_dump(dtrace_anon.dta_enabling);
+ }
+}
+
+/*
+ * DTrace Helper Functions
+ */
+static void
+dtrace_helper_trace(dtrace_helper_action_t *helper,
+ dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
+{
+ uint32_t size, next, nnext, i;
+ dtrace_helptrace_t *ent, *buffer;
+ uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
+
+ if ((buffer = dtrace_helptrace_buffer) == NULL)
+ return;
+
+ ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
+
+ /*
+ * What would a tracing framework be without its own tracing
+ * framework? (Well, a hell of a lot simpler, for starters...)
+ */
+ size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
+ sizeof (uint64_t) - sizeof (uint64_t);
+
+ /*
+ * Iterate until we can allocate a slot in the trace buffer.
+ */
+ do {
+ next = dtrace_helptrace_next;
+
+ if (next + size < dtrace_helptrace_bufsize) {
+ nnext = next + size;
+ } else {
+ nnext = size;
+ }
+ } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
+
+ /*
+ * We have our slot; fill it in.
+ */
+ if (nnext == size) {
+ dtrace_helptrace_wrapped++;
+ next = 0;
+ }
+
+ ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
+ ent->dtht_helper = helper;
+ ent->dtht_where = where;
+ ent->dtht_nlocals = vstate->dtvs_nlocals;
+
+ ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
+ mstate->dtms_fltoffs : -1;
+ ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
+ ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
+
+ for (i = 0; i < vstate->dtvs_nlocals; i++) {
+ dtrace_statvar_t *svar;
+
+ if ((svar = vstate->dtvs_locals[i]) == NULL)
+ continue;
+
+ ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
+ ent->dtht_locals[i] =
+ ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
+ }
+}
+
+static uint64_t
+dtrace_helper(int which, dtrace_mstate_t *mstate,
+ dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
+{
+ uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
+ uint64_t sarg0 = mstate->dtms_arg[0];
+ uint64_t sarg1 = mstate->dtms_arg[1];
+ uint64_t rval = 0;
+ dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
+ dtrace_helper_action_t *helper;
+ dtrace_vstate_t *vstate;
+ dtrace_difo_t *pred;
+ int i, trace = dtrace_helptrace_buffer != NULL;
+
+ ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
+
+ if (helpers == NULL)
+ return (0);
+
+ if ((helper = helpers->dthps_actions[which]) == NULL)
+ return (0);
+
+ vstate = &helpers->dthps_vstate;
+ mstate->dtms_arg[0] = arg0;
+ mstate->dtms_arg[1] = arg1;
+
+ /*
+ * Now iterate over each helper. If its predicate evaluates to 'true',
+ * we'll call the corresponding actions. Note that the below calls
+ * to dtrace_dif_emulate() may set faults in machine state. This is
+ * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
+ * the stored DIF offset with its own (which is the desired behavior).
+ * Also, note the calls to dtrace_dif_emulate() may allocate scratch
+ * from machine state; this is okay, too.
+ */
+ for (; helper != NULL; helper = helper->dtha_next) {
+ if ((pred = helper->dtha_predicate) != NULL) {
+ if (trace)
+ dtrace_helper_trace(helper, mstate, vstate, 0);
+
+ if (!dtrace_dif_emulate(pred, mstate, vstate, state))
+ goto next;
+
+ if (*flags & CPU_DTRACE_FAULT)
+ goto err;
+ }
+
+ for (i = 0; i < helper->dtha_nactions; i++) {
+ if (trace)
+ dtrace_helper_trace(helper,
+ mstate, vstate, i + 1);
+
+ rval = dtrace_dif_emulate(helper->dtha_actions[i],
+ mstate, vstate, state);
+
+ if (*flags & CPU_DTRACE_FAULT)
+ goto err;
+ }
+
+next:
+ if (trace)
+ dtrace_helper_trace(helper, mstate, vstate,
+ DTRACE_HELPTRACE_NEXT);
+ }
+
+ if (trace)
+ dtrace_helper_trace(helper, mstate, vstate,
+ DTRACE_HELPTRACE_DONE);
+
+ /*
+ * Restore the arg0 that we saved upon entry.
+ */
+ mstate->dtms_arg[0] = sarg0;
+ mstate->dtms_arg[1] = sarg1;
+
+ return (rval);
+
+err:
+ if (trace)
+ dtrace_helper_trace(helper, mstate, vstate,
+ DTRACE_HELPTRACE_ERR);
+
+ /*
+ * Restore the arg0 that we saved upon entry.
+ */
+ mstate->dtms_arg[0] = sarg0;
+ mstate->dtms_arg[1] = sarg1;
+
+ return (0);
+}
+
+static void
+dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
+ dtrace_vstate_t *vstate)
+{
+ int i;
+
+ if (helper->dtha_predicate != NULL)
+ dtrace_difo_release(helper->dtha_predicate, vstate);
+
+ for (i = 0; i < helper->dtha_nactions; i++) {
+ ASSERT(helper->dtha_actions[i] != NULL);
+ dtrace_difo_release(helper->dtha_actions[i], vstate);
+ }
+
+ kmem_free(helper->dtha_actions,
+ helper->dtha_nactions * sizeof (dtrace_difo_t *));
+ kmem_free(helper, sizeof (dtrace_helper_action_t));
+}
+
+static int
+dtrace_helper_destroygen(dtrace_helpers_t *help, int gen)
+{
+ proc_t *p = curproc;
+ dtrace_vstate_t *vstate;
+ int i;
+
+ if (help == NULL)
+ help = p->p_dtrace_helpers;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if (help == NULL || gen > help->dthps_generation)
+ return (EINVAL);
+
+ vstate = &help->dthps_vstate;
+
+ for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
+ dtrace_helper_action_t *last = NULL, *h, *next;
+
+ for (h = help->dthps_actions[i]; h != NULL; h = next) {
+ next = h->dtha_next;
+
+ if (h->dtha_generation == gen) {
+ if (last != NULL) {
+ last->dtha_next = next;
+ } else {
+ help->dthps_actions[i] = next;
+ }
+
+ dtrace_helper_action_destroy(h, vstate);
+ } else {
+ last = h;
+ }
+ }
+ }
+
+ /*
+ * Interate until we've cleared out all helper providers with the
+ * given generation number.
+ */
+ for (;;) {
+ dtrace_helper_provider_t *prov;
+
+ /*
+ * Look for a helper provider with the right generation. We
+ * have to start back at the beginning of the list each time
+ * because we drop dtrace_lock. It's unlikely that we'll make
+ * more than two passes.
+ */
+ for (i = 0; i < help->dthps_nprovs; i++) {
+ prov = help->dthps_provs[i];
+
+ if (prov->dthp_generation == gen)
+ break;
+ }
+
+ /*
+ * If there were no matches, we're done.
+ */
+ if (i == help->dthps_nprovs)
+ break;
+
+ /*
+ * Move the last helper provider into this slot.
+ */
+ help->dthps_nprovs--;
+ help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
+ help->dthps_provs[help->dthps_nprovs] = NULL;
+
+ mutex_exit(&dtrace_lock);
+
+ /*
+ * If we have a meta provider, remove this helper provider.
+ */
+ mutex_enter(&dtrace_meta_lock);
+ if (dtrace_meta_pid != NULL) {
+ ASSERT(dtrace_deferred_pid == NULL);
+ dtrace_helper_provider_remove(&prov->dthp_prov,
+ p->p_pid);
+ }
+ mutex_exit(&dtrace_meta_lock);
+
+ dtrace_helper_provider_destroy(prov);
+
+ mutex_enter(&dtrace_lock);
+ }
+
+ return (0);
+}
+
+static int
+dtrace_helper_validate(dtrace_helper_action_t *helper)
+{
+ int err = 0, i;
+ dtrace_difo_t *dp;
+
+ if ((dp = helper->dtha_predicate) != NULL)
+ err += dtrace_difo_validate_helper(dp);
+
+ for (i = 0; i < helper->dtha_nactions; i++)
+ err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
+
+ return (err == 0);
+}
+
+static int
+dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep,
+ dtrace_helpers_t *help)
+{
+ dtrace_helper_action_t *helper, *last;
+ dtrace_actdesc_t *act;
+ dtrace_vstate_t *vstate;
+ dtrace_predicate_t *pred;
+ int count = 0, nactions = 0, i;
+
+ if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
+ return (EINVAL);
+
+ last = help->dthps_actions[which];
+ vstate = &help->dthps_vstate;
+
+ for (count = 0; last != NULL; last = last->dtha_next) {
+ count++;
+ if (last->dtha_next == NULL)
+ break;
+ }
+
+ /*
+ * If we already have dtrace_helper_actions_max helper actions for this
+ * helper action type, we'll refuse to add a new one.
+ */
+ if (count >= dtrace_helper_actions_max)
+ return (ENOSPC);
+
+ helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
+ helper->dtha_generation = help->dthps_generation;
+
+ if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
+ ASSERT(pred->dtp_difo != NULL);
+ dtrace_difo_hold(pred->dtp_difo);
+ helper->dtha_predicate = pred->dtp_difo;
+ }
+
+ for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
+ if (act->dtad_kind != DTRACEACT_DIFEXPR)
+ goto err;
+
+ if (act->dtad_difo == NULL)
+ goto err;
+
+ nactions++;
+ }
+
+ helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
+ (helper->dtha_nactions = nactions), KM_SLEEP);
+
+ for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
+ dtrace_difo_hold(act->dtad_difo);
+ helper->dtha_actions[i++] = act->dtad_difo;
+ }
+
+ if (!dtrace_helper_validate(helper))
+ goto err;
+
+ if (last == NULL) {
+ help->dthps_actions[which] = helper;
+ } else {
+ last->dtha_next = helper;
+ }
+
+ if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
+ dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
+ dtrace_helptrace_next = 0;
+ }
+
+ return (0);
+err:
+ dtrace_helper_action_destroy(helper, vstate);
+ return (EINVAL);
+}
+
+static void
+dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
+ dof_helper_t *dofhp)
+{
+ ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
+
+ mutex_enter(&dtrace_meta_lock);
+ mutex_enter(&dtrace_lock);
+
+ if (!dtrace_attached() || dtrace_meta_pid == NULL) {
+ /*
+ * If the dtrace module is loaded but not attached, or if
+ * there aren't isn't a meta provider registered to deal with
+ * these provider descriptions, we need to postpone creating
+ * the actual providers until later.
+ */
+
+ if (help->dthps_next == NULL && help->dthps_prev == NULL &&
+ dtrace_deferred_pid != help) {
+ help->dthps_deferred = 1;
+ help->dthps_pid = p->p_pid;
+ help->dthps_next = dtrace_deferred_pid;
+ help->dthps_prev = NULL;
+ if (dtrace_deferred_pid != NULL)
+ dtrace_deferred_pid->dthps_prev = help;
+ dtrace_deferred_pid = help;
+ }
+
+ mutex_exit(&dtrace_lock);
+
+ } else if (dofhp != NULL) {
+ /*
+ * If the dtrace module is loaded and we have a particular
+ * helper provider description, pass that off to the
+ * meta provider.
+ */
+
+ mutex_exit(&dtrace_lock);
+
+ dtrace_helper_provide(dofhp, p->p_pid);
+
+ } else {
+ /*
+ * Otherwise, just pass all the helper provider descriptions
+ * off to the meta provider.
+ */
+
+ int i;
+ mutex_exit(&dtrace_lock);
+
+ for (i = 0; i < help->dthps_nprovs; i++) {
+ dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
+ p->p_pid);
+ }
+ }
+
+ mutex_exit(&dtrace_meta_lock);
+}
+
+static int
+dtrace_helper_provider_add(dof_helper_t *dofhp, dtrace_helpers_t *help, int gen)
+{
+ dtrace_helper_provider_t *hprov, **tmp_provs;
+ uint_t tmp_maxprovs, i;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(help != NULL);
+
+ /*
+ * If we already have dtrace_helper_providers_max helper providers,
+ * we're refuse to add a new one.
+ */
+ if (help->dthps_nprovs >= dtrace_helper_providers_max)
+ return (ENOSPC);
+
+ /*
+ * Check to make sure this isn't a duplicate.
+ */
+ for (i = 0; i < help->dthps_nprovs; i++) {
+ if (dofhp->dofhp_addr ==
+ help->dthps_provs[i]->dthp_prov.dofhp_addr)
+ return (EALREADY);
+ }
+
+ hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
+ hprov->dthp_prov = *dofhp;
+ hprov->dthp_ref = 1;
+ hprov->dthp_generation = gen;
+
+ /*
+ * Allocate a bigger table for helper providers if it's already full.
+ */
+ if (help->dthps_maxprovs == help->dthps_nprovs) {
+ tmp_maxprovs = help->dthps_maxprovs;
+ tmp_provs = help->dthps_provs;
+
+ if (help->dthps_maxprovs == 0)
+ help->dthps_maxprovs = 2;
+ else
+ help->dthps_maxprovs *= 2;
+ if (help->dthps_maxprovs > dtrace_helper_providers_max)
+ help->dthps_maxprovs = dtrace_helper_providers_max;
+
+ ASSERT(tmp_maxprovs < help->dthps_maxprovs);
+
+ help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
+ sizeof (dtrace_helper_provider_t *), KM_SLEEP);
+
+ if (tmp_provs != NULL) {
+ bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
+ sizeof (dtrace_helper_provider_t *));
+ kmem_free(tmp_provs, tmp_maxprovs *
+ sizeof (dtrace_helper_provider_t *));
+ }
+ }
+
+ help->dthps_provs[help->dthps_nprovs] = hprov;
+ help->dthps_nprovs++;
+
+ return (0);
+}
+
+static void
+dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
+{
+ mutex_enter(&dtrace_lock);
+
+ if (--hprov->dthp_ref == 0) {
+ dof_hdr_t *dof;
+ mutex_exit(&dtrace_lock);
+ dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
+ dtrace_dof_destroy(dof);
+ kmem_free(hprov, sizeof (dtrace_helper_provider_t));
+ } else {
+ mutex_exit(&dtrace_lock);
+ }
+}
+
+static int
+dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
+{
+ uintptr_t daddr = (uintptr_t)dof;
+ dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
+ dof_provider_t *provider;
+ dof_probe_t *probe;
+ uint8_t *arg;
+ char *strtab, *typestr;
+ dof_stridx_t typeidx;
+ size_t typesz;
+ uint_t nprobes, j, k;
+
+ ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
+
+ if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
+ dtrace_dof_error(dof, "misaligned section offset");
+ return (-1);
+ }
+
+ /*
+ * The section needs to be large enough to contain the DOF provider
+ * structure appropriate for the given version.
+ */
+ if (sec->dofs_size <
+ ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
+ offsetof(dof_provider_t, dofpv_prenoffs) :
+ sizeof (dof_provider_t))) {
+ dtrace_dof_error(dof, "provider section too small");
+ return (-1);
+ }
+
+ provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
+ str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
+ prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
+ arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
+ off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
+
+ if (str_sec == NULL || prb_sec == NULL ||
+ arg_sec == NULL || off_sec == NULL)
+ return (-1);
+
+ enoff_sec = NULL;
+
+ if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
+ provider->dofpv_prenoffs != DOF_SECT_NONE &&
+ (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
+ provider->dofpv_prenoffs)) == NULL)
+ return (-1);
+
+ strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
+
+ if (provider->dofpv_name >= str_sec->dofs_size ||
+ strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
+ dtrace_dof_error(dof, "invalid provider name");
+ return (-1);
+ }
+
+ if (prb_sec->dofs_entsize == 0 ||
+ prb_sec->dofs_entsize > prb_sec->dofs_size) {
+ dtrace_dof_error(dof, "invalid entry size");
+ return (-1);
+ }
+
+ if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
+ dtrace_dof_error(dof, "misaligned entry size");
+ return (-1);
+ }
+
+ if (off_sec->dofs_entsize != sizeof (uint32_t)) {
+ dtrace_dof_error(dof, "invalid entry size");
+ return (-1);
+ }
+
+ if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
+ dtrace_dof_error(dof, "misaligned section offset");
+ return (-1);
+ }
+
+ if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
+ dtrace_dof_error(dof, "invalid entry size");
+ return (-1);
+ }
+
+ arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
+
+ nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
+
+ /*
+ * Take a pass through the probes to check for errors.
+ */
+ for (j = 0; j < nprobes; j++) {
+ probe = (dof_probe_t *)(uintptr_t)(daddr +
+ prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
+
+ if (probe->dofpr_func >= str_sec->dofs_size) {
+ dtrace_dof_error(dof, "invalid function name");
+ return (-1);
+ }
+
+ if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
+ dtrace_dof_error(dof, "function name too long");
+ /*
+ * Keep going if the function name is too long.
+ * Unlike provider and probe names, we cannot reasonably
+ * impose restrictions on function names, since they're
+ * a property of the code being instrumented. We will
+ * skip this probe in dtrace_helper_provide_one().
+ */
+ }
+
+ if (probe->dofpr_name >= str_sec->dofs_size ||
+ strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
+ dtrace_dof_error(dof, "invalid probe name");
+ return (-1);
+ }
+
+ /*
+ * The offset count must not wrap the index, and the offsets
+ * must also not overflow the section's data.
+ */
+ if (probe->dofpr_offidx + probe->dofpr_noffs <
+ probe->dofpr_offidx ||
+ (probe->dofpr_offidx + probe->dofpr_noffs) *
+ off_sec->dofs_entsize > off_sec->dofs_size) {
+ dtrace_dof_error(dof, "invalid probe offset");
+ return (-1);
+ }
+
+ if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
+ /*
+ * If there's no is-enabled offset section, make sure
+ * there aren't any is-enabled offsets. Otherwise
+ * perform the same checks as for probe offsets
+ * (immediately above).
+ */
+ if (enoff_sec == NULL) {
+ if (probe->dofpr_enoffidx != 0 ||
+ probe->dofpr_nenoffs != 0) {
+ dtrace_dof_error(dof, "is-enabled "
+ "offsets with null section");
+ return (-1);
+ }
+ } else if (probe->dofpr_enoffidx +
+ probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
+ (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
+ enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
+ dtrace_dof_error(dof, "invalid is-enabled "
+ "offset");
+ return (-1);
+ }
+
+ if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
+ dtrace_dof_error(dof, "zero probe and "
+ "is-enabled offsets");
+ return (-1);
+ }
+ } else if (probe->dofpr_noffs == 0) {
+ dtrace_dof_error(dof, "zero probe offsets");
+ return (-1);
+ }
+
+ if (probe->dofpr_argidx + probe->dofpr_xargc <
+ probe->dofpr_argidx ||
+ (probe->dofpr_argidx + probe->dofpr_xargc) *
+ arg_sec->dofs_entsize > arg_sec->dofs_size) {
+ dtrace_dof_error(dof, "invalid args");
+ return (-1);
+ }
+
+ typeidx = probe->dofpr_nargv;
+ typestr = strtab + probe->dofpr_nargv;
+ for (k = 0; k < probe->dofpr_nargc; k++) {
+ if (typeidx >= str_sec->dofs_size) {
+ dtrace_dof_error(dof, "bad "
+ "native argument type");
+ return (-1);
+ }
+
+ typesz = strlen(typestr) + 1;
+ if (typesz > DTRACE_ARGTYPELEN) {
+ dtrace_dof_error(dof, "native "
+ "argument type too long");
+ return (-1);
+ }
+ typeidx += typesz;
+ typestr += typesz;
+ }
+
+ typeidx = probe->dofpr_xargv;
+ typestr = strtab + probe->dofpr_xargv;
+ for (k = 0; k < probe->dofpr_xargc; k++) {
+ if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
+ dtrace_dof_error(dof, "bad "
+ "native argument index");
+ return (-1);
+ }
+
+ if (typeidx >= str_sec->dofs_size) {
+ dtrace_dof_error(dof, "bad "
+ "translated argument type");
+ return (-1);
+ }
+
+ typesz = strlen(typestr) + 1;
+ if (typesz > DTRACE_ARGTYPELEN) {
+ dtrace_dof_error(dof, "translated argument "
+ "type too long");
+ return (-1);
+ }
+
+ typeidx += typesz;
+ typestr += typesz;
+ }
+ }
+
+ return (0);
+}
+
+static int
+dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp, struct proc *p)
+{
+ dtrace_helpers_t *help;
+ dtrace_vstate_t *vstate;
+ dtrace_enabling_t *enab = NULL;
+ int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
+ uintptr_t daddr = (uintptr_t)dof;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+
+ if ((help = p->p_dtrace_helpers) == NULL)
+ help = dtrace_helpers_create(p);
+
+ vstate = &help->dthps_vstate;
+
+ if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab, dhp->dofhp_addr,
+ dhp->dofhp_dof, B_FALSE)) != 0) {
+ dtrace_dof_destroy(dof);
+ return (rv);
+ }
+
+ /*
+ * Look for helper providers and validate their descriptions.
+ */
+ for (i = 0; i < dof->dofh_secnum; i++) {
+ dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
+ dof->dofh_secoff + i * dof->dofh_secsize);
+
+ if (sec->dofs_type != DOF_SECT_PROVIDER)
+ continue;
+
+ if (dtrace_helper_provider_validate(dof, sec) != 0) {
+ dtrace_enabling_destroy(enab);
+ dtrace_dof_destroy(dof);
+ return (-1);
+ }
+
+ nprovs++;
+ }
+
+ /*
+ * Now we need to walk through the ECB descriptions in the enabling.
+ */
+ for (i = 0; i < enab->dten_ndesc; i++) {
+ dtrace_ecbdesc_t *ep = enab->dten_desc[i];
+ dtrace_probedesc_t *desc = &ep->dted_probe;
+
+ if (strcmp(desc->dtpd_provider, "dtrace") != 0)
+ continue;
+
+ if (strcmp(desc->dtpd_mod, "helper") != 0)
+ continue;
+
+ if (strcmp(desc->dtpd_func, "ustack") != 0)
+ continue;
+
+ if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
+ ep, help)) != 0) {
+ /*
+ * Adding this helper action failed -- we are now going
+ * to rip out the entire generation and return failure.
+ */
+ (void) dtrace_helper_destroygen(help,
+ help->dthps_generation);
+ dtrace_enabling_destroy(enab);
+ dtrace_dof_destroy(dof);
+ return (-1);
+ }
+
+ nhelpers++;
+ }
+
+ if (nhelpers < enab->dten_ndesc)
+ dtrace_dof_error(dof, "unmatched helpers");
+
+ gen = help->dthps_generation++;
+ dtrace_enabling_destroy(enab);
+
+ if (nprovs > 0) {
+ /*
+ * Now that this is in-kernel, we change the sense of the
+ * members: dofhp_dof denotes the in-kernel copy of the DOF
+ * and dofhp_addr denotes the address at user-level.
+ */
+ dhp->dofhp_addr = dhp->dofhp_dof;
+ dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
+
+ if (dtrace_helper_provider_add(dhp, help, gen) == 0) {
+ mutex_exit(&dtrace_lock);
+ dtrace_helper_provider_register(p, help, dhp);
+ mutex_enter(&dtrace_lock);
+
+ destroy = 0;
+ }
+ }
+
+ if (destroy)
+ dtrace_dof_destroy(dof);
+
+ return (gen);
+}
+
+static dtrace_helpers_t *
+dtrace_helpers_create(proc_t *p)
+{
+ dtrace_helpers_t *help;
+
+ ASSERT(MUTEX_HELD(&dtrace_lock));
+ ASSERT(p->p_dtrace_helpers == NULL);
+
+ help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
+ help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
+ DTRACE_NHELPER_ACTIONS, KM_SLEEP);
+
+ p->p_dtrace_helpers = help;
+ dtrace_helpers++;
+
+ return (help);
+}
+
+#ifdef illumos
+static
+#endif
+void
+dtrace_helpers_destroy(proc_t *p)
+{
+ dtrace_helpers_t *help;
+ dtrace_vstate_t *vstate;
+#ifdef illumos
+ proc_t *p = curproc;
+#endif
+ int i;
+
+ mutex_enter(&dtrace_lock);
+
+ ASSERT(p->p_dtrace_helpers != NULL);
+ ASSERT(dtrace_helpers > 0);
+
+ help = p->p_dtrace_helpers;
+ vstate = &help->dthps_vstate;
+
+ /*
+ * We're now going to lose the help from this process.
+ */
+ p->p_dtrace_helpers = NULL;
+ dtrace_sync();
+
+ /*
+ * Destory the helper actions.
+ */
+ for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
+ dtrace_helper_action_t *h, *next;
+
+ for (h = help->dthps_actions[i]; h != NULL; h = next) {
+ next = h->dtha_next;
+ dtrace_helper_action_destroy(h, vstate);
+ h = next;
+ }
+ }
+
+ mutex_exit(&dtrace_lock);
+
+ /*
+ * Destroy the helper providers.
+ */
+ if (help->dthps_maxprovs > 0) {
+ mutex_enter(&dtrace_meta_lock);
+ if (dtrace_meta_pid != NULL) {
+ ASSERT(dtrace_deferred_pid == NULL);
+
+ for (i = 0; i < help->dthps_nprovs; i++) {
+ dtrace_helper_provider_remove(
+ &help->dthps_provs[i]->dthp_prov, p->p_pid);
+ }
+ } else {
+ mutex_enter(&dtrace_lock);
+ ASSERT(help->dthps_deferred == 0 ||
+ help->dthps_next != NULL ||
+ help->dthps_prev != NULL ||
+ help == dtrace_deferred_pid);
+
+ /*
+ * Remove the helper from the deferred list.
+ */
+ if (help->dthps_next != NULL)
+ help->dthps_next->dthps_prev = help->dthps_prev;
+ if (help->dthps_prev != NULL)
+ help->dthps_prev->dthps_next = help->dthps_next;
+ if (dtrace_deferred_pid == help) {
+ dtrace_deferred_pid = help->dthps_next;
+ ASSERT(help->dthps_prev == NULL);
+ }
+
+ mutex_exit(&dtrace_lock);
+ }
+
+ mutex_exit(&dtrace_meta_lock);
+
+ for (i = 0; i < help->dthps_nprovs; i++) {
+ dtrace_helper_provider_destroy(help->dthps_provs[i]);
+ }
+
+ kmem_free(help->dthps_provs, help->dthps_maxprovs *
+ sizeof (dtrace_helper_provider_t *));
+ }
+
+ mutex_enter(&dtrace_lock);
+
+ dtrace_vstate_fini(&help->dthps_vstate);
+ kmem_free(help->dthps_actions,
+ sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
+ kmem_free(help, sizeof (dtrace_helpers_t));
+
+ --dtrace_helpers;
+ mutex_exit(&dtrace_lock);
+}
+
+#ifdef illumos
+static
+#endif
+void
+dtrace_helpers_duplicate(proc_t *from, proc_t *to)
+{
+ dtrace_helpers_t *help, *newhelp;
+ dtrace_helper_action_t *helper, *new, *last;
+ dtrace_difo_t *dp;
+ dtrace_vstate_t *vstate;
+ int i, j, sz, hasprovs = 0;
+
+ mutex_enter(&dtrace_lock);
+ ASSERT(from->p_dtrace_helpers != NULL);
+ ASSERT(dtrace_helpers > 0);
+
+ help = from->p_dtrace_helpers;
+ newhelp = dtrace_helpers_create(to);
+ ASSERT(to->p_dtrace_helpers != NULL);
+
+ newhelp->dthps_generation = help->dthps_generation;
+ vstate = &newhelp->dthps_vstate;
+
+ /*
+ * Duplicate the helper actions.
+ */
+ for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
+ if ((helper = help->dthps_actions[i]) == NULL)
+ continue;
+
+ for (last = NULL; helper != NULL; helper = helper->dtha_next) {
+ new = kmem_zalloc(sizeof (dtrace_helper_action_t),
+ KM_SLEEP);
+ new->dtha_generation = helper->dtha_generation;
+
+ if ((dp = helper->dtha_predicate) != NULL) {
+ dp = dtrace_difo_duplicate(dp, vstate);
+ new->dtha_predicate = dp;
+ }
+
+ new->dtha_nactions = helper->dtha_nactions;
+ sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
+ new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
+
+ for (j = 0; j < new->dtha_nactions; j++) {
+ dtrace_difo_t *dp = helper->dtha_actions[j];
+
+ ASSERT(dp != NULL);
+ dp = dtrace_difo_duplicate(dp, vstate);
+ new->dtha_actions[j] = dp;
+ }
+
+ if (last != NULL) {
+ last->dtha_next = new;
+ } else {
+ newhelp->dthps_actions[i] = new;
+ }
+
+ last = new;
+ }
+ }
+
+ /*
+ * Duplicate the helper providers and register them with the
+ * DTrace framework.
+ */
+ if (help->dthps_nprovs > 0) {
+ newhelp->dthps_nprovs = help->dthps_nprovs;
+ newhelp->dthps_maxprovs = help->dthps_nprovs;
+ newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
+ sizeof (dtrace_helper_provider_t *), KM_SLEEP);
+ for (i = 0; i < newhelp->dthps_nprovs; i++) {
+ newhelp->dthps_provs[i] = help->dthps_provs[i];
+ newhelp->dthps_provs[i]->dthp_ref++;
+ }
+
+ hasprovs = 1;
+ }
+
+ mutex_exit(&dtrace_lock);
+
+ if (hasprovs)
+ dtrace_helper_provider_register(to, newhelp, NULL);
+}
+
+/*
+ * DTrace Hook Functions
+ */
+static void
+dtrace_module_loaded(modctl_t *ctl)
+{
+ dtrace_provider_t *prv;
+
+ mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
+ mutex_enter(&mod_lock);
+#endif
+
+#ifdef illumos
+ ASSERT(ctl->mod_busy);
+#endif
+
+ /*
+ * We're going to call each providers per-module provide operation
+ * specifying only this module.
+ */
+ for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
+ prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
+
+#ifdef illumos
+ mutex_exit(&mod_lock);
+#endif
+ mutex_exit(&dtrace_provider_lock);
+
+ /*
+ * If we have any retained enablings, we need to match against them.
+ * Enabling probes requires that cpu_lock be held, and we cannot hold
+ * cpu_lock here -- it is legal for cpu_lock to be held when loading a
+ * module. (In particular, this happens when loading scheduling
+ * classes.) So if we have any retained enablings, we need to dispatch
+ * our task queue to do the match for us.
+ */
+ mutex_enter(&dtrace_lock);
+
+ if (dtrace_retained == NULL) {
+ mutex_exit(&dtrace_lock);
+ return;
+ }
+
+ (void) taskq_dispatch(dtrace_taskq,
+ (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
+
+ mutex_exit(&dtrace_lock);
+
+ /*
+ * And now, for a little heuristic sleaze: in general, we want to
+ * match modules as soon as they load. However, we cannot guarantee
+ * this, because it would lead us to the lock ordering violation
+ * outlined above. The common case, of course, is that cpu_lock is
+ * _not_ held -- so we delay here for a clock tick, hoping that that's
+ * long enough for the task queue to do its work. If it's not, it's
+ * not a serious problem -- it just means that the module that we
+ * just loaded may not be immediately instrumentable.
+ */
+ delay(1);
+}
+
+static void
+#ifdef illumos
+dtrace_module_unloaded(modctl_t *ctl)
+#else
+dtrace_module_unloaded(modctl_t *ctl, int *error)
+#endif
+{
+ dtrace_probe_t template, *probe, *first, *next;
+ dtrace_provider_t *prov;
+#ifndef illumos
+ char modname[DTRACE_MODNAMELEN];
+ size_t len;
+#endif
+
+#ifdef illumos
+ template.dtpr_mod = ctl->mod_modname;
+#else
+ /* Handle the fact that ctl->filename may end in ".ko". */
+ strlcpy(modname, ctl->filename, sizeof(modname));
+ len = strlen(ctl->filename);
+ if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
+ modname[len - 3] = '\0';
+ template.dtpr_mod = modname;
+#endif
+
+ mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
+ mutex_enter(&mod_lock);
+#endif
+ mutex_enter(&dtrace_lock);
+
+#ifndef illumos
+ if (ctl->nenabled > 0) {
+ /* Don't allow unloads if a probe is enabled. */
+ mutex_exit(&dtrace_provider_lock);
+ mutex_exit(&dtrace_lock);
+ *error = -1;
+ printf(
+ "kldunload: attempt to unload module that has DTrace probes enabled\n");
+ return;
+ }
+#endif
+
+ if (dtrace_bymod == NULL) {
+ /*
+ * The DTrace module is loaded (obviously) but not attached;
+ * we don't have any work to do.
+ */
+ mutex_exit(&dtrace_provider_lock);
+#ifdef illumos
+ mutex_exit(&mod_lock);
+#endif
+ mutex_exit(&dtrace_lock);
+ return;
+ }
+
+ for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
+ probe != NULL; probe = probe->dtpr_nextmod) {
+ if (probe->dtpr_ecb != NULL) {
+ mutex_exit(&dtrace_provider_lock);
+#ifdef illumos
+ mutex_exit(&mod_lock);
+#endif
+ mutex_exit(&dtrace_lock);
+
+ /*
+ * This shouldn't _actually_ be possible -- we're
+ * unloading a module that has an enabled probe in it.
+ * (It's normally up to the provider to make sure that
+ * this can't happen.) However, because dtps_enable()
+ * doesn't have a failure mode, there can be an
+ * enable/unload race. Upshot: we don't want to
+ * assert, but we're not going to disable the
+ * probe, either.
+ */
+ if (dtrace_err_verbose) {
+#ifdef illumos
+ cmn_err(CE_WARN, "unloaded module '%s' had "
+ "enabled probes", ctl->mod_modname);
+#else
+ cmn_err(CE_WARN, "unloaded module '%s' had "
+ "enabled probes", modname);
+#endif
+ }
+
+ return;
+ }
+ }
+
+ probe = first;
+
+ for (first = NULL; probe != NULL; probe = next) {
+ ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
+
+ dtrace_probes[probe->dtpr_id - 1] = NULL;
+
+ next = probe->dtpr_nextmod;
+ dtrace_hash_remove(dtrace_bymod, probe);
+ dtrace_hash_remove(dtrace_byfunc, probe);
+ dtrace_hash_remove(dtrace_byname, probe);
+
+ if (first == NULL) {
+ first = probe;
+ probe->dtpr_nextmod = NULL;
+ } else {
+ probe->dtpr_nextmod = first;
+ first = probe;
+ }
+ }
+
+ /*
+ * We've removed all of the module's probes from the hash chains and
+ * from the probe array. Now issue a dtrace_sync() to be sure that
+ * everyone has cleared out from any probe array processing.
+ */
+ dtrace_sync();
+
+ for (probe = first; probe != NULL; probe = first) {
+ first = probe->dtpr_nextmod;
+ prov = probe->dtpr_provider;
+ prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
+ probe->dtpr_arg);
+ kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
+ kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
+ kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+#ifdef illumos
+ vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
+#else
+ free_unr(dtrace_arena, probe->dtpr_id);
+#endif
+ kmem_free(probe, sizeof (dtrace_probe_t));
+ }
+
+ mutex_exit(&dtrace_lock);
+#ifdef illumos
+ mutex_exit(&mod_lock);
+#endif
+ mutex_exit(&dtrace_provider_lock);
+}
+
+#ifndef illumos
+static void
+dtrace_kld_load(void *arg __unused, linker_file_t lf)
+{
+
+ dtrace_module_loaded(lf);
+}
+
+static void
+dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
+{
+
+ if (*error != 0)
+ /* We already have an error, so don't do anything. */
+ return;
+ dtrace_module_unloaded(lf, error);
+}
+#endif
+
+#ifdef illumos
+static void
+dtrace_suspend(void)
+{
+ dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
+}
+
+static void
+dtrace_resume(void)
+{
+ dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
+}
+#endif
+
+static int
+dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
+{
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ mutex_enter(&dtrace_lock);
+
+ switch (what) {
+ case CPU_CONFIG: {
+ dtrace_state_t *state;
+ dtrace_optval_t *opt, rs, c;
+
+ /*
+ * For now, we only allocate a new buffer for anonymous state.
+ */
+ if ((state = dtrace_anon.dta_state) == NULL)
+ break;
+
+ if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
+ break;
+
+ opt = state->dts_options;
+ c = opt[DTRACEOPT_CPU];
+
+ if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
+ break;
+
+ /*
+ * Regardless of what the actual policy is, we're going to
+ * temporarily set our resize policy to be manual. We're
+ * also going to temporarily set our CPU option to denote
+ * the newly configured CPU.
+ */
+ rs = opt[DTRACEOPT_BUFRESIZE];
+ opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
+ opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
+
+ (void) dtrace_state_buffers(state);
+
+ opt[DTRACEOPT_BUFRESIZE] = rs;
+ opt[DTRACEOPT_CPU] = c;
+
+ break;
+ }
+
+ case CPU_UNCONFIG:
+ /*
+ * We don't free the buffer in the CPU_UNCONFIG case. (The
+ * buffer will be freed when the consumer exits.)
+ */
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_exit(&dtrace_lock);
+ return (0);
+}
+
+#ifdef illumos
+static void
+dtrace_cpu_setup_initial(processorid_t cpu)
+{
+ (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
+}
+#endif
+
+static void
+dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
+{
+ if (dtrace_toxranges >= dtrace_toxranges_max) {
+ int osize, nsize;
+ dtrace_toxrange_t *range;
+
+ osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
+
+ if (osize == 0) {
+ ASSERT(dtrace_toxrange == NULL);
+ ASSERT(dtrace_toxranges_max == 0);
+ dtrace_toxranges_max = 1;
+ } else {
+ dtrace_toxranges_max <<= 1;
+ }
+
+ nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
+ range = kmem_zalloc(nsize, KM_SLEEP);
+
+ if (dtrace_toxrange != NULL) {
+ ASSERT(osize != 0);
+ bcopy(dtrace_toxrange, range, osize);
+ kmem_free(dtrace_toxrange, osize);
+ }
+
+ dtrace_toxrange = range;
+ }
+
+ ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
+ ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
+
+ dtrace_toxrange[dtrace_toxranges].dtt_base = base;
+ dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
+ dtrace_toxranges++;
+}
+
+static void
+dtrace_getf_barrier()
+{
+#ifdef illumos
+ /*
+ * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
+ * that contain calls to getf(), this routine will be called on every
+ * closef() before either the underlying vnode is released or the
+ * file_t itself is freed. By the time we are here, it is essential
+ * that the file_t can no longer be accessed from a call to getf()
+ * in probe context -- that assures that a dtrace_sync() can be used
+ * to clear out any enablings referring to the old structures.
+ */
+ if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
+ kcred->cr_zone->zone_dtrace_getf != 0)
+ dtrace_sync();
+#endif
+}
+
+/*
+ * DTrace Driver Cookbook Functions
+ */
+#ifdef illumos
+/*ARGSUSED*/
+static int
+dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+{
+ dtrace_provider_id_t id;
+ dtrace_state_t *state = NULL;
+ dtrace_enabling_t *enab;
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_provider_lock);
+ mutex_enter(&dtrace_lock);
+
+ if (ddi_soft_state_init(&dtrace_softstate,
+ sizeof (dtrace_state_t), 0) != 0) {
+ cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
+ mutex_exit(&cpu_lock);
+ mutex_exit(&dtrace_provider_lock);
+ mutex_exit(&dtrace_lock);
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
+ DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
+ ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
+ DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
+ cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
+ ddi_remove_minor_node(devi, NULL);
+ ddi_soft_state_fini(&dtrace_softstate);
+ mutex_exit(&cpu_lock);
+ mutex_exit(&dtrace_provider_lock);
+ mutex_exit(&dtrace_lock);
+ return (DDI_FAILURE);
+ }
+
+ ddi_report_dev(devi);
+ dtrace_devi = devi;
+
+ dtrace_modload = dtrace_module_loaded;
+ dtrace_modunload = dtrace_module_unloaded;
+ dtrace_cpu_init = dtrace_cpu_setup_initial;
+ dtrace_helpers_cleanup = dtrace_helpers_destroy;
+ dtrace_helpers_fork = dtrace_helpers_duplicate;
+ dtrace_cpustart_init = dtrace_suspend;
+ dtrace_cpustart_fini = dtrace_resume;
+ dtrace_debugger_init = dtrace_suspend;
+ dtrace_debugger_fini = dtrace_resume;
+
+ register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
+ NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
+ dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
+ UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
+ VM_SLEEP | VMC_IDENTIFIER);
+ dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
+ 1, INT_MAX, 0);
+
+ dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
+ sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
+ NULL, NULL, NULL, NULL, NULL, 0);
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
+ offsetof(dtrace_probe_t, dtpr_nextmod),
+ offsetof(dtrace_probe_t, dtpr_prevmod));
+
+ dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
+ offsetof(dtrace_probe_t, dtpr_nextfunc),
+ offsetof(dtrace_probe_t, dtpr_prevfunc));
+
+ dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
+ offsetof(dtrace_probe_t, dtpr_nextname),
+ offsetof(dtrace_probe_t, dtpr_prevname));
+
+ if (dtrace_retain_max < 1) {
+ cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
+ "setting to 1", dtrace_retain_max);
+ dtrace_retain_max = 1;
+ }
+
+ /*
+ * Now discover our toxic ranges.
+ */
+ dtrace_toxic_ranges(dtrace_toxrange_add);
+
+ /*
+ * Before we register ourselves as a provider to our own framework,
+ * we would like to assert that dtrace_provider is NULL -- but that's
+ * not true if we were loaded as a dependency of a DTrace provider.
+ * Once we've registered, we can assert that dtrace_provider is our
+ * pseudo provider.
+ */
+ (void) dtrace_register("dtrace", &dtrace_provider_attr,
+ DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
+
+ ASSERT(dtrace_provider != NULL);
+ ASSERT((dtrace_provider_id_t)dtrace_provider == id);
+
+ dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
+ dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
+ dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
+ dtrace_provider, NULL, NULL, "END", 0, NULL);
+ dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
+ dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
+
+ dtrace_anon_property();
+ mutex_exit(&cpu_lock);
+
+ /*
+ * If there are already providers, we must ask them to provide their
+ * probes, and then match any anonymous enabling against them. Note
+ * that there should be no other retained enablings at this time:
+ * the only retained enablings at this time should be the anonymous
+ * enabling.
+ */
+ if (dtrace_anon.dta_enabling != NULL) {
+ ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
+
+ dtrace_enabling_provide(NULL);
+ state = dtrace_anon.dta_state;
+
+ /*
+ * We couldn't hold cpu_lock across the above call to
+ * dtrace_enabling_provide(), but we must hold it to actually
+ * enable the probes. We have to drop all of our locks, pick
+ * up cpu_lock, and regain our locks before matching the
+ * retained anonymous enabling.
+ */
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_provider_lock);
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_provider_lock);
+ mutex_enter(&dtrace_lock);
+
+ if ((enab = dtrace_anon.dta_enabling) != NULL)
+ (void) dtrace_enabling_match(enab, NULL);
+
+ mutex_exit(&cpu_lock);
+ }
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_provider_lock);
+
+ if (state != NULL) {
+ /*
+ * If we created any anonymous state, set it going now.
+ */
+ (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
+ }
+
+ return (DDI_SUCCESS);
+}
+#endif /* illumos */
+
+#ifndef illumos
+static void dtrace_dtr(void *);
+#endif
+
+/*ARGSUSED*/
+static int
+#ifdef illumos
+dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
+#else
+dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+#endif
+{
+ dtrace_state_t *state;
+ uint32_t priv;
+ uid_t uid;
+ zoneid_t zoneid;
+
+#ifdef illumos
+ if (getminor(*devp) == DTRACEMNRN_HELPER)
+ return (0);
+
+ /*
+ * If this wasn't an open with the "helper" minor, then it must be
+ * the "dtrace" minor.
+ */
+ if (getminor(*devp) == DTRACEMNRN_DTRACE)
+ return (ENXIO);
+#else
+ cred_t *cred_p = NULL;
+ cred_p = dev->si_cred;
+
+ /*
+ * If no DTRACE_PRIV_* bits are set in the credential, then the
+ * caller lacks sufficient permission to do anything with DTrace.
+ */
+ dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
+ if (priv == DTRACE_PRIV_NONE) {
+#endif
+
+ return (EACCES);
+ }
+
+ /*
+ * Ask all providers to provide all their probes.
+ */
+ mutex_enter(&dtrace_provider_lock);
+ dtrace_probe_provide(NULL, NULL);
+ mutex_exit(&dtrace_provider_lock);
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_lock);
+ dtrace_opens++;
+ dtrace_membar_producer();
+
+#ifdef illumos
+ /*
+ * If the kernel debugger is active (that is, if the kernel debugger
+ * modified text in some way), we won't allow the open.
+ */
+ if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
+ dtrace_opens--;
+ mutex_exit(&cpu_lock);
+ mutex_exit(&dtrace_lock);
+ return (EBUSY);
+ }
+
+ if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
+ /*
+ * If DTrace helper tracing is enabled, we need to allocate the
+ * trace buffer and initialize the values.
+ */
+ dtrace_helptrace_buffer =
+ kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
+ dtrace_helptrace_next = 0;
+ dtrace_helptrace_wrapped = 0;
+ dtrace_helptrace_enable = 0;
+ }
+
+ state = dtrace_state_create(devp, cred_p);
+#else
+ state = dtrace_state_create(dev, NULL);
+ devfs_set_cdevpriv(state, dtrace_dtr);
+#endif
+
+ mutex_exit(&cpu_lock);
+
+ if (state == NULL) {
+#ifdef illumos
+ if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
+ (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
+#else
+ --dtrace_opens;
+#endif
+ mutex_exit(&dtrace_lock);
+ return (EAGAIN);
+ }
+
+ mutex_exit(&dtrace_lock);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+#ifdef illumos
+static int
+dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
+#else
+static void
+dtrace_dtr(void *data)
+#endif
+{
+#ifdef illumos
+ minor_t minor = getminor(dev);
+ dtrace_state_t *state;
+#endif
+ dtrace_helptrace_t *buf = NULL;
+
+#ifdef illumos
+ if (minor == DTRACEMNRN_HELPER)
+ return (0);
+
+ state = ddi_get_soft_state(dtrace_softstate, minor);
+#else
+ dtrace_state_t *state = data;
+#endif
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_lock);
+
+#ifdef illumos
+ if (state->dts_anon)
+#else
+ if (state != NULL && state->dts_anon)
+#endif
+ {
+ /*
+ * There is anonymous state. Destroy that first.
+ */
+ ASSERT(dtrace_anon.dta_state == NULL);
+ dtrace_state_destroy(state->dts_anon);
+ }
+
+ if (dtrace_helptrace_disable) {
+ /*
+ * If we have been told to disable helper tracing, set the
+ * buffer to NULL before calling into dtrace_state_destroy();
+ * we take advantage of its dtrace_sync() to know that no
+ * CPU is in probe context with enabled helper tracing
+ * after it returns.
+ */
+ buf = dtrace_helptrace_buffer;
+ dtrace_helptrace_buffer = NULL;
+ }
+
+#ifdef illumos
+ dtrace_state_destroy(state);
+#else
+ if (state != NULL) {
+ dtrace_state_destroy(state);
+ kmem_free(state, 0);
+ }
+#endif
+ ASSERT(dtrace_opens > 0);
+
+#ifdef illumos
+ /*
+ * Only relinquish control of the kernel debugger interface when there
+ * are no consumers and no anonymous enablings.
+ */
+ if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
+ (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
+#else
+ --dtrace_opens;
+#endif
+
+ if (buf != NULL) {
+ kmem_free(buf, dtrace_helptrace_bufsize);
+ dtrace_helptrace_disable = 0;
+ }
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+
+#ifdef illumos
+ return (0);
+#endif
+}
+
+#ifdef illumos
+/*ARGSUSED*/
+static int
+dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
+{
+ int rval;
+ dof_helper_t help, *dhp = NULL;
+
+ switch (cmd) {
+ case DTRACEHIOC_ADDDOF:
+ if (copyin((void *)arg, &help, sizeof (help)) != 0) {
+ dtrace_dof_error(NULL, "failed to copyin DOF helper");
+ return (EFAULT);
+ }
+
+ dhp = &help;
+ arg = (intptr_t)help.dofhp_dof;
+ /*FALLTHROUGH*/
+
+ case DTRACEHIOC_ADD: {
+ dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
+
+ if (dof == NULL)
+ return (rval);
+
+ mutex_enter(&dtrace_lock);
+
+ /*
+ * dtrace_helper_slurp() takes responsibility for the dof --
+ * it may free it now or it may save it and free it later.
+ */
+ if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
+ *rv = rval;
+ rval = 0;
+ } else {
+ rval = EINVAL;
+ }
+
+ mutex_exit(&dtrace_lock);
+ return (rval);
+ }
+
+ case DTRACEHIOC_REMOVE: {
+ mutex_enter(&dtrace_lock);
+ rval = dtrace_helper_destroygen(NULL, arg);
+ mutex_exit(&dtrace_lock);
+
+ return (rval);
+ }
+
+ default:
+ break;
+ }
+
+ return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+ minor_t minor = getminor(dev);
+ dtrace_state_t *state;
+ int rval;
+
+ if (minor == DTRACEMNRN_HELPER)
+ return (dtrace_ioctl_helper(cmd, arg, rv));
+
+ state = ddi_get_soft_state(dtrace_softstate, minor);
+
+ if (state->dts_anon) {
+ ASSERT(dtrace_anon.dta_state == NULL);
+ state = state->dts_anon;
+ }
+
+ switch (cmd) {
+ case DTRACEIOC_PROVIDER: {
+ dtrace_providerdesc_t pvd;
+ dtrace_provider_t *pvp;
+
+ if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
+ return (EFAULT);
+
+ pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
+ mutex_enter(&dtrace_provider_lock);
+
+ for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
+ if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
+ break;
+ }
+
+ mutex_exit(&dtrace_provider_lock);
+
+ if (pvp == NULL)
+ return (ESRCH);
+
+ bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
+ bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
+
+ if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ case DTRACEIOC_EPROBE: {
+ dtrace_eprobedesc_t epdesc;
+ dtrace_ecb_t *ecb;
+ dtrace_action_t *act;
+ void *buf;
+ size_t size;
+ uintptr_t dest;
+ int nrecs;
+
+ if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
+ return (EFAULT);
+
+ mutex_enter(&dtrace_lock);
+
+ if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
+ mutex_exit(&dtrace_lock);
+ return (EINVAL);
+ }
+
+ if (ecb->dte_probe == NULL) {
+ mutex_exit(&dtrace_lock);
+ return (EINVAL);
+ }
+
+ epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
+ epdesc.dtepd_uarg = ecb->dte_uarg;
+ epdesc.dtepd_size = ecb->dte_size;
+
+ nrecs = epdesc.dtepd_nrecs;
+ epdesc.dtepd_nrecs = 0;
+ for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
+ if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
+ continue;
+
+ epdesc.dtepd_nrecs++;
+ }
+
+ /*
+ * Now that we have the size, we need to allocate a temporary
+ * buffer in which to store the complete description. We need
+ * the temporary buffer to be able to drop dtrace_lock()
+ * across the copyout(), below.
+ */
+ size = sizeof (dtrace_eprobedesc_t) +
+ (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
+
+ buf = kmem_alloc(size, KM_SLEEP);
+ dest = (uintptr_t)buf;
+
+ bcopy(&epdesc, (void *)dest, sizeof (epdesc));
+ dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
+
+ for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
+ if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
+ continue;
+
+ if (nrecs-- == 0)
+ break;
+
+ bcopy(&act->dta_rec, (void *)dest,
+ sizeof (dtrace_recdesc_t));
+ dest += sizeof (dtrace_recdesc_t);
+ }
+
+ mutex_exit(&dtrace_lock);
+
+ if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
+ kmem_free(buf, size);
+ return (EFAULT);
+ }
+
+ kmem_free(buf, size);
+ return (0);
+ }
+
+ case DTRACEIOC_AGGDESC: {
+ dtrace_aggdesc_t aggdesc;
+ dtrace_action_t *act;
+ dtrace_aggregation_t *agg;
+ int nrecs;
+ uint32_t offs;
+ dtrace_recdesc_t *lrec;
+ void *buf;
+ size_t size;
+ uintptr_t dest;
+
+ if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
+ return (EFAULT);
+
+ mutex_enter(&dtrace_lock);
+
+ if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
+ mutex_exit(&dtrace_lock);
+ return (EINVAL);
+ }
+
+ aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
+
+ nrecs = aggdesc.dtagd_nrecs;
+ aggdesc.dtagd_nrecs = 0;
+
+ offs = agg->dtag_base;
+ lrec = &agg->dtag_action.dta_rec;
+ aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
+
+ for (act = agg->dtag_first; ; act = act->dta_next) {
+ ASSERT(act->dta_intuple ||
+ DTRACEACT_ISAGG(act->dta_kind));
+
+ /*
+ * If this action has a record size of zero, it
+ * denotes an argument to the aggregating action.
+ * Because the presence of this record doesn't (or
+ * shouldn't) affect the way the data is interpreted,
+ * we don't copy it out to save user-level the
+ * confusion of dealing with a zero-length record.
+ */
+ if (act->dta_rec.dtrd_size == 0) {
+ ASSERT(agg->dtag_hasarg);
+ continue;
+ }
+
+ aggdesc.dtagd_nrecs++;
+
+ if (act == &agg->dtag_action)
+ break;
+ }
+
+ /*
+ * Now that we have the size, we need to allocate a temporary
+ * buffer in which to store the complete description. We need
+ * the temporary buffer to be able to drop dtrace_lock()
+ * across the copyout(), below.
+ */
+ size = sizeof (dtrace_aggdesc_t) +
+ (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
+
+ buf = kmem_alloc(size, KM_SLEEP);
+ dest = (uintptr_t)buf;
+
+ bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
+ dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
+
+ for (act = agg->dtag_first; ; act = act->dta_next) {
+ dtrace_recdesc_t rec = act->dta_rec;
+
+ /*
+ * See the comment in the above loop for why we pass
+ * over zero-length records.
+ */
+ if (rec.dtrd_size == 0) {
+ ASSERT(agg->dtag_hasarg);
+ continue;
+ }
+
+ if (nrecs-- == 0)
+ break;
+
+ rec.dtrd_offset -= offs;
+ bcopy(&rec, (void *)dest, sizeof (rec));
+ dest += sizeof (dtrace_recdesc_t);
+
+ if (act == &agg->dtag_action)
+ break;
+ }
+
+ mutex_exit(&dtrace_lock);
+
+ if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
+ kmem_free(buf, size);
+ return (EFAULT);
+ }
+
+ kmem_free(buf, size);
+ return (0);
+ }
+
+ case DTRACEIOC_ENABLE: {
+ dof_hdr_t *dof;
+ dtrace_enabling_t *enab = NULL;
+ dtrace_vstate_t *vstate;
+ int err = 0;
+
+ *rv = 0;
+
+ /*
+ * If a NULL argument has been passed, we take this as our
+ * cue to reevaluate our enablings.
+ */
+ if (arg == NULL) {
+ dtrace_enabling_matchall();
+
+ return (0);
+ }
+
+ if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
+ return (rval);
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_lock);
+ vstate = &state->dts_vstate;
+
+ if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+ dtrace_dof_destroy(dof);
+ return (EBUSY);
+ }
+
+ if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+ dtrace_dof_destroy(dof);
+ return (EINVAL);
+ }
+
+ if ((rval = dtrace_dof_options(dof, state)) != 0) {
+ dtrace_enabling_destroy(enab);
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+ dtrace_dof_destroy(dof);
+ return (rval);
+ }
+
+ if ((err = dtrace_enabling_match(enab, rv)) == 0) {
+ err = dtrace_enabling_retain(enab);
+ } else {
+ dtrace_enabling_destroy(enab);
+ }
+
+ mutex_exit(&cpu_lock);
+ mutex_exit(&dtrace_lock);
+ dtrace_dof_destroy(dof);
+
+ return (err);
+ }
+
+ case DTRACEIOC_REPLICATE: {
+ dtrace_repldesc_t desc;
+ dtrace_probedesc_t *match = &desc.dtrpd_match;
+ dtrace_probedesc_t *create = &desc.dtrpd_create;
+ int err;
+
+ if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
+ return (EFAULT);
+
+ match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
+ match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
+ match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
+ match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
+
+ create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
+ create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
+ create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
+ create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
+
+ mutex_enter(&dtrace_lock);
+ err = dtrace_enabling_replicate(state, match, create);
+ mutex_exit(&dtrace_lock);
+
+ return (err);
+ }
+
+ case DTRACEIOC_PROBEMATCH:
+ case DTRACEIOC_PROBES: {
+ dtrace_probe_t *probe = NULL;
+ dtrace_probedesc_t desc;
+ dtrace_probekey_t pkey;
+ dtrace_id_t i;
+ int m = 0;
+ uint32_t priv;
+ uid_t uid;
+ zoneid_t zoneid;
+
+ if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
+ return (EFAULT);
+
+ desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
+ desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
+ desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
+ desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
+
+ /*
+ * Before we attempt to match this probe, we want to give
+ * all providers the opportunity to provide it.
+ */
+ if (desc.dtpd_id == DTRACE_IDNONE) {
+ mutex_enter(&dtrace_provider_lock);
+ dtrace_probe_provide(&desc, NULL);
+ mutex_exit(&dtrace_provider_lock);
+ desc.dtpd_id++;
+ }
+
+ if (cmd == DTRACEIOC_PROBEMATCH) {
+ dtrace_probekey(&desc, &pkey);
+ pkey.dtpk_id = DTRACE_IDNONE;
+ }
+
+ dtrace_cred2priv(cr, &priv, &uid, &zoneid);
+
+ mutex_enter(&dtrace_lock);
+
+ if (cmd == DTRACEIOC_PROBEMATCH) {
+ for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
+ if ((probe = dtrace_probes[i - 1]) != NULL &&
+ (m = dtrace_match_probe(probe, &pkey,
+ priv, uid, zoneid)) != 0)
+ break;
+ }
+
+ if (m < 0) {
+ mutex_exit(&dtrace_lock);
+ return (EINVAL);
+ }
+
+ } else {
+ for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
+ if ((probe = dtrace_probes[i - 1]) != NULL &&
+ dtrace_match_priv(probe, priv, uid, zoneid))
+ break;
+ }
+ }
+
+ if (probe == NULL) {
+ mutex_exit(&dtrace_lock);
+ return (ESRCH);
+ }
+
+ dtrace_probe_description(probe, &desc);
+ mutex_exit(&dtrace_lock);
+
+ if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ case DTRACEIOC_PROBEARG: {
+ dtrace_argdesc_t desc;
+ dtrace_probe_t *probe;
+ dtrace_provider_t *prov;
+
+ if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
+ return (EFAULT);
+
+ if (desc.dtargd_id == DTRACE_IDNONE)
+ return (EINVAL);
+
+ if (desc.dtargd_ndx == DTRACE_ARGNONE)
+ return (EINVAL);
+
+ mutex_enter(&dtrace_provider_lock);
+ mutex_enter(&mod_lock);
+ mutex_enter(&dtrace_lock);
+
+ if (desc.dtargd_id > dtrace_nprobes) {
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&mod_lock);
+ mutex_exit(&dtrace_provider_lock);
+ return (EINVAL);
+ }
+
+ if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&mod_lock);
+ mutex_exit(&dtrace_provider_lock);
+ return (EINVAL);
+ }
+
+ mutex_exit(&dtrace_lock);
+
+ prov = probe->dtpr_provider;
+
+ if (prov->dtpv_pops.dtps_getargdesc == NULL) {
+ /*
+ * There isn't any typed information for this probe.
+ * Set the argument number to DTRACE_ARGNONE.
+ */
+ desc.dtargd_ndx = DTRACE_ARGNONE;
+ } else {
+ desc.dtargd_native[0] = '\0';
+ desc.dtargd_xlate[0] = '\0';
+ desc.dtargd_mapping = desc.dtargd_ndx;
+
+ prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
+ probe->dtpr_id, probe->dtpr_arg, &desc);
+ }
+
+ mutex_exit(&mod_lock);
+ mutex_exit(&dtrace_provider_lock);
+
+ if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ case DTRACEIOC_GO: {
+ processorid_t cpuid;
+ rval = dtrace_state_go(state, &cpuid);
+
+ if (rval != 0)
+ return (rval);
+
+ if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ case DTRACEIOC_STOP: {
+ processorid_t cpuid;
+
+ mutex_enter(&dtrace_lock);
+ rval = dtrace_state_stop(state, &cpuid);
+ mutex_exit(&dtrace_lock);
+
+ if (rval != 0)
+ return (rval);
+
+ if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ case DTRACEIOC_DOFGET: {
+ dof_hdr_t hdr, *dof;
+ uint64_t len;
+
+ if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
+ return (EFAULT);
+
+ mutex_enter(&dtrace_lock);
+ dof = dtrace_dof_create(state);
+ mutex_exit(&dtrace_lock);
+
+ len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
+ rval = copyout(dof, (void *)arg, len);
+ dtrace_dof_destroy(dof);
+
+ return (rval == 0 ? 0 : EFAULT);
+ }
+
+ case DTRACEIOC_AGGSNAP:
+ case DTRACEIOC_BUFSNAP: {
+ dtrace_bufdesc_t desc;
+ caddr_t cached;
+ dtrace_buffer_t *buf;
+
+ if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
+ return (EFAULT);
+
+ if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
+ return (EINVAL);
+
+ mutex_enter(&dtrace_lock);
+
+ if (cmd == DTRACEIOC_BUFSNAP) {
+ buf = &state->dts_buffer[desc.dtbd_cpu];
+ } else {
+ buf = &state->dts_aggbuffer[desc.dtbd_cpu];
+ }
+
+ if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
+ size_t sz = buf->dtb_offset;
+
+ if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
+ mutex_exit(&dtrace_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * If this buffer has already been consumed, we're
+ * going to indicate that there's nothing left here
+ * to consume.
+ */
+ if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
+ mutex_exit(&dtrace_lock);
+
+ desc.dtbd_size = 0;
+ desc.dtbd_drops = 0;
+ desc.dtbd_errors = 0;
+ desc.dtbd_oldest = 0;
+ sz = sizeof (desc);
+
+ if (copyout(&desc, (void *)arg, sz) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ /*
+ * If this is a ring buffer that has wrapped, we want
+ * to copy the whole thing out.
+ */
+ if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
+ dtrace_buffer_polish(buf);
+ sz = buf->dtb_size;
+ }
+
+ if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
+ mutex_exit(&dtrace_lock);
+ return (EFAULT);
+ }
+
+ desc.dtbd_size = sz;
+ desc.dtbd_drops = buf->dtb_drops;
+ desc.dtbd_errors = buf->dtb_errors;
+ desc.dtbd_oldest = buf->dtb_xamot_offset;
+ desc.dtbd_timestamp = dtrace_gethrtime();
+
+ mutex_exit(&dtrace_lock);
+
+ if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
+ return (EFAULT);
+
+ buf->dtb_flags |= DTRACEBUF_CONSUMED;
+
+ return (0);
+ }
+
+ if (buf->dtb_tomax == NULL) {
+ ASSERT(buf->dtb_xamot == NULL);
+ mutex_exit(&dtrace_lock);
+ return (ENOENT);
+ }
+
+ cached = buf->dtb_tomax;
+ ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
+
+ dtrace_xcall(desc.dtbd_cpu,
+ (dtrace_xcall_t)dtrace_buffer_switch, buf);
+
+ state->dts_errors += buf->dtb_xamot_errors;
+
+ /*
+ * If the buffers did not actually switch, then the cross call
+ * did not take place -- presumably because the given CPU is
+ * not in the ready set. If this is the case, we'll return
+ * ENOENT.
+ */
+ if (buf->dtb_tomax == cached) {
+ ASSERT(buf->dtb_xamot != cached);
+ mutex_exit(&dtrace_lock);
+ return (ENOENT);
+ }
+
+ ASSERT(cached == buf->dtb_xamot);
+
+ /*
+ * We have our snapshot; now copy it out.
+ */
+ if (copyout(buf->dtb_xamot, desc.dtbd_data,
+ buf->dtb_xamot_offset) != 0) {
+ mutex_exit(&dtrace_lock);
+ return (EFAULT);
+ }
+
+ desc.dtbd_size = buf->dtb_xamot_offset;
+ desc.dtbd_drops = buf->dtb_xamot_drops;
+ desc.dtbd_errors = buf->dtb_xamot_errors;
+ desc.dtbd_oldest = 0;
+ desc.dtbd_timestamp = buf->dtb_switched;
+
+ mutex_exit(&dtrace_lock);
+
+ /*
+ * Finally, copy out the buffer description.
+ */
+ if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ case DTRACEIOC_CONF: {
+ dtrace_conf_t conf;
+
+ bzero(&conf, sizeof (conf));
+ conf.dtc_difversion = DIF_VERSION;
+ conf.dtc_difintregs = DIF_DIR_NREGS;
+ conf.dtc_diftupregs = DIF_DTR_NREGS;
+ conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
+
+ if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ case DTRACEIOC_STATUS: {
+ dtrace_status_t stat;
+ dtrace_dstate_t *dstate;
+ int i, j;
+ uint64_t nerrs;
+
+ /*
+ * See the comment in dtrace_state_deadman() for the reason
+ * for setting dts_laststatus to INT64_MAX before setting
+ * it to the correct value.
+ */
+ state->dts_laststatus = INT64_MAX;
+ dtrace_membar_producer();
+ state->dts_laststatus = dtrace_gethrtime();
+
+ bzero(&stat, sizeof (stat));
+
+ mutex_enter(&dtrace_lock);
+
+ if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
+ mutex_exit(&dtrace_lock);
+ return (ENOENT);
+ }
+
+ if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
+ stat.dtst_exiting = 1;
+
+ nerrs = state->dts_errors;
+ dstate = &state->dts_vstate.dtvs_dynvars;
+
+ for (i = 0; i < NCPU; i++) {
+ dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
+
+ stat.dtst_dyndrops += dcpu->dtdsc_drops;
+ stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
+ stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
+
+ if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
+ stat.dtst_filled++;
+
+ nerrs += state->dts_buffer[i].dtb_errors;
+
+ for (j = 0; j < state->dts_nspeculations; j++) {
+ dtrace_speculation_t *spec;
+ dtrace_buffer_t *buf;
+
+ spec = &state->dts_speculations[j];
+ buf = &spec->dtsp_buffer[i];
+ stat.dtst_specdrops += buf->dtb_xamot_drops;
+ }
+ }
+
+ stat.dtst_specdrops_busy = state->dts_speculations_busy;
+ stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
+ stat.dtst_stkstroverflows = state->dts_stkstroverflows;
+ stat.dtst_dblerrors = state->dts_dblerrors;
+ stat.dtst_killed =
+ (state->dts_activity == DTRACE_ACTIVITY_KILLED);
+ stat.dtst_errors = nerrs;
+
+ mutex_exit(&dtrace_lock);
+
+ if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ case DTRACEIOC_FORMAT: {
+ dtrace_fmtdesc_t fmt;
+ char *str;
+ int len;
+
+ if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
+ return (EFAULT);
+
+ mutex_enter(&dtrace_lock);
+
+ if (fmt.dtfd_format == 0 ||
+ fmt.dtfd_format > state->dts_nformats) {
+ mutex_exit(&dtrace_lock);
+ return (EINVAL);
+ }
+
+ /*
+ * Format strings are allocated contiguously and they are
+ * never freed; if a format index is less than the number
+ * of formats, we can assert that the format map is non-NULL
+ * and that the format for the specified index is non-NULL.
+ */
+ ASSERT(state->dts_formats != NULL);
+ str = state->dts_formats[fmt.dtfd_format - 1];
+ ASSERT(str != NULL);
+
+ len = strlen(str) + 1;
+
+ if (len > fmt.dtfd_length) {
+ fmt.dtfd_length = len;
+
+ if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
+ mutex_exit(&dtrace_lock);
+ return (EINVAL);
+ }
+ } else {
+ if (copyout(str, fmt.dtfd_string, len) != 0) {
+ mutex_exit(&dtrace_lock);
+ return (EINVAL);
+ }
+ }
+
+ mutex_exit(&dtrace_lock);
+ return (0);
+ }
+
+ default:
+ break;
+ }
+
+ return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ dtrace_state_t *state;
+
+ switch (cmd) {
+ case DDI_DETACH:
+ break;
+
+ case DDI_SUSPEND:
+ return (DDI_SUCCESS);
+
+ default:
+ return (DDI_FAILURE);
+ }
+
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_provider_lock);
+ mutex_enter(&dtrace_lock);
+
+ ASSERT(dtrace_opens == 0);
+
+ if (dtrace_helpers > 0) {
+ mutex_exit(&dtrace_provider_lock);
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+ return (DDI_FAILURE);
+ }
+
+ if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
+ mutex_exit(&dtrace_provider_lock);
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&cpu_lock);
+ return (DDI_FAILURE);
+ }
+
+ dtrace_provider = NULL;
+
+ if ((state = dtrace_anon_grab()) != NULL) {
+ /*
+ * If there were ECBs on this state, the provider should
+ * have not been allowed to detach; assert that there is
+ * none.
+ */
+ ASSERT(state->dts_necbs == 0);
+ dtrace_state_destroy(state);
+
+ /*
+ * If we're being detached with anonymous state, we need to
+ * indicate to the kernel debugger that DTrace is now inactive.
+ */
+ (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
+ }
+
+ bzero(&dtrace_anon, sizeof (dtrace_anon_t));
+ unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
+ dtrace_cpu_init = NULL;
+ dtrace_helpers_cleanup = NULL;
+ dtrace_helpers_fork = NULL;
+ dtrace_cpustart_init = NULL;
+ dtrace_cpustart_fini = NULL;
+ dtrace_debugger_init = NULL;
+ dtrace_debugger_fini = NULL;
+ dtrace_modload = NULL;
+ dtrace_modunload = NULL;
+
+ ASSERT(dtrace_getf == 0);
+ ASSERT(dtrace_closef == NULL);
+
+ mutex_exit(&cpu_lock);
+
+ kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
+ dtrace_probes = NULL;
+ dtrace_nprobes = 0;
+
+ dtrace_hash_destroy(dtrace_bymod);
+ dtrace_hash_destroy(dtrace_byfunc);
+ dtrace_hash_destroy(dtrace_byname);
+ dtrace_bymod = NULL;
+ dtrace_byfunc = NULL;
+ dtrace_byname = NULL;
+
+ kmem_cache_destroy(dtrace_state_cache);
+ vmem_destroy(dtrace_minor);
+ vmem_destroy(dtrace_arena);
+
+ if (dtrace_toxrange != NULL) {
+ kmem_free(dtrace_toxrange,
+ dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
+ dtrace_toxrange = NULL;
+ dtrace_toxranges = 0;
+ dtrace_toxranges_max = 0;
+ }
+
+ ddi_remove_minor_node(dtrace_devi, NULL);
+ dtrace_devi = NULL;
+
+ ddi_soft_state_fini(&dtrace_softstate);
+
+ ASSERT(dtrace_vtime_references == 0);
+ ASSERT(dtrace_opens == 0);
+ ASSERT(dtrace_retained == NULL);
+
+ mutex_exit(&dtrace_lock);
+ mutex_exit(&dtrace_provider_lock);
+
+ /*
+ * We don't destroy the task queue until after we have dropped our
+ * locks (taskq_destroy() may block on running tasks). To prevent
+ * attempting to do work after we have effectively detached but before
+ * the task queue has been destroyed, all tasks dispatched via the
+ * task queue must check that DTrace is still attached before
+ * performing any operation.
+ */
+ taskq_destroy(dtrace_taskq);
+ dtrace_taskq = NULL;
+
+ return (DDI_SUCCESS);
+}
+#endif
+
+#ifdef illumos
+/*ARGSUSED*/
+static int
+dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ int error;
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = (void *)dtrace_devi;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ }
+ return (error);
+}
+#endif
+
+#ifdef illumos
+static struct cb_ops dtrace_cb_ops = {
+ dtrace_open, /* open */
+ dtrace_close, /* close */
+ nulldev, /* strategy */
+ nulldev, /* print */
+ nodev, /* dump */
+ nodev, /* read */
+ nodev, /* write */
+ dtrace_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* cb_prop_op */
+ 0, /* streamtab */
+ D_NEW | D_MP /* Driver compatibility flag */
+};
+
+static struct dev_ops dtrace_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* refcnt */
+ dtrace_info, /* get_dev_info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ dtrace_attach, /* attach */
+ dtrace_detach, /* detach */
+ nodev, /* reset */
+ &dtrace_cb_ops, /* driver operations */
+ NULL, /* bus operations */
+ nodev /* dev power */
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* module type (this is a pseudo driver) */
+ "Dynamic Tracing", /* name of module */
+ &dtrace_ops, /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
+#else
+
+static d_ioctl_t dtrace_ioctl;
+static d_ioctl_t dtrace_ioctl_helper;
+static void dtrace_load(void *);
+static int dtrace_unload(void);
+static struct cdev *dtrace_dev;
+static struct cdev *helper_dev;
+
+void dtrace_invop_init(void);
+void dtrace_invop_uninit(void);
+
+static struct cdevsw dtrace_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = dtrace_ioctl,
+ .d_open = dtrace_open,
+ .d_name = "dtrace",
+};
+
+static struct cdevsw helper_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = dtrace_ioctl_helper,
+ .d_name = "helper",
+};
+
+#include <dtrace_anon.c>
+#include <dtrace_ioctl.c>
+#include <dtrace_load.c>
+#include <dtrace_modevent.c>
+#include <dtrace_sysctl.c>
+#include <dtrace_unload.c>
+#include <dtrace_vtime.c>
+#include <dtrace_hacks.c>
+#include <dtrace_isa.c>
+
+SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
+SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
+SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
+
+DEV_MODULE(dtrace, dtrace_modevent, NULL);
+MODULE_VERSION(dtrace, 1);
+MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.c
new file mode 100644
index 000000000000..fbc656a90e5c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.c
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2016 (Graeme Jenkinson)
+ * All rights reserved.
+ *
+ * This software was developed by BAE Systems, the University of Cambridge
+ * Computer Laboratory, and Memorial University under DARPA/AFRL contract
+ * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing
+ * (TC) research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/types.h>
+
+#include "dtrace_xoroshiro128_plus.h"
+
+static __inline uint64_t
+rotl(const uint64_t x, int k)
+{
+ return (x << k) | (x >> (64 - k));
+}
+
+/*
+ * This is the jump function for the generator. It is equivalent to 2^64 calls
+ * to next(); it can be used to generate 2^64 non-overlapping subsequences for
+ * parallel computations.
+ */
+void
+dtrace_xoroshiro128_plus_jump(uint64_t * const state,
+ uint64_t * const jump_state)
+{
+ static const uint64_t JUMP[] = { 0xbeac0467eba5facb,
+ 0xd86b048b86aa9922 };
+
+ uint64_t s0 = 0;
+ uint64_t s1 = 0;
+ int i = 0;
+ int b = 0;
+ for (i = 0; i < sizeof JUMP / sizeof *JUMP; i++) {
+ for (b = 0; b < 64; b++) {
+ if (JUMP[i] & 1ULL << b) {
+ s0 ^= state[0];
+ s1 ^= state[1];
+ }
+ dtrace_xoroshiro128_plus_next(state);
+ }
+ }
+ jump_state[0] = s0;
+ jump_state[1] = s1;
+}
+
+/*
+ * xoroshiro128+ - XOR/rotate/shift/rotate
+ * xorshift.di.unimi.it
+ */
+uint64_t
+dtrace_xoroshiro128_plus_next(uint64_t * const state)
+{
+ const uint64_t s0 = state[0];
+ uint64_t s1 = state[1];
+ uint64_t result;
+ result = s0 + s1;
+
+ s1 ^= s0;
+ state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
+ state[1] = rotl(s1, 36);
+
+ return result;
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.h b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.h
new file mode 100644
index 000000000000..73b17226428f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2016 (Graeme Jenkinson)
+ * All rights reserved.
+ *
+ * This software was developed by BAE Systems, the University of Cambridge
+ * Computer Laboratory, and Memorial University under DARPA/AFRL contract
+ * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing
+ * (TC) research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef _DTRACE_XOROSHIRO128_PLUS_H
+#define _DTRACE_XOROSHIRO128_PLUS_H
+
+#include <sys/types.h>
+
+void dtrace_xoroshiro128_plus_jump(uint64_t * const, uint64_t * const);
+uint64_t dtrace_xoroshiro128_plus_next(uint64_t * const);
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
new file mode 100644
index 000000000000..d5be43f0c3d1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
@@ -0,0 +1,2663 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Portions Copyright 2010 The FreeBSD Foundation
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/atomic.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/systm.h>
+#ifdef illumos
+#include <sys/ddi.h>
+#endif
+#include <sys/sunddi.h>
+#include <sys/cpuvar.h>
+#include <sys/kmem.h>
+#ifdef illumos
+#include <sys/strsubr.h>
+#endif
+#include <sys/fasttrap.h>
+#include <sys/fasttrap_impl.h>
+#include <sys/fasttrap_isa.h>
+#include <sys/dtrace.h>
+#include <sys/dtrace_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/proc.h>
+#include <sys/policy.h>
+#ifdef illumos
+#include <util/qsort.h>
+#endif
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#ifndef illumos
+#include <sys/dtrace_bsd.h>
+#include <sys/eventhandler.h>
+#include <sys/rmlock.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/u8_textprep.h>
+#include <sys/user.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_param.h>
+
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#endif
+
+/*
+ * User-Land Trap-Based Tracing
+ * ----------------------------
+ *
+ * The fasttrap provider allows DTrace consumers to instrument any user-level
+ * instruction to gather data; this includes probes with semantic
+ * signifigance like entry and return as well as simple offsets into the
+ * function. While the specific techniques used are very ISA specific, the
+ * methodology is generalizable to any architecture.
+ *
+ *
+ * The General Methodology
+ * -----------------------
+ *
+ * With the primary goal of tracing every user-land instruction and the
+ * limitation that we can't trust user space so don't want to rely on much
+ * information there, we begin by replacing the instructions we want to trace
+ * with trap instructions. Each instruction we overwrite is saved into a hash
+ * table keyed by process ID and pc address. When we enter the kernel due to
+ * this trap instruction, we need the effects of the replaced instruction to
+ * appear to have occurred before we proceed with the user thread's
+ * execution.
+ *
+ * Each user level thread is represented by a ulwp_t structure which is
+ * always easily accessible through a register. The most basic way to produce
+ * the effects of the instruction we replaced is to copy that instruction out
+ * to a bit of scratch space reserved in the user thread's ulwp_t structure
+ * (a sort of kernel-private thread local storage), set the PC to that
+ * scratch space and single step. When we reenter the kernel after single
+ * stepping the instruction we must then adjust the PC to point to what would
+ * normally be the next instruction. Of course, special care must be taken
+ * for branches and jumps, but these represent such a small fraction of any
+ * instruction set that writing the code to emulate these in the kernel is
+ * not too difficult.
+ *
+ * Return probes may require several tracepoints to trace every return site,
+ * and, conversely, each tracepoint may activate several probes (the entry
+ * and offset 0 probes, for example). To solve this muliplexing problem,
+ * tracepoints contain lists of probes to activate and probes contain lists
+ * of tracepoints to enable. If a probe is activated, it adds its ID to
+ * existing tracepoints or creates new ones as necessary.
+ *
+ * Most probes are activated _before_ the instruction is executed, but return
+ * probes are activated _after_ the effects of the last instruction of the
+ * function are visible. Return probes must be fired _after_ we have
+ * single-stepped the instruction whereas all other probes are fired
+ * beforehand.
+ *
+ *
+ * Lock Ordering
+ * -------------
+ *
+ * The lock ordering below -- both internally and with respect to the DTrace
+ * framework -- is a little tricky and bears some explanation. Each provider
+ * has a lock (ftp_mtx) that protects its members including reference counts
+ * for enabled probes (ftp_rcount), consumers actively creating probes
+ * (ftp_ccount) and USDT consumers (ftp_mcount); all three prevent a provider
+ * from being freed. A provider is looked up by taking the bucket lock for the
+ * provider hash table, and is returned with its lock held. The provider lock
+ * may be taken in functions invoked by the DTrace framework, but may not be
+ * held while calling functions in the DTrace framework.
+ *
+ * To ensure consistency over multiple calls to the DTrace framework, the
+ * creation lock (ftp_cmtx) should be held. Naturally, the creation lock may
+ * not be taken when holding the provider lock as that would create a cyclic
+ * lock ordering. In situations where one would naturally take the provider
+ * lock and then the creation lock, we instead up a reference count to prevent
+ * the provider from disappearing, drop the provider lock, and acquire the
+ * creation lock.
+ *
+ * Briefly:
+ * bucket lock before provider lock
+ * DTrace before provider lock
+ * creation lock before DTrace
+ * never hold the provider lock and creation lock simultaneously
+ */
+
+static d_open_t fasttrap_open;
+static d_ioctl_t fasttrap_ioctl;
+
+static struct cdevsw fasttrap_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = fasttrap_open,
+ .d_ioctl = fasttrap_ioctl,
+ .d_name = "fasttrap",
+};
+static struct cdev *fasttrap_cdev;
+static dtrace_meta_provider_id_t fasttrap_meta_id;
+
+static struct proc *fasttrap_cleanup_proc;
+static struct mtx fasttrap_cleanup_mtx;
+static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv;
+
+/*
+ * Generation count on modifications to the global tracepoint lookup table.
+ */
+static volatile uint64_t fasttrap_mod_gen;
+
+/*
+ * When the fasttrap provider is loaded, fasttrap_max is set to either
+ * FASTTRAP_MAX_DEFAULT, or the value for fasttrap-max-probes in the
+ * fasttrap.conf file (Illumos), or the value provied in the loader.conf (FreeBSD).
+ * Each time a probe is created, fasttrap_total is incremented by the number
+ * of tracepoints that may be associated with that probe; fasttrap_total is capped
+ * at fasttrap_max.
+ */
+#define FASTTRAP_MAX_DEFAULT 250000
+static uint32_t fasttrap_max = FASTTRAP_MAX_DEFAULT;
+static uint32_t fasttrap_total;
+
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
+#define FASTTRAP_TPOINTS_DEFAULT_SIZE 0x4000
+#define FASTTRAP_PROVIDERS_DEFAULT_SIZE 0x100
+#define FASTTRAP_PROCS_DEFAULT_SIZE 0x100
+
+#define FASTTRAP_PID_NAME "pid"
+
+fasttrap_hash_t fasttrap_tpoints;
+static fasttrap_hash_t fasttrap_provs;
+static fasttrap_hash_t fasttrap_procs;
+
+static uint64_t fasttrap_pid_count; /* pid ref count */
+static kmutex_t fasttrap_count_mtx; /* lock on ref count */
+
+#define FASTTRAP_ENABLE_FAIL 1
+#define FASTTRAP_ENABLE_PARTIAL 2
+
+static int fasttrap_tracepoint_enable(proc_t *, fasttrap_probe_t *, uint_t);
+static void fasttrap_tracepoint_disable(proc_t *, fasttrap_probe_t *, uint_t);
+
+static fasttrap_provider_t *fasttrap_provider_lookup(pid_t, const char *,
+ const dtrace_pattr_t *);
+static void fasttrap_provider_retire(pid_t, const char *, int);
+static void fasttrap_provider_free(fasttrap_provider_t *);
+
+static fasttrap_proc_t *fasttrap_proc_lookup(pid_t);
+static void fasttrap_proc_release(fasttrap_proc_t *);
+
+#ifndef illumos
+static void fasttrap_thread_dtor(void *, struct thread *);
+#endif
+
+#define FASTTRAP_PROVS_INDEX(pid, name) \
+ ((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask)
+
+#define FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask)
+
+#ifndef illumos
+struct rmlock fasttrap_tp_lock;
+static eventhandler_tag fasttrap_thread_dtor_tag;
+#endif
+
+static unsigned long tpoints_hash_size = FASTTRAP_TPOINTS_DEFAULT_SIZE;
+
+#ifdef __FreeBSD__
+SYSCTL_DECL(_kern_dtrace);
+SYSCTL_NODE(_kern_dtrace, OID_AUTO, fasttrap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "DTrace fasttrap parameters");
+SYSCTL_UINT(_kern_dtrace_fasttrap, OID_AUTO, max_probes, CTLFLAG_RWTUN, &fasttrap_max,
+ FASTTRAP_MAX_DEFAULT, "Maximum number of fasttrap probes");
+SYSCTL_ULONG(_kern_dtrace_fasttrap, OID_AUTO, tpoints_hash_size, CTLFLAG_RDTUN, &tpoints_hash_size,
+ FASTTRAP_TPOINTS_DEFAULT_SIZE, "Size of the tracepoint hash table");
+#endif
+
+static int
+fasttrap_highbit(ulong_t i)
+{
+ int h = 1;
+
+ if (i == 0)
+ return (0);
+#ifdef _LP64
+ if (i & 0xffffffff00000000ul) {
+ h += 32; i >>= 32;
+ }
+#endif
+ if (i & 0xffff0000) {
+ h += 16; i >>= 16;
+ }
+ if (i & 0xff00) {
+ h += 8; i >>= 8;
+ }
+ if (i & 0xf0) {
+ h += 4; i >>= 4;
+ }
+ if (i & 0xc) {
+ h += 2; i >>= 2;
+ }
+ if (i & 0x2) {
+ h += 1;
+ }
+ return (h);
+}
+
+static uint_t
+fasttrap_hash_str(const char *p)
+{
+ unsigned int g;
+ uint_t hval = 0;
+
+ while (*p) {
+ hval = (hval << 4) + *p++;
+ if ((g = (hval & 0xf0000000)) != 0)
+ hval ^= g >> 24;
+ hval &= ~g;
+ }
+ return (hval);
+}
+
+void
+fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc)
+{
+ ksiginfo_t ksi;
+
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = SIGTRAP;
+ ksi.ksi_code = TRAP_DTRACE;
+ ksi.ksi_addr = (caddr_t)pc;
+ PROC_LOCK(p);
+ (void)tdsendsignal(p, t, SIGTRAP, &ksi);
+ PROC_UNLOCK(p);
+}
+
+#ifndef illumos
+/*
+ * Obtain a chunk of scratch space in the address space of the target process.
+ */
+fasttrap_scrspace_t *
+fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc)
+{
+ fasttrap_scrblock_t *scrblk;
+ fasttrap_scrspace_t *scrspc;
+ struct proc *p;
+ vm_offset_t addr;
+ int error, i;
+
+ scrspc = NULL;
+ if (td->t_dtrace_sscr != NULL) {
+ /* If the thread already has scratch space, we're done. */
+ scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
+ return (scrspc);
+ }
+
+ p = td->td_proc;
+
+ mutex_enter(&fprc->ftpc_mtx);
+ if (LIST_EMPTY(&fprc->ftpc_fscr)) {
+ /*
+ * No scratch space is available, so we'll map a new scratch
+ * space block into the traced process' address space.
+ */
+ addr = 0;
+ error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr,
+ FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL,
+ VM_PROT_ALL, 0);
+ if (error != KERN_SUCCESS)
+ goto done;
+
+ scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK);
+ scrblk->ftsb_addr = addr;
+ LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next);
+
+ /*
+ * Carve the block up into chunks and put them on the free list.
+ */
+ for (i = 0;
+ i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) {
+ scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK);
+ scrspc->ftss_addr = addr +
+ i * FASTTRAP_SCRSPACE_SIZE;
+ LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc,
+ ftss_next);
+ }
+ }
+
+ /*
+ * Take the first scratch chunk off the free list, put it on the
+ * allocated list, and return its address.
+ */
+ scrspc = LIST_FIRST(&fprc->ftpc_fscr);
+ LIST_REMOVE(scrspc, ftss_next);
+ LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next);
+
+ /*
+ * This scratch space is reserved for use by td until the thread exits.
+ */
+ td->t_dtrace_sscr = scrspc;
+
+done:
+ mutex_exit(&fprc->ftpc_mtx);
+
+ return (scrspc);
+}
+
+/*
+ * Return any allocated per-thread scratch space chunks back to the process'
+ * free list.
+ */
+static void
+fasttrap_thread_dtor(void *arg __unused, struct thread *td)
+{
+ fasttrap_bucket_t *bucket;
+ fasttrap_proc_t *fprc;
+ fasttrap_scrspace_t *scrspc;
+ pid_t pid;
+
+ if (td->t_dtrace_sscr == NULL)
+ return;
+
+ pid = td->td_proc->p_pid;
+ bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
+ fprc = NULL;
+
+ /* Look up the fasttrap process handle for this process. */
+ mutex_enter(&bucket->ftb_mtx);
+ for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
+ if (fprc->ftpc_pid == pid) {
+ mutex_enter(&fprc->ftpc_mtx);
+ mutex_exit(&bucket->ftb_mtx);
+ break;
+ }
+ }
+ if (fprc == NULL) {
+ mutex_exit(&bucket->ftb_mtx);
+ return;
+ }
+
+ scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
+ LIST_REMOVE(scrspc, ftss_next);
+ LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next);
+
+ mutex_exit(&fprc->ftpc_mtx);
+}
+#endif
+
+/*
+ * This function ensures that no threads are actively using the memory
+ * associated with probes that were formerly live.
+ */
+static void
+fasttrap_mod_barrier(uint64_t gen)
+{
+ int i;
+
+ if (gen < fasttrap_mod_gen)
+ return;
+
+ fasttrap_mod_gen++;
+
+#ifdef illumos
+ CPU_FOREACH(i) {
+ mutex_enter(&fasttrap_cpuc_pid_lock[i]);
+ mutex_exit(&fasttrap_cpuc_pid_lock[i]);
+ }
+#else
+ rm_wlock(&fasttrap_tp_lock);
+ rm_wunlock(&fasttrap_tp_lock);
+#endif
+}
+
+/*
+ * This function performs asynchronous cleanup of fasttrap providers. The
+ * Solaris implementation of this mechanism use a timeout that's activated in
+ * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while
+ * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler.
+ * Thus we use a dedicated process to perform the cleanup when requested.
+ */
+/*ARGSUSED*/
+static void
+fasttrap_pid_cleanup_cb(void *data)
+{
+ fasttrap_provider_t **fpp, *fp;
+ fasttrap_bucket_t *bucket;
+ dtrace_provider_id_t provid;
+ int i, later = 0, rval;
+
+ mtx_lock(&fasttrap_cleanup_mtx);
+ while (!fasttrap_cleanup_drain || later > 0) {
+ fasttrap_cleanup_work = 0;
+ mtx_unlock(&fasttrap_cleanup_mtx);
+
+ later = 0;
+
+ /*
+ * Iterate over all the providers trying to remove the marked
+ * ones. If a provider is marked but not retired, we just
+ * have to take a crack at removing it -- it's no big deal if
+ * we can't.
+ */
+ for (i = 0; i < fasttrap_provs.fth_nent; i++) {
+ bucket = &fasttrap_provs.fth_table[i];
+ mutex_enter(&bucket->ftb_mtx);
+ fpp = (fasttrap_provider_t **)&bucket->ftb_data;
+
+ while ((fp = *fpp) != NULL) {
+ if (!fp->ftp_marked) {
+ fpp = &fp->ftp_next;
+ continue;
+ }
+
+ mutex_enter(&fp->ftp_mtx);
+
+ /*
+ * If this provider has consumers actively
+ * creating probes (ftp_ccount) or is a USDT
+ * provider (ftp_mcount), we can't unregister
+ * or even condense.
+ */
+ if (fp->ftp_ccount != 0 ||
+ fp->ftp_mcount != 0) {
+ mutex_exit(&fp->ftp_mtx);
+ fp->ftp_marked = 0;
+ continue;
+ }
+
+ if (!fp->ftp_retired || fp->ftp_rcount != 0)
+ fp->ftp_marked = 0;
+
+ mutex_exit(&fp->ftp_mtx);
+
+ /*
+ * If we successfully unregister this
+ * provider we can remove it from the hash
+ * chain and free the memory. If our attempt
+ * to unregister fails and this is a retired
+ * provider, increment our flag to try again
+ * pretty soon. If we've consumed more than
+ * half of our total permitted number of
+ * probes call dtrace_condense() to try to
+ * clean out the unenabled probes.
+ */
+ provid = fp->ftp_provid;
+ if ((rval = dtrace_unregister(provid)) != 0) {
+ if (fasttrap_total > fasttrap_max / 2)
+ (void) dtrace_condense(provid);
+
+ if (rval == EAGAIN)
+ fp->ftp_marked = 1;
+
+ later += fp->ftp_marked;
+ fpp = &fp->ftp_next;
+ } else {
+ *fpp = fp->ftp_next;
+ fasttrap_provider_free(fp);
+ }
+ }
+ mutex_exit(&bucket->ftb_mtx);
+ }
+ mtx_lock(&fasttrap_cleanup_mtx);
+
+ /*
+ * If we were unable to retire a provider, try again after a
+ * second. This situation can occur in certain circumstances
+ * where providers cannot be unregistered even though they have
+ * no probes enabled because of an execution of dtrace -l or
+ * something similar.
+ */
+ if (later > 0 || fasttrap_cleanup_work ||
+ fasttrap_cleanup_drain) {
+ mtx_unlock(&fasttrap_cleanup_mtx);
+ pause("ftclean", hz);
+ mtx_lock(&fasttrap_cleanup_mtx);
+ } else
+ mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx,
+ 0, "ftcl", 0);
+ }
+
+ /*
+ * Wake up the thread in fasttrap_unload() now that we're done.
+ */
+ wakeup(&fasttrap_cleanup_drain);
+ mtx_unlock(&fasttrap_cleanup_mtx);
+
+ kthread_exit();
+}
+
+/*
+ * Activates the asynchronous cleanup mechanism.
+ */
+static void
+fasttrap_pid_cleanup(void)
+{
+
+ mtx_lock(&fasttrap_cleanup_mtx);
+ if (!fasttrap_cleanup_work) {
+ fasttrap_cleanup_work = 1;
+ wakeup(&fasttrap_cleanup_cv);
+ }
+ mtx_unlock(&fasttrap_cleanup_mtx);
+}
+
+/*
+ * This is called from cfork() via dtrace_fasttrap_fork(). The child
+ * process's address space is (roughly) a copy of the parent process's so
+ * we have to remove all the instrumentation we had previously enabled in the
+ * parent.
+ */
+static void
+fasttrap_fork(proc_t *p, proc_t *cp)
+{
+#ifndef illumos
+ fasttrap_scrblock_t *scrblk;
+ fasttrap_proc_t *fprc = NULL;
+#endif
+ pid_t ppid = p->p_pid;
+ int i;
+
+ ASSERT(curproc == p);
+#ifdef illumos
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+#else
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+#endif
+#ifdef illumos
+ ASSERT(p->p_dtrace_count > 0);
+#else
+ /*
+ * This check is purposely here instead of in kern_fork.c because,
+ * for legal resons, we cannot include the dtrace_cddl.h header
+ * inside kern_fork.c and insert if-clause there.
+ */
+ if (p->p_dtrace_count == 0 && p->p_dtrace_helpers == NULL)
+ return;
+#endif
+
+ ASSERT(cp->p_dtrace_count == 0);
+
+ /*
+ * This would be simpler and faster if we maintained per-process
+ * hash tables of enabled tracepoints. It could, however, potentially
+ * slow down execution of a tracepoint since we'd need to go
+ * through two levels of indirection. In the future, we should
+ * consider either maintaining per-process ancillary lists of
+ * enabled tracepoints or hanging a pointer to a per-process hash
+ * table of enabled tracepoints off the proc structure.
+ */
+
+ /*
+ * We don't have to worry about the child process disappearing
+ * because we're in fork().
+ */
+#ifdef illumos
+ mtx_lock_spin(&cp->p_slock);
+ sprlock_proc(cp);
+ mtx_unlock_spin(&cp->p_slock);
+#else
+ /*
+ * fasttrap_tracepoint_remove() expects the child process to be
+ * unlocked and the VM then expects curproc to be unlocked.
+ */
+ _PHOLD(cp);
+ PROC_UNLOCK(cp);
+ PROC_UNLOCK(p);
+ if (p->p_dtrace_count == 0)
+ goto dup_helpers;
+#endif
+
+ /*
+ * Iterate over every tracepoint looking for ones that belong to the
+ * parent process, and remove each from the child process.
+ */
+ for (i = 0; i < fasttrap_tpoints.fth_nent; i++) {
+ fasttrap_tracepoint_t *tp;
+ fasttrap_bucket_t *bucket = &fasttrap_tpoints.fth_table[i];
+
+ mutex_enter(&bucket->ftb_mtx);
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ if (tp->ftt_pid == ppid &&
+ tp->ftt_proc->ftpc_acount != 0) {
+ int ret = fasttrap_tracepoint_remove(cp, tp);
+ ASSERT(ret == 0);
+
+ /*
+ * The count of active providers can only be
+ * decremented (i.e. to zero) during exec,
+ * exit, and removal of a meta provider so it
+ * should be impossible to drop the count
+ * mid-fork.
+ */
+ ASSERT(tp->ftt_proc->ftpc_acount != 0);
+#ifndef illumos
+ fprc = tp->ftt_proc;
+#endif
+ }
+ }
+ mutex_exit(&bucket->ftb_mtx);
+
+#ifndef illumos
+ /*
+ * Unmap any scratch space inherited from the parent's address
+ * space.
+ */
+ if (fprc != NULL) {
+ mutex_enter(&fprc->ftpc_mtx);
+ LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) {
+ vm_map_remove(&cp->p_vmspace->vm_map,
+ scrblk->ftsb_addr,
+ scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE);
+ }
+ mutex_exit(&fprc->ftpc_mtx);
+ }
+#endif
+ }
+
+#ifdef illumos
+ mutex_enter(&cp->p_lock);
+ sprunlock(cp);
+#else
+dup_helpers:
+ if (p->p_dtrace_helpers != NULL)
+ dtrace_helpers_duplicate(p, cp);
+ PROC_LOCK(p);
+ PROC_LOCK(cp);
+ _PRELE(cp);
+#endif
+}
+
+/*
+ * This is called from proc_exit() or from exec_common() if p_dtrace_probes
+ * is set on the proc structure to indicate that there is a pid provider
+ * associated with this process.
+ */
+static void
+fasttrap_exec_exit(proc_t *p)
+{
+#ifndef illumos
+ struct thread *td;
+#endif
+
+#ifdef illumos
+ ASSERT(p == curproc);
+#else
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ _PHOLD(p);
+ /*
+ * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr
+ * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it
+ * ourselves when a process exits.
+ */
+ FOREACH_THREAD_IN_PROC(p, td)
+ td->t_dtrace_sscr = NULL;
+ PROC_UNLOCK(p);
+#endif
+
+ /*
+ * We clean up the pid provider for this process here; user-land
+ * static probes are handled by the meta-provider remove entry point.
+ */
+ fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0);
+#ifndef illumos
+ if (p->p_dtrace_helpers)
+ dtrace_helpers_destroy(p);
+ PROC_LOCK(p);
+ _PRELE(p);
+#endif
+}
+
+
+/*ARGSUSED*/
+static void
+fasttrap_pid_provide(void *arg, dtrace_probedesc_t *desc)
+{
+ /*
+ * There are no "default" pid probes.
+ */
+}
+
+static int
+fasttrap_tracepoint_enable(proc_t *p, fasttrap_probe_t *probe, uint_t index)
+{
+ fasttrap_tracepoint_t *tp, *new_tp = NULL;
+ fasttrap_bucket_t *bucket;
+ fasttrap_id_t *id;
+ pid_t pid;
+ uintptr_t pc;
+
+ ASSERT(index < probe->ftp_ntps);
+
+ pid = probe->ftp_pid;
+ pc = probe->ftp_tps[index].fit_tp->ftt_pc;
+ id = &probe->ftp_tps[index].fit_id;
+
+ ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
+
+#ifdef illumos
+ ASSERT(!(p->p_flag & SVFORK));
+#endif
+
+ /*
+ * Before we make any modifications, make sure we've imposed a barrier
+ * on the generation in which this probe was last modified.
+ */
+ fasttrap_mod_barrier(probe->ftp_gen);
+
+ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+ /*
+ * If the tracepoint has already been enabled, just add our id to the
+ * list of interested probes. This may be our second time through
+ * this path in which case we'll have constructed the tracepoint we'd
+ * like to install. If we can't find a match, and have an allocated
+ * tracepoint ready to go, enable that one now.
+ *
+ * A tracepoint whose process is defunct is also considered defunct.
+ */
+again:
+ mutex_enter(&bucket->ftb_mtx);
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ /*
+ * Note that it's safe to access the active count on the
+ * associated proc structure because we know that at least one
+ * provider (this one) will still be around throughout this
+ * operation.
+ */
+ if (tp->ftt_pid != pid || tp->ftt_pc != pc ||
+ tp->ftt_proc->ftpc_acount == 0)
+ continue;
+
+ /*
+ * Now that we've found a matching tracepoint, it would be
+ * a decent idea to confirm that the tracepoint is still
+ * enabled and the trap instruction hasn't been overwritten.
+ * Since this is a little hairy, we'll punt for now.
+ */
+
+ /*
+ * This can't be the first interested probe. We don't have
+ * to worry about another thread being in the midst of
+ * deleting this tracepoint (which would be the only valid
+ * reason for a tracepoint to have no interested probes)
+ * since we're holding P_PR_LOCK for this process.
+ */
+ ASSERT(tp->ftt_ids != NULL || tp->ftt_retids != NULL);
+
+ switch (id->fti_ptype) {
+ case DTFTP_ENTRY:
+ case DTFTP_OFFSETS:
+ case DTFTP_IS_ENABLED:
+ id->fti_next = tp->ftt_ids;
+ membar_producer();
+ tp->ftt_ids = id;
+ membar_producer();
+ break;
+
+ case DTFTP_RETURN:
+ case DTFTP_POST_OFFSETS:
+ id->fti_next = tp->ftt_retids;
+ membar_producer();
+ tp->ftt_retids = id;
+ membar_producer();
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+ mutex_exit(&bucket->ftb_mtx);
+
+ if (new_tp != NULL) {
+ new_tp->ftt_ids = NULL;
+ new_tp->ftt_retids = NULL;
+ }
+
+ return (0);
+ }
+
+ /*
+ * If we have a good tracepoint ready to go, install it now while
+ * we have the lock held and no one can screw with us.
+ */
+ if (new_tp != NULL) {
+ int rc = 0;
+
+ new_tp->ftt_next = bucket->ftb_data;
+ membar_producer();
+ bucket->ftb_data = new_tp;
+ membar_producer();
+ mutex_exit(&bucket->ftb_mtx);
+
+ /*
+ * Activate the tracepoint in the ISA-specific manner.
+ * If this fails, we need to report the failure, but
+ * indicate that this tracepoint must still be disabled
+ * by calling fasttrap_tracepoint_disable().
+ */
+ if (fasttrap_tracepoint_install(p, new_tp) != 0)
+ rc = FASTTRAP_ENABLE_PARTIAL;
+
+ /*
+ * Increment the count of the number of tracepoints active in
+ * the victim process.
+ */
+#ifdef illumos
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+#endif
+ p->p_dtrace_count++;
+
+ return (rc);
+ }
+
+ mutex_exit(&bucket->ftb_mtx);
+
+ /*
+ * Initialize the tracepoint that's been preallocated with the probe.
+ */
+ new_tp = probe->ftp_tps[index].fit_tp;
+
+ ASSERT(new_tp->ftt_pid == pid);
+ ASSERT(new_tp->ftt_pc == pc);
+ ASSERT(new_tp->ftt_proc == probe->ftp_prov->ftp_proc);
+ ASSERT(new_tp->ftt_ids == NULL);
+ ASSERT(new_tp->ftt_retids == NULL);
+
+ switch (id->fti_ptype) {
+ case DTFTP_ENTRY:
+ case DTFTP_OFFSETS:
+ case DTFTP_IS_ENABLED:
+ id->fti_next = NULL;
+ new_tp->ftt_ids = id;
+ break;
+
+ case DTFTP_RETURN:
+ case DTFTP_POST_OFFSETS:
+ id->fti_next = NULL;
+ new_tp->ftt_retids = id;
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+#ifdef __FreeBSD__
+ if (SV_PROC_FLAG(p, SV_LP64))
+ p->p_model = DATAMODEL_LP64;
+ else
+ p->p_model = DATAMODEL_ILP32;
+#endif
+
+ /*
+ * If the ISA-dependent initialization goes to plan, go back to the
+ * beginning and try to install this freshly made tracepoint.
+ */
+ if (fasttrap_tracepoint_init(p, new_tp, pc, id->fti_ptype) == 0)
+ goto again;
+
+ new_tp->ftt_ids = NULL;
+ new_tp->ftt_retids = NULL;
+
+ return (FASTTRAP_ENABLE_FAIL);
+}
+
+static void
+fasttrap_tracepoint_disable(proc_t *p, fasttrap_probe_t *probe, uint_t index)
+{
+ fasttrap_bucket_t *bucket;
+ fasttrap_provider_t *provider = probe->ftp_prov;
+ fasttrap_tracepoint_t **pp, *tp;
+ fasttrap_id_t *id, **idp = NULL;
+ pid_t pid;
+ uintptr_t pc;
+
+ ASSERT(index < probe->ftp_ntps);
+
+ pid = probe->ftp_pid;
+ pc = probe->ftp_tps[index].fit_tp->ftt_pc;
+ id = &probe->ftp_tps[index].fit_id;
+
+ ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
+
+ /*
+ * Find the tracepoint and make sure that our id is one of the
+ * ones registered with it.
+ */
+ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+ mutex_enter(&bucket->ftb_mtx);
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ if (tp->ftt_pid == pid && tp->ftt_pc == pc &&
+ tp->ftt_proc == provider->ftp_proc)
+ break;
+ }
+
+ /*
+ * If we somehow lost this tracepoint, we're in a world of hurt.
+ */
+ ASSERT(tp != NULL);
+
+ switch (id->fti_ptype) {
+ case DTFTP_ENTRY:
+ case DTFTP_OFFSETS:
+ case DTFTP_IS_ENABLED:
+ ASSERT(tp->ftt_ids != NULL);
+ idp = &tp->ftt_ids;
+ break;
+
+ case DTFTP_RETURN:
+ case DTFTP_POST_OFFSETS:
+ ASSERT(tp->ftt_retids != NULL);
+ idp = &tp->ftt_retids;
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+ while ((*idp)->fti_probe != probe) {
+ idp = &(*idp)->fti_next;
+ ASSERT(*idp != NULL);
+ }
+
+ id = *idp;
+ *idp = id->fti_next;
+ membar_producer();
+
+ ASSERT(id->fti_probe == probe);
+
+ /*
+ * If there are other registered enablings of this tracepoint, we're
+ * all done, but if this was the last probe assocated with this
+ * this tracepoint, we need to remove and free it.
+ */
+ if (tp->ftt_ids != NULL || tp->ftt_retids != NULL) {
+
+ /*
+ * If the current probe's tracepoint is in use, swap it
+ * for an unused tracepoint.
+ */
+ if (tp == probe->ftp_tps[index].fit_tp) {
+ fasttrap_probe_t *tmp_probe;
+ fasttrap_tracepoint_t **tmp_tp;
+ uint_t tmp_index;
+
+ if (tp->ftt_ids != NULL) {
+ tmp_probe = tp->ftt_ids->fti_probe;
+ /* LINTED - alignment */
+ tmp_index = FASTTRAP_ID_INDEX(tp->ftt_ids);
+ tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp;
+ } else {
+ tmp_probe = tp->ftt_retids->fti_probe;
+ /* LINTED - alignment */
+ tmp_index = FASTTRAP_ID_INDEX(tp->ftt_retids);
+ tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp;
+ }
+
+ ASSERT(*tmp_tp != NULL);
+ ASSERT(*tmp_tp != probe->ftp_tps[index].fit_tp);
+ ASSERT((*tmp_tp)->ftt_ids == NULL);
+ ASSERT((*tmp_tp)->ftt_retids == NULL);
+
+ probe->ftp_tps[index].fit_tp = *tmp_tp;
+ *tmp_tp = tp;
+ }
+
+ mutex_exit(&bucket->ftb_mtx);
+
+ /*
+ * Tag the modified probe with the generation in which it was
+ * changed.
+ */
+ probe->ftp_gen = fasttrap_mod_gen;
+ return;
+ }
+
+ mutex_exit(&bucket->ftb_mtx);
+
+ /*
+ * We can't safely remove the tracepoint from the set of active
+ * tracepoints until we've actually removed the fasttrap instruction
+ * from the process's text. We can, however, operate on this
+ * tracepoint secure in the knowledge that no other thread is going to
+ * be looking at it since we hold P_PR_LOCK on the process if it's
+ * live or we hold the provider lock on the process if it's dead and
+ * gone.
+ */
+
+ /*
+ * We only need to remove the actual instruction if we're looking
+ * at an existing process
+ */
+ if (p != NULL) {
+ /*
+ * If we fail to restore the instruction we need to kill
+ * this process since it's in a completely unrecoverable
+ * state.
+ */
+ if (fasttrap_tracepoint_remove(p, tp) != 0)
+ fasttrap_sigtrap(p, NULL, pc);
+
+ /*
+ * Decrement the count of the number of tracepoints active
+ * in the victim process.
+ */
+#ifdef illumos
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+#endif
+ p->p_dtrace_count--;
+
+ atomic_add_rel_64(&p->p_fasttrap_tp_gen, 1);
+ }
+
+ /*
+ * Remove the probe from the hash table of active tracepoints.
+ */
+ mutex_enter(&bucket->ftb_mtx);
+ pp = (fasttrap_tracepoint_t **)&bucket->ftb_data;
+ ASSERT(*pp != NULL);
+ while (*pp != tp) {
+ pp = &(*pp)->ftt_next;
+ ASSERT(*pp != NULL);
+ }
+
+ *pp = tp->ftt_next;
+ membar_producer();
+
+ mutex_exit(&bucket->ftb_mtx);
+
+ /*
+ * Tag the modified probe with the generation in which it was changed.
+ */
+ probe->ftp_gen = fasttrap_mod_gen;
+}
+
+static void
+fasttrap_enable_callbacks(void)
+{
+ /*
+ * We don't have to play the rw lock game here because we're
+ * providing something rather than taking something away --
+ * we can be sure that no threads have tried to follow this
+ * function pointer yet.
+ */
+ mutex_enter(&fasttrap_count_mtx);
+ if (fasttrap_pid_count == 0) {
+ ASSERT(dtrace_pid_probe_ptr == NULL);
+ ASSERT(dtrace_return_probe_ptr == NULL);
+ dtrace_pid_probe_ptr = &fasttrap_pid_probe;
+ dtrace_return_probe_ptr = &fasttrap_return_probe;
+ }
+ ASSERT(dtrace_pid_probe_ptr == &fasttrap_pid_probe);
+ ASSERT(dtrace_return_probe_ptr == &fasttrap_return_probe);
+ fasttrap_pid_count++;
+ mutex_exit(&fasttrap_count_mtx);
+}
+
+static void
+fasttrap_disable_callbacks(void)
+{
+ mutex_enter(&fasttrap_count_mtx);
+ ASSERT(fasttrap_pid_count > 0);
+ fasttrap_pid_count--;
+ if (fasttrap_pid_count == 0) {
+ /*
+ * Synchronize with the breakpoint handler, which is careful to
+ * enable interrupts only after loading the hook pointer.
+ */
+ dtrace_sync();
+ dtrace_pid_probe_ptr = NULL;
+ dtrace_return_probe_ptr = NULL;
+ }
+ mutex_exit(&fasttrap_count_mtx);
+}
+
+/*ARGSUSED*/
+static void
+fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
+{
+ fasttrap_probe_t *probe = parg;
+ proc_t *p = NULL;
+ int i, rc;
+
+ ASSERT(probe != NULL);
+ ASSERT(!probe->ftp_enabled);
+ ASSERT(id == probe->ftp_id);
+#ifdef illumos
+ ASSERT(MUTEX_HELD(&cpu_lock));
+#endif
+
+ /*
+ * Increment the count of enabled probes on this probe's provider;
+ * the provider can't go away while the probe still exists. We
+ * must increment this even if we aren't able to properly enable
+ * this probe.
+ */
+ mutex_enter(&probe->ftp_prov->ftp_mtx);
+ probe->ftp_prov->ftp_rcount++;
+ mutex_exit(&probe->ftp_prov->ftp_mtx);
+
+ /*
+ * If this probe's provider is retired (meaning it was valid in a
+ * previously exec'ed incarnation of this address space), bail out. The
+ * provider can't go away while we're in this code path.
+ */
+ if (probe->ftp_prov->ftp_retired)
+ return;
+
+ /*
+ * If we can't find the process, it may be that we're in the context of
+ * a fork in which the traced process is being born and we're copying
+ * USDT probes. Otherwise, the process is gone so bail.
+ */
+#ifdef illumos
+ if ((p = sprlock(probe->ftp_pid)) == NULL) {
+ if ((curproc->p_flag & SFORKING) == 0)
+ return;
+
+ mutex_enter(&pidlock);
+ p = prfind(probe->ftp_pid);
+
+ if (p == NULL) {
+ /*
+ * So it's not that the target process is being born,
+ * it's that it isn't there at all (and we simply
+ * happen to be forking). Anyway, we know that the
+ * target is definitely gone, so bail out.
+ */
+ mutex_exit(&pidlock);
+ return (0);
+ }
+
+ /*
+ * Confirm that curproc is indeed forking the process in which
+ * we're trying to enable probes.
+ */
+ ASSERT(p->p_parent == curproc);
+ ASSERT(p->p_stat == SIDL);
+
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ sprlock_proc(p);
+ }
+
+ ASSERT(!(p->p_flag & SVFORK));
+ mutex_exit(&p->p_lock);
+#else
+ if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0)
+ return;
+#endif
+
+ /*
+ * We have to enable the trap entry point before any user threads have
+ * the chance to execute the trap instruction we're about to place
+ * in their process's text.
+ */
+ fasttrap_enable_callbacks();
+
+ /*
+ * Enable all the tracepoints and add this probe's id to each
+ * tracepoint's list of active probes.
+ */
+ for (i = 0; i < probe->ftp_ntps; i++) {
+ if ((rc = fasttrap_tracepoint_enable(p, probe, i)) != 0) {
+ /*
+ * If enabling the tracepoint failed completely,
+ * we don't have to disable it; if the failure
+ * was only partial we must disable it.
+ */
+ if (rc == FASTTRAP_ENABLE_FAIL)
+ i--;
+ else
+ ASSERT(rc == FASTTRAP_ENABLE_PARTIAL);
+
+ /*
+ * Back up and pull out all the tracepoints we've
+ * created so far for this probe.
+ */
+ while (i >= 0) {
+ fasttrap_tracepoint_disable(p, probe, i);
+ i--;
+ }
+
+#ifdef illumos
+ mutex_enter(&p->p_lock);
+ sprunlock(p);
+#else
+ PRELE(p);
+#endif
+
+ /*
+ * Since we're not actually enabling this probe,
+ * drop our reference on the trap table entry.
+ */
+ fasttrap_disable_callbacks();
+ return;
+ }
+ }
+#ifdef illumos
+ mutex_enter(&p->p_lock);
+ sprunlock(p);
+#else
+ PRELE(p);
+#endif
+
+ probe->ftp_enabled = 1;
+}
+
+/*ARGSUSED*/
+static void
+fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg)
+{
+ fasttrap_probe_t *probe = parg;
+ fasttrap_provider_t *provider = probe->ftp_prov;
+ proc_t *p;
+ int i, whack = 0;
+
+ ASSERT(id == probe->ftp_id);
+
+ mutex_enter(&provider->ftp_mtx);
+
+ /*
+ * We won't be able to acquire a /proc-esque lock on the process
+ * iff the process is dead and gone. In this case, we rely on the
+ * provider lock as a point of mutual exclusion to prevent other
+ * DTrace consumers from disabling this probe.
+ */
+ if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0)
+ p = NULL;
+
+ /*
+ * Disable all the associated tracepoints (for fully enabled probes).
+ */
+ if (probe->ftp_enabled) {
+ for (i = 0; i < probe->ftp_ntps; i++) {
+ fasttrap_tracepoint_disable(p, probe, i);
+ }
+ }
+
+ ASSERT(provider->ftp_rcount > 0);
+ provider->ftp_rcount--;
+
+ if (p != NULL) {
+ /*
+ * Even though we may not be able to remove it entirely, we
+ * mark this retired provider to get a chance to remove some
+ * of the associated probes.
+ */
+ if (provider->ftp_retired && !provider->ftp_marked)
+ whack = provider->ftp_marked = 1;
+ mutex_exit(&provider->ftp_mtx);
+ } else {
+ /*
+ * If the process is dead, we're just waiting for the
+ * last probe to be disabled to be able to free it.
+ */
+ if (provider->ftp_rcount == 0 && !provider->ftp_marked)
+ whack = provider->ftp_marked = 1;
+ mutex_exit(&provider->ftp_mtx);
+ }
+
+ if (whack)
+ fasttrap_pid_cleanup();
+
+#ifdef __FreeBSD__
+ if (p != NULL)
+ PRELE(p);
+#endif
+ if (!probe->ftp_enabled)
+ return;
+
+ probe->ftp_enabled = 0;
+
+#ifdef illumos
+ ASSERT(MUTEX_HELD(&cpu_lock));
+#endif
+ fasttrap_disable_callbacks();
+}
+
+/*ARGSUSED*/
+static void
+fasttrap_pid_getargdesc(void *arg, dtrace_id_t id, void *parg,
+ dtrace_argdesc_t *desc)
+{
+ fasttrap_probe_t *probe = parg;
+ char *str;
+ int i, ndx;
+
+ desc->dtargd_native[0] = '\0';
+ desc->dtargd_xlate[0] = '\0';
+
+ if (probe->ftp_prov->ftp_retired != 0 ||
+ desc->dtargd_ndx >= probe->ftp_nargs) {
+ desc->dtargd_ndx = DTRACE_ARGNONE;
+ return;
+ }
+
+ ndx = (probe->ftp_argmap != NULL) ?
+ probe->ftp_argmap[desc->dtargd_ndx] : desc->dtargd_ndx;
+
+ str = probe->ftp_ntypes;
+ for (i = 0; i < ndx; i++) {
+ str += strlen(str) + 1;
+ }
+
+ ASSERT(strlen(str + 1) < sizeof (desc->dtargd_native));
+ (void) strcpy(desc->dtargd_native, str);
+
+ if (probe->ftp_xtypes == NULL)
+ return;
+
+ str = probe->ftp_xtypes;
+ for (i = 0; i < desc->dtargd_ndx; i++) {
+ str += strlen(str) + 1;
+ }
+
+ ASSERT(strlen(str + 1) < sizeof (desc->dtargd_xlate));
+ (void) strcpy(desc->dtargd_xlate, str);
+}
+
+/*ARGSUSED*/
+static void
+fasttrap_pid_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+ fasttrap_probe_t *probe = parg;
+ int i;
+ size_t size;
+
+ ASSERT(probe != NULL);
+ ASSERT(!probe->ftp_enabled);
+ ASSERT(fasttrap_total >= probe->ftp_ntps);
+
+ atomic_add_32(&fasttrap_total, -probe->ftp_ntps);
+ size = offsetof(fasttrap_probe_t, ftp_tps[probe->ftp_ntps]);
+
+ if (probe->ftp_gen + 1 >= fasttrap_mod_gen)
+ fasttrap_mod_barrier(probe->ftp_gen);
+
+ for (i = 0; i < probe->ftp_ntps; i++) {
+ kmem_free(probe->ftp_tps[i].fit_tp,
+ sizeof (fasttrap_tracepoint_t));
+ }
+
+ kmem_free(probe, size);
+}
+
+
+static const dtrace_pattr_t pid_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+};
+
+static dtrace_pops_t pid_pops = {
+ .dtps_provide = fasttrap_pid_provide,
+ .dtps_provide_module = NULL,
+ .dtps_enable = fasttrap_pid_enable,
+ .dtps_disable = fasttrap_pid_disable,
+ .dtps_suspend = NULL,
+ .dtps_resume = NULL,
+ .dtps_getargdesc = fasttrap_pid_getargdesc,
+ .dtps_getargval = fasttrap_pid_getarg,
+ .dtps_usermode = NULL,
+ .dtps_destroy = fasttrap_pid_destroy
+};
+
+static dtrace_pops_t usdt_pops = {
+ .dtps_provide = fasttrap_pid_provide,
+ .dtps_provide_module = NULL,
+ .dtps_enable = fasttrap_pid_enable,
+ .dtps_disable = fasttrap_pid_disable,
+ .dtps_suspend = NULL,
+ .dtps_resume = NULL,
+ .dtps_getargdesc = fasttrap_pid_getargdesc,
+ .dtps_getargval = fasttrap_usdt_getarg,
+ .dtps_usermode = NULL,
+ .dtps_destroy = fasttrap_pid_destroy
+};
+
+static fasttrap_proc_t *
+fasttrap_proc_lookup(pid_t pid)
+{
+ fasttrap_bucket_t *bucket;
+ fasttrap_proc_t *fprc, *new_fprc;
+
+
+ bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
+ mutex_enter(&bucket->ftb_mtx);
+
+ for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
+ if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) {
+ mutex_enter(&fprc->ftpc_mtx);
+ mutex_exit(&bucket->ftb_mtx);
+ fprc->ftpc_rcount++;
+ atomic_inc_64(&fprc->ftpc_acount);
+ ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
+ mutex_exit(&fprc->ftpc_mtx);
+
+ return (fprc);
+ }
+ }
+
+ /*
+ * Drop the bucket lock so we don't try to perform a sleeping
+ * allocation under it.
+ */
+ mutex_exit(&bucket->ftb_mtx);
+
+ new_fprc = kmem_zalloc(sizeof (fasttrap_proc_t), KM_SLEEP);
+ new_fprc->ftpc_pid = pid;
+ new_fprc->ftpc_rcount = 1;
+ new_fprc->ftpc_acount = 1;
+#ifndef illumos
+ mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT,
+ NULL);
+#endif
+
+ mutex_enter(&bucket->ftb_mtx);
+
+ /*
+ * Take another lap through the list to make sure a proc hasn't
+ * been created for this pid while we weren't under the bucket lock.
+ */
+ for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
+ if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) {
+ mutex_enter(&fprc->ftpc_mtx);
+ mutex_exit(&bucket->ftb_mtx);
+ fprc->ftpc_rcount++;
+ atomic_inc_64(&fprc->ftpc_acount);
+ ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
+ mutex_exit(&fprc->ftpc_mtx);
+
+ kmem_free(new_fprc, sizeof (fasttrap_proc_t));
+
+ return (fprc);
+ }
+ }
+
+ new_fprc->ftpc_next = bucket->ftb_data;
+ bucket->ftb_data = new_fprc;
+
+ mutex_exit(&bucket->ftb_mtx);
+
+ return (new_fprc);
+}
+
+static void
+fasttrap_proc_release(fasttrap_proc_t *proc)
+{
+ fasttrap_bucket_t *bucket;
+ fasttrap_proc_t *fprc, **fprcp;
+ pid_t pid = proc->ftpc_pid;
+#ifndef illumos
+ fasttrap_scrblock_t *scrblk, *scrblktmp;
+ fasttrap_scrspace_t *scrspc, *scrspctmp;
+ struct proc *p;
+ struct thread *td;
+#endif
+
+ mutex_enter(&proc->ftpc_mtx);
+
+ ASSERT(proc->ftpc_rcount != 0);
+ ASSERT(proc->ftpc_acount <= proc->ftpc_rcount);
+
+ if (--proc->ftpc_rcount != 0) {
+ mutex_exit(&proc->ftpc_mtx);
+ return;
+ }
+
+#ifndef illumos
+ /*
+ * Free all structures used to manage per-thread scratch space.
+ */
+ LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next,
+ scrblktmp) {
+ LIST_REMOVE(scrblk, ftsb_next);
+ free(scrblk, M_SOLARIS);
+ }
+ LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) {
+ LIST_REMOVE(scrspc, ftss_next);
+ free(scrspc, M_SOLARIS);
+ }
+ LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) {
+ LIST_REMOVE(scrspc, ftss_next);
+ free(scrspc, M_SOLARIS);
+ }
+
+ if ((p = pfind(pid)) != NULL) {
+ FOREACH_THREAD_IN_PROC(p, td)
+ td->t_dtrace_sscr = NULL;
+ PROC_UNLOCK(p);
+ }
+#endif
+
+ mutex_exit(&proc->ftpc_mtx);
+
+ /*
+ * There should definitely be no live providers associated with this
+ * process at this point.
+ */
+ ASSERT(proc->ftpc_acount == 0);
+
+ bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
+ mutex_enter(&bucket->ftb_mtx);
+
+ fprcp = (fasttrap_proc_t **)&bucket->ftb_data;
+ while ((fprc = *fprcp) != NULL) {
+ if (fprc == proc)
+ break;
+
+ fprcp = &fprc->ftpc_next;
+ }
+
+ /*
+ * Something strange has happened if we can't find the proc.
+ */
+ ASSERT(fprc != NULL);
+
+ *fprcp = fprc->ftpc_next;
+
+ mutex_exit(&bucket->ftb_mtx);
+
+ kmem_free(fprc, sizeof (fasttrap_proc_t));
+}
+
+/*
+ * Lookup a fasttrap-managed provider based on its name and associated pid.
+ * If the pattr argument is non-NULL, this function instantiates the provider
+ * if it doesn't exist otherwise it returns NULL. The provider is returned
+ * with its lock held.
+ */
+static fasttrap_provider_t *
+fasttrap_provider_lookup(pid_t pid, const char *name,
+ const dtrace_pattr_t *pattr)
+{
+ fasttrap_provider_t *fp, *new_fp = NULL;
+ fasttrap_bucket_t *bucket;
+ char provname[DTRACE_PROVNAMELEN];
+ proc_t *p;
+ cred_t *cred;
+
+ ASSERT(strlen(name) < sizeof (fp->ftp_name));
+ ASSERT(pattr != NULL);
+
+ bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)];
+ mutex_enter(&bucket->ftb_mtx);
+
+ /*
+ * Take a lap through the list and return the match if we find it.
+ */
+ for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
+ if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
+ !fp->ftp_retired) {
+ mutex_enter(&fp->ftp_mtx);
+ mutex_exit(&bucket->ftb_mtx);
+ return (fp);
+ }
+ }
+
+ /*
+ * Drop the bucket lock so we don't try to perform a sleeping
+ * allocation under it.
+ */
+ mutex_exit(&bucket->ftb_mtx);
+
+ /*
+ * Make sure the process exists, isn't a child created as the result
+ * of a vfork(2), and isn't a zombie (but may be in fork).
+ */
+ if ((p = pfind(pid)) == NULL)
+ return (NULL);
+
+ /*
+ * Increment p_dtrace_probes so that the process knows to inform us
+ * when it exits or execs. fasttrap_provider_free() decrements this
+ * when we're done with this provider.
+ */
+ p->p_dtrace_probes++;
+
+ /*
+ * Grab the credentials for this process so we have
+ * something to pass to dtrace_register().
+ */
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ crhold(p->p_ucred);
+ cred = p->p_ucred;
+ PROC_UNLOCK(p);
+
+ new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP);
+ new_fp->ftp_pid = pid;
+ new_fp->ftp_proc = fasttrap_proc_lookup(pid);
+#ifndef illumos
+ mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL);
+ mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL);
+#endif
+
+ ASSERT(new_fp->ftp_proc != NULL);
+
+ mutex_enter(&bucket->ftb_mtx);
+
+ /*
+ * Take another lap through the list to make sure a provider hasn't
+ * been created for this pid while we weren't under the bucket lock.
+ */
+ for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
+ if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
+ !fp->ftp_retired) {
+ mutex_enter(&fp->ftp_mtx);
+ mutex_exit(&bucket->ftb_mtx);
+ fasttrap_provider_free(new_fp);
+ crfree(cred);
+ return (fp);
+ }
+ }
+
+ (void) strcpy(new_fp->ftp_name, name);
+
+ /*
+ * Fail and return NULL if either the provider name is too long
+ * or we fail to register this new provider with the DTrace
+ * framework. Note that this is the only place we ever construct
+ * the full provider name -- we keep it in pieces in the provider
+ * structure.
+ */
+ if (snprintf(provname, sizeof (provname), "%s%u", name, (uint_t)pid) >=
+ sizeof (provname) ||
+ dtrace_register(provname, pattr,
+ DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER | DTRACE_PRIV_ZONEOWNER, cred,
+ pattr == &pid_attr ? &pid_pops : &usdt_pops, new_fp,
+ &new_fp->ftp_provid) != 0) {
+ mutex_exit(&bucket->ftb_mtx);
+ fasttrap_provider_free(new_fp);
+ crfree(cred);
+ return (NULL);
+ }
+
+ new_fp->ftp_next = bucket->ftb_data;
+ bucket->ftb_data = new_fp;
+
+ mutex_enter(&new_fp->ftp_mtx);
+ mutex_exit(&bucket->ftb_mtx);
+
+ crfree(cred);
+ return (new_fp);
+}
+
+static void
+fasttrap_provider_free(fasttrap_provider_t *provider)
+{
+ pid_t pid = provider->ftp_pid;
+ proc_t *p;
+
+ /*
+ * There need to be no associated enabled probes, no consumers
+ * creating probes, and no meta providers referencing this provider.
+ */
+ ASSERT(provider->ftp_rcount == 0);
+ ASSERT(provider->ftp_ccount == 0);
+ ASSERT(provider->ftp_mcount == 0);
+
+ /*
+ * If this provider hasn't been retired, we need to explicitly drop the
+ * count of active providers on the associated process structure.
+ */
+ if (!provider->ftp_retired) {
+ atomic_dec_64(&provider->ftp_proc->ftpc_acount);
+ ASSERT(provider->ftp_proc->ftpc_acount <
+ provider->ftp_proc->ftpc_rcount);
+ }
+
+ fasttrap_proc_release(provider->ftp_proc);
+
+#ifndef illumos
+ mutex_destroy(&provider->ftp_mtx);
+ mutex_destroy(&provider->ftp_cmtx);
+#endif
+ kmem_free(provider, sizeof (fasttrap_provider_t));
+
+ /*
+ * Decrement p_dtrace_probes on the process whose provider we're
+ * freeing. We don't have to worry about clobbering somone else's
+ * modifications to it because we have locked the bucket that
+ * corresponds to this process's hash chain in the provider hash
+ * table. Don't sweat it if we can't find the process.
+ */
+ if ((p = pfind(pid)) == NULL) {
+ return;
+ }
+
+ p->p_dtrace_probes--;
+#ifndef illumos
+ PROC_UNLOCK(p);
+#endif
+}
+
+static void
+fasttrap_provider_retire(pid_t pid, const char *name, int mprov)
+{
+ fasttrap_provider_t *fp;
+ fasttrap_bucket_t *bucket;
+ dtrace_provider_id_t provid;
+
+ ASSERT(strlen(name) < sizeof (fp->ftp_name));
+
+ bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)];
+ mutex_enter(&bucket->ftb_mtx);
+
+ for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
+ if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
+ !fp->ftp_retired)
+ break;
+ }
+
+ if (fp == NULL) {
+ mutex_exit(&bucket->ftb_mtx);
+ return;
+ }
+
+ mutex_enter(&fp->ftp_mtx);
+ ASSERT(!mprov || fp->ftp_mcount > 0);
+ if (mprov && --fp->ftp_mcount != 0) {
+ mutex_exit(&fp->ftp_mtx);
+ mutex_exit(&bucket->ftb_mtx);
+ return;
+ }
+
+ /*
+ * Mark the provider to be removed in our post-processing step, mark it
+ * retired, and drop the active count on its proc. Marking it indicates
+ * that we should try to remove it; setting the retired flag indicates
+ * that we're done with this provider; dropping the active the proc
+ * releases our hold, and when this reaches zero (as it will during
+ * exit or exec) the proc and associated providers become defunct.
+ *
+ * We obviously need to take the bucket lock before the provider lock
+ * to perform the lookup, but we need to drop the provider lock
+ * before calling into the DTrace framework since we acquire the
+ * provider lock in callbacks invoked from the DTrace framework. The
+ * bucket lock therefore protects the integrity of the provider hash
+ * table.
+ */
+ atomic_dec_64(&fp->ftp_proc->ftpc_acount);
+ ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount);
+
+ fp->ftp_retired = 1;
+ fp->ftp_marked = 1;
+ provid = fp->ftp_provid;
+ mutex_exit(&fp->ftp_mtx);
+
+ /*
+ * We don't have to worry about invalidating the same provider twice
+ * since fasttrap_provider_lookup() will ignore provider that have
+ * been marked as retired.
+ */
+ dtrace_invalidate(provid);
+
+ mutex_exit(&bucket->ftb_mtx);
+
+ fasttrap_pid_cleanup();
+}
+
+static int
+fasttrap_uint32_cmp(const void *ap, const void *bp)
+{
+ return (*(const uint32_t *)ap - *(const uint32_t *)bp);
+}
+
+static int
+fasttrap_uint64_cmp(const void *ap, const void *bp)
+{
+ return (*(const uint64_t *)ap - *(const uint64_t *)bp);
+}
+
+static int
+fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
+{
+ fasttrap_provider_t *provider;
+ fasttrap_probe_t *pp;
+ fasttrap_tracepoint_t *tp;
+ char *name;
+ int i, aframes = 0, whack;
+
+ /*
+ * There needs to be at least one desired trace point.
+ */
+ if (pdata->ftps_noffs == 0)
+ return (EINVAL);
+
+ switch (pdata->ftps_type) {
+ case DTFTP_ENTRY:
+ name = "entry";
+ aframes = FASTTRAP_ENTRY_AFRAMES;
+ break;
+ case DTFTP_RETURN:
+ name = "return";
+ aframes = FASTTRAP_RETURN_AFRAMES;
+ break;
+ case DTFTP_OFFSETS:
+ name = NULL;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ if ((provider = fasttrap_provider_lookup(pdata->ftps_pid,
+ FASTTRAP_PID_NAME, &pid_attr)) == NULL)
+ return (ESRCH);
+
+ /*
+ * Increment this reference count to indicate that a consumer is
+ * actively adding a new probe associated with this provider. This
+ * prevents the provider from being deleted -- we'll need to check
+ * for pending deletions when we drop this reference count.
+ */
+ provider->ftp_ccount++;
+ mutex_exit(&provider->ftp_mtx);
+
+ /*
+ * Grab the creation lock to ensure consistency between calls to
+ * dtrace_probe_lookup() and dtrace_probe_create() in the face of
+ * other threads creating probes. We must drop the provider lock
+ * before taking this lock to avoid a three-way deadlock with the
+ * DTrace framework.
+ */
+ mutex_enter(&provider->ftp_cmtx);
+
+ if (name == NULL) {
+ for (i = 0; i < pdata->ftps_noffs; i++) {
+ char name_str[17];
+
+ (void) sprintf(name_str, "%llx",
+ (unsigned long long)pdata->ftps_offs[i]);
+
+ if (dtrace_probe_lookup(provider->ftp_provid,
+ pdata->ftps_mod, pdata->ftps_func, name_str) != 0)
+ continue;
+
+ atomic_inc_32(&fasttrap_total);
+
+ if (fasttrap_total > fasttrap_max) {
+ atomic_dec_32(&fasttrap_total);
+ goto no_mem;
+ }
+
+ pp = kmem_zalloc(sizeof (fasttrap_probe_t), KM_SLEEP);
+
+ pp->ftp_prov = provider;
+ pp->ftp_faddr = pdata->ftps_pc;
+ pp->ftp_fsize = pdata->ftps_size;
+ pp->ftp_pid = pdata->ftps_pid;
+ pp->ftp_ntps = 1;
+
+ tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t),
+ KM_SLEEP);
+
+ tp->ftt_proc = provider->ftp_proc;
+ tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc;
+ tp->ftt_pid = pdata->ftps_pid;
+
+ pp->ftp_tps[0].fit_tp = tp;
+ pp->ftp_tps[0].fit_id.fti_probe = pp;
+ pp->ftp_tps[0].fit_id.fti_ptype = pdata->ftps_type;
+
+ pp->ftp_id = dtrace_probe_create(provider->ftp_provid,
+ pdata->ftps_mod, pdata->ftps_func, name_str,
+ FASTTRAP_OFFSET_AFRAMES, pp);
+ }
+
+ } else if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod,
+ pdata->ftps_func, name) == 0) {
+ atomic_add_32(&fasttrap_total, pdata->ftps_noffs);
+
+ if (fasttrap_total > fasttrap_max) {
+ atomic_add_32(&fasttrap_total, -pdata->ftps_noffs);
+ goto no_mem;
+ }
+
+ /*
+ * Make sure all tracepoint program counter values are unique.
+ * We later assume that each probe has exactly one tracepoint
+ * for a given pc.
+ */
+ qsort(pdata->ftps_offs, pdata->ftps_noffs,
+ sizeof (uint64_t), fasttrap_uint64_cmp);
+ for (i = 1; i < pdata->ftps_noffs; i++) {
+ if (pdata->ftps_offs[i] > pdata->ftps_offs[i - 1])
+ continue;
+
+ atomic_add_32(&fasttrap_total, -pdata->ftps_noffs);
+ goto no_mem;
+ }
+
+ ASSERT(pdata->ftps_noffs > 0);
+ pp = kmem_zalloc(offsetof(fasttrap_probe_t,
+ ftp_tps[pdata->ftps_noffs]), KM_SLEEP);
+
+ pp->ftp_prov = provider;
+ pp->ftp_faddr = pdata->ftps_pc;
+ pp->ftp_fsize = pdata->ftps_size;
+ pp->ftp_pid = pdata->ftps_pid;
+ pp->ftp_ntps = pdata->ftps_noffs;
+
+ for (i = 0; i < pdata->ftps_noffs; i++) {
+ tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t),
+ KM_SLEEP);
+
+ tp->ftt_proc = provider->ftp_proc;
+ tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc;
+ tp->ftt_pid = pdata->ftps_pid;
+
+ pp->ftp_tps[i].fit_tp = tp;
+ pp->ftp_tps[i].fit_id.fti_probe = pp;
+ pp->ftp_tps[i].fit_id.fti_ptype = pdata->ftps_type;
+ }
+
+ pp->ftp_id = dtrace_probe_create(provider->ftp_provid,
+ pdata->ftps_mod, pdata->ftps_func, name, aframes, pp);
+ }
+
+ mutex_exit(&provider->ftp_cmtx);
+
+ /*
+ * We know that the provider is still valid since we incremented the
+ * creation reference count. If someone tried to clean up this provider
+ * while we were using it (e.g. because the process called exec(2) or
+ * exit(2)), take note of that and try to clean it up now.
+ */
+ mutex_enter(&provider->ftp_mtx);
+ provider->ftp_ccount--;
+ whack = provider->ftp_retired;
+ mutex_exit(&provider->ftp_mtx);
+
+ if (whack)
+ fasttrap_pid_cleanup();
+
+ return (0);
+
+no_mem:
+ /*
+ * If we've exhausted the allowable resources, we'll try to remove
+ * this provider to free some up. This is to cover the case where
+ * the user has accidentally created many more probes than was
+ * intended (e.g. pid123:::).
+ */
+ mutex_exit(&provider->ftp_cmtx);
+ mutex_enter(&provider->ftp_mtx);
+ provider->ftp_ccount--;
+ provider->ftp_marked = 1;
+ mutex_exit(&provider->ftp_mtx);
+
+ fasttrap_pid_cleanup();
+
+ return (ENOMEM);
+}
+
+/*ARGSUSED*/
+static void *
+fasttrap_meta_provide(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid)
+{
+ fasttrap_provider_t *provider;
+
+ /*
+ * A 32-bit unsigned integer (like a pid for example) can be
+ * expressed in 10 or fewer decimal digits. Make sure that we'll
+ * have enough space for the provider name.
+ */
+ if (strlen(dhpv->dthpv_provname) + 10 >=
+ sizeof (provider->ftp_name)) {
+ printf("failed to instantiate provider %s: "
+ "name too long to accomodate pid", dhpv->dthpv_provname);
+ return (NULL);
+ }
+
+ /*
+ * Don't let folks spoof the true pid provider.
+ */
+ if (strcmp(dhpv->dthpv_provname, FASTTRAP_PID_NAME) == 0) {
+ printf("failed to instantiate provider %s: "
+ "%s is an invalid name", dhpv->dthpv_provname,
+ FASTTRAP_PID_NAME);
+ return (NULL);
+ }
+
+ /*
+ * The highest stability class that fasttrap supports is ISA; cap
+ * the stability of the new provider accordingly.
+ */
+ if (dhpv->dthpv_pattr.dtpa_provider.dtat_class > DTRACE_CLASS_ISA)
+ dhpv->dthpv_pattr.dtpa_provider.dtat_class = DTRACE_CLASS_ISA;
+ if (dhpv->dthpv_pattr.dtpa_mod.dtat_class > DTRACE_CLASS_ISA)
+ dhpv->dthpv_pattr.dtpa_mod.dtat_class = DTRACE_CLASS_ISA;
+ if (dhpv->dthpv_pattr.dtpa_func.dtat_class > DTRACE_CLASS_ISA)
+ dhpv->dthpv_pattr.dtpa_func.dtat_class = DTRACE_CLASS_ISA;
+ if (dhpv->dthpv_pattr.dtpa_name.dtat_class > DTRACE_CLASS_ISA)
+ dhpv->dthpv_pattr.dtpa_name.dtat_class = DTRACE_CLASS_ISA;
+ if (dhpv->dthpv_pattr.dtpa_args.dtat_class > DTRACE_CLASS_ISA)
+ dhpv->dthpv_pattr.dtpa_args.dtat_class = DTRACE_CLASS_ISA;
+
+ if ((provider = fasttrap_provider_lookup(pid, dhpv->dthpv_provname,
+ &dhpv->dthpv_pattr)) == NULL) {
+ printf("failed to instantiate provider %s for "
+ "process %u", dhpv->dthpv_provname, (uint_t)pid);
+ return (NULL);
+ }
+
+ /*
+ * Up the meta provider count so this provider isn't removed until
+ * the meta provider has been told to remove it.
+ */
+ provider->ftp_mcount++;
+
+ mutex_exit(&provider->ftp_mtx);
+
+ return (provider);
+}
+
+/*
+ * We know a few things about our context here: we know that the probe being
+ * created doesn't already exist (DTrace won't load DOF at the same address
+ * twice, even if explicitly told to do so) and we know that we are
+ * single-threaded with respect to the meta provider machinery. Knowing that
+ * this is a new probe and that there is no way for us to race with another
+ * operation on this provider allows us an important optimization: we need not
+ * lookup a probe before adding it. Saving this lookup is important because
+ * this code is in the fork path for processes with USDT probes, and lookups
+ * here are potentially very expensive because of long hash conflicts on
+ * module, function and name (DTrace doesn't hash on provider name).
+ */
+/*ARGSUSED*/
+static void
+fasttrap_meta_create_probe(void *arg, void *parg,
+ dtrace_helper_probedesc_t *dhpb)
+{
+ fasttrap_provider_t *provider = parg;
+ fasttrap_probe_t *pp;
+ fasttrap_tracepoint_t *tp;
+ int i, j;
+ uint32_t ntps;
+
+ /*
+ * Since the meta provider count is non-zero we don't have to worry
+ * about this provider disappearing.
+ */
+ ASSERT(provider->ftp_mcount > 0);
+
+ /*
+ * The offsets must be unique.
+ */
+ qsort(dhpb->dthpb_offs, dhpb->dthpb_noffs, sizeof (uint32_t),
+ fasttrap_uint32_cmp);
+ for (i = 1; i < dhpb->dthpb_noffs; i++) {
+ if (dhpb->dthpb_base + dhpb->dthpb_offs[i] <=
+ dhpb->dthpb_base + dhpb->dthpb_offs[i - 1])
+ return;
+ }
+
+ qsort(dhpb->dthpb_enoffs, dhpb->dthpb_nenoffs, sizeof (uint32_t),
+ fasttrap_uint32_cmp);
+ for (i = 1; i < dhpb->dthpb_nenoffs; i++) {
+ if (dhpb->dthpb_base + dhpb->dthpb_enoffs[i] <=
+ dhpb->dthpb_base + dhpb->dthpb_enoffs[i - 1])
+ return;
+ }
+
+ ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs;
+ ASSERT(ntps > 0);
+
+ atomic_add_32(&fasttrap_total, ntps);
+
+ if (fasttrap_total > fasttrap_max) {
+ atomic_add_32(&fasttrap_total, -ntps);
+ return;
+ }
+
+ pp = kmem_zalloc(offsetof(fasttrap_probe_t, ftp_tps[ntps]), KM_SLEEP);
+
+ pp->ftp_prov = provider;
+ pp->ftp_pid = provider->ftp_pid;
+ pp->ftp_ntps = ntps;
+ pp->ftp_nargs = dhpb->dthpb_xargc;
+ pp->ftp_xtypes = dhpb->dthpb_xtypes;
+ pp->ftp_ntypes = dhpb->dthpb_ntypes;
+
+ /*
+ * First create a tracepoint for each actual point of interest.
+ */
+ for (i = 0; i < dhpb->dthpb_noffs; i++) {
+ tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP);
+
+ tp->ftt_proc = provider->ftp_proc;
+ tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_offs[i];
+ tp->ftt_pid = provider->ftp_pid;
+
+ pp->ftp_tps[i].fit_tp = tp;
+ pp->ftp_tps[i].fit_id.fti_probe = pp;
+#ifdef __sparc
+ pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_POST_OFFSETS;
+#else
+ pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_OFFSETS;
+#endif
+ }
+
+ /*
+ * Then create a tracepoint for each is-enabled point.
+ */
+ for (j = 0; i < ntps; i++, j++) {
+ tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP);
+
+ tp->ftt_proc = provider->ftp_proc;
+ tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_enoffs[j];
+ tp->ftt_pid = provider->ftp_pid;
+
+ pp->ftp_tps[i].fit_tp = tp;
+ pp->ftp_tps[i].fit_id.fti_probe = pp;
+ pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_IS_ENABLED;
+ }
+
+ /*
+ * If the arguments are shuffled around we set the argument remapping
+ * table. Later, when the probe fires, we only remap the arguments
+ * if the table is non-NULL.
+ */
+ for (i = 0; i < dhpb->dthpb_xargc; i++) {
+ if (dhpb->dthpb_args[i] != i) {
+ pp->ftp_argmap = dhpb->dthpb_args;
+ break;
+ }
+ }
+
+ /*
+ * The probe is fully constructed -- register it with DTrace.
+ */
+ pp->ftp_id = dtrace_probe_create(provider->ftp_provid, dhpb->dthpb_mod,
+ dhpb->dthpb_func, dhpb->dthpb_name, FASTTRAP_OFFSET_AFRAMES, pp);
+}
+
+/*ARGSUSED*/
+static void
+fasttrap_meta_remove(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid)
+{
+ /*
+ * Clean up the USDT provider. There may be active consumers of the
+ * provider busy adding probes, no damage will actually befall the
+ * provider until that count has dropped to zero. This just puts
+ * the provider on death row.
+ */
+ fasttrap_provider_retire(pid, dhpv->dthpv_provname, 1);
+}
+
+static dtrace_mops_t fasttrap_mops = {
+ .dtms_create_probe = fasttrap_meta_create_probe,
+ .dtms_provide_pid = fasttrap_meta_provide,
+ .dtms_remove_pid = fasttrap_meta_remove
+};
+
+/*ARGSUSED*/
+static int
+fasttrap_open(struct cdev *dev __unused, int oflags __unused,
+ int devtype __unused, struct thread *td __unused)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+fasttrap_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int fflag,
+ struct thread *td)
+{
+ if (!dtrace_attached())
+ return (EAGAIN);
+
+ if (cmd == FASTTRAPIOC_MAKEPROBE) {
+ fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg;
+ fasttrap_probe_spec_t *probe;
+ uint64_t noffs;
+ size_t size;
+ int ret, err;
+
+ if (copyin(&uprobe->ftps_noffs, &noffs,
+ sizeof (uprobe->ftps_noffs)))
+ return (EFAULT);
+
+ /*
+ * Probes must have at least one tracepoint.
+ */
+ if (noffs == 0)
+ return (EINVAL);
+
+ size = sizeof (fasttrap_probe_spec_t) +
+ sizeof (probe->ftps_offs[0]) * (noffs - 1);
+
+ if (size > 1024 * 1024)
+ return (ENOMEM);
+
+ probe = kmem_alloc(size, KM_SLEEP);
+
+ if (copyin(uprobe, probe, size) != 0 ||
+ probe->ftps_noffs != noffs) {
+ kmem_free(probe, size);
+ return (EFAULT);
+ }
+
+ /*
+ * Verify that the function and module strings contain no
+ * funny characters.
+ */
+ if (u8_validate(probe->ftps_func, strlen(probe->ftps_func),
+ NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod),
+ NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
+ ret = EINVAL;
+ goto err;
+ }
+
+#ifdef notyet
+ if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) {
+ proc_t *p;
+ pid_t pid = probe->ftps_pid;
+
+ mutex_enter(&pidlock);
+ /*
+ * Report an error if the process doesn't exist
+ * or is actively being birthed.
+ */
+ if ((p = pfind(pid)) == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ return (ESRCH);
+ }
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if ((ret = priv_proc_cred_perm(cr, p, NULL,
+ VREAD | VWRITE)) != 0) {
+ mutex_exit(&p->p_lock);
+ return (ret);
+ }
+ mutex_exit(&p->p_lock);
+ }
+#endif /* notyet */
+
+ ret = fasttrap_add_probe(probe);
+err:
+ kmem_free(probe, size);
+
+ return (ret);
+
+ } else if (cmd == FASTTRAPIOC_GETINSTR) {
+ fasttrap_instr_query_t instr;
+ fasttrap_tracepoint_t *tp;
+ uint_t index;
+#ifdef notyet
+ int ret;
+#endif
+
+ if (copyin((void *)arg, &instr, sizeof (instr)) != 0)
+ return (EFAULT);
+
+#ifdef notyet
+ if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) {
+ proc_t *p;
+ pid_t pid = instr.ftiq_pid;
+
+ mutex_enter(&pidlock);
+ /*
+ * Report an error if the process doesn't exist
+ * or is actively being birthed.
+ */
+ if ((p == pfind(pid)) == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ return (ESRCH);
+ }
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if ((ret = priv_proc_cred_perm(cr, p, NULL,
+ VREAD)) != 0) {
+ mutex_exit(&p->p_lock);
+ return (ret);
+ }
+
+ mutex_exit(&p->p_lock);
+ }
+#endif /* notyet */
+
+ index = FASTTRAP_TPOINTS_INDEX(instr.ftiq_pid, instr.ftiq_pc);
+
+ mutex_enter(&fasttrap_tpoints.fth_table[index].ftb_mtx);
+ tp = fasttrap_tpoints.fth_table[index].ftb_data;
+ while (tp != NULL) {
+ if (instr.ftiq_pid == tp->ftt_pid &&
+ instr.ftiq_pc == tp->ftt_pc &&
+ tp->ftt_proc->ftpc_acount != 0)
+ break;
+
+ tp = tp->ftt_next;
+ }
+
+ if (tp == NULL) {
+ mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx);
+ return (ENOENT);
+ }
+
+ bcopy(&tp->ftt_instr, &instr.ftiq_instr,
+ sizeof (instr.ftiq_instr));
+ mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx);
+
+ if (copyout(&instr, (void *)arg, sizeof (instr)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static int
+fasttrap_load(void)
+{
+ ulong_t nent;
+ int i, ret;
+
+ /* Create the /dev/dtrace/fasttrap entry. */
+ fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+ "dtrace/fasttrap");
+
+ mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF);
+ mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT,
+ NULL);
+
+#ifdef illumos
+ fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
+ "fasttrap-max-probes", FASTTRAP_MAX_DEFAULT);
+#endif
+ fasttrap_total = 0;
+
+ /*
+ * Conjure up the tracepoints hashtable...
+ */
+#ifdef illumos
+ nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
+ "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE);
+#else
+ nent = tpoints_hash_size;
+#endif
+
+ if (nent == 0 || nent > 0x1000000)
+ nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
+
+ tpoints_hash_size = nent;
+
+ if (ISP2(nent))
+ fasttrap_tpoints.fth_nent = nent;
+ else
+ fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent);
+ ASSERT(fasttrap_tpoints.fth_nent > 0);
+ fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1;
+ fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent *
+ sizeof (fasttrap_bucket_t), KM_SLEEP);
+#ifndef illumos
+ for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+ mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx,
+ "tracepoints bucket mtx", MUTEX_DEFAULT, NULL);
+#endif
+
+ /*
+ * ... and the providers hash table...
+ */
+ nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE;
+ if (ISP2(nent))
+ fasttrap_provs.fth_nent = nent;
+ else
+ fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent);
+ ASSERT(fasttrap_provs.fth_nent > 0);
+ fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1;
+ fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent *
+ sizeof (fasttrap_bucket_t), KM_SLEEP);
+#ifndef illumos
+ for (i = 0; i < fasttrap_provs.fth_nent; i++)
+ mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx,
+ "providers bucket mtx", MUTEX_DEFAULT, NULL);
+#endif
+
+ ret = kproc_create(fasttrap_pid_cleanup_cb, NULL,
+ &fasttrap_cleanup_proc, 0, 0, "ftcleanup");
+ if (ret != 0) {
+ destroy_dev(fasttrap_cdev);
+#ifndef illumos
+ for (i = 0; i < fasttrap_provs.fth_nent; i++)
+ mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
+ for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+ mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
+#endif
+ kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent *
+ sizeof (fasttrap_bucket_t));
+ mtx_destroy(&fasttrap_cleanup_mtx);
+ mutex_destroy(&fasttrap_count_mtx);
+ return (ret);
+ }
+
+
+ /*
+ * ... and the procs hash table.
+ */
+ nent = FASTTRAP_PROCS_DEFAULT_SIZE;
+ if (ISP2(nent))
+ fasttrap_procs.fth_nent = nent;
+ else
+ fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent);
+ ASSERT(fasttrap_procs.fth_nent > 0);
+ fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1;
+ fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent *
+ sizeof (fasttrap_bucket_t), KM_SLEEP);
+#ifndef illumos
+ for (i = 0; i < fasttrap_procs.fth_nent; i++)
+ mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx,
+ "processes bucket mtx", MUTEX_DEFAULT, NULL);
+
+ rm_init(&fasttrap_tp_lock, "fasttrap tracepoint");
+
+ /*
+ * This event handler must run before kdtrace_thread_dtor() since it
+ * accesses the thread's struct kdtrace_thread.
+ */
+ fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
+ fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST);
+#endif
+
+ /*
+ * Install our hooks into fork(2), exec(2), and exit(2).
+ */
+ dtrace_fasttrap_fork = &fasttrap_fork;
+ dtrace_fasttrap_exit = &fasttrap_exec_exit;
+ dtrace_fasttrap_exec = &fasttrap_exec_exit;
+
+ (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
+ &fasttrap_meta_id);
+
+ return (0);
+}
+
+static int
+fasttrap_unload(void)
+{
+ int i, fail = 0;
+
+ /*
+ * Unregister the meta-provider to make sure no new fasttrap-
+ * managed providers come along while we're trying to close up
+ * shop. If we fail to detach, we'll need to re-register as a
+ * meta-provider. We can fail to unregister as a meta-provider
+ * if providers we manage still exist.
+ */
+ if (fasttrap_meta_id != DTRACE_METAPROVNONE &&
+ dtrace_meta_unregister(fasttrap_meta_id) != 0)
+ return (-1);
+
+ /*
+ * Iterate over all of our providers. If there's still a process
+ * that corresponds to that pid, fail to detach.
+ */
+ for (i = 0; i < fasttrap_provs.fth_nent; i++) {
+ fasttrap_provider_t **fpp, *fp;
+ fasttrap_bucket_t *bucket = &fasttrap_provs.fth_table[i];
+
+ mutex_enter(&bucket->ftb_mtx);
+ fpp = (fasttrap_provider_t **)&bucket->ftb_data;
+ while ((fp = *fpp) != NULL) {
+ /*
+ * Acquire and release the lock as a simple way of
+ * waiting for any other consumer to finish with
+ * this provider. A thread must first acquire the
+ * bucket lock so there's no chance of another thread
+ * blocking on the provider's lock.
+ */
+ mutex_enter(&fp->ftp_mtx);
+ mutex_exit(&fp->ftp_mtx);
+
+ if (dtrace_unregister(fp->ftp_provid) != 0) {
+ fail = 1;
+ fpp = &fp->ftp_next;
+ } else {
+ *fpp = fp->ftp_next;
+ fasttrap_provider_free(fp);
+ }
+ }
+
+ mutex_exit(&bucket->ftb_mtx);
+ }
+
+ if (fail) {
+ (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
+ &fasttrap_meta_id);
+
+ return (-1);
+ }
+
+ /*
+ * Stop new processes from entering these hooks now, before the
+ * fasttrap_cleanup thread runs. That way all processes will hopefully
+ * be out of these hooks before we free fasttrap_provs.fth_table
+ */
+ ASSERT(dtrace_fasttrap_fork == &fasttrap_fork);
+ dtrace_fasttrap_fork = NULL;
+
+ ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit);
+ dtrace_fasttrap_exec = NULL;
+
+ ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit);
+ dtrace_fasttrap_exit = NULL;
+
+ mtx_lock(&fasttrap_cleanup_mtx);
+ fasttrap_cleanup_drain = 1;
+ /* Wait for the cleanup thread to finish up and signal us. */
+ wakeup(&fasttrap_cleanup_cv);
+ mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld",
+ 0);
+ fasttrap_cleanup_proc = NULL;
+ mtx_destroy(&fasttrap_cleanup_mtx);
+
+#ifdef DEBUG
+ mutex_enter(&fasttrap_count_mtx);
+ ASSERT(fasttrap_pid_count == 0);
+ mutex_exit(&fasttrap_count_mtx);
+#endif
+
+#ifndef illumos
+ EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag);
+
+ for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+ mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
+ for (i = 0; i < fasttrap_provs.fth_nent; i++)
+ mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
+ for (i = 0; i < fasttrap_procs.fth_nent; i++)
+ mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx);
+#endif
+ kmem_free(fasttrap_tpoints.fth_table,
+ fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t));
+ fasttrap_tpoints.fth_nent = 0;
+
+ kmem_free(fasttrap_provs.fth_table,
+ fasttrap_provs.fth_nent * sizeof (fasttrap_bucket_t));
+ fasttrap_provs.fth_nent = 0;
+
+ kmem_free(fasttrap_procs.fth_table,
+ fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t));
+ fasttrap_procs.fth_nent = 0;
+
+#ifndef illumos
+ destroy_dev(fasttrap_cdev);
+ mutex_destroy(&fasttrap_count_mtx);
+ rm_destroy(&fasttrap_tp_lock);
+#endif
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+fasttrap_modevent(module_t mod __unused, int type, void *data __unused)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ break;
+
+ case MOD_UNLOAD:
+ break;
+
+ case MOD_SHUTDOWN:
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
+
+SYSINIT(fasttrap_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fasttrap_load,
+ NULL);
+SYSUNINIT(fasttrap_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+ fasttrap_unload, NULL);
+
+DEV_MODULE(fasttrap, fasttrap_modevent, NULL);
+MODULE_VERSION(fasttrap, 1);
+MODULE_DEPEND(fasttrap, dtrace, 1, 1, 1);
+MODULE_DEPEND(fasttrap, opensolaris, 1, 1, 1);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
new file mode 100644
index 000000000000..6d82470d220a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -0,0 +1,94 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/taskq.h>
+#include <sys/vnode.h>
+
+/* Extensible attribute (xva) routines. */
+
+/*
+ * Zero out the structure, set the size of the requested/returned bitmaps,
+ * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
+ * to the returned attributes array.
+ */
+void
+xva_init(xvattr_t *xvap)
+{
+ bzero(xvap, sizeof (xvattr_t));
+ xvap->xva_mapsize = XVA_MAPSIZE;
+ xvap->xva_magic = XVA_MAGIC;
+ xvap->xva_vattr.va_mask = AT_XVATTR;
+ xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
+}
+
+/*
+ * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
+ * structure. Otherwise, returns NULL.
+ */
+xoptattr_t *
+xva_getxoptattr(xvattr_t *xvap)
+{
+ xoptattr_t *xoap = NULL;
+ if (xvap->xva_vattr.va_mask & AT_XVATTR)
+ xoap = &xvap->xva_xoptattrs;
+ return (xoap);
+}
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq)
+{
+ VERIFY(vp->v_count > 0);
+ if (refcount_release_if_not_last(&vp->v_usecount)) {
+ return;
+ }
+ VERIFY(taskq_dispatch((taskq_t *)taskq,
+ (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
new file mode 100644
index 000000000000..e558b2a50358
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
@@ -0,0 +1,19 @@
+Copyright (c) 2011 Google, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
new file mode 100644
index 000000000000..f98cb76dfc91
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
@@ -0,0 +1 @@
+CITYHASH CHECKSUM FUNCTIONALITY IN ZFS
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
new file mode 100644
index 000000000000..722cc75f01e9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
@@ -0,0 +1,30 @@
+LZ4 - Fast LZ compression algorithm
+Copyright (C) 2011-2013, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+- LZ4 source repository : http://code.google.com/p/lz4/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
new file mode 100644
index 000000000000..211f679b5749
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
@@ -0,0 +1 @@
+LZ4 COMPRESSION FUNCTIONALITY IN ZFS
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
new file mode 100644
index 000000000000..1843c8161038
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
@@ -0,0 +1,960 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * ARC buffer data (ABD).
+ *
+ * ABDs are an abstract data structure for the ARC which can use two
+ * different ways of storing the underlying data:
+ *
+ * (a) Linear buffer. In this case, all the data in the ABD is stored in one
+ * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
+ *
+ * +-------------------+
+ * | ABD (linear) |
+ * | abd_flags = ... |
+ * | abd_size = ... | +--------------------------------+
+ * | abd_buf ------------->| raw buffer of size abd_size |
+ * +-------------------+ +--------------------------------+
+ * no abd_chunks
+ *
+ * (b) Scattered buffer. In this case, the data in the ABD is split into
+ * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
+ * to the chunks recorded in an array at the end of the ABD structure.
+ *
+ * +-------------------+
+ * | ABD (scattered) |
+ * | abd_flags = ... |
+ * | abd_size = ... |
+ * | abd_offset = 0 | +-----------+
+ * | abd_chunks[0] ----------------------------->| chunk 0 |
+ * | abd_chunks[1] ---------------------+ +-----------+
+ * | ... | | +-----------+
+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
+ * +-------------------+ | +-----------+
+ * | ...
+ * | +-----------+
+ * +----------------->| chunk N-1 |
+ * +-----------+
+ *
+ * Using a large proportion of scattered ABDs decreases ARC fragmentation since
+ * when we are at the limit of allocatable space, using equal-size chunks will
+ * allow us to quickly reclaim enough space for a new large allocation (assuming
+ * it is also scattered).
+ *
+ * In addition to directly allocating a linear or scattered ABD, it is also
+ * possible to create an ABD by requesting the "sub-ABD" starting at an offset
+ * within an existing ABD. In linear buffers this is simple (set abd_buf of
+ * the new ABD to the starting point within the original raw buffer), but
+ * scattered ABDs are a little more complex. The new ABD makes a copy of the
+ * relevant abd_chunks pointers (but not the underlying data). However, to
+ * provide arbitrary rather than only chunk-aligned starting offsets, it also
+ * tracks an abd_offset field which represents the starting point of the data
+ * within the first chunk in abd_chunks. For both linear and scattered ABDs,
+ * creating an offset ABD marks the original ABD as the offset's parent, and the
+ * original ABD's abd_children refcount is incremented. This data allows us to
+ * ensure the root ABD isn't deleted before its children.
+ *
+ * Most consumers should never need to know what type of ABD they're using --
+ * the ABD public API ensures that it's possible to transparently switch from
+ * using a linear ABD to a scattered one when doing so would be beneficial.
+ *
+ * If you need to use the data within an ABD directly, if you know it's linear
+ * (because you allocated it) you can use abd_to_buf() to access the underlying
+ * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
+ * which will allocate a raw buffer if necessary. Use the abd_return_buf*
+ * functions to return any raw buffers that are no longer necessary when you're
+ * done using them.
+ *
+ * There are a variety of ABD APIs that implement basic buffer operations:
+ * compare, copy, read, write, and fill with zeroes. If you need a custom
+ * function which progressively accesses the whole ABD, use the abd_iterate_*
+ * functions.
+ */
+
+#include <sys/abd.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+typedef struct abd_stats {
+ kstat_named_t abdstat_struct_size;
+ kstat_named_t abdstat_scatter_cnt;
+ kstat_named_t abdstat_scatter_data_size;
+ kstat_named_t abdstat_scatter_chunk_waste;
+ kstat_named_t abdstat_linear_cnt;
+ kstat_named_t abdstat_linear_data_size;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+ /* Amount of memory occupied by all of the abd_t struct allocations */
+ { "struct_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset()).
+ */
+ { "scatter_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+ { "scatter_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The amount of space wasted at the end of the last chunk across all
+ * scatter ABDs tracked by scatter_cnt.
+ */
+ { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
+ /*
+ * The number of linear ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset() and abd_get_from_buf()). If an
+ * ABD takes ownership of its buf then it will become tracked.
+ */
+ { "linear_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all linear ABDs tracked by linear_cnt */
+ { "linear_data_size", KSTAT_DATA_UINT64 },
+};
+
+#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
+#define ABDSTAT_INCR(stat, val) \
+ atomic_add_64(&abd_stats.stat.value.ui64, (val))
+#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
+#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
+
+/*
+ * It is possible to make all future ABDs be linear by setting this to B_FALSE.
+ * Otherwise, ABDs are allocated scattered by default unless the caller uses
+ * abd_alloc_linear().
+ */
+boolean_t zfs_abd_scatter_enabled = B_TRUE;
+
+/*
+ * The size of the chunks ABD allocates. Because the sizes allocated from the
+ * kmem_cache can't change, this tunable can only be modified at boot. Changing
+ * it at runtime would cause ABD iteration to work incorrectly for ABDs which
+ * were allocated with the old size, so a safeguard has been put in place which
+ * will cause the machine to panic if you change it and try to access the data
+ * within a scattered ABD.
+ */
+size_t zfs_abd_chunk_size = 4096;
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+SYSCTL_DECL(_vfs_zfs);
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
+ &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
+SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
+ &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
+#endif
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
+
+kmem_cache_t *abd_chunk_cache;
+static kstat_t *abd_ksp;
+
+extern inline boolean_t abd_is_linear(abd_t *abd);
+extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size);
+extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size);
+extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size);
+extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size);
+extern inline void abd_zero(abd_t *abd, size_t size);
+
+static void *
+abd_alloc_chunk()
+{
+ void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
+ ASSERT3P(c, !=, NULL);
+ return (c);
+}
+
+static void
+abd_free_chunk(void *c)
+{
+ kmem_cache_free(abd_chunk_cache, c);
+}
+
+void
+abd_init(void)
+{
+#ifdef illumos
+ vmem_t *data_alloc_arena = NULL;
+
+#ifdef _KERNEL
+ data_alloc_arena = zio_alloc_arena;
+#endif
+
+ /*
+ * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
+ * so that no allocator metadata is stored with the buffers.
+ */
+ abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+ NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
+#else
+ abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+ NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG);
+#endif
+ abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (abd_ksp != NULL) {
+ abd_ksp->ks_data = &abd_stats;
+ kstat_install(abd_ksp);
+ }
+}
+
+void
+abd_fini(void)
+{
+ if (abd_ksp != NULL) {
+ kstat_delete(abd_ksp);
+ abd_ksp = NULL;
+ }
+
+ kmem_cache_destroy(abd_chunk_cache);
+ abd_chunk_cache = NULL;
+}
+
+static inline size_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+ return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_scatter_chunkcnt(abd_t *abd)
+{
+ ASSERT(!abd_is_linear(abd));
+ return (abd_chunkcnt_for_bytes(
+ abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
+}
+
+static inline void
+abd_verify(abd_t *abd)
+{
+ ASSERT3U(abd->abd_size, >, 0);
+ ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
+ ABD_FLAG_OWNER | ABD_FLAG_META));
+ IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
+ IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
+ } else {
+ ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
+ zfs_abd_chunk_size);
+ size_t n = abd_scatter_chunkcnt(abd);
+ for (int i = 0; i < n; i++) {
+ ASSERT3P(
+ abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
+ }
+ }
+}
+
+static inline abd_t *
+abd_alloc_struct(size_t chunkcnt)
+{
+ size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
+ abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
+ ASSERT3P(abd, !=, NULL);
+ ABDSTAT_INCR(abdstat_struct_size, size);
+
+ return (abd);
+}
+
+static inline void
+abd_free_struct(abd_t *abd)
+{
+ size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
+ int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
+ kmem_free(abd, size);
+ ABDSTAT_INCR(abdstat_struct_size, -size);
+}
+
+/*
+ * Allocate an ABD, along with its own underlying data buffers. Use this if you
+ * don't care whether the ABD is linear or not.
+ */
+abd_t *
+abd_alloc(size_t size, boolean_t is_metadata)
+{
+ if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size)
+ return (abd_alloc_linear(size, is_metadata));
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ size_t n = abd_chunkcnt_for_bytes(size);
+ abd_t *abd = abd_alloc_struct(n);
+
+ abd->abd_flags = ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_scatter.abd_offset = 0;
+ abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
+
+ for (int i = 0; i < n; i++) {
+ void *c = abd_alloc_chunk();
+ ASSERT3P(c, !=, NULL);
+ abd->abd_u.abd_scatter.abd_chunks[i] = c;
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ n * zfs_abd_chunk_size - size);
+
+ return (abd);
+}
+
+static void
+abd_free_scatter(abd_t *abd)
+{
+ size_t n = abd_scatter_chunkcnt(abd);
+ for (int i = 0; i < n; i++) {
+ abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
+ }
+
+ zfs_refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ abd->abd_size - n * zfs_abd_chunk_size);
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Allocate an ABD that must be linear, along with its own underlying data
+ * buffer. Only use this when it would be very annoying to write your ABD
+ * consumer with a scattered ABD.
+ */
+abd_t *
+abd_alloc_linear(size_t size, boolean_t is_metadata)
+{
+ abd_t *abd = abd_alloc_struct(0);
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ if (is_metadata) {
+ abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
+ } else {
+ abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, size);
+
+ return (abd);
+}
+
+static void
+abd_free_linear(abd_t *abd)
+{
+ if (abd->abd_flags & ABD_FLAG_META) {
+ zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ } else {
+ zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ }
+
+ zfs_refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
+ * abd_alloc_linear().
+ */
+void
+abd_free(abd_t *abd)
+{
+ abd_verify(abd);
+ ASSERT3P(abd->abd_parent, ==, NULL);
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd))
+ abd_free_linear(abd);
+ else
+ abd_free_scatter(abd);
+}
+
+/*
+ * Allocate an ABD of the same format (same metadata flag, same scatterize
+ * setting) as another ABD.
+ */
+abd_t *
+abd_alloc_sametype(abd_t *sabd, size_t size)
+{
+ boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
+ if (abd_is_linear(sabd)) {
+ return (abd_alloc_linear(size, is_metadata));
+ } else {
+ return (abd_alloc(size, is_metadata));
+ }
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+ return (abd_alloc_linear(size, is_metadata));
+}
+
+/*
+ * Allocate a new ABD to point to offset off of sabd. It shares the underlying
+ * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
+ * any derived ABDs exist.
+ */
+abd_t *
+abd_get_offset(abd_t *sabd, size_t off)
+{
+ abd_t *abd;
+
+ abd_verify(sabd);
+ ASSERT3U(off, <=, sabd->abd_size);
+
+ if (abd_is_linear(sabd)) {
+ abd = abd_alloc_struct(0);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+
+ abd->abd_u.abd_linear.abd_buf =
+ (char *)sabd->abd_u.abd_linear.abd_buf + off;
+ } else {
+ size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
+ size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
+ (new_offset / zfs_abd_chunk_size);
+
+ abd = abd_alloc_struct(chunkcnt);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = 0;
+
+ abd->abd_u.abd_scatter.abd_offset =
+ new_offset % zfs_abd_chunk_size;
+ abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
+
+ /* Copy the scatterlist starting at the correct offset */
+ (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
+ &sabd->abd_u.abd_scatter.abd_chunks[new_offset /
+ zfs_abd_chunk_size],
+ chunkcnt * sizeof (void *));
+ }
+
+ abd->abd_size = sabd->abd_size - off;
+ abd->abd_parent = sabd;
+ zfs_refcount_create(&abd->abd_children);
+ (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
+
+ return (abd);
+}
+
+/*
+ * Allocate a linear ABD structure for buf. You must free this with abd_put()
+ * since the resulting ABD doesn't own its own buffer.
+ */
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+ abd_t *abd = abd_alloc_struct(0);
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that if we
+ * own the underlying data buffer, which is not true in this case.
+ * Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_linear.abd_buf = buf;
+
+ return (abd);
+}
+
+/*
+ * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
+ * free the underlying scatterlist or buffer.
+ */
+void
+abd_put(abd_t *abd)
+{
+ abd_verify(abd);
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+
+ if (abd->abd_parent != NULL) {
+ (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
+ abd->abd_size, abd);
+ }
+
+ zfs_refcount_destroy(&abd->abd_children);
+ abd_free_struct(abd);
+}
+
+/*
+ * Get the raw buffer associated with a linear ABD.
+ */
+void *
+abd_to_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ abd_verify(abd);
+ return (abd->abd_u.abd_linear.abd_buf);
+}
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+ void *buf;
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ buf = abd_to_buf(abd);
+ } else {
+ buf = zio_buf_alloc(n);
+ }
+ (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+
+ return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+ void *buf = abd_borrow_buf(abd, n);
+ if (!abd_is_linear(abd)) {
+ abd_copy_to_buf(buf, abd, n);
+ }
+ return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * not change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(buf, ==, abd_to_buf(abd));
+ } else {
+ ASSERT0(abd_cmp_buf(abd, buf, n));
+ zio_buf_free(buf, n);
+ }
+ (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+ if (!abd_is_linear(abd)) {
+ abd_copy_from_buf(abd, buf, n);
+ }
+ abd_return_buf(abd, buf, n);
+}
+
+/*
+ * Give this ABD ownership of the buffer that it's storing. Can only be used on
+ * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
+ * with abd_alloc_linear() which subsequently released ownership of their buf
+ * with abd_release_ownership_of_buf().
+ */
+void
+abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+ abd_verify(abd);
+
+ abd->abd_flags |= ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+}
+
+void
+abd_release_ownership_of_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ abd_verify(abd);
+
+ abd->abd_flags &= ~ABD_FLAG_OWNER;
+ /* Disable this flag since we no longer own the data buffer */
+ abd->abd_flags &= ~ABD_FLAG_META;
+
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+}
+
+struct abd_iter {
+ abd_t *iter_abd; /* ABD being iterated through */
+ size_t iter_pos; /* position (relative to abd_offset) */
+ void *iter_mapaddr; /* addr corresponding to iter_pos */
+ size_t iter_mapsize; /* length of data valid at mapaddr */
+};
+
+static inline size_t
+abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
+{
+ ASSERT(!abd_is_linear(aiter->iter_abd));
+ return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
+ aiter->iter_pos) % zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_index(struct abd_iter *aiter)
+{
+ ASSERT(!abd_is_linear(aiter->iter_abd));
+ return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
+ aiter->iter_pos) / zfs_abd_chunk_size);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+static void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+ abd_verify(abd);
+ aiter->iter_abd = abd;
+ aiter->iter_pos = 0;
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+static void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to advance to, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ aiter->iter_pos += amount;
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_map(struct abd_iter *aiter)
+{
+ void *paddr;
+ size_t offset = 0;
+
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* Panic if someone has changed zfs_abd_chunk_size */
+ IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
+ aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
+
+ /* There's nothing left to iterate over, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ if (abd_is_linear(aiter->iter_abd)) {
+ offset = aiter->iter_pos;
+ aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+ paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
+ } else {
+ size_t index = abd_iter_scatter_chunk_index(aiter);
+ offset = abd_iter_scatter_chunk_offset(aiter);
+ aiter->iter_mapsize = zfs_abd_chunk_size - offset;
+ paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
+ }
+ aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+ /* There's nothing left to unmap, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+ ASSERT3U(aiter->iter_mapsize, >, 0);
+
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+int
+abd_iterate_func(abd_t *abd, size_t off, size_t size,
+ abd_iter_func_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter aiter;
+
+ abd_verify(abd);
+ ASSERT3U(off + size, <=, abd->abd_size);
+
+ abd_iter_init(&aiter, abd);
+ abd_iter_advance(&aiter, off);
+
+ while (size > 0) {
+ abd_iter_map(&aiter);
+
+ size_t len = MIN(aiter.iter_mapsize, size);
+ ASSERT3U(len, >, 0);
+
+ ret = func(aiter.iter_mapaddr, len, private);
+
+ abd_iter_unmap(&aiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&aiter, len);
+ }
+
+ return (ret);
+}
+
+struct buf_arg {
+ void *arg_buf;
+};
+
+static int
+abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(ba_ptr->arg_buf, buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy abd to buf. (off is the offset in abd.)
+ */
+void
+abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
+ &ba_ptr);
+}
+
+static int
+abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
+{
+ int ret;
+ struct buf_arg *ba_ptr = private;
+
+ ret = memcmp(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (ret);
+}
+
+/*
+ * Compare the contents of abd to buf. (off is the offset in abd.)
+ */
+int
+abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
+}
+
+static int
+abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy from buf to abd. (off is the offset in abd.)
+ */
+void
+abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
+ &ba_ptr);
+}
+
+/*ARGSUSED*/
+static int
+abd_zero_off_cb(void *buf, size_t size, void *private)
+{
+ (void) memset(buf, 0, size);
+ return (0);
+}
+
+/*
+ * Zero out the abd from a particular offset to the end.
+ */
+void
+abd_zero_off(abd_t *abd, size_t off, size_t size)
+{
+ (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
+}
+
+/*
+ * Iterate over two ABDs and call func incrementally on the two ABDs' data in
+ * equal-sized chunks (passed to func as raw buffers). func could be called many
+ * times during this iteration.
+ */
+int
+abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
+ size_t size, abd_iter_func2_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter daiter, saiter;
+
+ abd_verify(dabd);
+ abd_verify(sabd);
+
+ ASSERT3U(doff + size, <=, dabd->abd_size);
+ ASSERT3U(soff + size, <=, sabd->abd_size);
+
+ abd_iter_init(&daiter, dabd);
+ abd_iter_init(&saiter, sabd);
+ abd_iter_advance(&daiter, doff);
+ abd_iter_advance(&saiter, soff);
+
+ while (size > 0) {
+ abd_iter_map(&daiter);
+ abd_iter_map(&saiter);
+
+ size_t dlen = MIN(daiter.iter_mapsize, size);
+ size_t slen = MIN(saiter.iter_mapsize, size);
+ size_t len = MIN(dlen, slen);
+ ASSERT(dlen > 0 || slen > 0);
+
+ ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
+ private);
+
+ abd_iter_unmap(&saiter);
+ abd_iter_unmap(&daiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&daiter, len);
+ abd_iter_advance(&saiter, len);
+ }
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ (void) memcpy(dbuf, sbuf, size);
+ return (0);
+}
+
+/*
+ * Copy from sabd to dabd starting from soff and doff.
+ */
+void
+abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
+{
+ (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
+ abd_copy_off_cb, NULL);
+}
+
+/*ARGSUSED*/
+static int
+abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
+{
+ return (memcmp(bufa, bufb, size));
+}
+
+/*
+ * Compares the first size bytes of two ABDs.
+ */
+int
+abd_cmp(abd_t *dabd, abd_t *sabd, size_t size)
+{
+ return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
new file mode 100644
index 000000000000..713ff2b0116c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
@@ -0,0 +1,234 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/aggsum.h>
+
+/*
+ * Aggregate-sum counters are a form of fanned-out counter, used when atomic
+ * instructions on a single field cause enough CPU cache line contention to
+ * slow system performance. Due to their increased overhead and the expense
+ * involved with precisely reading from them, they should only be used in cases
+ * where the write rate (increment/decrement) is much higher than the read rate
+ * (get value).
+ *
+ * Aggregate sum counters are comprised of two basic parts, the core and the
+ * buckets. The core counter contains a lock for the entire counter, as well
+ * as the current upper and lower bounds on the value of the counter. The
+ * aggsum_bucket structure contains a per-bucket lock to protect the contents of
+ * the bucket, the current amount that this bucket has changed from the global
+ * counter (called the delta), and the amount of increment and decrement we have
+ * "borrowed" from the core counter.
+ *
+ * The basic operation of an aggsum is simple. Threads that wish to modify the
+ * counter will modify one bucket's counter (determined by their current CPU, to
+ * help minimize lock and cache contention). If the bucket already has
+ * sufficient capacity borrowed from the core structure to handle their request,
+ * they simply modify the delta and return. If the bucket does not, we clear
+ * the bucket's current state (to prevent the borrowed amounts from getting too
+ * large), and borrow more from the core counter. Borrowing is done by adding to
+ * the upper bound (or subtracting from the lower bound) of the core counter,
+ * and setting the borrow value for the bucket to the amount added (or
+ * subtracted). Clearing the bucket is the opposite; we add the current delta
+ * to both the lower and upper bounds of the core counter, subtract the borrowed
+ * incremental from the upper bound, and add the borrowed decrement from the
+ * lower bound. Note that only borrowing and clearing require access to the
+ * core counter; since all other operations access CPU-local resources,
+ * performance can be much higher than a traditional counter.
+ *
+ * Threads that wish to read from the counter have a slightly more challenging
+ * task. It is fast to determine the upper and lower bounds of the aggum; this
+ * does not require grabbing any locks. This suffices for cases where an
+ * approximation of the aggsum's value is acceptable. However, if one needs to
+ * know whether some specific value is above or below the current value in the
+ * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
+ * comparing the target value to the upper and lower bounds of the aggsum, and
+ * then clearing a bucket. This proceeds until the target is outside of the
+ * upper and lower bounds and we return a response, or the last bucket has been
+ * cleared and we know that the target is equal to the aggsum's value. Finally,
+ * the most expensive operation is determining the precise value of the aggsum.
+ * To do this, we clear every bucket and then return the upper bound (which must
+ * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
+ * expensive is clearing buckets. This involves grabbing the global lock
+ * (serializing against themselves and borrow operations), grabbing a bucket's
+ * lock (preventing threads on those CPUs from modifying their delta), and
+ * zeroing out the borrowed value (forcing that thread to borrow on its next
+ * request, which will also be expensive). This is what makes aggsums well
+ * suited for write-many read-rarely operations.
+ */
+
+/*
+ * We will borrow aggsum_borrow_multiplier times the current request, so we will
+ * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
+ * aggsum_delta().
+ */
+static uint_t aggsum_borrow_multiplier = 10;
+
+void
+aggsum_init(aggsum_t *as, uint64_t value)
+{
+ bzero(as, sizeof (*as));
+ as->as_lower_bound = as->as_upper_bound = value;
+ mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
+ as->as_numbuckets = boot_ncpus;
+ as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
+ KM_SLEEP);
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ mutex_init(&as->as_buckets[i].asc_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+void
+aggsum_fini(aggsum_t *as)
+{
+ for (int i = 0; i < as->as_numbuckets; i++)
+ mutex_destroy(&as->as_buckets[i].asc_lock);
+ kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
+ mutex_destroy(&as->as_lock);
+}
+
+int64_t
+aggsum_lower_bound(aggsum_t *as)
+{
+ return (as->as_lower_bound);
+}
+
+int64_t
+aggsum_upper_bound(aggsum_t *as)
+{
+ return (as->as_upper_bound);
+}
+
+static void
+aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
+{
+ ASSERT(MUTEX_HELD(&as->as_lock));
+ ASSERT(MUTEX_HELD(&asb->asc_lock));
+
+ /*
+ * We use atomic instructions for this because we read the upper and
+ * lower bounds without the lock, so we need stores to be atomic.
+ */
+ atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+ asb->asc_delta + asb->asc_borrowed);
+ atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+ asb->asc_delta - asb->asc_borrowed);
+ asb->asc_delta = 0;
+ asb->asc_borrowed = 0;
+}
+
+uint64_t
+aggsum_value(aggsum_t *as)
+{
+ int64_t rv;
+
+ mutex_enter(&as->as_lock);
+ if (as->as_lower_bound == as->as_upper_bound) {
+ rv = as->as_lower_bound;
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ ASSERT0(as->as_buckets[i].asc_delta);
+ ASSERT0(as->as_buckets[i].asc_borrowed);
+ }
+ mutex_exit(&as->as_lock);
+ return (rv);
+ }
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ struct aggsum_bucket *asb = &as->as_buckets[i];
+ mutex_enter(&asb->asc_lock);
+ aggsum_flush_bucket(as, asb);
+ mutex_exit(&asb->asc_lock);
+ }
+ VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+ rv = as->as_lower_bound;
+ mutex_exit(&as->as_lock);
+
+ return (rv);
+}
+
+void
+aggsum_add(aggsum_t *as, int64_t delta)
+{
+ struct aggsum_bucket *asb =
+ &as->as_buckets[CPU_SEQID % as->as_numbuckets];
+ int64_t borrow;
+
+ /* Try fast path if we already borrowed enough before. */
+ mutex_enter(&asb->asc_lock);
+ if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
+ asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
+ asb->asc_delta += delta;
+ mutex_exit(&asb->asc_lock);
+ return;
+ }
+ mutex_exit(&asb->asc_lock);
+
+ /*
+ * We haven't borrowed enough. Take the global lock and borrow
+ * considering what is requested now and what we borrowed before.
+ */
+ borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
+ mutex_enter(&as->as_lock);
+ mutex_enter(&asb->asc_lock);
+ delta += asb->asc_delta;
+ asb->asc_delta = 0;
+ if (borrow >= asb->asc_borrowed)
+ borrow -= asb->asc_borrowed;
+ else
+ borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
+ asb->asc_borrowed += borrow;
+ atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+ delta - borrow);
+ atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+ delta + borrow);
+ mutex_exit(&asb->asc_lock);
+ mutex_exit(&as->as_lock);
+}
+
+/*
+ * Compare the aggsum value to target efficiently. Returns -1 if the value
+ * represented by the aggsum is less than target, 1 if it's greater, and 0 if
+ * they are equal.
+ */
+int
+aggsum_compare(aggsum_t *as, uint64_t target)
+{
+ if (as->as_upper_bound < target)
+ return (-1);
+ if (as->as_lower_bound > target)
+ return (1);
+ mutex_enter(&as->as_lock);
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ struct aggsum_bucket *asb = &as->as_buckets[i];
+ mutex_enter(&asb->asc_lock);
+ aggsum_flush_bucket(as, asb);
+ mutex_exit(&asb->asc_lock);
+ if (as->as_upper_bound < target) {
+ mutex_exit(&as->as_lock);
+ return (-1);
+ }
+ if (as->as_lower_bound > target) {
+ mutex_exit(&as->as_lock);
+ return (1);
+ }
+ }
+ VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+ ASSERT3U(as->as_lower_bound, ==, target);
+ mutex_exit(&as->as_lock);
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
new file mode 100644
index 000000000000..592fb02cfac1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -0,0 +1,8569 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/*
+ * DVA-based Adjustable Replacement Cache
+ *
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory. This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about. Our cache is not so simple. At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them. Blocks are only evictable
+ * when there are no external references active. This makes
+ * eviction far more problematic: we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space. In these circumstances we are unable to adjust the cache
+ * size. To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss. Our model has a variable sized cache. It grows with
+ * high use, but also tries to react to memory pressure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefore exactly the same size. So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict. In our model, we
+ * have variable sized cache blocks (rangeing from 512 bytes to
+ * 128K bytes). We therefore choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists. The arc_read() interface
+ * uses method 1, while the internal ARC algorithms for
+ * adjusting the cache use method 2. We therefore provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * ARC list locks.
+ *
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table. It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each ARC state also has a mutex which is used to protect the
+ * buffer list associated with the state. When attempting to
+ * obtain a hash table lock while holding an ARC list lock you
+ * must use: mutex_tryenter() to avoid deadlock. Also note that
+ * the active state mutex must be held before the ghost state mutex.
+ *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted. In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored. For example,
+ * when using the ZPL each dentry holds a references on a znode. These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ *
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
+ *
+ * - L2ARC buflist creation
+ * - L2ARC buflist eviction
+ * - L2ARC write completion, which walks L2ARC buflists
+ * - ARC header destruction, as it removes from L2ARC buflists
+ * - ARC header release, as it removes from L2ARC buflists
+ */
+
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
+ *
+ * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
+ * ability to store the physical data (b_pabd) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
+ * it will match its on-disk compression characteristics. This behavior can be
+ * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pabd will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
+ * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
+ * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer. The ARC will provide references to this data and will keep it
+ * cached until it is no longer in use. The ARC caches only the L1ARC's physical
+ * data block and will evict any arc_buf_t that is no longer referenced. The
+ * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
+ * "overhead_size" kstat.
+ *
+ * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
+ * compressed form. The typical case is that consumers will want uncompressed
+ * data, and when that happens a new data buffer is allocated where the data is
+ * decompressed for them to use. Currently the only consumer who wants
+ * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
+ * exists on disk. When this happens, the arc_buf_t's data buffer is shared
+ * with the arc_buf_hdr_t.
+ *
+ * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
+ * first one is owned by a compressed send consumer (and therefore references
+ * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
+ * used by any other consumer (and has its own uncompressed copy of the data
+ * buffer).
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | fields |
+ * | common to |
+ * | L1- and |
+ * | L2ARC |
+ * +-----------+
+ * | l2arc_buf_hdr_t
+ * | |
+ * +-----------+
+ * | l1arc_buf_hdr_t
+ * | | arc_buf_t
+ * | b_buf +------------>+-----------+ arc_buf_t
+ * | b_pabd +-+ |b_next +---->+-----------+
+ * +-----------+ | |-----------| |b_next +-->NULL
+ * | |b_comp = T | +-----------+
+ * | |b_data +-+ |b_comp = F |
+ * | +-----------+ | |b_data +-+
+ * +->+------+ | +-----------+ |
+ * compressed | | | |
+ * data | |<--------------+ | uncompressed
+ * +------+ compressed, | data
+ * shared +-->+------+
+ * data | |
+ * | |
+ * +------+
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
+ * arc_buf_t and either copies uncompressed data into a new data buffer from an
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
+ * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
+ * hdr is compressed and the desired compression characteristics of the
+ * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
+ * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
+ * the last buffer in the hdr's b_buf list, however a shared compressed buf can
+ * be anywhere in the hdr's list.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
+ * the last element in the buf list):
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ * l2arc_buf_hdr_t| |
+ * | |
+ * +-----------+
+ * l1arc_buf_hdr_t| |
+ * | | arc_buf_t (shared)
+ * | b_buf +------------>+---------+ arc_buf_t
+ * | | |b_next +---->+---------+
+ * | b_pabd +-+ |---------| |b_next +-->NULL
+ * +-----------+ | | | +---------+
+ * | |b_data +-+ | |
+ * | +---------+ | |b_data +-+
+ * +->+------+ | +---------+ |
+ * | | | |
+ * uncompressed | | | |
+ * data +------+ | |
+ * ^ +->+------+ |
+ * | uncompressed | | |
+ * | data | | |
+ * | +------+ |
+ * +---------------------------------+
+ *
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
+ * it may compress the data before writing it to disk. The ARC will be called
+ * with the transformed data and will bcopy the transformed on-disk block into
+ * a newly allocated b_pabd. Writes are always done into buffers which have
+ * either been loaned (and hence are new and don't have other readers) or
+ * buffers which have been released (and hence have their own hdr, if there
+ * were originally other readers of the buf's original hdr). This ensures that
+ * the ARC only needs to update a single buf and its hdr after a write occurs.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pabd. The
+ * L2ARC will always write the contents of b_pabd to the L2ARC. This means
+ * that when compressed ARC is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * ARC is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#ifdef _KERNEL
+#include <sys/dnlc.h>
+#include <sys/racct.h>
+#endif
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/trim_map.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/sdt.h>
+#include <sys/aggsum.h>
+#include <sys/cityhash.h>
+
+#include <machine/vmparam.h>
+
+#ifdef illumos
+#ifndef _KERNEL
+/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
+boolean_t arc_watch = B_FALSE;
+int arc_procfd;
+#endif
+#endif /* illumos */
+
+/*
+ * This thread's job is to keep enough free memory in the system, by
+ * calling arc_kmem_reap_now() plus arc_shrink(), which improves
+ * arc_available_memory().
+ */
+static zthr_t *arc_reap_zthr;
+
+/*
+ * This thread's job is to keep arc_size under arc_c, by calling
+ * arc_adjust(), which improves arc_is_overflowing().
+ */
+static zthr_t *arc_adjust_zthr;
+
+static kmutex_t arc_adjust_lock;
+static kcondvar_t arc_adjust_waiters_cv;
+static boolean_t arc_adjust_needed = B_FALSE;
+
+static kmutex_t arc_dnlc_evicts_lock;
+static kcondvar_t arc_dnlc_evicts_cv;
+static boolean_t arc_dnlc_evicts_thread_exit;
+
+uint_t arc_reduce_dnlc_percent = 3;
+
+/*
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
+ */
+int zfs_arc_evict_batch_limit = 10;
+
+/* number of seconds before growing cache again */
+int arc_grow_retry = 60;
+
+/*
+ * Minimum time between calls to arc_kmem_reap_soon(). Note that this will
+ * be converted to ticks, so with the default hz=100, a setting of 15 ms
+ * will actually wait 2 ticks, or 20ms.
+ */
+int arc_kmem_cache_reap_retry_ms = 1000;
+
+/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
+int zfs_arc_overflow_shift = 8;
+
+/* shift of arc_c for calculating both min and max arc_p */
+int arc_p_min_shift = 4;
+
+/* log2(fraction of arc to reclaim) */
+int arc_shrink_shift = 7;
+
+/*
+ * log2(fraction of ARC which must be free to allow growing).
+ * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * when reading a new block into the ARC, we will evict an equal-sized block
+ * from the ARC.
+ *
+ * This must be less than arc_shrink_shift, so that when we shrink the ARC,
+ * we will still not allow it to grow.
+ */
+int arc_no_grow_shift = 5;
+
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int zfs_arc_min_prefetch_ms = 1;
+static int zfs_arc_min_prescient_prefetch_ms = 6;
+
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
+static boolean_t arc_initialized;
+extern boolean_t zfs_prefetch_disable;
+
+/*
+ * The arc has filled available memory and has now warmed up.
+ */
+static boolean_t arc_warm;
+
+/*
+ * log2 fraction of the zio arena to keep free.
+ */
+int arc_zio_arena_free_shift = 2;
+
+/*
+ * These tunables are for performance analysis.
+ */
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+uint64_t zfs_arc_meta_limit = 0;
+uint64_t zfs_arc_meta_min = 0;
+uint64_t zfs_arc_dnode_limit = 0;
+uint64_t zfs_arc_dnode_reduce_percent = 10;
+int zfs_arc_grow_retry = 0;
+int zfs_arc_shrink_shift = 0;
+int zfs_arc_no_grow_shift = 0;
+int zfs_arc_p_min_shift = 0;
+uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+u_int zfs_arc_free_target = 0;
+
+/* Absolute min for arc min / max is 16MB. */
+static uint64_t arc_abs_min = 16 << 20;
+
+/*
+ * ARC dirty data constraints for arc_tempreserve_space() throttle
+ */
+uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
+uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
+uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
+
+boolean_t zfs_compressed_arc_enabled = B_TRUE;
+
+static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static void
+arc_free_target_init(void *unused __unused)
+{
+
+ zfs_arc_free_target = vm_cnt.v_free_target;
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+ arc_free_target_init, NULL);
+
+TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
+TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
+TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
+TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry);
+TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift);
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
+ 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
+ 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
+ CTLTYPE_U32 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
+ 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
+ "log2(fraction of ARC which must be free to allow growing)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
+ &zfs_arc_average_blocksize, 0,
+ "ARC average blocksize");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
+ &arc_shrink_shift, 0,
+ "log2(fraction of arc to reclaim)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW,
+ &arc_grow_retry, 0,
+ "Wait in seconds before considering growing ARC");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
+ &zfs_compressed_arc_enabled, 0,
+ "Enable compressed ARC");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN,
+ &arc_kmem_cache_reap_retry_ms, 0,
+ "Interval between ARC kmem_cache reapings");
+
+/*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
+ sysctl_vfs_zfs_arc_free_target, "IU",
+ "Desired number of free pages below which ARC triggers reclaim");
+
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+ u_int val;
+ int err;
+
+ val = zfs_arc_free_target;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < minfree)
+ return (EINVAL);
+ if (val > vm_cnt.v_page_count)
+ return (EINVAL);
+
+ zfs_arc_free_target = val;
+
+ return (0);
+}
+
+/*
+ * Must be declared here, before the definition of corresponding kstat
+ * macro which uses the same names will confuse the compiler.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+ sysctl_vfs_zfs_arc_meta_limit, "QU",
+ "ARC metadata limit");
+#endif
+
+/*
+ * Note that buffers can be in one of 6 states:
+ * ARC_anon - anonymous (discussed below)
+ * ARC_mru - recently used, currently cached
+ * ARC_mru_ghost - recentely used, no longer in cache
+ * ARC_mfu - frequently used, currently cached
+ * ARC_mfu_ghost - frequently used, no longer in cache
+ * ARC_l2c_only - exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states. These are
+ * the only buffers that can be evicted or deleted. Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA. These are buffers that hold dirty block copies
+ * before they are written to stable storage. By definition,
+ * they are "ref'd" and are considered part of arc_mru
+ * that cannot be freed. Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
+ */
+
+typedef struct arc_state {
+ /*
+ * list of evictable buffers
+ */
+ multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of evictable data in this state
+ */
+ zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of data in this state; this includes: evictable,
+ * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+ */
+ zfs_refcount_t arcs_size;
+ /*
+ * supports the "dbufs" kstat
+ */
+ arc_state_type_t arcs_state;
+} arc_state_t;
+
+/*
+ * Percentage that can be consumed by dnodes of ARC meta buffers.
+ */
+int zfs_arc_meta_prune = 10000;
+unsigned long zfs_arc_dnode_limit_percent = 10;
+int zfs_arc_meta_strategy = ARC_STRATEGY_META_ONLY;
+int zfs_arc_meta_adjust_restarts = 4096;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_meta_strategy, CTLFLAG_RWTUN,
+ &zfs_arc_meta_strategy, 0,
+ "ARC metadata reclamation strategy "
+ "(0 = metadata only, 1 = balance data and metadata)");
+
+/* The 6 states: */
+static arc_state_t ARC_anon;
+static arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+static arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
+
+typedef struct arc_stats {
+ kstat_named_t arcstat_hits;
+ kstat_named_t arcstat_misses;
+ kstat_named_t arcstat_demand_data_hits;
+ kstat_named_t arcstat_demand_data_misses;
+ kstat_named_t arcstat_demand_metadata_hits;
+ kstat_named_t arcstat_demand_metadata_misses;
+ kstat_named_t arcstat_prefetch_data_hits;
+ kstat_named_t arcstat_prefetch_data_misses;
+ kstat_named_t arcstat_prefetch_metadata_hits;
+ kstat_named_t arcstat_prefetch_metadata_misses;
+ kstat_named_t arcstat_mru_hits;
+ kstat_named_t arcstat_mru_ghost_hits;
+ kstat_named_t arcstat_mfu_hits;
+ kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_allocated;
+ kstat_named_t arcstat_deleted;
+ /*
+ * Number of buffers that could not be evicted because the hash lock
+ * was held by another thread. The lock may not necessarily be held
+ * by something using the same buffer, since hash locks are shared
+ * by multiple buffers.
+ */
+ kstat_named_t arcstat_mutex_miss;
+ /*
+ * Number of buffers skipped when updating the access state due to the
+ * header having already been released after acquiring the hash lock.
+ */
+ kstat_named_t arcstat_access_skip;
+ /*
+ * Number of buffers skipped because they have I/O in progress, are
+ * indirect prefetch buffers that have not lived long enough, or are
+ * not from the spa we're trying to evict from.
+ */
+ kstat_named_t arcstat_evict_skip;
+ /*
+ * Number of times arc_evict_state() was unable to evict enough
+ * buffers to reach it's target amount.
+ */
+ kstat_named_t arcstat_evict_not_enough;
+ kstat_named_t arcstat_evict_l2_cached;
+ kstat_named_t arcstat_evict_l2_eligible;
+ kstat_named_t arcstat_evict_l2_ineligible;
+ kstat_named_t arcstat_evict_l2_skip;
+ kstat_named_t arcstat_hash_elements;
+ kstat_named_t arcstat_hash_elements_max;
+ kstat_named_t arcstat_hash_collisions;
+ kstat_named_t arcstat_hash_chains;
+ kstat_named_t arcstat_hash_chain_max;
+ kstat_named_t arcstat_p;
+ kstat_named_t arcstat_c;
+ kstat_named_t arcstat_c_min;
+ kstat_named_t arcstat_c_max;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_size;
+ /*
+ * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
+ * Note that the compressed bytes may match the uncompressed bytes
+ * if the block is either not compressed or compressed arc is disabled.
+ */
+ kstat_named_t arcstat_compressed_size;
+ /*
+ * Uncompressed size of the data stored in b_pabd. If compressed
+ * arc is disabled then this value will be identical to the stat
+ * above.
+ */
+ kstat_named_t arcstat_uncompressed_size;
+ /*
+ * Number of bytes stored in all the arc_buf_t's. This is classified
+ * as "overhead" since this data is typically short-lived and will
+ * be evicted from the arc when it becomes unreferenced unless the
+ * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+ * values have been set (see comment in dbuf.c for more information).
+ */
+ kstat_named_t arcstat_overhead_size;
+ /*
+ * Number of bytes consumed by internal ARC structures necessary
+ * for tracking purposes; these structures are not actually
+ * backed by ARC buffers. This includes arc_buf_hdr_t structures
+ * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
+ * caches), and arc_buf_t structures (allocated via arc_buf_t
+ * cache).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_hdr_size;
+ /*
+ * Number of bytes consumed by ARC buffers of type equal to
+ * ARC_BUFC_DATA. This is generally consumed by buffers backing
+ * on disk user data (e.g. plain file contents).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_data_size;
+ /*
+ * Number of bytes consumed by ARC buffers of type equal to
+ * ARC_BUFC_METADATA. This is generally consumed by buffers
+ * backing on disk data that is used for internal ZFS
+ * structures (e.g. ZAP, dnode, indirect blocks, etc).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_metadata_size;
+ /*
+ * Number of bytes consumed by dmu_buf_impl_t objects.
+ */
+ kstat_named_t arcstat_dbuf_size;
+ /*
+ * Number of bytes consumed by dnode_t objects.
+ */
+ kstat_named_t arcstat_dnode_size;
+ /*
+ * Number of bytes consumed by bonus buffers.
+ */
+ kstat_named_t arcstat_bonus_size;
+#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
+ /*
+ * Sum of the previous three counters, provided for compatibility.
+ */
+ kstat_named_t arcstat_other_size;
+#endif
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_anon state. This includes *all* buffers in the arc_anon
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_size;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_DATA,
+ * residing in the arc_anon state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_METADATA,
+ * residing in the arc_anon state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_evictable_metadata;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_mru state. This includes *all* buffers in the arc_mru
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_size;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_DATA,
+ * residing in the arc_mru state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_METADATA,
+ * residing in the arc_mru state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_evictable_metadata;
+ /*
+ * Total number of bytes that *would have been* consumed by ARC
+ * buffers in the arc_mru_ghost state. The key thing to note
+ * here, is the fact that this size doesn't actually indicate
+ * RAM consumption. The ghost lists only consist of headers and
+ * don't actually have ARC buffers linked off of these headers.
+ * Thus, *if* the headers had associated ARC buffers, these
+ * buffers *would have* consumed this number of bytes.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_size;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_evictable_data;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_evictable_metadata;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_mfu state. This includes *all* buffers in the arc_mfu
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_size;
+ /*
+ * Number of bytes consumed by ARC buffers that are eligible for
+ * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
+ * state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that are eligible for
+ * eviction, of type ARC_BUFC_METADATA, and reside in the
+ * arc_mfu state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_evictable_metadata;
+ /*
+ * Total number of bytes that *would have been* consumed by ARC
+ * buffers in the arc_mfu_ghost state. See the comment above
+ * arcstat_mru_ghost_size for more details.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_size;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_evictable_data;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_evictable_metadata;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_read_bytes;
+ kstat_named_t arcstat_l2_write_bytes;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_lock_retry;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_evict_l1cached;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_lsize;
+ kstat_named_t arcstat_l2_psize;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_l2_hdr_size;
+ kstat_named_t arcstat_l2_write_trylock_fail;
+ kstat_named_t arcstat_l2_write_passed_headroom;
+ kstat_named_t arcstat_l2_write_spa_mismatch;
+ kstat_named_t arcstat_l2_write_in_l2;
+ kstat_named_t arcstat_l2_write_hdr_io_in_progress;
+ kstat_named_t arcstat_l2_write_not_cacheable;
+ kstat_named_t arcstat_l2_write_full;
+ kstat_named_t arcstat_l2_write_buffer_iter;
+ kstat_named_t arcstat_l2_write_pios;
+ kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
+ kstat_named_t arcstat_l2_write_buffer_list_iter;
+ kstat_named_t arcstat_l2_write_buffer_list_null_iter;
+ kstat_named_t arcstat_memory_throttle_count;
+ kstat_named_t arcstat_memory_direct_count;
+ kstat_named_t arcstat_memory_indirect_count;
+ kstat_named_t arcstat_memory_all_bytes;
+ kstat_named_t arcstat_memory_free_bytes;
+ kstat_named_t arcstat_memory_available_bytes;
+ kstat_named_t arcstat_no_grow;
+ kstat_named_t arcstat_tempreserve;
+ kstat_named_t arcstat_loaned_bytes;
+ kstat_named_t arcstat_prune;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_meta_used;
+ kstat_named_t arcstat_meta_limit;
+ kstat_named_t arcstat_dnode_limit;
+ kstat_named_t arcstat_meta_max;
+ kstat_named_t arcstat_meta_min;
+ kstat_named_t arcstat_async_upgrade_sync;
+ kstat_named_t arcstat_demand_hit_predictive_prefetch;
+ kstat_named_t arcstat_demand_hit_prescient_prefetch;
+} arc_stats_t;
+
+static arc_stats_t arc_stats = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "demand_data_hits", KSTAT_DATA_UINT64 },
+ { "demand_data_misses", KSTAT_DATA_UINT64 },
+ { "demand_metadata_hits", KSTAT_DATA_UINT64 },
+ { "demand_metadata_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_data_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_data_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
+ { "mru_hits", KSTAT_DATA_UINT64 },
+ { "mru_ghost_hits", KSTAT_DATA_UINT64 },
+ { "mfu_hits", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "allocated", KSTAT_DATA_UINT64 },
+ { "deleted", KSTAT_DATA_UINT64 },
+ { "mutex_miss", KSTAT_DATA_UINT64 },
+ { "access_skip", KSTAT_DATA_UINT64 },
+ { "evict_skip", KSTAT_DATA_UINT64 },
+ { "evict_not_enough", KSTAT_DATA_UINT64 },
+ { "evict_l2_cached", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_skip", KSTAT_DATA_UINT64 },
+ { "hash_elements", KSTAT_DATA_UINT64 },
+ { "hash_elements_max", KSTAT_DATA_UINT64 },
+ { "hash_collisions", KSTAT_DATA_UINT64 },
+ { "hash_chains", KSTAT_DATA_UINT64 },
+ { "hash_chain_max", KSTAT_DATA_UINT64 },
+ { "p", KSTAT_DATA_UINT64 },
+ { "c", KSTAT_DATA_UINT64 },
+ { "c_min", KSTAT_DATA_UINT64 },
+ { "c_max", KSTAT_DATA_UINT64 },
+ { "size", KSTAT_DATA_UINT64 },
+ { "compressed_size", KSTAT_DATA_UINT64 },
+ { "uncompressed_size", KSTAT_DATA_UINT64 },
+ { "overhead_size", KSTAT_DATA_UINT64 },
+ { "hdr_size", KSTAT_DATA_UINT64 },
+ { "data_size", KSTAT_DATA_UINT64 },
+ { "metadata_size", KSTAT_DATA_UINT64 },
+ { "dbuf_size", KSTAT_DATA_UINT64 },
+ { "dnode_size", KSTAT_DATA_UINT64 },
+ { "bonus_size", KSTAT_DATA_UINT64 },
+#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
+ { "other_size", KSTAT_DATA_UINT64 },
+#endif
+ { "anon_size", KSTAT_DATA_UINT64 },
+ { "anon_evictable_data", KSTAT_DATA_UINT64 },
+ { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mru_size", KSTAT_DATA_UINT64 },
+ { "mru_evictable_data", KSTAT_DATA_UINT64 },
+ { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mru_ghost_size", KSTAT_DATA_UINT64 },
+ { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
+ { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mfu_size", KSTAT_DATA_UINT64 },
+ { "mfu_evictable_data", KSTAT_DATA_UINT64 },
+ { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_size", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "l2_hits", KSTAT_DATA_UINT64 },
+ { "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_feeds", KSTAT_DATA_UINT64 },
+ { "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_read_bytes", KSTAT_DATA_UINT64 },
+ { "l2_write_bytes", KSTAT_DATA_UINT64 },
+ { "l2_writes_sent", KSTAT_DATA_UINT64 },
+ { "l2_writes_done", KSTAT_DATA_UINT64 },
+ { "l2_writes_error", KSTAT_DATA_UINT64 },
+ { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
+ { "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_cksum_bad", KSTAT_DATA_UINT64 },
+ { "l2_io_error", KSTAT_DATA_UINT64 },
+ { "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_asize", KSTAT_DATA_UINT64 },
+ { "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
+ { "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
+ { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
+ { "l2_write_in_l2", KSTAT_DATA_UINT64 },
+ { "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
+ { "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
+ { "l2_write_full", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
+ { "l2_write_pios", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
+ { "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "memory_direct_count", KSTAT_DATA_UINT64 },
+ { "memory_indirect_count", KSTAT_DATA_UINT64 },
+ { "memory_all_bytes", KSTAT_DATA_UINT64 },
+ { "memory_free_bytes", KSTAT_DATA_UINT64 },
+ { "memory_available_bytes", KSTAT_DATA_UINT64 },
+ { "arc_no_grow", KSTAT_DATA_UINT64 },
+ { "arc_tempreserve", KSTAT_DATA_UINT64 },
+ { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
+ { "arc_prune", KSTAT_DATA_UINT64 },
+ { "arc_meta_used", KSTAT_DATA_UINT64 },
+ { "arc_meta_limit", KSTAT_DATA_UINT64 },
+ { "arc_dnode_limit", KSTAT_DATA_UINT64 },
+ { "arc_meta_max", KSTAT_DATA_UINT64 },
+ { "arc_meta_min", KSTAT_DATA_UINT64 },
+ { "async_upgrade_sync", KSTAT_DATA_UINT64 },
+ { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+ { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
+};
+
+#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
+
+#define ARCSTAT_INCR(stat, val) \
+ atomic_add_64(&arc_stats.stat.value.ui64, (val))
+
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
+
+#define ARCSTAT_MAX(stat, val) { \
+ uint64_t m; \
+ while ((val) > (m = arc_stats.stat.value.ui64) && \
+ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
+ continue; \
+}
+
+#define ARCSTAT_MAXSTAT(stat) \
+ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
+
+/*
+ * We define a macro to allow ARC hits/misses to be easily broken down by
+ * two separate conditions, giving a total of four different subtypes for
+ * each of hits and misses (so eight statistics total).
+ */
+#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
+ if (cond1) { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
+ } \
+ } else { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
+ } \
+ }
+
+kstat_t *arc_ksp;
+static arc_state_t *arc_anon;
+static arc_state_t *arc_mru;
+static arc_state_t *arc_mru_ghost;
+static arc_state_t *arc_mfu;
+static arc_state_t *arc_mfu_ghost;
+static arc_state_t *arc_l2c_only;
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them. For these variables, we therefore define them to be in
+ * terms of the statistic variable. This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
+#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
+#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
+#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
+#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
+#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */
+#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
+
+/* compressed size of entire arc */
+#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
+
+/*
+ * There are also some ARC variables that we want to export, but that are
+ * updated so often that having the canonical representation be the statistic
+ * variable causes a performance bottleneck. We want to use aggsum_t's for these
+ * instead, but still be able to export the kstat in the same way as before.
+ * The solution is to always use the aggsum version, except in the kstat update
+ * callback.
+ */
+aggsum_t arc_size;
+aggsum_t arc_meta_used;
+aggsum_t astat_data_size;
+aggsum_t astat_metadata_size;
+aggsum_t astat_hdr_size;
+aggsum_t astat_bonus_size;
+aggsum_t astat_dnode_size;
+aggsum_t astat_dbuf_size;
+aggsum_t astat_l2_hdr_size;
+
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
+static taskq_t *arc_prune_taskq;
+
+static int arc_no_grow; /* Don't try to grow cache size */
+static hrtime_t arc_growtime;
+static uint64_t arc_tempreserve;
+static uint64_t arc_loaned_bytes;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+ void *acb_private;
+ arc_read_done_func_t *acb_done;
+ arc_buf_t *acb_buf;
+ boolean_t acb_compressed;
+ zio_t *acb_zio_dummy;
+ zio_t *acb_zio_head;
+ arc_callback_t *acb_next;
+};
+
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+ void *awcb_private;
+ arc_write_done_func_t *awcb_ready;
+ arc_write_done_func_t *awcb_children_ready;
+ arc_write_done_func_t *awcb_physdone;
+ arc_write_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
+};
+
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
+ kmutex_t b_freeze_lock;
+ zio_cksum_t *b_freeze_cksum;
+#ifdef ZFS_DEBUG
+ /*
+ * Used for debugging with kmem_flags - by allocating and freeing
+ * b_thawed when the buffer is thawed, we get a record of the stack
+ * trace that thawed it.
+ */
+ void *b_thawed;
+#endif
+
+ arc_buf_t *b_buf;
+ uint32_t b_bufcnt;
+ /* for waiting on writes to complete */
+ kcondvar_t b_cv;
+ uint8_t b_byteswap;
+
+ /* protected by arc state mutex */
+ arc_state_t *b_state;
+ multilist_node_t b_arc_node;
+
+ /* updated atomically */
+ clock_t b_arc_access;
+ uint32_t b_mru_hits;
+ uint32_t b_mru_ghost_hits;
+ uint32_t b_mfu_hits;
+ uint32_t b_mfu_ghost_hits;
+ uint32_t b_l2_hits;
+
+ /* self protecting */
+ zfs_refcount_t b_refcnt;
+
+ arc_callback_t *b_acb;
+ abd_t *b_pabd;
+} l1arc_buf_hdr_t;
+
+typedef struct l2arc_dev l2arc_dev_t;
+
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+ uint32_t b_hits;
+
+ list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+
+ arc_buf_contents_t b_type;
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /*
+ * This field stores the size of the data buffer after
+ * compression, and is set in the arc's zio completion handlers.
+ * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+ *
+ * While the block pointers can store up to 32MB in their psize
+ * field, we can only store up to 32MB minus 512B. This is due
+ * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+ * a field of zeros represents 512B in the bp). We can't use a
+ * bias of 1 since we need to reserve a psize of zero, here, to
+ * represent holes and embedded blocks.
+ *
+ * This isn't a problem in practice, since the maximum size of a
+ * buffer is limited to 16MB, so we never need to store 32MB in
+ * this field. Even in the upstream illumos code base, the
+ * maximum size of a buffer is limited to 16MB.
+ */
+ uint16_t b_psize;
+
+ /*
+ * This field stores the size of the data buffer before
+ * compression, and cannot change once set. It is in units
+ * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+ */
+ uint16_t b_lsize; /* immutable */
+ uint64_t b_spa; /* immutable */
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
+};
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static int
+sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = arc_meta_limit;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val <= 0 || val > arc_c_max)
+ return (EINVAL);
+
+ arc_meta_limit = val;
+
+ mutex_enter(&arc_adjust_lock);
+ arc_adjust_needed = B_TRUE;
+ mutex_exit(&arc_adjust_lock);
+ zthr_wakeup(arc_adjust_zthr);
+
+ return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
+{
+ uint32_t val;
+ int err;
+
+ val = arc_no_grow_shift;
+ err = sysctl_handle_32(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val >= arc_shrink_shift)
+ return (EINVAL);
+
+ arc_no_grow_shift = val;
+ return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_arc_max;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (zfs_arc_max == 0) {
+ /* Loader tunable so blindly set */
+ zfs_arc_max = val;
+ return (0);
+ }
+
+ if (val < arc_abs_min || val > kmem_size())
+ return (EINVAL);
+ if (val < arc_c_min)
+ return (EINVAL);
+ if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
+ return (EINVAL);
+
+ arc_c_max = val;
+
+ arc_c = arc_c_max;
+ arc_p = (arc_c >> 1);
+
+ if (zfs_arc_meta_limit == 0) {
+ /* limit meta-data to 1/4 of the arc capacity */
+ arc_meta_limit = arc_c_max / 4;
+ }
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc_c = arc_c / 2;
+
+ zfs_arc_max = arc_c;
+
+ mutex_enter(&arc_adjust_lock);
+ arc_adjust_needed = B_TRUE;
+ mutex_exit(&arc_adjust_lock);
+ zthr_wakeup(arc_adjust_zthr);
+
+ return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_arc_min;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (zfs_arc_min == 0) {
+ /* Loader tunable so blindly set */
+ zfs_arc_min = val;
+ return (0);
+ }
+
+ if (val < arc_abs_min || val > arc_c_max)
+ return (EINVAL);
+
+ arc_c_min = val;
+
+ if (zfs_arc_meta_min == 0)
+ arc_meta_min = arc_c_min / 2;
+
+ if (arc_c < arc_c_min)
+ arc_c = arc_c_min;
+
+ zfs_arc_min = arc_c_min;
+
+ return (0);
+}
+#endif
+
+#define GHOST_STATE(state) \
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
+ (state) == arc_l2c_only)
+
+#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
+#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
+#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define HDR_PRESCIENT_PREFETCH(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+#define HDR_COMPRESSION_ENABLED(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
+
+#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define HDR_L2_READING(hdr) \
+ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
+ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
+
+#define HDR_ISTYPE_METADATA(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
+
+#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+
+/* For storing compression mode in b_flags */
+#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
+ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
+#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
+#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
+
+/*
+ * Other sizes
+ */
+
+#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
+
+/*
+ * Hash table routines
+ */
+
+#define HT_LOCK_PAD CACHE_LINE_SIZE
+
+struct ht_lock {
+ kmutex_t ht_lock;
+#ifdef _KERNEL
+ unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+#define BUF_LOCKS 256
+typedef struct buf_hash_table {
+ uint64_t ht_mask;
+ arc_buf_hdr_t **ht_table;
+ struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define BUF_HASH_INDEX(spa, dva, birth) \
+ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define HDR_LOCK(hdr) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+/*
+ * Level 2 ARC
+ */
+
+#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 2 /* num of writes */
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define L2ARC_HEADROOM_BOOST 200
+#define L2ARC_FEED_SECS 1 /* caching interval secs */
+#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
+
+#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
+#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
+
+/* L2ARC Performance Tunables */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
+uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
+uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
+boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN,
+ &l2arc_write_max, 0, "max write size");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN,
+ &l2arc_write_boost, 0, "extra write during warmup");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN,
+ &l2arc_headroom, 0, "number of dev writes");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN,
+ &l2arc_feed_secs, 0, "interval seconds");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN,
+ &l2arc_feed_min_ms, 0, "min interval milliseconds");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN,
+ &l2arc_noprefetch, 0, "don't cache prefetch bufs");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN,
+ &l2arc_feed_again, 0, "turbo warmup");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN,
+ &l2arc_norw, 0, "no reads during writes");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+ &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+ &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+ &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of anonymous state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+ &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+ &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+ &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mru state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mru ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+ &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+ &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+ &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mfu state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mfu ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+ &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
+
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW,
+ &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
+ &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms");
+
+/*
+ * L2ARC Internals
+ */
+struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+ zfs_refcount_t l2ad_alloc; /* allocated bytes */
+};
+
+static list_t L2ARC_dev_list; /* device list */
+static list_t *l2arc_dev_list; /* device list pointer */
+static kmutex_t l2arc_dev_mtx; /* device list mutex */
+static l2arc_dev_t *l2arc_dev_last; /* last device used */
+static list_t L2ARC_free_on_write; /* free after write buf list */
+static list_t *l2arc_free_on_write; /* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
+static uint64_t l2arc_ndev; /* number of devices */
+
+typedef struct l2arc_read_callback {
+ arc_buf_hdr_t *l2rcb_hdr; /* read header */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_phys_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+ abd_t *l2rcb_abd; /* temporary buffer */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+} l2arc_write_callback_t;
+
+typedef struct l2arc_data_free {
+ /* protected by l2arc_free_on_write_mtx */
+ abd_t *l2df_abd;
+ size_t l2df_size;
+ arc_buf_contents_t l2df_type;
+ list_node_t l2df_list_node;
+} l2arc_data_free_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
+static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
+static void arc_hdr_free_pabd(arc_buf_hdr_t *);
+static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t);
+static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static boolean_t arc_is_overflowing();
+static void arc_buf_watch(arc_buf_t *);
+static void arc_prune_async(int64_t);
+
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+
+static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
+static void l2arc_read_done(zio_t *);
+
+static void
+l2arc_trim(const arc_buf_hdr_t *hdr)
+{
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+
+ if (HDR_GET_PSIZE(hdr) != 0) {
+ trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
+ HDR_GET_PSIZE(hdr), 0);
+ }
+}
+
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
+static uint64_t
+buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
+{
+ return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
+}
+
+#define HDR_EMPTY(hdr) \
+ ((hdr)->b_dva.dva_word[0] == 0 && \
+ (hdr)->b_dva.dva_word[1] == 0)
+
+#define HDR_EQUAL(spa, dva, birth, hdr) \
+ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
+
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+ hdr->b_dva.dva_word[0] = 0;
+ hdr->b_dva.dva_word[1] = 0;
+ hdr->b_birth = 0;
+}
+
+static arc_buf_hdr_t *
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
+{
+ const dva_t *dva = BP_IDENTITY(bp);
+ uint64_t birth = BP_PHYSICAL_BIRTH(bp);
+ uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *hdr;
+
+ mutex_enter(hash_lock);
+ for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
+ hdr = hdr->b_hash_next) {
+ if (HDR_EQUAL(spa, dva, birth, hdr)) {
+ *lockp = hash_lock;
+ return (hdr);
+ }
+ }
+ mutex_exit(hash_lock);
+ *lockp = NULL;
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
+ */
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *fhdr;
+ uint32_t i;
+
+ ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
+ ASSERT(hdr->b_birth != 0);
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+ if (lockp != NULL) {
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ } else {
+ ASSERT(MUTEX_HELD(hash_lock));
+ }
+
+ for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
+ fhdr = fhdr->b_hash_next, i++) {
+ if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+ return (fhdr);
+ }
+
+ hdr->b_hash_next = buf_hash_table.ht_table[idx];
+ buf_hash_table.ht_table[idx] = hdr;
+ arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+
+ /* collect some hash table performance data */
+ if (i > 0) {
+ ARCSTAT_BUMP(arcstat_hash_collisions);
+ if (i == 1)
+ ARCSTAT_BUMP(arcstat_hash_chains);
+
+ ARCSTAT_MAX(arcstat_hash_chain_max, i);
+ }
+
+ ARCSTAT_BUMP(arcstat_hash_elements);
+ ARCSTAT_MAXSTAT(arcstat_hash_elements);
+
+ return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *hdr)
+{
+ arc_buf_hdr_t *fhdr, **hdrp;
+ uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
+
+ ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+
+ hdrp = &buf_hash_table.ht_table[idx];
+ while ((fhdr = *hdrp) != hdr) {
+ ASSERT3P(fhdr, !=, NULL);
+ hdrp = &fhdr->b_hash_next;
+ }
+ *hdrp = hdr->b_hash_next;
+ hdr->b_hash_next = NULL;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+
+ /* collect some hash table performance data */
+ ARCSTAT_BUMPDOWN(arcstat_hash_elements);
+
+ if (buf_hash_table.ht_table[idx] &&
+ buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+ ARCSTAT_BUMPDOWN(arcstat_hash_chains);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_l2only_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+ int i;
+
+ kmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+ for (i = 0; i < BUF_LOCKS; i++)
+ mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_cache_destroy(hdr_full_cache);
+ kmem_cache_destroy(hdr_l2only_cache);
+ kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ bzero(hdr, HDR_FULL_SIZE);
+ cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+ zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
+ mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+ arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ bzero(hdr, HDR_L2ONLY_SIZE);
+ arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+buf_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_t));
+ mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_full_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ ASSERT(HDR_EMPTY(hdr));
+ cv_destroy(&hdr->b_l1hdr.b_cv);
+ zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+ mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_l2only_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ ASSERT(HDR_EMPTY(hdr));
+ arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+}
+
+/* ARGSUSED */
+static void
+buf_dest(void *vbuf, void *unused)
+{
+ arc_buf_t *buf = vbuf;
+
+ mutex_destroy(&buf->b_evict_lock);
+ arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+}
+
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+ dprintf("hdr_recl called\n");
+ /*
+ * umem calls the reclaim func when we destroy the buf cache,
+ * which is after we do arc_fini().
+ */
+ if (arc_initialized)
+ zthr_wakeup(arc_reap_zthr);
+}
+
+static void
+buf_init(void)
+{
+ uint64_t *ct;
+ uint64_t hsize = 1ULL << 12;
+ int i, j;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average block size of zfs_arc_average_blocksize (default 8K).
+ * By default, the table will take up
+ * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
+ */
+ while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
+ hsize <<= 1;
+retry:
+ buf_hash_table.ht_mask = hsize - 1;
+ buf_hash_table.ht_table =
+ kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+ if (buf_hash_table.ht_table == NULL) {
+ ASSERT(hsize > (1ULL << 8));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+ 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+ hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+ HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
+ NULL, NULL, 0);
+ buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+ 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < 256; i++)
+ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+ for (i = 0; i < BUF_LOCKS; i++) {
+ mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+/*
+ * This is the size that the buf occupies in memory. If the buf is compressed,
+ * it will correspond to the compressed size. You should use this method of
+ * getting the buf size unless you explicitly need the logical size.
+ */
+int32_t
+arc_buf_size(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
+}
+
+int32_t
+arc_buf_lsize(arc_buf_t *buf)
+{
+ return (HDR_GET_LSIZE(buf->b_hdr));
+}
+
+enum zio_compress
+arc_get_compression(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
+}
+
+#define ARC_MINTIME (hz>>4) /* 62 ms */
+
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+ boolean_t shared = (buf->b_data != NULL &&
+ buf->b_hdr->b_l1hdr.b_pabd != NULL &&
+ abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
+ buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
+ IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+ IMPLY(shared, ARC_BUF_SHARED(buf));
+ IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
+
+ /*
+ * It would be nice to assert arc_can_share() too, but the "hdr isn't
+ * already being shared" requirement prevents us from doing that.
+ */
+
+ return (shared);
+}
+
+/*
+ * Free the checksum associated with this header. If there is no checksum, this
+ * is a no-op.
+ */
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_l1hdr.b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
+/*
+ * Return true iff at least one of the bufs on hdr is not compressed.
+ */
+static boolean_t
+arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
+{
+ for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
+ if (!ARC_BUF_COMPRESSED(b)) {
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
+ * matches the checksum that is stored in the hdr. If there is no checksum,
+ * or if the buf is compressed, this is a no-op.
+ */
+static void
+arc_cksum_verify(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ zio_cksum_t zc;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ }
+
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
+ if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
+ panic("buffer modified while frozen!");
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
+{
+ enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
+ boolean_t valid_cksum;
+
+ ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+ VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
+
+ /*
+ * We rely on the blkptr's checksum to determine if the block
+ * is valid or not. When compressed arc is enabled, the l2arc
+ * writes the block to the l2arc just as it appears in the pool.
+ * This allows us to use the blkptr's checksum to validate the
+ * data that we just read off of the l2arc without having to store
+ * a separate checksum in the arc_buf_hdr_t. However, if compressed
+ * arc is disabled, then the data written to the l2arc is always
+ * uncompressed and won't match the block as it exists in the main
+ * pool. When this is the case, we must first compress it if it is
+ * compressed on the main pool before we can validate the checksum.
+ */
+ if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ uint64_t csize;
+
+ abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
+ csize = zio_compress_data(compress, zio->io_abd,
+ abd_to_buf(cdata), lsize);
+
+ ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
+ if (csize < HDR_GET_PSIZE(hdr)) {
+ /*
+ * Compressed blocks are always a multiple of the
+ * smallest ashift in the pool. Ideally, we would
+ * like to round up the csize to the next
+ * spa_min_ashift but that value may have changed
+ * since the block was last written. Instead,
+ * we rely on the fact that the hdr's psize
+ * was set to the psize of the block when it was
+ * last written. We set the csize to that value
+ * and zero out any part that should not contain
+ * data.
+ */
+ abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
+ csize = HDR_GET_PSIZE(hdr);
+ }
+ zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
+ }
+
+ /*
+ * Block pointers always store the checksum for the logical data.
+ * If the block pointer has the gang bit set, then the checksum
+ * it represents is for the reconstituted data and not for an
+ * individual gang member. The zio pipeline, however, must be able to
+ * determine the checksum of each of the gang constituents so it
+ * treats the checksum comparison differently than what we need
+ * for l2arc blocks. This prevents us from using the
+ * zio_checksum_error() interface directly. Instead we must call the
+ * zio_checksum_error_impl() so that we can ensure the checksum is
+ * generated using the correct checksum algorithm and accounts for the
+ * logical I/O size and not just a gang fragment.
+ */
+ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+ BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
+ zio->io_offset, NULL) == 0);
+ zio_pop_transforms(zio);
+ return (valid_cksum);
+}
+
+/*
+ * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
+ * checksum and attaches it to the buf's hdr so that we can ensure that the buf
+ * isn't modified later on. If buf is compressed or there is already a checksum
+ * on the hdr, this is a no-op (we only checksum uncompressed bufs).
+ */
+static void
+arc_cksum_compute(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ ASSERT(arc_hdr_has_uncompressed_buf(hdr));
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ } else if (ARC_BUF_COMPRESSED(buf)) {
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ }
+
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
+ hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+ KM_SLEEP);
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
+ hdr->b_l1hdr.b_freeze_cksum);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+#ifdef illumos
+ arc_buf_watch(buf);
+#endif
+}
+
+#ifdef illumos
+#ifndef _KERNEL
+typedef struct procctl {
+ long cmd;
+ prwatch_t prwatch;
+} procctl_t;
+#endif
+
+/* ARGSUSED */
+static void
+arc_buf_unwatch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+ if (arc_watch) {
+ int result;
+ procctl_t ctl;
+ ctl.cmd = PCWATCH;
+ ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
+ ctl.prwatch.pr_size = 0;
+ ctl.prwatch.pr_wflags = 0;
+ result = write(arc_procfd, &ctl, sizeof (ctl));
+ ASSERT3U(result, ==, sizeof (ctl));
+ }
+#endif
+}
+
+/* ARGSUSED */
+static void
+arc_buf_watch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+ if (arc_watch) {
+ int result;
+ procctl_t ctl;
+ ctl.cmd = PCWATCH;
+ ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
+ ctl.prwatch.pr_size = arc_buf_size(buf);
+ ctl.prwatch.pr_wflags = WA_WRITE;
+ result = write(arc_procfd, &ctl, sizeof (ctl));
+ ASSERT3U(result, ==, sizeof (ctl));
+ }
+#endif
+}
+#endif /* illumos */
+
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+ arc_buf_contents_t type;
+ if (HDR_ISTYPE_METADATA(hdr)) {
+ type = ARC_BUFC_METADATA;
+ } else {
+ type = ARC_BUFC_DATA;
+ }
+ VERIFY3U(hdr->b_type, ==, type);
+ return (type);
+}
+
+boolean_t
+arc_is_metadata(arc_buf_t *buf)
+{
+ return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+ switch (type) {
+ case ARC_BUFC_DATA:
+ /* metadata field is 0 if buffer contains normal data */
+ return (0);
+ case ARC_BUFC_METADATA:
+ return (ARC_FLAG_BUFC_METADATA);
+ default:
+ break;
+ }
+ panic("undefined ARC buffer type!");
+ return ((uint32_t)-1);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ arc_cksum_verify(buf);
+
+ /*
+ * Compressed buffers do not manipulate the b_freeze_cksum or
+ * allocate b_thawed.
+ */
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ arc_cksum_free(hdr);
+
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+#ifdef ZFS_DEBUG
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (hdr->b_l1hdr.b_thawed != NULL)
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
+ }
+#endif
+
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+
+#ifdef illumos
+ arc_buf_unwatch(buf);
+#endif
+}
+
+void
+arc_buf_freeze(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
+ hdr->b_l1hdr.b_state == arc_anon);
+ arc_cksum_compute(buf);
+ mutex_exit(hash_lock);
+}
+
+/*
+ * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
+ * the following functions should be used to ensure that the flags are
+ * updated in a thread-safe way. When manipulating the flags either
+ * the hash_lock must be held or the hdr must be undiscoverable. This
+ * ensures that we're not racing with any other threads when updating
+ * the flags.
+ */
+static inline void
+arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ hdr->b_flags |= flags;
+}
+
+static inline void
+arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ hdr->b_flags &= ~flags;
+}
+
+/*
+ * Setting the compression bits in the arc_buf_hdr_t's b_flags is
+ * done in a special way since we have to clear and set bits
+ * at the same time. Consumers that wish to set the compression bits
+ * must use this function to ensure that the flags are updated in
+ * thread-safe manner.
+ */
+static void
+arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
+{
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ /*
+ * Holes and embedded blocks will always have a psize = 0 so
+ * we ignore the compression of the blkptr and set the
+ * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
+ * Holes and embedded blocks remain anonymous so we don't
+ * want to uncompress them. Mark them as uncompressed.
+ */
+ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+ ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+ HDR_SET_COMPRESS(hdr, cmp);
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
+ ASSERT(HDR_COMPRESSION_ENABLED(hdr));
+ }
+}
+
+/*
+ * Looks for another buf on the same hdr which has the data decompressed, copies
+ * from it, and returns true. If no such buf exists, returns false.
+ */
+static boolean_t
+arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t copied = B_FALSE;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
+
+ for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
+ from = from->b_next) {
+ /* can't use our own data buffer */
+ if (from == buf) {
+ continue;
+ }
+
+ if (!ARC_BUF_COMPRESSED(from)) {
+ bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
+ copied = B_TRUE;
+ break;
+ }
+ }
+
+ /*
+ * There were no decompressed bufs, so there should not be a
+ * checksum on the hdr either.
+ */
+ EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+
+ return (copied);
+}
+
+/*
+ * Given a buf that has a data buffer attached to it, this function will
+ * efficiently fill the buf with data of the specified compression setting from
+ * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
+ * are already sharing a data buf, no copy is performed.
+ *
+ * If the buf is marked as compressed but uncompressed data was requested, this
+ * will allocate a new data buffer for the buf, remove that flag, and fill the
+ * buf with uncompressed data. You can't request a compressed buf on a hdr with
+ * uncompressed data, and (since we haven't added support for it yet) if you
+ * want compressed data your buf must already be marked as compressed and have
+ * the correct-sized data buffer.
+ */
+static int
+arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
+ dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+
+ ASSERT3P(buf->b_data, !=, NULL);
+ IMPLY(compressed, hdr_compressed);
+ IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
+
+ if (hdr_compressed == compressed) {
+ if (!arc_buf_is_shared(buf)) {
+ abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
+ arc_buf_size(buf));
+ }
+ } else {
+ ASSERT(hdr_compressed);
+ ASSERT(!compressed);
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
+
+ /*
+ * If the buf is sharing its data with the hdr, unlink it and
+ * allocate a new data buffer for the buf.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+
+ /* We need to give the buf it's own b_data */
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+
+ /* Previously overhead was 0; just add new overhead */
+ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+ } else if (ARC_BUF_COMPRESSED(buf)) {
+ /* We need to reallocate the buf's b_data */
+ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
+ buf);
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+
+ /* We increased the size of b_data; update overhead */
+ ARCSTAT_INCR(arcstat_overhead_size,
+ HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
+ }
+
+ /*
+ * Regardless of the buf's previous compression settings, it
+ * should not be compressed at the end of this function.
+ */
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * Try copying the data from another buf which already has a
+ * decompressed version. If that's not possible, it's time to
+ * bite the bullet and decompress the data from the hdr.
+ */
+ if (arc_buf_try_copy_decompressed_data(buf)) {
+ /* Skip byteswapping and checksumming (already done) */
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
+ return (0);
+ } else {
+ int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pabd, buf->b_data,
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+
+ /*
+ * Absent hardware errors or software bugs, this should
+ * be impossible, but log it anyway so we can debug it.
+ */
+ if (error != 0) {
+ zfs_dbgmsg(
+ "hdr %p, compress %d, psize %d, lsize %d",
+ hdr, HDR_GET_COMPRESS(hdr),
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+ return (SET_ERROR(EIO));
+ }
+ }
+ }
+
+ /* Byteswap the buf's data if necessary */
+ if (bswap != DMU_BSWAP_NUMFUNCS) {
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
+ dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
+ }
+
+ /* Compute the hdr's checksum if necessary */
+ arc_cksum_compute(buf);
+
+ return (0);
+}
+
+int
+arc_decompress(arc_buf_t *buf)
+{
+ return (arc_buf_fill(buf, B_FALSE));
+}
+
+/*
+ * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
+ */
+static uint64_t
+arc_hdr_size(arc_buf_hdr_t *hdr)
+{
+ uint64_t size;
+
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ HDR_GET_PSIZE(hdr) > 0) {
+ size = HDR_GET_PSIZE(hdr);
+ } else {
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
+ size = HDR_GET_LSIZE(hdr);
+ }
+ return (size);
+}
+
+/*
+ * Increment the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ HDR_GET_LSIZE(hdr), hdr);
+ return;
+ }
+
+ ASSERT(!GHOST_STATE(state));
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ arc_hdr_size(hdr), hdr);
+ }
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ if (arc_buf_is_shared(buf))
+ continue;
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ arc_buf_size(buf), buf);
+ }
+}
+
+/*
+ * Decrement the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ HDR_GET_LSIZE(hdr), hdr);
+ return;
+ }
+
+ ASSERT(!GHOST_STATE(state));
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ arc_hdr_size(hdr), hdr);
+ }
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ if (arc_buf_is_shared(buf))
+ continue;
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ arc_buf_size(buf), buf);
+ }
+}
+
+/*
+ * Add a reference to this hdr indicating that someone is actively
+ * referencing that memory. When the refcount transitions from 0 to 1,
+ * we remove it from the respective arc_state_t list to indicate that
+ * it is not evictable.
+ */
+static void
+add_reference(arc_buf_hdr_t *hdr, void *tag)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (!MUTEX_HELD(HDR_LOCK(hdr))) {
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ }
+
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+
+ if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+ (state != arc_anon)) {
+ /* We don't use the L2-only state list. */
+ if (state != arc_l2c_only) {
+ multilist_remove(state->arcs_list[arc_buf_type(hdr)],
+ hdr);
+ arc_evictable_space_decrement(hdr, state);
+ }
+ /* remove the prefetch flag if we get a reference */
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ }
+}
+
+/*
+ * Remove a reference from this hdr. When the reference transitions from
+ * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
+ * list making it eligible for eviction.
+ */
+static int
+remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
+{
+ int cnt;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
+ ASSERT(!GHOST_STATE(state));
+
+ /*
+ * arc_l2c_only counts as a ghost state so we don't need to explicitly
+ * check to prevent usage of the arc_l2c_only list.
+ */
+ if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
+ (state != arc_anon)) {
+ multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+ arc_evictable_space_increment(hdr, state);
+ }
+ return (cnt);
+}
+
+/*
+ * Returns detailed information about a specific arc buffer. When the
+ * state_index argument is set the function will calculate the arc header
+ * list position for its arc state. Since this requires a linear traversal
+ * callers are strongly encourage not to do this. However, it can be helpful
+ * for targeted analysis so the functionality is provided.
+ */
+void
+arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
+{
+ arc_buf_hdr_t *hdr = ab->b_hdr;
+ l1arc_buf_hdr_t *l1hdr = NULL;
+ l2arc_buf_hdr_t *l2hdr = NULL;
+ arc_state_t *state = NULL;
+
+ memset(abi, 0, sizeof (arc_buf_info_t));
+
+ if (hdr == NULL)
+ return;
+
+ abi->abi_flags = hdr->b_flags;
+
+ if (HDR_HAS_L1HDR(hdr)) {
+ l1hdr = &hdr->b_l1hdr;
+ state = l1hdr->b_state;
+ }
+ if (HDR_HAS_L2HDR(hdr))
+ l2hdr = &hdr->b_l2hdr;
+
+ if (l1hdr) {
+ abi->abi_bufcnt = l1hdr->b_bufcnt;
+ abi->abi_access = l1hdr->b_arc_access;
+ abi->abi_mru_hits = l1hdr->b_mru_hits;
+ abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
+ abi->abi_mfu_hits = l1hdr->b_mfu_hits;
+ abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
+ abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
+ }
+
+ if (l2hdr) {
+ abi->abi_l2arc_dattr = l2hdr->b_daddr;
+ abi->abi_l2arc_hits = l2hdr->b_hits;
+ }
+
+ abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
+ abi->abi_state_contents = arc_buf_type(hdr);
+ abi->abi_size = arc_hdr_size(hdr);
+}
+
+/*
+ * Move the supplied buffer to the indicated state. The hash lock
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
+ kmutex_t *hash_lock)
+{
+ arc_state_t *old_state;
+ int64_t refcnt;
+ uint32_t bufcnt;
+ boolean_t update_old, update_new;
+ arc_buf_contents_t buftype = arc_buf_type(hdr);
+
+ /*
+ * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+ * in arc_read() when bringing a buffer out of the L2ARC. However, the
+ * L1 hdr doesn't always exist when we change state to arc_anon before
+ * destroying a header, in which case reallocating to add the L1 hdr is
+ * pointless.
+ */
+ if (HDR_HAS_L1HDR(hdr)) {
+ old_state = hdr->b_l1hdr.b_state;
+ refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
+ bufcnt = hdr->b_l1hdr.b_bufcnt;
+ update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
+ } else {
+ old_state = arc_l2c_only;
+ refcnt = 0;
+ bufcnt = 0;
+ update_old = B_FALSE;
+ }
+ update_new = update_old;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT3P(new_state, !=, old_state);
+ ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
+ ASSERT(old_state != arc_anon || bufcnt <= 1);
+
+ /*
+ * If this buffer is evictable, transfer it from the
+ * old state list to the new state list.
+ */
+ if (refcnt == 0) {
+ if (old_state != arc_anon && old_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ multilist_remove(old_state->arcs_list[buftype], hdr);
+
+ if (GHOST_STATE(old_state)) {
+ ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ update_old = B_TRUE;
+ }
+ arc_evictable_space_decrement(hdr, old_state);
+ }
+ if (new_state != arc_anon && new_state != arc_l2c_only) {
+
+ /*
+ * An L1 header always exists here, since if we're
+ * moving to some L1-cached state (i.e. not l2c_only or
+ * anonymous), we realloc the header to add an L1hdr
+ * beforehand.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ multilist_insert(new_state->arcs_list[buftype], hdr);
+
+ if (GHOST_STATE(new_state)) {
+ ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ update_new = B_TRUE;
+ }
+ arc_evictable_space_increment(hdr, new_state);
+ }
+ }
+
+ ASSERT(!HDR_EMPTY(hdr));
+ if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
+
+ /* adjust state sizes (ignore arc_l2c_only) */
+
+ if (update_new && new_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(new_state)) {
+ ASSERT0(bufcnt);
+
+ /*
+ * When moving a header to a ghost state, we first
+ * remove all arc buffers. Thus, we'll have a
+ * bufcnt of zero, and no arc buffer to use for
+ * the reference. As a result, we use the arc
+ * header pointer for the reference.
+ */
+ (void) zfs_refcount_add_many(&new_state->arcs_size,
+ HDR_GET_LSIZE(hdr), hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ } else {
+ uint32_t buffers = 0;
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ ASSERT3U(bufcnt, !=, 0);
+ buffers++;
+
+ /*
+ * When the arc_buf_t is sharing the data
+ * block with the hdr, the owner of the
+ * reference belongs to the hdr. Only
+ * add to the refcount if the arc_buf_t is
+ * not shared.
+ */
+ if (arc_buf_is_shared(buf))
+ continue;
+
+ (void) zfs_refcount_add_many(
+ &new_state->arcs_size,
+ arc_buf_size(buf), buf);
+ }
+ ASSERT3U(bufcnt, ==, buffers);
+
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ (void) zfs_refcount_add_many(
+ &new_state->arcs_size,
+ arc_hdr_size(hdr), hdr);
+ } else {
+ ASSERT(GHOST_STATE(old_state));
+ }
+ }
+ }
+
+ if (update_old && old_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(old_state)) {
+ ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+
+ /*
+ * When moving a header off of a ghost state,
+ * the header will not contain any arc buffers.
+ * We use the arc header pointer for the reference
+ * which is exactly what we did when we put the
+ * header on the ghost state.
+ */
+
+ (void) zfs_refcount_remove_many(&old_state->arcs_size,
+ HDR_GET_LSIZE(hdr), hdr);
+ } else {
+ uint32_t buffers = 0;
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ ASSERT3U(bufcnt, !=, 0);
+ buffers++;
+
+ /*
+ * When the arc_buf_t is sharing the data
+ * block with the hdr, the owner of the
+ * reference belongs to the hdr. Only
+ * add to the refcount if the arc_buf_t is
+ * not shared.
+ */
+ if (arc_buf_is_shared(buf))
+ continue;
+
+ (void) zfs_refcount_remove_many(
+ &old_state->arcs_size, arc_buf_size(buf),
+ buf);
+ }
+ ASSERT3U(bufcnt, ==, buffers);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ (void) zfs_refcount_remove_many(
+ &old_state->arcs_size, arc_hdr_size(hdr), hdr);
+ }
+ }
+
+ if (HDR_HAS_L1HDR(hdr))
+ hdr->b_l1hdr.b_state = new_state;
+
+ /*
+ * L2 headers should never be on the L2 state list since they don't
+ * have L1 headers allocated.
+ */
+ ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+ multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
+}
+
+void
+arc_space_consume(uint64_t space, arc_space_type_t type)
+{
+ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+ switch (type) {
+ case ARC_SPACE_DATA:
+ aggsum_add(&astat_data_size, space);
+ break;
+ case ARC_SPACE_META:
+ aggsum_add(&astat_metadata_size, space);
+ break;
+ case ARC_SPACE_BONUS:
+ aggsum_add(&astat_bonus_size, space);
+ break;
+ case ARC_SPACE_DNODE:
+ aggsum_add(&astat_dnode_size, space);
+ break;
+ case ARC_SPACE_DBUF:
+ aggsum_add(&astat_dbuf_size, space);
+ break;
+ case ARC_SPACE_HDRS:
+ aggsum_add(&astat_hdr_size, space);
+ break;
+ case ARC_SPACE_L2HDRS:
+ aggsum_add(&astat_l2_hdr_size, space);
+ break;
+ }
+
+ if (type != ARC_SPACE_DATA)
+ aggsum_add(&arc_meta_used, space);
+
+ aggsum_add(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space, arc_space_type_t type)
+{
+ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+ switch (type) {
+ case ARC_SPACE_DATA:
+ aggsum_add(&astat_data_size, -space);
+ break;
+ case ARC_SPACE_META:
+ aggsum_add(&astat_metadata_size, -space);
+ break;
+ case ARC_SPACE_BONUS:
+ aggsum_add(&astat_bonus_size, -space);
+ break;
+ case ARC_SPACE_DNODE:
+ aggsum_add(&astat_dnode_size, -space);
+ break;
+ case ARC_SPACE_DBUF:
+ aggsum_add(&astat_dbuf_size, -space);
+ break;
+ case ARC_SPACE_HDRS:
+ aggsum_add(&astat_hdr_size, -space);
+ break;
+ case ARC_SPACE_L2HDRS:
+ aggsum_add(&astat_l2_hdr_size, -space);
+ break;
+ }
+
+ if (type != ARC_SPACE_DATA) {
+ ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
+ /*
+ * We use the upper bound here rather than the precise value
+ * because the arc_meta_max value doesn't need to be
+ * precise. It's only consumed by humans via arcstats.
+ */
+ if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
+ arc_meta_max = aggsum_upper_bound(&arc_meta_used);
+ aggsum_add(&arc_meta_used, -space);
+ }
+
+ ASSERT(aggsum_compare(&arc_size, space) >= 0);
+ aggsum_add(&arc_size, -space);
+}
+
+/*
+ * Given a hdr and a buf, returns whether that buf can share its b_data buffer
+ * with the hdr's b_pabd.
+ */
+static boolean_t
+arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ /*
+ * The criteria for sharing a hdr's data are:
+ * 1. the hdr's compression matches the buf's compression
+ * 2. the hdr doesn't need to be byteswapped
+ * 3. the hdr isn't already being shared
+ * 4. the buf is either compressed or it is the last buf in the hdr list
+ *
+ * Criterion #4 maintains the invariant that shared uncompressed
+ * bufs must be the final buf in the hdr's b_buf list. Reading this, you
+ * might ask, "if a compressed buf is allocated first, won't that be the
+ * last thing in the list?", but in that case it's impossible to create
+ * a shared uncompressed buf anyway (because the hdr must be compressed
+ * to have the compressed buf). You might also think that #3 is
+ * sufficient to make this guarantee, however it's possible
+ * (specifically in the rare L2ARC write race mentioned in
+ * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
+ * is sharable, but wasn't at the time of its allocation. Rather than
+ * allow a new shared uncompressed buf to be created and then shuffle
+ * the list around to make it the last element, this simply disallows
+ * sharing if the new buf isn't the first to be added.
+ */
+ ASSERT3P(buf->b_hdr, ==, hdr);
+ boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
+ boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
+ return (buf_compressed == hdr_compressed &&
+ hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+ !HDR_SHARED_DATA(hdr) &&
+ (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
+}
+
+/*
+ * Allocate a buf for this hdr. If you care about the data that's in the hdr,
+ * or if you want a compressed buffer, pass those flags in. Returns 0 if the
+ * copy was made successfully, or an error code otherwise.
+ */
+static int
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
+ boolean_t fill, arc_buf_t **ret)
+{
+ arc_buf_t *buf;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+ VERIFY(hdr->b_type == ARC_BUFC_DATA ||
+ hdr->b_type == ARC_BUFC_METADATA);
+ ASSERT3P(ret, !=, NULL);
+ ASSERT3P(*ret, ==, NULL);
+
+ buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ buf->b_flags = 0;
+
+ add_reference(hdr, tag);
+
+ /*
+ * We're about to change the hdr's b_flags. We must either
+ * hold the hash_lock or be undiscoverable.
+ */
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ /*
+ * Only honor requests for compressed bufs if the hdr is actually
+ * compressed.
+ */
+ if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * If the hdr's data can be shared then we share the data buffer and
+ * set the appropriate bit in the hdr's b_flags to indicate the hdr is
+ * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
+ * buffer to store the buf's data.
+ *
+ * There are two additional restrictions here because we're sharing
+ * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
+ * actively involved in an L2ARC write, because if this buf is used by
+ * an arc_write() then the hdr's data buffer will be released when the
+ * write completes, even though the L2ARC write might still be using it.
+ * Second, the hdr's ABD must be linear so that the buf's user doesn't
+ * need to be ABD-aware.
+ */
+ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
+ abd_is_linear(hdr->b_l1hdr.b_pabd);
+
+ /* Set up b_data and sharing */
+ if (can_share) {
+ buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
+ arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+ } else {
+ buf->b_data =
+ arc_get_data_buf(hdr, arc_buf_size(buf), buf);
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
+ }
+ VERIFY3P(buf->b_data, !=, NULL);
+
+ hdr->b_l1hdr.b_buf = buf;
+ hdr->b_l1hdr.b_bufcnt += 1;
+
+ /*
+ * If the user wants the data from the hdr, we need to either copy or
+ * decompress the data.
+ */
+ if (fill) {
+ return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
+ }
+
+ return (0);
+}
+
+static char *arc_onloan_tag = "onloan";
+
+static inline void
+arc_loaned_bytes_update(int64_t delta)
+{
+ atomic_add_64(&arc_loaned_bytes, delta);
+
+ /* assert that it did not wrap around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
+}
+
+/*
+ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
+ * flight data by arc_tempreserve_space() until they are "returned". Loaned
+ * buffers must be returned to the arc before they can be used by the DMU or
+ * freed.
+ */
+arc_buf_t *
+arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
+{
+ arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
+ is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
+
+ arc_loaned_bytes_update(arc_buf_size(buf));
+
+ return (buf);
+}
+
+arc_buf_t *
+arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
+ psize, lsize, compression_type);
+
+ arc_loaned_bytes_update(arc_buf_size(buf));
+
+ return (buf);
+}
+
+
+/*
+ * Return a loaned arc buffer to the arc.
+ */
+void
+arc_return_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+ (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+
+ arc_loaned_bytes_update(-arc_buf_size(buf));
+}
+
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+ (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
+
+ arc_loaned_bytes_update(arc_buf_size(buf));
+}
+
+static void
+l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
+{
+ l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
+
+ df->l2df_abd = abd;
+ df->l2df_size = size;
+ df->l2df_type = type;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+static void
+arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ uint64_t size = arc_hdr_size(hdr);
+
+ /* protected by hash lock, if in the hash table */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(state != arc_anon && state != arc_l2c_only);
+
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ size, hdr);
+ }
+ (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
+ if (type == ARC_BUFC_METADATA) {
+ arc_space_return(size, ARC_SPACE_META);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_space_return(size, ARC_SPACE_DATA);
+ }
+
+ l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
+}
+
+/*
+ * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
+ * data buffer, we transfer the refcount ownership to the hdr and update
+ * the appropriate kstats.
+ */
+static void
+arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+
+ ASSERT(arc_can_share(hdr, buf));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ /*
+ * Start sharing the data buffer. We transfer the
+ * refcount ownership to the hdr since it always owns
+ * the refcount whenever an arc_buf_t is shared.
+ */
+ zfs_refcount_transfer_ownership(&state->arcs_size, buf, hdr);
+ hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
+ abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
+ HDR_ISTYPE_METADATA(hdr));
+ arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
+
+ /*
+ * Since we've transferred ownership to the hdr we need
+ * to increment its compressed and uncompressed kstats and
+ * decrement the overhead size.
+ */
+ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
+}
+
+static void
+arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+
+ ASSERT(arc_buf_is_shared(buf));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ /*
+ * We are no longer sharing this buffer so we need
+ * to transfer its ownership to the rightful owner.
+ */
+ zfs_refcount_transfer_ownership(&state->arcs_size, hdr, buf);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
+ abd_put(hdr->b_l1hdr.b_pabd);
+ hdr->b_l1hdr.b_pabd = NULL;
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+
+ /*
+ * Since the buffer is no longer shared between
+ * the arc buf and the hdr, count it as overhead.
+ */
+ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
+}
+
+/*
+ * Remove an arc_buf_t from the hdr's buf list and return the last
+ * arc_buf_t on the list. If no buffers remain on the list then return
+ * NULL.
+ */
+static arc_buf_t *
+arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
+ arc_buf_t *lastbuf = NULL;
+
+ /*
+ * Remove the buf from the hdr list and locate the last
+ * remaining buffer on the list.
+ */
+ while (*bufp != NULL) {
+ if (*bufp == buf)
+ *bufp = buf->b_next;
+
+ /*
+ * If we've removed a buffer in the middle of
+ * the list then update the lastbuf and update
+ * bufp.
+ */
+ if (*bufp != NULL) {
+ lastbuf = *bufp;
+ bufp = &(*bufp)->b_next;
+ }
+ }
+ buf->b_next = NULL;
+ ASSERT3P(lastbuf, !=, buf);
+ IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
+ IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
+ IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
+
+ return (lastbuf);
+}
+
+/*
+ * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
+ * list and free it.
+ */
+static void
+arc_buf_destroy_impl(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ /*
+ * Free up the data associated with the buf but only if we're not
+ * sharing this with the hdr. If we are sharing it with the hdr, the
+ * hdr is responsible for doing the free.
+ */
+ if (buf->b_data != NULL) {
+ /*
+ * We're about to change the hdr's b_flags. We must either
+ * hold the hash_lock or be undiscoverable.
+ */
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ arc_cksum_verify(buf);
+#ifdef illumos
+ arc_buf_unwatch(buf);
+#endif
+
+ if (arc_buf_is_shared(buf)) {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ } else {
+ uint64_t size = arc_buf_size(buf);
+ arc_free_data_buf(hdr, buf->b_data, size, buf);
+ ARCSTAT_INCR(arcstat_overhead_size, -size);
+ }
+ buf->b_data = NULL;
+
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+ hdr->b_l1hdr.b_bufcnt -= 1;
+ }
+
+ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
+
+ if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
+ /*
+ * If the current arc_buf_t is sharing its data buffer with the
+ * hdr, then reassign the hdr's b_pabd to share it with the new
+ * buffer at the end of the list. The shared buffer is always
+ * the last one on the hdr's buffer list.
+ *
+ * There is an equivalent case for compressed bufs, but since
+ * they aren't guaranteed to be the last buf in the list and
+ * that is an exceedingly rare case, we just allow that space be
+ * wasted temporarily.
+ */
+ if (lastbuf != NULL) {
+ /* Only one buf can be shared at once */
+ VERIFY(!arc_buf_is_shared(lastbuf));
+ /* hdr is uncompressed so can't have compressed buf */
+ VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
+
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ arc_hdr_free_pabd(hdr);
+
+ /*
+ * We must setup a new shared block between the
+ * last buffer and the hdr. The data would have
+ * been allocated by the arc buf so we need to transfer
+ * ownership to the hdr since it's now being shared.
+ */
+ arc_share_buf(hdr, lastbuf);
+ }
+ } else if (HDR_SHARED_DATA(hdr)) {
+ /*
+ * Uncompressed shared buffers are always at the end
+ * of the list. Compressed buffers don't have the
+ * same requirements. This makes it hard to
+ * simply assert that the lastbuf is shared so
+ * we rely on the hdr's compression flags to determine
+ * if we have a compressed, shared buffer.
+ */
+ ASSERT3P(lastbuf, !=, NULL);
+ ASSERT(arc_buf_is_shared(lastbuf) ||
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
+ }
+
+ /*
+ * Free the checksum if we're removing the last uncompressed buf from
+ * this hdr.
+ */
+ if (!arc_hdr_has_uncompressed_buf(hdr)) {
+ arc_cksum_free(hdr);
+ }
+
+ /* clean up the buf */
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+}
+
+static void
+arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t do_adapt)
+{
+ ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!HDR_SHARED_DATA(hdr));
+
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, do_adapt);
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+}
+
+static void
+arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ /*
+ * If the hdr is currently being written to the l2arc then
+ * we defer freeing the data by adding it to the l2arc_free_on_write
+ * list. The l2arc will free the data once it's finished
+ * writing it to the l2arc device.
+ */
+ if (HDR_L2_WRITING(hdr)) {
+ arc_hdr_free_on_write(hdr);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else {
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+ arc_hdr_size(hdr), hdr);
+ }
+ hdr->b_l1hdr.b_pabd = NULL;
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+
+ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+}
+
+static arc_buf_hdr_t *
+arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
+ enum zio_compress compression_type, arc_buf_contents_t type)
+{
+ arc_buf_hdr_t *hdr;
+
+ VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
+
+ hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
+ ASSERT(HDR_EMPTY(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
+ HDR_SET_PSIZE(hdr, psize);
+ HDR_SET_LSIZE(hdr, lsize);
+ hdr->b_spa = spa;
+ hdr->b_type = type;
+ hdr->b_flags = 0;
+ arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
+ arc_hdr_set_compress(hdr, compression_type);
+
+ hdr->b_l1hdr.b_state = arc_anon;
+ hdr->b_l1hdr.b_arc_access = 0;
+ hdr->b_l1hdr.b_bufcnt = 0;
+ hdr->b_l1hdr.b_buf = NULL;
+
+ /*
+ * Allocate the hdr's buffer. This will contain either
+ * the compressed or uncompressed data depending on the block
+ * it references and compressed arc enablement.
+ */
+ arc_hdr_alloc_pabd(hdr, B_TRUE);
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+
+ return (hdr);
+}
+
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ arc_buf_hdr_t *nhdr;
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+ (old == hdr_l2only_cache && new == hdr_full_cache));
+
+ nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+ buf_hash_remove(hdr);
+
+ bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+
+ if (new == hdr_full_cache) {
+ arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
+ /*
+ * arc_access and arc_change_state need to be aware that a
+ * header has just come out of L2ARC, so we set its state to
+ * l2c_only even though it's about to change.
+ */
+ nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+ /* Verify previous threads set to NULL before freeing */
+ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
+ } else {
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+ /*
+ * If we've reached here, We must have been called from
+ * arc_evict_hdr(), as such we should have already been
+ * removed from any ghost list we were previously on
+ * (which protects us from racing with arc_evict_state),
+ * thus no locking is needed during this check.
+ */
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ /*
+ * A buffer must not be moved into the arc_l2c_only
+ * state if it's not finished being written out to the
+ * l2arc device. Otherwise, the b_l1hdr.b_pabd field
+ * might try to be accessed, even though it was removed.
+ */
+ VERIFY(!HDR_L2_WRITING(hdr));
+ VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+
+#ifdef ZFS_DEBUG
+ if (hdr->b_l1hdr.b_thawed != NULL) {
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = NULL;
+ }
+#endif
+
+ arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
+ }
+ /*
+ * The header has been reallocated so we need to re-insert it into any
+ * lists it was on.
+ */
+ (void) buf_hash_insert(nhdr, NULL);
+
+ ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+ mutex_enter(&dev->l2ad_mtx);
+
+ /*
+ * We must place the realloc'ed header back into the list at
+ * the same spot. Otherwise, if it's placed earlier in the list,
+ * l2arc_write_buffers() could find it during the function's
+ * write phase, and try to write it out to the l2arc.
+ */
+ list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+ list_remove(&dev->l2ad_buflist, hdr);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ /*
+ * Since we're using the pointer address as the tag when
+ * incrementing and decrementing the l2ad_alloc refcount, we
+ * must remove the old pointer (that we're about to destroy) and
+ * add the new pointer to the refcount. Otherwise we'd remove
+ * the wrong pointer address when calling arc_hdr_destroy() later.
+ */
+
+ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
+ hdr);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr),
+ nhdr);
+
+ buf_discard_identity(hdr);
+ kmem_cache_free(old, hdr);
+
+ return (nhdr);
+}
+
+/*
+ * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
+ * The buf is returned thawed since we expect the consumer to modify it.
+ */
+arc_buf_t *
+arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
+{
+ arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
+ ZIO_COMPRESS_OFF, type);
+ ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
+
+ arc_buf_t *buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
+ arc_buf_thaw(buf);
+
+ return (buf);
+}
+
+/*
+ * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
+ * for bufs containing metadata.
+ */
+arc_buf_t *
+arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ ASSERT3U(lsize, >, 0);
+ ASSERT3U(lsize, >=, psize);
+ ASSERT(compression_type > ZIO_COMPRESS_OFF);
+ ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
+
+ arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+ compression_type, ARC_BUFC_DATA);
+ ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
+
+ arc_buf_t *buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
+ arc_buf_thaw(buf);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+ if (!arc_buf_is_shared(buf)) {
+ /*
+ * To ensure that the hdr has the correct data in it if we call
+ * arc_decompress() on this buf before it's been written to
+ * disk, it's easiest if we just set up sharing between the
+ * buf and the hdr.
+ */
+ ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
+ arc_hdr_free_pabd(hdr);
+ arc_share_buf(hdr, buf);
+ }
+
+ return (buf);
+}
+
+static void
+arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+ l2arc_dev_t *dev = l2hdr->b_dev;
+ uint64_t psize = arc_hdr_size(hdr);
+
+ ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ list_remove(&dev->l2ad_buflist, hdr);
+
+ ARCSTAT_INCR(arcstat_l2_psize, -psize);
+ ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
+
+ vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
+
+ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+ hdr->b_l1hdr.b_bufcnt > 0);
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ }
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+ if (!HDR_EMPTY(hdr))
+ buf_discard_identity(hdr);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+ boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
+
+ if (!buflist_held)
+ mutex_enter(&dev->l2ad_mtx);
+
+ /*
+ * Even though we checked this conditional above, we
+ * need to check this again now that we have the
+ * l2ad_mtx. This is because we could be racing with
+ * another thread calling l2arc_evict() which might have
+ * destroyed this header's L2 portion as we were waiting
+ * to acquire the l2ad_mtx. If that happens, we don't
+ * want to re-destroy the header's L2 portion.
+ */
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_trim(hdr);
+ arc_hdr_l2hdr_destroy(hdr);
+ }
+
+ if (!buflist_held)
+ mutex_exit(&dev->l2ad_mtx);
+ }
+
+ if (HDR_HAS_L1HDR(hdr)) {
+ arc_cksum_free(hdr);
+
+ while (hdr->b_l1hdr.b_buf != NULL)
+ arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
+
+#ifdef ZFS_DEBUG
+ if (hdr->b_l1hdr.b_thawed != NULL) {
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = NULL;
+ }
+#endif
+
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ arc_hdr_free_pabd(hdr);
+ }
+ }
+
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ kmem_cache_free(hdr_full_cache, hdr);
+ } else {
+ kmem_cache_free(hdr_l2only_cache, hdr);
+ }
+}
+
+void
+arc_buf_destroy(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ VERIFY0(remove_reference(hdr, NULL, tag));
+ arc_hdr_destroy(hdr);
+ return;
+ }
+
+ mutex_enter(hash_lock);
+ ASSERT3P(hdr, ==, buf->b_hdr);
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
+ ASSERT3P(buf->b_data, !=, NULL);
+
+ (void) remove_reference(hdr, hash_lock, tag);
+ arc_buf_destroy_impl(buf);
+ mutex_exit(hash_lock);
+}
+
+/*
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on its state prior to entering this
+ * function. The following transitions are possible:
+ *
+ * - arc_mru -> arc_mru_ghost
+ * - arc_mfu -> arc_mfu_ghost
+ * - arc_mru_ghost -> arc_l2c_only
+ * - arc_mru_ghost -> deleted
+ * - arc_mfu_ghost -> arc_l2c_only
+ * - arc_mfu_ghost -> deleted
+ */
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+{
+ arc_state_t *evicted_state, *state;
+ int64_t bytes_evicted = 0;
+ int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+ zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ state = hdr->b_l1hdr.b_state;
+ if (GHOST_STATE(state)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+
+ /*
+ * l2arc_write_buffers() relies on a header's L1 portion
+ * (i.e. its b_pabd field) during it's write phase.
+ * Thus, we cannot push a header onto the arc_l2c_only
+ * state (removing it's L1 piece) until the header is
+ * done being written to the l2arc.
+ */
+ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+ ARCSTAT_BUMP(arcstat_evict_l2_skip);
+ return (bytes_evicted);
+ }
+
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_evicted += HDR_GET_LSIZE(hdr);
+
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ if (HDR_HAS_L2HDR(hdr)) {
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, hdr, hash_lock);
+ /*
+ * dropping from L1+L2 cached to L2-only,
+ * realloc to remove the L1 header.
+ */
+ hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+ hdr_l2only_cache);
+ } else {
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
+ }
+ return (bytes_evicted);
+ }
+
+ ASSERT(state == arc_mru || state == arc_mfu);
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(hdr) ||
+ ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
+ return (bytes_evicted);
+ }
+
+ ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ break;
+ }
+ if (buf->b_data != NULL)
+ bytes_evicted += HDR_GET_LSIZE(hdr);
+ mutex_exit(&buf->b_evict_lock);
+ arc_buf_destroy_impl(buf);
+ }
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
+ } else {
+ if (l2arc_write_eligible(hdr->b_spa, hdr)) {
+ ARCSTAT_INCR(arcstat_evict_l2_eligible,
+ HDR_GET_LSIZE(hdr));
+ } else {
+ ARCSTAT_INCR(arcstat_evict_l2_ineligible,
+ HDR_GET_LSIZE(hdr));
+ }
+ }
+
+ if (hdr->b_l1hdr.b_bufcnt == 0) {
+ arc_cksum_free(hdr);
+
+ bytes_evicted += arc_hdr_size(hdr);
+
+ /*
+ * If this hdr is being evicted and has a compressed
+ * buffer then we discard it here before we change states.
+ * This ensures that the accounting is updated correctly
+ * in arc_free_data_impl().
+ */
+ arc_hdr_free_pabd(hdr);
+
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+ }
+
+ return (bytes_evicted);
+}
+
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+ uint64_t spa, int64_t bytes)
+{
+ multilist_sublist_t *mls;
+ uint64_t bytes_evicted = 0;
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ int evict_count = 0;
+
+ ASSERT3P(marker, !=, NULL);
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+ mls = multilist_sublist_lock(ml, idx);
+
+ for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+ hdr = multilist_sublist_prev(mls, marker)) {
+ if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+ (evict_count >= zfs_arc_evict_batch_limit))
+ break;
+
+ /*
+ * To keep our iteration location, move the marker
+ * forward. Since we're not holding hdr's hash lock, we
+ * must be very careful and not remove 'hdr' from the
+ * sublist. Otherwise, other consumers might mistake the
+ * 'hdr' as not being on a sublist when they call the
+ * multilist_link_active() function (they all rely on
+ * the hash lock protecting concurrent insertions and
+ * removals). multilist_sublist_move_forward() was
+ * specifically implemented to ensure this is the case
+ * (only 'marker' will be removed and re-inserted).
+ */
+ multilist_sublist_move_forward(mls, marker);
+
+ /*
+ * The only case where the b_spa field should ever be
+ * zero, is the marker headers inserted by
+ * arc_evict_state(). It's possible for multiple threads
+ * to be calling arc_evict_state() concurrently (e.g.
+ * dsl_pool_close() and zio_inject_fault()), so we must
+ * skip any markers we see from these other threads.
+ */
+ if (hdr->b_spa == 0)
+ continue;
+
+ /* we're only interested in evicting buffers of a certain spa */
+ if (spa != 0 && hdr->b_spa != spa) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
+ continue;
+ }
+
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We aren't calling this function from any code path
+ * that would already be holding a hash lock, so we're
+ * asserting on this assumption to be defensive in case
+ * this ever changes. Without this check, it would be
+ * possible to incorrectly increment arcstat_mutex_miss
+ * below (e.g. if the code changed such that we called
+ * this function with a hash lock held).
+ */
+ ASSERT(!MUTEX_HELD(hash_lock));
+
+ if (mutex_tryenter(hash_lock)) {
+ uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+ mutex_exit(hash_lock);
+
+ bytes_evicted += evicted;
+
+ /*
+ * If evicted is zero, arc_evict_hdr() must have
+ * decided to skip this header, don't increment
+ * evict_count in this case.
+ */
+ if (evicted != 0)
+ evict_count++;
+
+ /*
+ * If arc_size isn't overflowing, signal any
+ * threads that might happen to be waiting.
+ *
+ * For each header evicted, we wake up a single
+ * thread. If we used cv_broadcast, we could
+ * wake up "too many" threads causing arc_size
+ * to significantly overflow arc_c; since
+ * arc_get_data_impl() doesn't check for overflow
+ * when it's woken up (it doesn't because it's
+ * possible for the ARC to be overflowing while
+ * full of un-evictable buffers, and the
+ * function should proceed in this case).
+ *
+ * If threads are left sleeping, due to not
+ * using cv_broadcast here, they will be woken
+ * up via cv_broadcast in arc_adjust_cb() just
+ * before arc_adjust_zthr sleeps.
+ */
+ mutex_enter(&arc_adjust_lock);
+ if (!arc_is_overflowing())
+ cv_signal(&arc_adjust_waiters_cv);
+ mutex_exit(&arc_adjust_lock);
+ } else {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ }
+ }
+
+ multilist_sublist_unlock(mls);
+
+ return (bytes_evicted);
+}
+
+/*
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type)
+{
+ uint64_t total_evicted = 0;
+ multilist_t *ml = state->arcs_list[type];
+ int num_sublists;
+ arc_buf_hdr_t **markers;
+
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+ num_sublists = multilist_get_num_sublists(ml);
+
+ /*
+ * If we've tried to evict from each sublist, made some
+ * progress, but still have not hit the target number of bytes
+ * to evict, we want to keep trying. The markers allow us to
+ * pick up where we left off for each individual sublist, rather
+ * than starting from the tail each time.
+ */
+ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+ for (int i = 0; i < num_sublists; i++) {
+ markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+ /*
+ * A b_spa of 0 is used to indicate that this header is
+ * a marker. This fact is used in arc_adjust_type() and
+ * arc_evict_state_impl().
+ */
+ markers[i]->b_spa = 0;
+
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_insert_tail(mls, markers[i]);
+ multilist_sublist_unlock(mls);
+ }
+
+ /*
+ * While we haven't hit our target number of bytes to evict, or
+ * we're evicting all available buffers.
+ */
+ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+ int sublist_idx = multilist_get_random_index(ml);
+ uint64_t scan_evicted = 0;
+
+ /*
+ * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
+ * Request that 10% of the LRUs be scanned by the superblock
+ * shrinker.
+ */
+ if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
+ arc_dnode_limit) > 0) {
+ arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
+ arc_dnode_limit) / sizeof (dnode_t) /
+ zfs_arc_dnode_reduce_percent);
+ }
+
+ /*
+ * Start eviction using a randomly selected sublist,
+ * this is to try and evenly balance eviction across all
+ * sublists. Always starting at the same sublist
+ * (e.g. index 0) would cause evictions to favor certain
+ * sublists over others.
+ */
+ for (int i = 0; i < num_sublists; i++) {
+ uint64_t bytes_remaining;
+ uint64_t bytes_evicted;
+
+ if (bytes == ARC_EVICT_ALL)
+ bytes_remaining = ARC_EVICT_ALL;
+ else if (total_evicted < bytes)
+ bytes_remaining = bytes - total_evicted;
+ else
+ break;
+
+ bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+ markers[sublist_idx], spa, bytes_remaining);
+
+ scan_evicted += bytes_evicted;
+ total_evicted += bytes_evicted;
+
+ /* we've reached the end, wrap to the beginning */
+ if (++sublist_idx >= num_sublists)
+ sublist_idx = 0;
+ }
+
+ /*
+ * If we didn't evict anything during this scan, we have
+ * no reason to believe we'll evict more during another
+ * scan, so break the loop.
+ */
+ if (scan_evicted == 0) {
+ /* This isn't possible, let's make that obvious */
+ ASSERT3S(bytes, !=, 0);
+
+ /*
+ * When bytes is ARC_EVICT_ALL, the only way to
+ * break the loop is when scan_evicted is zero.
+ * In that case, we actually have evicted enough,
+ * so we don't want to increment the kstat.
+ */
+ if (bytes != ARC_EVICT_ALL) {
+ ASSERT3S(total_evicted, <, bytes);
+ ARCSTAT_BUMP(arcstat_evict_not_enough);
+ }
+
+ break;
+ }
+ }
+
+ for (int i = 0; i < num_sublists; i++) {
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_remove(mls, markers[i]);
+ multilist_sublist_unlock(mls);
+
+ kmem_cache_free(hdr_full_cache, markers[i]);
+ }
+ kmem_free(markers, sizeof (*markers) * num_sublists);
+
+ return (total_evicted);
+}
+
+/*
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to B_FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to B_TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
+ */
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+ boolean_t retry)
+{
+ uint64_t evicted = 0;
+
+ while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
+ evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
+
+ if (!retry)
+ break;
+ }
+
+ return (evicted);
+}
+
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *ptr)
+{
+ arc_prune_t *ap = (arc_prune_t *)ptr;
+ arc_prune_func_t *func = ap->p_pfunc;
+
+ if (func != NULL)
+ func(ap->p_adjust, ap->p_private);
+
+ zfs_refcount_remove(&ap->p_refcnt, func);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference. This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread(). A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+static void
+arc_prune_async(int64_t adjust)
+{
+ arc_prune_t *ap;
+
+ mutex_enter(&arc_prune_mtx);
+ for (ap = list_head(&arc_prune_list); ap != NULL;
+ ap = list_next(&arc_prune_list, ap)) {
+
+ if (zfs_refcount_count(&ap->p_refcnt) >= 2)
+ continue;
+
+ zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
+ ap->p_adjust = adjust;
+ if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
+ ap, TQ_SLEEP) == TASKQID_INVALID) {
+ zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
+ continue;
+ }
+ ARCSTAT_BUMP(arcstat_prune);
+ }
+ mutex_exit(&arc_prune_mtx);
+}
+
+/*
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
+ */
+static uint64_t
+arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type)
+{
+ int64_t delta;
+
+ if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
+ delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
+ bytes);
+ return (arc_evict_state(state, spa, delta, type));
+ }
+
+ return (0);
+}
+
+/*
+ * The goal of this function is to evict enough meta data buffers from the
+ * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
+ * more complicated than it appears because it is common for data buffers
+ * to have holds on meta data buffers. In addition, dnode meta data buffers
+ * will be held by the dnodes in the block preventing them from being freed.
+ * This means we can't simply traverse the ARC and expect to always find
+ * enough unheld meta data buffer to release.
+ *
+ * Therefore, this function has been updated to make alternating passes
+ * over the ARC releasing data buffers and then newly unheld meta data
+ * buffers. This ensures forward progress is maintained and meta_used
+ * will decrease. Normally this is sufficient, but if required the ARC
+ * will call the registered prune callbacks causing dentry and inodes to
+ * be dropped from the VFS cache. This will make dnode meta data buffers
+ * available for reclaim.
+ */
+static uint64_t
+arc_adjust_meta_balanced(uint64_t meta_used)
+{
+ int64_t delta, prune = 0, adjustmnt;
+ uint64_t total_evicted = 0;
+ arc_buf_contents_t type = ARC_BUFC_DATA;
+ int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
+
+restart:
+ /*
+ * This slightly differs than the way we evict from the mru in
+ * arc_adjust because we don't have a "target" value (i.e. no
+ * "meta" arc_p). As a result, I think we can completely
+ * cannibalize the metadata in the MRU before we evict the
+ * metadata from the MFU. I think we probably need to implement a
+ * "metadata arc_p" value to do this properly.
+ */
+ adjustmnt = meta_used - arc_meta_limit;
+
+ if (adjustmnt > 0 &&
+ zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
+ delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
+ adjustmnt);
+ total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
+ adjustmnt -= delta;
+ }
+
+ /*
+ * We can't afford to recalculate adjustmnt here. If we do,
+ * new metadata buffers can sneak into the MRU or ANON lists,
+ * thus penalize the MFU metadata. Although the fudge factor is
+ * small, it has been empirically shown to be significant for
+ * certain workloads (e.g. creating many empty directories). As
+ * such, we use the original calculation for adjustmnt, and
+ * simply decrement the amount of data evicted from the MRU.
+ */
+
+ if (adjustmnt > 0 &&
+ zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
+ delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
+ adjustmnt);
+ total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
+ }
+
+ adjustmnt = meta_used - arc_meta_limit;
+
+ if (adjustmnt > 0 &&
+ zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
+ delta = MIN(adjustmnt,
+ zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
+ total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
+ adjustmnt -= delta;
+ }
+
+ if (adjustmnt > 0 &&
+ zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
+ delta = MIN(adjustmnt,
+ zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
+ total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
+ }
+
+ /*
+ * If after attempting to make the requested adjustment to the ARC
+ * the meta limit is still being exceeded then request that the
+ * higher layers drop some cached objects which have holds on ARC
+ * meta buffers. Requests to the upper layers will be made with
+ * increasingly large scan sizes until the ARC is below the limit.
+ */
+ if (meta_used > arc_meta_limit) {
+ if (type == ARC_BUFC_DATA) {
+ type = ARC_BUFC_METADATA;
+ } else {
+ type = ARC_BUFC_DATA;
+
+ if (zfs_arc_meta_prune) {
+ prune += zfs_arc_meta_prune;
+ arc_prune_async(prune);
+ }
+ }
+
+ if (restarts > 0) {
+ restarts--;
+ goto restart;
+ }
+ }
+ return (total_evicted);
+}
+
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_adjust_meta_only(uint64_t meta_used)
+{
+ uint64_t total_evicted = 0;
+ int64_t target;
+
+ /*
+ * If we're over the meta limit, we want to evict enough
+ * metadata to get back under the meta limit. We don't want to
+ * evict so much that we drop the MRU below arc_p, though. If
+ * we're over the meta limit more than we're over arc_p, we
+ * evict some from the MRU here, and some from the MFU below.
+ */
+ target = MIN((int64_t)(meta_used - arc_meta_limit),
+ (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
+ zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
+
+ total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+
+ /*
+ * Similar to the above, we want to evict enough bytes to get us
+ * below the meta limit, but not so much as to drop us below the
+ * space allotted to the MFU (which is defined as arc_c - arc_p).
+ */
+ target = MIN((int64_t)(meta_used - arc_meta_limit),
+ (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
+ (arc_c - arc_p)));
+
+ total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+
+ return (total_evicted);
+}
+
+static uint64_t
+arc_adjust_meta(uint64_t meta_used)
+{
+ if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
+ return (arc_adjust_meta_only(meta_used));
+ else
+ return (arc_adjust_meta_balanced(meta_used));
+}
+
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_adjust_type(arc_state_t *state)
+{
+ multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
+ multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
+ int data_idx = multilist_get_random_index(data_ml);
+ int meta_idx = multilist_get_random_index(meta_ml);
+ multilist_sublist_t *data_mls;
+ multilist_sublist_t *meta_mls;
+ arc_buf_contents_t type;
+ arc_buf_hdr_t *data_hdr;
+ arc_buf_hdr_t *meta_hdr;
+
+ /*
+ * We keep the sublist lock until we're finished, to prevent
+ * the headers from being destroyed via arc_evict_state().
+ */
+ data_mls = multilist_sublist_lock(data_ml, data_idx);
+ meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
+
+ /*
+ * These two loops are to ensure we skip any markers that
+ * might be at the tail of the lists due to arc_evict_state().
+ */
+
+ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+ data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+ if (data_hdr->b_spa != 0)
+ break;
+ }
+
+ for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+ meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+ if (meta_hdr->b_spa != 0)
+ break;
+ }
+
+ if (data_hdr == NULL && meta_hdr == NULL) {
+ type = ARC_BUFC_DATA;
+ } else if (data_hdr == NULL) {
+ ASSERT3P(meta_hdr, !=, NULL);
+ type = ARC_BUFC_METADATA;
+ } else if (meta_hdr == NULL) {
+ ASSERT3P(data_hdr, !=, NULL);
+ type = ARC_BUFC_DATA;
+ } else {
+ ASSERT3P(data_hdr, !=, NULL);
+ ASSERT3P(meta_hdr, !=, NULL);
+
+ /* The headers can't be on the sublist without an L1 header */
+ ASSERT(HDR_HAS_L1HDR(data_hdr));
+ ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+ if (data_hdr->b_l1hdr.b_arc_access <
+ meta_hdr->b_l1hdr.b_arc_access) {
+ type = ARC_BUFC_DATA;
+ } else {
+ type = ARC_BUFC_METADATA;
+ }
+ }
+
+ multilist_sublist_unlock(meta_mls);
+ multilist_sublist_unlock(data_mls);
+
+ return (type);
+}
+
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
+arc_adjust(void)
+{
+ uint64_t total_evicted = 0;
+ uint64_t bytes;
+ int64_t target;
+ uint64_t asize = aggsum_value(&arc_size);
+ uint64_t ameta = aggsum_value(&arc_meta_used);
+
+ /*
+ * If we're over arc_meta_limit, we want to correct that before
+ * potentially evicting data buffers below.
+ */
+ total_evicted += arc_adjust_meta(ameta);
+
+ /*
+ * Adjust MRU size
+ *
+ * If we're over the target cache size, we want to evict enough
+ * from the list to get back to our target size. We don't want
+ * to evict too much from the MRU, such that it drops below
+ * arc_p. So, if we're over our target cache size more than
+ * the MRU is over arc_p, we'll evict enough to get back to
+ * arc_p here, and then evict more from the MFU below.
+ */
+ target = MIN((int64_t)(asize - arc_c),
+ (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
+ zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
+
+ /*
+ * If we're below arc_meta_min, always prefer to evict data.
+ * Otherwise, try to satisfy the requested number of bytes to
+ * evict from the type which contains older buffers; in an
+ * effort to keep newer buffers in the cache regardless of their
+ * type. If we cannot satisfy the number of bytes from this
+ * type, spill over into the next type.
+ */
+ if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+ ameta > arc_meta_min) {
+ bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * metadata, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+ } else {
+ bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * data, we try to get the rest from metadata.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+ }
+
+ /*
+ * Re-sum ARC stats after the first round of evictions.
+ */
+ asize = aggsum_value(&arc_size);
+ ameta = aggsum_value(&arc_meta_used);
+
+ /*
+ * Adjust MFU size
+ *
+ * Now that we've tried to evict enough from the MRU to get its
+ * size back to arc_p, if we're still above the target cache
+ * size, we evict the rest from the MFU.
+ */
+ target = asize - arc_c;
+
+ if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
+ ameta > arc_meta_min) {
+ bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * metadata, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+ } else {
+ bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * data, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+ }
+
+ /*
+ * Adjust ghost lists
+ *
+ * In addition to the above, the ARC also defines target values
+ * for the ghost lists. The sum of the mru list and mru ghost
+ * list should never exceed the target size of the cache, and
+ * the sum of the mru list, mfu list, mru ghost list, and mfu
+ * ghost list should never exceed twice the target size of the
+ * cache. The following logic enforces these limits on the ghost
+ * caches, and evicts from them as needed.
+ */
+ target = zfs_refcount_count(&arc_mru->arcs_size) +
+ zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
+
+ bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
+
+ /*
+ * We assume the sum of the mru list and mfu list is less than
+ * or equal to arc_c (we enforced this above), which means we
+ * can use the simpler of the two equations below:
+ *
+ * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+ * mru ghost + mfu ghost <= arc_c
+ */
+ target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
+ zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
+
+ bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+
+ return (total_evicted);
+}
+
+void
+arc_flush(spa_t *spa, boolean_t retry)
+{
+ uint64_t guid = 0;
+
+ /*
+ * If retry is B_TRUE, a spa must not be specified since we have
+ * no good way to determine if all of a spa's buffers have been
+ * evicted from an arc state.
+ */
+ ASSERT(!retry || spa == 0);
+
+ if (spa != NULL)
+ guid = spa_load_guid(spa);
+
+ (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
+}
+
+static void
+arc_reduce_target_size(int64_t to_free)
+{
+ uint64_t asize = aggsum_value(&arc_size);
+ if (arc_c > arc_c_min) {
+ DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
+ arc_c_min, uint64_t, arc_p, uint64_t, to_free);
+ if (arc_c > arc_c_min + to_free)
+ atomic_add_64(&arc_c, -to_free);
+ else
+ arc_c = arc_c_min;
+
+ atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+ if (asize < arc_c)
+ arc_c = MAX(asize, arc_c_min);
+ if (arc_p > arc_c)
+ arc_p = (arc_c >> 1);
+
+ DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
+ arc_p);
+
+ ASSERT(arc_c >= arc_c_min);
+ ASSERT((int64_t)arc_p >= 0);
+ }
+
+ if (asize > arc_c) {
+ DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize,
+ uint64_t, arc_c);
+ /* See comment in arc_adjust_cb_check() on why lock+flag */
+ mutex_enter(&arc_adjust_lock);
+ arc_adjust_needed = B_TRUE;
+ mutex_exit(&arc_adjust_lock);
+ zthr_wakeup(arc_adjust_zthr);
+ }
+}
+
+typedef enum free_memory_reason_t {
+ FMR_UNKNOWN,
+ FMR_NEEDFREE,
+ FMR_LOTSFREE,
+ FMR_SWAPFS_MINFREE,
+ FMR_PAGES_PP_MAXIMUM,
+ FMR_HEAP_ARENA,
+ FMR_ZIO_ARENA,
+} free_memory_reason_t;
+
+int64_t last_free_memory;
+free_memory_reason_t last_free_reason;
+
+/*
+ * Additional reserve of pages for pp_reserve.
+ */
+int64_t arc_pages_pp_reserve = 64;
+
+/*
+ * Additional reserve of pages for swapfs.
+ */
+int64_t arc_swapfs_reserve = 64;
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed. Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+static int64_t
+arc_available_memory(void)
+{
+ int64_t lowest = INT64_MAX;
+ int64_t n;
+ free_memory_reason_t r = FMR_UNKNOWN;
+
+#ifdef _KERNEL
+#ifdef __FreeBSD__
+ /*
+ * Cooperate with pagedaemon when it's time for it to scan
+ * and reclaim some pages.
+ */
+ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_LOTSFREE;
+ }
+
+#else
+ if (needfree > 0) {
+ n = PAGESIZE * (-needfree);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_NEEDFREE;
+ }
+ }
+
+ /*
+ * check that we're out of range of the pageout scanner. It starts to
+ * schedule paging if freemem is less than lotsfree and needfree.
+ * lotsfree is the high-water mark for pageout, and needfree is the
+ * number of needed free pages. We add extra pages here to make sure
+ * the scanner doesn't start up while we're freeing memory.
+ */
+ n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_LOTSFREE;
+ }
+
+ /*
+ * check to make sure that swapfs has enough space so that anon
+ * reservations can still succeed. anon_resvmem() checks that the
+ * availrmem is greater than swapfs_minfree, and the number of reserved
+ * swap pages. We also add a bit of extra here just to prevent
+ * circumstances from getting really dire.
+ */
+ n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
+ desfree - arc_swapfs_reserve);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_SWAPFS_MINFREE;
+ }
+
+
+ /*
+ * Check that we have enough availrmem that memory locking (e.g., via
+ * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
+ * stores the number of pages that cannot be locked; when availrmem
+ * drops below pages_pp_maximum, page locking mechanisms such as
+ * page_pp_lock() will fail.)
+ */
+ n = PAGESIZE * (availrmem - pages_pp_maximum -
+ arc_pages_pp_reserve);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_PAGES_PP_MAXIMUM;
+ }
+
+#endif /* __FreeBSD__ */
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
+ * free)
+ */
+ n = uma_avail() - (long)(uma_limit() / 4);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_HEAP_ARENA;
+ }
+#endif
+
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this arena remains
+ * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
+ *
+ * Note that reducing the arc_zio_arena_free_shift keeps more virtual
+ * memory (in the zio_arena) free, which can avoid memory
+ * fragmentation issues.
+ */
+ if (zio_arena != NULL) {
+ n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
+ (vmem_size(zio_arena, VMEM_ALLOC) >>
+ arc_zio_arena_free_shift);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_ZIO_ARENA;
+ }
+ }
+
+#else /* _KERNEL */
+ /* Every 100 calls, free a small amount */
+ if (spa_get_random(100) == 0)
+ lowest = -1024;
+#endif /* _KERNEL */
+
+ last_free_memory = lowest;
+ last_free_reason = r;
+ DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
+ return (lowest);
+}
+
+
+/*
+ * Determine if the system is under memory pressure and is asking
+ * to reclaim memory. A return value of B_TRUE indicates that the system
+ * is under memory pressure and that the arc should adjust accordingly.
+ */
+static boolean_t
+arc_reclaim_needed(void)
+{
+ return (arc_available_memory() < 0);
+}
+
+extern kmem_cache_t *zio_buf_cache[];
+extern kmem_cache_t *zio_data_buf_cache[];
+extern kmem_cache_t *range_seg_cache;
+extern kmem_cache_t *abd_chunk_cache;
+
+static __noinline void
+arc_kmem_reap_soon(void)
+{
+ size_t i;
+ kmem_cache_t *prev_cache = NULL;
+ kmem_cache_t *prev_data_cache = NULL;
+
+ DTRACE_PROBE(arc__kmem_reap_start);
+#ifdef _KERNEL
+ if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
+ /*
+ * We are exceeding our meta-data cache limit.
+ * Purge some DNLC entries to release holds on meta-data.
+ */
+ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+ }
+#if defined(__i386)
+ /*
+ * Reclaim unused memory from all kmem caches.
+ */
+ kmem_reap();
+#endif
+#endif
+
+ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+ if (zio_buf_cache[i] != prev_cache) {
+ prev_cache = zio_buf_cache[i];
+ kmem_cache_reap_soon(zio_buf_cache[i]);
+ }
+ if (zio_data_buf_cache[i] != prev_data_cache) {
+ prev_data_cache = zio_data_buf_cache[i];
+ kmem_cache_reap_soon(zio_data_buf_cache[i]);
+ }
+ }
+ kmem_cache_reap_soon(abd_chunk_cache);
+ kmem_cache_reap_soon(buf_cache);
+ kmem_cache_reap_soon(hdr_full_cache);
+ kmem_cache_reap_soon(hdr_l2only_cache);
+ kmem_cache_reap_soon(range_seg_cache);
+
+#ifdef illumos
+ if (zio_arena != NULL) {
+ /*
+ * Ask the vmem arena to reclaim unused memory from its
+ * quantum caches.
+ */
+ vmem_qcache_reap(zio_arena);
+ }
+#endif
+ DTRACE_PROBE(arc__kmem_reap_end);
+}
+
+/* ARGSUSED */
+static boolean_t
+arc_adjust_cb_check(void *arg, zthr_t *zthr)
+{
+ /*
+ * This is necessary in order for the mdb ::arc dcmd to
+ * show up to date information. Since the ::arc command
+ * does not call the kstat's update function, without
+ * this call, the command may show stale stats for the
+ * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+ * with this change, the data might be up to 1 second
+ * out of date(the arc_adjust_zthr has a maximum sleep
+ * time of 1 second); but that should suffice. The
+ * arc_state_t structures can be queried directly if more
+ * accurate information is needed.
+ */
+ if (arc_ksp != NULL)
+ arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+
+ /*
+ * We have to rely on arc_get_data_impl() to tell us when to adjust,
+ * rather than checking if we are overflowing here, so that we are
+ * sure to not leave arc_get_data_impl() waiting on
+ * arc_adjust_waiters_cv. If we have become "not overflowing" since
+ * arc_get_data_impl() checked, we need to wake it up. We could
+ * broadcast the CV here, but arc_get_data_impl() may have not yet
+ * gone to sleep. We would need to use a mutex to ensure that this
+ * function doesn't broadcast until arc_get_data_impl() has gone to
+ * sleep (e.g. the arc_adjust_lock). However, the lock ordering of
+ * such a lock would necessarily be incorrect with respect to the
+ * zthr_lock, which is held before this function is called, and is
+ * held by arc_get_data_impl() when it calls zthr_wakeup().
+ */
+ return (arc_adjust_needed);
+}
+
+/*
+ * Keep arc_size under arc_c by running arc_adjust which evicts data
+ * from the ARC. */
+/* ARGSUSED */
+static void
+arc_adjust_cb(void *arg, zthr_t *zthr)
+{
+ uint64_t evicted = 0;
+
+ /* Evict from cache */
+ evicted = arc_adjust();
+
+ /*
+ * If evicted is zero, we couldn't evict anything
+ * via arc_adjust(). This could be due to hash lock
+ * collisions, but more likely due to the majority of
+ * arc buffers being unevictable. Therefore, even if
+ * arc_size is above arc_c, another pass is unlikely to
+ * be helpful and could potentially cause us to enter an
+ * infinite loop. Additionally, zthr_iscancelled() is
+ * checked here so that if the arc is shutting down, the
+ * broadcast will wake any remaining arc adjust waiters.
+ */
+ mutex_enter(&arc_adjust_lock);
+ arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
+ evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
+ if (!arc_adjust_needed) {
+ /*
+ * We're either no longer overflowing, or we
+ * can't evict anything more, so we should wake
+ * up any waiters.
+ */
+ cv_broadcast(&arc_adjust_waiters_cv);
+ }
+ mutex_exit(&arc_adjust_lock);
+}
+
+/* ARGSUSED */
+static boolean_t
+arc_reap_cb_check(void *arg, zthr_t *zthr)
+{
+ int64_t free_memory = arc_available_memory();
+
+ /*
+ * If a kmem reap is already active, don't schedule more. We must
+ * check for this because kmem_cache_reap_soon() won't actually
+ * block on the cache being reaped (this is to prevent callers from
+ * becoming implicitly blocked by a system-wide kmem reap -- which,
+ * on a system with many, many full magazines, can take minutes).
+ */
+ if (!kmem_cache_reap_active() &&
+ free_memory < 0) {
+ arc_no_grow = B_TRUE;
+ arc_warm = B_TRUE;
+ /*
+ * Wait at least zfs_grow_retry (default 60) seconds
+ * before considering growing.
+ */
+ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+ return (B_TRUE);
+ } else if (free_memory < arc_c >> arc_no_grow_shift) {
+ arc_no_grow = B_TRUE;
+ } else if (gethrtime() >= arc_growtime) {
+ arc_no_grow = B_FALSE;
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Keep enough free memory in the system by reaping the ARC's kmem
+ * caches. To cause more slabs to be reapable, we may reduce the
+ * target size of the cache (arc_c), causing the arc_adjust_cb()
+ * to free more buffers.
+ */
+/* ARGSUSED */
+static void
+arc_reap_cb(void *arg, zthr_t *zthr)
+{
+ int64_t free_memory;
+
+ /*
+ * Kick off asynchronous kmem_reap()'s of all our caches.
+ */
+ arc_kmem_reap_soon();
+
+ /*
+ * Wait at least arc_kmem_cache_reap_retry_ms between
+ * arc_kmem_reap_soon() calls. Without this check it is possible to
+ * end up in a situation where we spend lots of time reaping
+ * caches, while we're near arc_c_min. Waiting here also gives the
+ * subsequent free memory check a chance of finding that the
+ * asynchronous reap has already freed enough memory, and we don't
+ * need to call arc_reduce_target_size().
+ */
+ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
+
+ /*
+ * Reduce the target size as needed to maintain the amount of free
+ * memory in the system at a fraction of the arc_size (1/128th by
+ * default). If oversubscribed (free_memory < 0) then reduce the
+ * target arc_size by the deficit amount plus the fractional
+ * amount. If free memory is positive but less then the fractional
+ * amount, reduce by what is needed to hit the fractional amount.
+ */
+ free_memory = arc_available_memory();
+
+ int64_t to_free =
+ (arc_c >> arc_shrink_shift) - free_memory;
+ if (to_free > 0) {
+#ifdef _KERNEL
+#ifdef illumos
+ to_free = MAX(to_free, ptob(needfree));
+#endif
+#endif
+ arc_reduce_target_size(to_free);
+ }
+}
+
+static u_int arc_dnlc_evicts_arg;
+extern struct vfsops zfs_vfsops;
+
+static void
+arc_dnlc_evicts_thread(void *dummy __unused)
+{
+ callb_cpr_t cpr;
+ u_int percent;
+
+ CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&arc_dnlc_evicts_lock);
+ while (!arc_dnlc_evicts_thread_exit) {
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
+ CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
+ if (arc_dnlc_evicts_arg != 0) {
+ percent = arc_dnlc_evicts_arg;
+ mutex_exit(&arc_dnlc_evicts_lock);
+#ifdef _KERNEL
+ vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
+#endif
+ mutex_enter(&arc_dnlc_evicts_lock);
+ /*
+ * Clear our token only after vnlru_free()
+ * pass is done, to avoid false queueing of
+ * the requests.
+ */
+ arc_dnlc_evicts_arg = 0;
+ }
+ }
+ arc_dnlc_evicts_thread_exit = FALSE;
+ cv_broadcast(&arc_dnlc_evicts_cv);
+ CALLB_CPR_EXIT(&cpr);
+ thread_exit();
+}
+
+void
+dnlc_reduce_cache(void *arg)
+{
+ u_int percent;
+
+ percent = (u_int)(uintptr_t)arg;
+ mutex_enter(&arc_dnlc_evicts_lock);
+ if (arc_dnlc_evicts_arg == 0) {
+ arc_dnlc_evicts_arg = percent;
+ cv_broadcast(&arc_dnlc_evicts_cv);
+ }
+ mutex_exit(&arc_dnlc_evicts_lock);
+}
+
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are comming from. This function is only called
+ * when we are adding new content to the cache.
+ */
+static void
+arc_adapt(int bytes, arc_state_t *state)
+{
+ int mult;
+ uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+ int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
+ int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
+
+ if (state == arc_l2c_only)
+ return;
+
+ ASSERT(bytes > 0);
+ /*
+ * Adapt the target size of the MRU list:
+ * - if we just hit in the MRU ghost list, then increase
+ * the target size of the MRU list.
+ * - if we just hit in the MFU ghost list, then increase
+ * the target size of the MFU list by decreasing the
+ * target size of the MRU list.
+ */
+ if (state == arc_mru_ghost) {
+ mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
+ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
+
+ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
+ } else if (state == arc_mfu_ghost) {
+ uint64_t delta;
+
+ mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
+ mult = MIN(mult, 10);
+
+ delta = MIN(bytes * mult, arc_p);
+ arc_p = MAX(arc_p_min, arc_p - delta);
+ }
+ ASSERT((int64_t)arc_p >= 0);
+
+ /*
+ * Wake reap thread if we do not have any available memory
+ */
+ if (arc_reclaim_needed()) {
+ zthr_wakeup(arc_reap_zthr);
+ return;
+ }
+
+ if (arc_no_grow)
+ return;
+
+ if (arc_c >= arc_c_max)
+ return;
+
+ /*
+ * If we're within (2 * maxblocksize) bytes of the target
+ * cache size, increment the target cache size
+ */
+ if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
+ 0) {
+ DTRACE_PROBE1(arc__inc_adapt, int, bytes);
+ atomic_add_64(&arc_c, (int64_t)bytes);
+ if (arc_c > arc_c_max)
+ arc_c = arc_c_max;
+ else if (state == arc_anon)
+ atomic_add_64(&arc_p, (int64_t)bytes);
+ if (arc_p > arc_c)
+ arc_p = arc_c;
+ }
+ ASSERT((int64_t)arc_p >= 0);
+}
+
+/*
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
+ */
+static boolean_t
+arc_is_overflowing(void)
+{
+ /* Always allow at least one block of overflow */
+ int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+ arc_c >> zfs_arc_overflow_shift);
+
+ /*
+ * We just compare the lower bound here for performance reasons. Our
+ * primary goals are to make sure that the arc never grows without
+ * bound, and that it can reach its maximum size. This check
+ * accomplishes both goals. The maximum amount we could run over by is
+ * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
+ * in the ARC. In practice, that's in the tens of MB, which is low
+ * enough to be safe.
+ */
+ return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow);
+}
+
+static abd_t *
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_get_data_impl(hdr, size, tag, do_adapt);
+ if (type == ARC_BUFC_METADATA) {
+ return (abd_alloc(size, B_TRUE));
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ return (abd_alloc(size, B_FALSE));
+ }
+}
+
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_get_data_impl(hdr, size, tag, B_TRUE);
+ if (type == ARC_BUFC_METADATA) {
+ return (zio_buf_alloc(size));
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ return (zio_data_buf_alloc(size));
+ }
+}
+
+/*
+ * Allocate a block and return it to the caller. If we are hitting the
+ * hard limit for the cache size, we must sleep, waiting for the eviction
+ * thread to catch up. If we're past the target size but below the hard
+ * limit, we'll only signal the reclaim thread and continue on.
+ */
+static void
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ if (do_adapt)
+ arc_adapt(size, state);
+
+ /*
+ * If arc_size is currently overflowing, and has grown past our
+ * upper limit, we must be adding data faster than the evict
+ * thread can evict. Thus, to ensure we don't compound the
+ * problem by adding more data and forcing arc_size to grow even
+ * further past it's target size, we halt and wait for the
+ * eviction thread to catch up.
+ *
+ * It's also possible that the reclaim thread is unable to evict
+ * enough buffers to get arc_size below the overflow limit (e.g.
+ * due to buffers being un-evictable, or hash lock collisions).
+ * In this case, we want to proceed regardless if we're
+ * overflowing; thus we don't use a while loop here.
+ */
+ if (arc_is_overflowing()) {
+ mutex_enter(&arc_adjust_lock);
+
+ /*
+ * Now that we've acquired the lock, we may no longer be
+ * over the overflow limit, lets check.
+ *
+ * We're ignoring the case of spurious wake ups. If that
+ * were to happen, it'd let this thread consume an ARC
+ * buffer before it should have (i.e. before we're under
+ * the overflow limit and were signalled by the reclaim
+ * thread). As long as that is a rare occurrence, it
+ * shouldn't cause any harm.
+ */
+ if (arc_is_overflowing()) {
+ arc_adjust_needed = B_TRUE;
+ zthr_wakeup(arc_adjust_zthr);
+ (void) cv_wait(&arc_adjust_waiters_cv,
+ &arc_adjust_lock);
+ }
+ mutex_exit(&arc_adjust_lock);
+ }
+
+ VERIFY3U(hdr->b_type, ==, type);
+ if (type == ARC_BUFC_METADATA) {
+ arc_space_consume(size, ARC_SPACE_META);
+ } else {
+ arc_space_consume(size, ARC_SPACE_DATA);
+ }
+
+ /*
+ * Update the state size. Note that ghost states have a
+ * "ghost size" and so don't need to be updated.
+ */
+ if (!GHOST_STATE(state)) {
+
+ (void) zfs_refcount_add_many(&state->arcs_size, size, tag);
+
+ /*
+ * If this is reached via arc_read, the link is
+ * protected by the hash lock. If reached via
+ * arc_buf_alloc, the header should not be accessed by
+ * any other thread. And, if reached via arc_read_done,
+ * the hash lock will protect it if it's found in the
+ * hash table; otherwise no other thread should be
+ * trying to [add|remove]_reference it.
+ */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ (void) zfs_refcount_add_many(&state->arcs_esize[type],
+ size, tag);
+ }
+
+ /*
+ * If we are growing the cache, and we are adding anonymous
+ * data, and we have outgrown arc_p, update arc_p
+ */
+ if (aggsum_upper_bound(&arc_size) < arc_c &&
+ hdr->b_l1hdr.b_state == arc_anon &&
+ (zfs_refcount_count(&arc_anon->arcs_size) +
+ zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
+ arc_p = MIN(arc_c, arc_p + size);
+ }
+ ARCSTAT_BUMP(arcstat_allocated);
+}
+
+static void
+arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
+{
+ arc_free_data_impl(hdr, size, tag);
+ abd_free(abd);
+}
+
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_free_data_impl(hdr, size, tag);
+ if (type == ARC_BUFC_METADATA) {
+ zio_buf_free(buf, size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ zio_data_buf_free(buf, size);
+ }
+}
+
+/*
+ * Free the arc data buffer.
+ */
+static void
+arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ /* protected by hash lock, if in the hash table */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(state != arc_anon && state != arc_l2c_only);
+
+ (void) zfs_refcount_remove_many(&state->arcs_esize[type],
+ size, tag);
+ }
+ (void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
+
+ VERIFY3U(hdr->b_type, ==, type);
+ if (type == ARC_BUFC_METADATA) {
+ arc_space_return(size, ARC_SPACE_META);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_space_return(size, ARC_SPACE_DATA);
+ }
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
+ */
+static void
+arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+{
+ clock_t now;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ /*
+ * This buffer is not in the cache, and does not
+ * appear in our "ghost" list. Add the new buffer
+ * to the MRU state.
+ */
+
+ ASSERT0(hdr->b_l1hdr.b_arc_access);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mru, hdr, hash_lock);
+
+ } else if (hdr->b_l1hdr.b_state == arc_mru) {
+ now = ddi_get_lbolt();
+
+ /*
+ * If this buffer is here because of a prefetch, then either:
+ * - clear the flag if this is a "referencing" read
+ * (any subsequent access will bump this into the MFU state).
+ * or
+ * - move the buffer to the head of the list if this is
+ * another prefetch (to make it less likely to be evicted).
+ */
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+ if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ /* link protected by hash lock */
+ ASSERT(multilist_link_active(
+ &hdr->b_l1hdr.b_arc_node));
+ } else {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ }
+ hdr->b_l1hdr.b_arc_access = now;
+ return;
+ }
+
+ /*
+ * This buffer has been "accessed" only once so far,
+ * but it is still in the cache. Move it to the MFU
+ * state.
+ */
+ if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
+ /*
+ * More than 125ms have passed since we
+ * instantiated this buffer. Move it to the
+ * most frequently used state.
+ */
+ hdr->b_l1hdr.b_arc_access = now;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mfu, hdr, hash_lock);
+ }
+ atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
+ arc_state_t *new_state;
+ /*
+ * This buffer has been "accessed" recently, but
+ * was evicted from the cache. Move it to the
+ * MFU state.
+ */
+
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+ new_state = arc_mru;
+ if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+ } else {
+ new_state = arc_mfu;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ }
+
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ arc_change_state(new_state, hdr, hash_lock);
+
+ atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
+ ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+ } else if (hdr->b_l1hdr.b_state == arc_mfu) {
+ /*
+ * This buffer has been accessed more than once and is
+ * still in the cache. Keep it in the MFU state.
+ *
+ * NOTE: an add_reference() that occurred when we did
+ * the arc_read() will have kicked this off the list.
+ * If it was a prefetch, we will explicitly move it to
+ * the head of the list now.
+ */
+
+ atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
+ ARCSTAT_BUMP(arcstat_mfu_hits);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
+ arc_state_t *new_state = arc_mfu;
+ /*
+ * This buffer has been accessed more than once but has
+ * been evicted from the cache. Move it back to the
+ * MFU state.
+ */
+
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+ /*
+ * This is a prefetch access...
+ * move this block back to the MRU state.
+ */
+ new_state = arc_mru;
+ }
+
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(new_state, hdr, hash_lock);
+
+ atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
+ ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
+ /*
+ * This buffer is on the 2nd Level ARC.
+ */
+
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mfu, hdr, hash_lock);
+ } else {
+ ASSERT(!"invalid arc state");
+ }
+}
+
+/*
+ * This routine is called by dbuf_hold() to update the arc_access() state
+ * which otherwise would be skipped for entries in the dbuf cache.
+ */
+void
+arc_buf_access(arc_buf_t *buf)
+{
+ mutex_enter(&buf->b_evict_lock);
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ /*
+ * Avoid taking the hash_lock when possible as an optimization.
+ * The header must be checked again under the hash_lock in order
+ * to handle the case where it is concurrently being released.
+ */
+ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
+ mutex_exit(&buf->b_evict_lock);
+ ARCSTAT_BUMP(arcstat_access_skip);
+ return;
+ }
+
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
+ mutex_exit(hash_lock);
+ mutex_exit(&buf->b_evict_lock);
+ ARCSTAT_BUMP(arcstat_access_skip);
+ return;
+ }
+
+ mutex_exit(&buf->b_evict_lock);
+
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
+
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
+}
+
+/* a generic arc_read_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
+{
+ if (buf == NULL)
+ return;
+
+ bcopy(buf->b_data, arg, arc_buf_size(buf));
+ arc_buf_destroy(buf, arg);
+}
+
+/* a generic arc_read_done_func_t */
+/* ARGSUSED */
+void
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
+{
+ arc_buf_t **bufp = arg;
+ if (buf == NULL) {
+ ASSERT(zio == NULL || zio->io_error != 0);
+ *bufp = NULL;
+ } else {
+ ASSERT(zio == NULL || zio->io_error == 0);
+ *bufp = buf;
+ ASSERT(buf->b_data != NULL);
+ }
+}
+
+static void
+arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
+{
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
+ ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ } else {
+ if (HDR_COMPRESSION_ENABLED(hdr)) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
+ BP_GET_COMPRESS(bp));
+ }
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+ ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
+ }
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+ arc_buf_hdr_t *hdr = zio->io_private;
+ kmutex_t *hash_lock = NULL;
+ arc_callback_t *callback_list;
+ arc_callback_t *acb;
+ boolean_t freeable = B_FALSE;
+ boolean_t no_zio_error = (zio->io_error == 0);
+
+ /*
+ * The hdr was inserted into hash-table and removed from lists
+ * prior to starting I/O. We should find this header, since
+ * it's in the hash table, and it should be legit since it's
+ * not possible to evict it during the I/O. The only possible
+ * reason for it not to be found is if we were freed during the
+ * read.
+ */
+ if (HDR_IN_HASH_TABLE(hdr)) {
+ ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+ ASSERT3U(hdr->b_dva.dva_word[0], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[0]);
+ ASSERT3U(hdr->b_dva.dva_word[1], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[1]);
+
+ arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
+ &hash_lock);
+
+ ASSERT((found == hdr &&
+ DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+ ASSERT3P(hash_lock, !=, NULL);
+ }
+
+ if (no_zio_error) {
+ /* byteswap if necessary */
+ if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+ if (BP_GET_LEVEL(zio->io_bp) > 0) {
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
+ } else {
+ hdr->b_l1hdr.b_byteswap =
+ DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+ }
+ } else {
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+ }
+ }
+
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
+ if (l2arc_noprefetch && HDR_PREFETCH(hdr))
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
+
+ callback_list = hdr->b_l1hdr.b_acb;
+ ASSERT3P(callback_list, !=, NULL);
+
+ if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ arc_access(hdr, hash_lock);
+ }
+
+ /*
+ * If a read request has a callback (i.e. acb_done is not NULL), then we
+ * make a buf containing the data according to the parameters which were
+ * passed in. The implementation of arc_buf_alloc_impl() ensures that we
+ * aren't needlessly decompressing the data multiple times.
+ */
+ int callback_cnt = 0;
+ for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
+ if (!acb->acb_done)
+ continue;
+
+ callback_cnt++;
+
+ if (no_zio_error) {
+ int error = arc_buf_alloc_impl(hdr, acb->acb_private,
+ acb->acb_compressed, zio->io_error == 0,
+ &acb->acb_buf);
+ if (error != 0) {
+ /*
+ * Decompression failed. Set io_error
+ * so that when we call acb_done (below),
+ * we will indicate that the read failed.
+ * Note that in the unusual case where one
+ * callback is compressed and another
+ * uncompressed, we will mark all of them
+ * as failed, even though the uncompressed
+ * one can't actually fail. In this case,
+ * the hdr will not be anonymous, because
+ * if there are multiple callbacks, it's
+ * because multiple threads found the same
+ * arc buf in the hash table.
+ */
+ zio->io_error = error;
+ }
+ }
+ }
+ /*
+ * If there are multiple callbacks, we must have the hash lock,
+ * because the only way for multiple threads to find this hdr is
+ * in the hash table. This ensures that if there are multiple
+ * callbacks, the hdr is not anonymous. If it were anonymous,
+ * we couldn't use arc_buf_destroy() in the error case below.
+ */
+ ASSERT(callback_cnt < 2 || hash_lock != NULL);
+
+ hdr->b_l1hdr.b_acb = NULL;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ if (callback_cnt == 0) {
+ ASSERT(HDR_PREFETCH(hdr));
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ }
+
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+ callback_list != NULL);
+
+ if (no_zio_error) {
+ arc_hdr_verify(hdr, zio->io_bp);
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+ if (hdr->b_l1hdr.b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
+ if (HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
+ freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
+ }
+
+ /*
+ * Broadcast before we drop the hash_lock to avoid the possibility
+ * that the hdr (and hence the cv) might be freed before we get to
+ * the cv_broadcast().
+ */
+ cv_broadcast(&hdr->b_l1hdr.b_cv);
+
+ if (hash_lock != NULL) {
+ mutex_exit(hash_lock);
+ } else {
+ /*
+ * This block was freed while we waited for the read to
+ * complete. It has been removed from the hash table and
+ * moved to the anonymous state (so that it won't show up
+ * in the cache).
+ */
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
+ }
+
+ /* execute each callback and free its structure */
+ while ((acb = callback_list) != NULL) {
+ if (acb->acb_done != NULL) {
+ if (zio->io_error != 0 && acb->acb_buf != NULL) {
+ /*
+ * If arc_buf_alloc_impl() fails during
+ * decompression, the buf will still be
+ * allocated, and needs to be freed here.
+ */
+ arc_buf_destroy(acb->acb_buf, acb->acb_private);
+ acb->acb_buf = NULL;
+ }
+ acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+ acb->acb_buf, acb->acb_private);
+ }
+
+ if (acb->acb_zio_dummy != NULL) {
+ acb->acb_zio_dummy->io_error = zio->io_error;
+ zio_nowait(acb->acb_zio_dummy);
+ }
+
+ callback_list = acb->acb_next;
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
+
+ if (freeable)
+ arc_hdr_destroy(hdr);
+}
+
+/*
+ * "Read" the block at the specified DVA (in bp) via the
+ * cache. If the block is found in the cache, invoke the provided
+ * callback immediately and return. Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required. If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
+ void *private, zio_priority_t priority, int zio_flags,
+ arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
+{
+ arc_buf_hdr_t *hdr = NULL;
+ kmutex_t *hash_lock = NULL;
+ zio_t *rzio;
+ uint64_t guid = spa_load_guid(spa);
+ boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
+ int rc = 0;
+
+ ASSERT(!BP_IS_EMBEDDED(bp) ||
+ BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
+
+top:
+ if (!BP_IS_EMBEDDED(bp)) {
+ /*
+ * Embedded BP's have no DVA and require no I/O to "read".
+ * Create an anonymous arc buf to back it.
+ */
+ hdr = buf_hash_find(guid, bp, &hash_lock);
+ }
+
+ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
+ arc_buf_t *buf = NULL;
+ *arc_flags |= ARC_FLAG_CACHED;
+
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
+
+ ASSERT3P(head_zio, !=, NULL);
+ if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+ priority == ZIO_PRIORITY_SYNC_READ) {
+ /*
+ * This is a sync read that needs to wait for
+ * an in-flight async read. Request that the
+ * zio have its priority upgraded.
+ */
+ zio_change_priority(head_zio, priority);
+ DTRACE_PROBE1(arc__async__upgrade__sync,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_async_upgrade_sync);
+ }
+ if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+
+ if (*arc_flags & ARC_FLAG_WAIT) {
+ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
+
+ if (done) {
+ arc_callback_t *acb = NULL;
+
+ acb = kmem_zalloc(sizeof (arc_callback_t),
+ KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_compressed = compressed_read;
+ if (pio != NULL)
+ acb->acb_zio_dummy = zio_null(pio,
+ spa, NULL, NULL, NULL, zio_flags);
+
+ ASSERT3P(acb->acb_done, !=, NULL);
+ acb->acb_zio_head = head_zio;
+ acb->acb_next = hdr->b_l1hdr.b_acb;
+ hdr->b_l1hdr.b_acb = acb;
+ mutex_exit(hash_lock);
+ return (0);
+ }
+ mutex_exit(hash_lock);
+ return (0);
+ }
+
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
+
+ if (done) {
+ if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+ /*
+ * This is a demand read which does not have to
+ * wait for i/o because we did a predictive
+ * prefetch i/o for it, which has completed.
+ */
+ DTRACE_PROBE1(
+ arc__demand__hit__predictive__prefetch,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_predictive_prefetch);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+
+ if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_prescient_prefetch);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
+
+ ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
+ /* Get a buf with the desired data in it. */
+ rc = arc_buf_alloc_impl(hdr, private,
+ compressed_read, B_TRUE, &buf);
+ if (rc != 0) {
+ arc_buf_destroy(buf, private);
+ buf = NULL;
+ }
+ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
+ rc == 0 || rc != ENOENT);
+ } else if (*arc_flags & ARC_FLAG_PREFETCH &&
+ zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ }
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+ if (*arc_flags & ARC_FLAG_L2CACHE)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
+ data, metadata, hits);
+
+ if (done)
+ done(NULL, zb, bp, buf, private);
+ } else {
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ uint64_t psize = BP_GET_PSIZE(bp);
+ arc_callback_t *acb;
+ vdev_t *vd = NULL;
+ uint64_t addr = 0;
+ boolean_t devw = B_FALSE;
+ uint64_t size;
+
+ if (hdr == NULL) {
+ /* this block is not in the cache */
+ arc_buf_hdr_t *exists = NULL;
+ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+ hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+ BP_GET_COMPRESS(bp), type);
+
+ if (!BP_IS_EMBEDDED(bp)) {
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ }
+ if (exists != NULL) {
+ /* somebody beat us to the hash insert */
+ mutex_exit(hash_lock);
+ buf_discard_identity(hdr);
+ arc_hdr_destroy(hdr);
+ goto top; /* restart the IO request */
+ }
+ } else {
+ /*
+ * This block is in the ghost cache. If it was L2-only
+ * (and thus didn't have an L1 hdr), we realloc the
+ * header to add an L1 hdr.
+ */
+ if (!HDR_HAS_L1HDR(hdr)) {
+ hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+ hdr_full_cache);
+ }
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+ /*
+ * This is a delicate dance that we play here.
+ * This hdr is in the ghost list so we access it
+ * to move it out of the ghost list before we
+ * initiate the read. If it's a prefetch then
+ * it won't have a callback so we'll remove the
+ * reference that arc_buf_alloc_impl() created. We
+ * do this after we've called arc_access() to
+ * avoid hitting an assert in remove_reference().
+ */
+ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
+ arc_access(hdr, hash_lock);
+ arc_hdr_alloc_pabd(hdr, B_FALSE);
+ }
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ size = arc_hdr_size(hdr);
+
+ /*
+ * If compression is enabled on the hdr, then will do
+ * RAW I/O and will store the compressed data in the hdr's
+ * data block. Otherwise, the hdr's data block will contain
+ * the uncompressed data.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+
+ if (*arc_flags & ARC_FLAG_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+
+ if (*arc_flags & ARC_FLAG_L2CACHE)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ if (BP_GET_LEVEL(bp) > 0)
+ arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
+ if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
+ ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
+
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_compressed = compressed_read;
+
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ hdr->b_l1hdr.b_acb = acb;
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+ if (HDR_HAS_L2HDR(hdr) &&
+ (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+ devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+ addr = hdr->b_l2hdr.b_daddr;
+ /*
+ * Lock out L2ARC device removal.
+ */
+ if (vdev_is_dead(vd) ||
+ !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
+ vd = NULL;
+ }
+
+ /*
+ * We count both async reads and scrub IOs as asynchronous so
+ * that both can be upgraded in the event of a cache hit while
+ * the read IO is still in-flight.
+ */
+ if (priority == ZIO_PRIORITY_ASYNC_READ ||
+ priority == ZIO_PRIORITY_SCRUB)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+ else
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+
+ /*
+ * At this point, we have a level 1 cache miss. Try again in
+ * L2ARC if possible.
+ */
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
+
+ DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+ uint64_t, lsize, zbookmark_phys_t *, zb);
+ ARCSTAT_BUMP(arcstat_misses);
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
+ data, metadata, misses);
+#ifdef _KERNEL
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_READBPS, size);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+ curthread->td_ru.ru_inblock++;
+#endif
+
+ if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
+ /*
+ * Read from the L2ARC if the following are true:
+ * 1. The L2ARC vdev was previously cached.
+ * 2. This buffer still has L2ARC metadata.
+ * 3. This buffer isn't currently writing to the L2ARC.
+ * 4. The L2ARC entry wasn't evicted, which may
+ * also have invalidated the vdev.
+ * 5. This isn't prefetch and l2arc_noprefetch is set.
+ */
+ if (HDR_HAS_L2HDR(hdr) &&
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
+ !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
+ l2arc_read_callback_t *cb;
+ abd_t *abd;
+ uint64_t asize;
+
+ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_hits);
+ atomic_inc_32(&hdr->b_l2hdr.b_hits);
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+ KM_SLEEP);
+ cb->l2rcb_hdr = hdr;
+ cb->l2rcb_bp = *bp;
+ cb->l2rcb_zb = *zb;
+ cb->l2rcb_flags = zio_flags;
+
+ asize = vdev_psize_to_asize(vd, size);
+ if (asize != size) {
+ abd = abd_alloc_for_io(asize,
+ HDR_ISTYPE_METADATA(hdr));
+ cb->l2rcb_abd = abd;
+ } else {
+ abd = hdr->b_l1hdr.b_pabd;
+ }
+
+ ASSERT(addr >= VDEV_LABEL_START_SIZE &&
+ addr + asize <= vd->vdev_psize -
+ VDEV_LABEL_END_SIZE);
+
+ /*
+ * l2arc read. The SCL_L2ARC lock will be
+ * released by l2arc_read_done().
+ * Issue a null zio if the underlying buffer
+ * was squashed to zero size by compression.
+ */
+ ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
+ ZIO_COMPRESS_EMPTY);
+ rzio = zio_read_phys(pio, vd, addr,
+ asize, abd,
+ ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority,
+ zio_flags | ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ acb->acb_zio_head = rzio;
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+ zio_t *, rzio);
+ ARCSTAT_INCR(arcstat_l2_read_bytes, size);
+
+ if (*arc_flags & ARC_FLAG_NOWAIT) {
+ zio_nowait(rzio);
+ return (0);
+ }
+
+ ASSERT(*arc_flags & ARC_FLAG_WAIT);
+ if (zio_wait(rzio) == 0)
+ return (0);
+
+ /* l2arc read error; goto zio_read() */
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
+ } else {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ if (HDR_L2_WRITING(hdr))
+ ARCSTAT_BUMP(arcstat_l2_rw_clash);
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ }
+ } else {
+ if (vd != NULL)
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ if (l2arc_ndev != 0) {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ }
+ }
+
+ rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
+ arc_read_done, hdr, priority, zio_flags, zb);
+ acb->acb_zio_head = rzio;
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ if (*arc_flags & ARC_FLAG_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
+ zio_nowait(rzio);
+ }
+ return (0);
+}
+
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+ arc_prune_t *p;
+
+ p = kmem_alloc(sizeof (*p), KM_SLEEP);
+ p->p_pfunc = func;
+ p->p_private = private;
+ list_link_init(&p->p_node);
+ zfs_refcount_create(&p->p_refcnt);
+
+ mutex_enter(&arc_prune_mtx);
+ zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
+ list_insert_head(&arc_prune_list, p);
+ mutex_exit(&arc_prune_mtx);
+
+ return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+ boolean_t wait = B_FALSE;
+ mutex_enter(&arc_prune_mtx);
+ list_remove(&arc_prune_list, p);
+ if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
+ wait = B_TRUE;
+ mutex_exit(&arc_prune_mtx);
+
+ /* wait for arc_prune_task to finish */
+ if (wait)
+ taskq_wait(arc_prune_taskq);
+ ASSERT0(zfs_refcount_count(&p->p_refcnt));
+ zfs_refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+}
+
+/*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ uint64_t guid = spa_load_guid(spa);
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ hdr = buf_hash_find(guid, bp, &hash_lock);
+ if (hdr == NULL)
+ return;
+
+ /*
+ * We might be trying to free a block that is still doing I/O
+ * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
+ * dmu_sync-ed block). If this block is being prefetched, then it
+ * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
+ * until the I/O completes. A block may also have a reference if it is
+ * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
+ * have written the new block to its final resting place on disk but
+ * without the dedup flag set. This would have left the hdr in the MRU
+ * state and discoverable. When the txg finally syncs it detects that
+ * the block was overridden in open context and issues an override I/O.
+ * Since this is a dedup block, the override I/O will determine if the
+ * block is already in the DDT. If so, then it will replace the io_bp
+ * with the bp from the DDT and allow the I/O to finish. When the I/O
+ * reaches the done callback, dbuf_write_override_done, it will
+ * check to see if the io_bp and io_bp_override are identical.
+ * If they are not, then it indicates that the bp was replaced with
+ * the bp in the DDT and the override bp is freed. This allows
+ * us to arrive here with a reference on a block that is being
+ * freed. So if we have an I/O in progress, or a reference to
+ * this hdr, then we don't destroy the hdr.
+ */
+ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
+ mutex_exit(hash_lock);
+ } else {
+ mutex_exit(hash_lock);
+ }
+
+}
+
+/*
+ * Release this buffer from the cache, making it an anonymous buffer. This
+ * must be done after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ /*
+ * It would be nice to assert that if it's DMU metadata (level >
+ * 0 || it's the dnode file), then it must be syncing context.
+ * But we don't know that information at this level.
+ */
+
+ mutex_enter(&buf->b_evict_lock);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ /*
+ * We don't grab the hash lock prior to this check, because if
+ * the buffer's header is in the arc_anon state, it won't be
+ * linked into the hash table.
+ */
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ mutex_exit(&buf->b_evict_lock);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ ASSERT(!HDR_HAS_L2HDR(hdr));
+ ASSERT(HDR_EMPTY(hdr));
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ hdr->b_l1hdr.b_arc_access = 0;
+
+ /*
+ * If the buf is being overridden then it may already
+ * have a hdr that is not empty.
+ */
+ buf_discard_identity(hdr);
+ arc_buf_thaw(buf);
+
+ return;
+ }
+
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * This assignment is only valid as long as the hash_lock is
+ * held, we must be careful not to reference state or the
+ * b_state field after dropping the lock.
+ */
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT3P(state, !=, arc_anon);
+
+ /* this buffer is not on any list */
+ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+ /*
+ * We have to recheck this conditional again now that
+ * we're holding the l2ad_mtx to prevent a race with
+ * another thread which might be concurrently calling
+ * l2arc_evict(). In that case, l2arc_evict() might have
+ * destroyed the header's L2 portion as we were waiting
+ * to acquire the l2ad_mtx.
+ */
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_trim(hdr);
+ arc_hdr_l2hdr_destroy(hdr);
+ }
+
+ mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+ }
+
+ /*
+ * Do we have more than one buf?
+ */
+ if (hdr->b_l1hdr.b_bufcnt > 1) {
+ arc_buf_hdr_t *nhdr;
+ uint64_t spa = hdr->b_spa;
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ enum zio_compress compress = HDR_GET_COMPRESS(hdr);
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ VERIFY3U(hdr->b_type, ==, type);
+
+ ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
+ (void) remove_reference(hdr, hash_lock, tag);
+
+ if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+ ASSERT(ARC_BUF_LAST(buf));
+ }
+
+ /*
+ * Pull the data off of this hdr and attach it to
+ * a new anonymous hdr. Also find the last buffer
+ * in the hdr's buffer list.
+ */
+ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
+ ASSERT3P(lastbuf, !=, NULL);
+
+ /*
+ * If the current arc_buf_t and the hdr are sharing their data
+ * buffer, then we must stop sharing that block.
+ */
+ if (arc_buf_is_shared(buf)) {
+ VERIFY(!arc_buf_is_shared(lastbuf));
+
+ /*
+ * First, sever the block sharing relationship between
+ * buf and the arc_buf_hdr_t.
+ */
+ arc_unshare_buf(hdr, buf);
+
+ /*
+ * Now we need to recreate the hdr's b_pabd. Since we
+ * have lastbuf handy, we try to share with it, but if
+ * we can't then we allocate a new b_pabd and copy the
+ * data from buf into it.
+ */
+ if (arc_can_share(hdr, lastbuf)) {
+ arc_share_buf(hdr, lastbuf);
+ } else {
+ arc_hdr_alloc_pabd(hdr, B_TRUE);
+ abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
+ buf->b_data, psize);
+ }
+ VERIFY3P(lastbuf->b_data, !=, NULL);
+ } else if (HDR_SHARED_DATA(hdr)) {
+ /*
+ * Uncompressed shared buffers are always at the end
+ * of the list. Compressed buffers don't have the
+ * same requirements. This makes it hard to
+ * simply assert that the lastbuf is shared so
+ * we rely on the hdr's compression flags to determine
+ * if we have a compressed, shared buffer.
+ */
+ ASSERT(arc_buf_is_shared(lastbuf) ||
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
+ ASSERT(!ARC_BUF_SHARED(buf));
+ }
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ ASSERT3P(state, !=, arc_l2c_only);
+
+ (void) zfs_refcount_remove_many(&state->arcs_size,
+ arc_buf_size(buf), buf);
+
+ if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ ASSERT3P(state, !=, arc_l2c_only);
+ (void) zfs_refcount_remove_many(
+ &state->arcs_esize[type],
+ arc_buf_size(buf), buf);
+ }
+
+ hdr->b_l1hdr.b_bufcnt -= 1;
+ arc_cksum_verify(buf);
+#ifdef illumos
+ arc_buf_unwatch(buf);
+#endif
+
+ mutex_exit(hash_lock);
+
+ /*
+ * Allocate a new hdr. The new hdr will contain a b_pabd
+ * buffer which will be freed in arc_write().
+ */
+ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
+ ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT0(nhdr->b_l1hdr.b_bufcnt);
+ ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
+ VERIFY3U(nhdr->b_type, ==, type);
+ ASSERT(!HDR_SHARED_DATA(nhdr));
+
+ nhdr->b_l1hdr.b_buf = buf;
+ nhdr->b_l1hdr.b_bufcnt = 1;
+ (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
+ buf->b_hdr = nhdr;
+
+ mutex_exit(&buf->b_evict_lock);
+ (void) zfs_refcount_add_many(&arc_anon->arcs_size,
+ arc_buf_size(buf), buf);
+ } else {
+ mutex_exit(&buf->b_evict_lock);
+ ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+ /* protected by hash lock, or hdr is on arc_anon */
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_l1hdr.b_arc_access = 0;
+ mutex_exit(hash_lock);
+
+ buf_discard_identity(hdr);
+ arc_buf_thaw(buf);
+ }
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+ int released;
+
+ mutex_enter(&buf->b_evict_lock);
+ released = (buf->b_data != NULL &&
+ buf->b_hdr->b_l1hdr.b_state == arc_anon);
+ mutex_exit(&buf->b_evict_lock);
+ return (released);
+}
+
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+ int referenced;
+
+ mutex_enter(&buf->b_evict_lock);
+ referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
+ mutex_exit(&buf->b_evict_lock);
+ return (referenced);
+}
+#endif
+
+static void
+arc_write_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+
+ /*
+ * If we're reexecuting this zio because the pool suspended, then
+ * cleanup any state that was previously set the first time the
+ * callback was invoked.
+ */
+ if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
+ arc_cksum_free(hdr);
+#ifdef illumos
+ arc_buf_unwatch(buf);
+#endif
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ if (arc_buf_is_shared(buf)) {
+ arc_unshare_buf(hdr, buf);
+ } else {
+ arc_hdr_free_pabd(hdr);
+ }
+ }
+ }
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(!arc_buf_is_shared(buf));
+
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+
+ if (HDR_IO_IN_PROGRESS(hdr))
+ ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
+
+ arc_cksum_compute(buf);
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+ enum zio_compress compress;
+ if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+ compress = ZIO_COMPRESS_OFF;
+ } else {
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
+ compress = BP_GET_COMPRESS(zio->io_bp);
+ }
+ HDR_SET_PSIZE(hdr, psize);
+ arc_hdr_set_compress(hdr, compress);
+
+
+ /*
+ * Fill the hdr with data. If the hdr is compressed, the data we want
+ * is available from the zio, otherwise we can take it from the buf.
+ *
+ * We might be able to share the buf's data with the hdr here. However,
+ * doing so would cause the ARC to be full of linear ABDs if we write a
+ * lot of shareable data. As a compromise, we check whether scattered
+ * ABDs are allowed, and assume that if they are then the user wants
+ * the ARC to be primarily filled with them regardless of the data being
+ * written. Therefore, if they're allowed then we allocate one and copy
+ * the data into it; otherwise, we share the data directly if we can.
+ */
+ if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
+ arc_hdr_alloc_pabd(hdr, B_TRUE);
+
+ /*
+ * Ideally, we would always copy the io_abd into b_pabd, but the
+ * user may have disabled compressed ARC, thus we must check the
+ * hdr's compression setting rather than the io_bp's.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+ ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
+ ZIO_COMPRESS_OFF);
+ ASSERT3U(psize, >, 0);
+
+ abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
+ } else {
+ ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
+
+ abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
+ arc_buf_size(buf));
+ }
+ } else {
+ ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
+ ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+
+ arc_share_buf(hdr, buf);
+ }
+
+ arc_hdr_verify(hdr, zio->io_bp);
+}
+
+static void
+arc_write_children_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+
+ callback->awcb_children_ready(zio, buf, callback->awcb_private);
+}
+
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write. See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+ arc_write_callback_t *cb = zio->io_private;
+ if (cb->awcb_physdone != NULL)
+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+
+ if (zio->io_error == 0) {
+ arc_hdr_verify(hdr, zio->io_bp);
+
+ if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+ buf_discard_identity(hdr);
+ } else {
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ }
+ } else {
+ ASSERT(HDR_EMPTY(hdr));
+ }
+
+ /*
+ * If the block to be written was all-zero or compressed enough to be
+ * embedded in the BP, no write was performed so there will be no
+ * dva/birth/checksum. The buffer must therefore remain anonymous
+ * (and uncached).
+ */
+ if (!HDR_EMPTY(hdr)) {
+ arc_buf_hdr_t *exists;
+ kmutex_t *hash_lock;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ arc_cksum_verify(buf);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists != NULL) {
+ /*
+ * This can only happen if we overwrite for
+ * sync-to-convergence, because we remove
+ * buffers from the hash table when we arc_free().
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad overwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ ASSERT(zfs_refcount_is_zero(
+ &exists->b_l1hdr.b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+ /* nopwrite */
+ ASSERT(zio->io_prop.zp_nopwrite);
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad nopwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ } else {
+ /* Dedup */
+ ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+ ASSERT(BP_GET_DEDUP(zio->io_bp));
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+ }
+ }
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ /* if it's not anon, we are doing a scrub */
+ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ }
+
+ ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
+
+ abd_put(zio->io_abd);
+ kmem_free(callback, sizeof (arc_write_callback_t));
+}
+
+zio_t *
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
+ arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
+ arc_write_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, const zbookmark_phys_t *zb)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_write_callback_t *callback;
+ zio_t *zio;
+ zio_prop_t localprop = *zp;
+
+ ASSERT3P(ready, !=, NULL);
+ ASSERT3P(done, !=, NULL);
+ ASSERT(!HDR_IO_ERROR(hdr));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+ if (l2arc)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ if (ARC_BUF_COMPRESSED(buf)) {
+ /*
+ * We're writing a pre-compressed buffer. Make the
+ * compression algorithm requested by the zio_prop_t match
+ * the pre-compressed buffer's compression algorithm.
+ */
+ localprop.zp_compress = HDR_GET_COMPRESS(hdr);
+
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ callback->awcb_ready = ready;
+ callback->awcb_children_ready = children_ready;
+ callback->awcb_physdone = physdone;
+ callback->awcb_done = done;
+ callback->awcb_private = private;
+ callback->awcb_buf = buf;
+
+ /*
+ * The hdr's b_pabd is now stale, free it now. A new data block
+ * will be allocated when the zio pipeline calls arc_write_ready().
+ */
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ /*
+ * If the buf is currently sharing the data block with
+ * the hdr then we need to break that relationship here.
+ * The hdr will remain with a NULL data pointer and the
+ * buf will take sole ownership of the block.
+ */
+ if (arc_buf_is_shared(buf)) {
+ arc_unshare_buf(hdr, buf);
+ } else {
+ arc_hdr_free_pabd(hdr);
+ }
+ VERIFY3P(buf->b_data, !=, NULL);
+ arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
+ }
+ ASSERT(!arc_buf_is_shared(buf));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+
+ zio = zio_write(pio, spa, txg, bp,
+ abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
+ HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
+ (children_ready != NULL) ? arc_write_children_ready : NULL,
+ arc_write_physdone, arc_write_done, callback,
+ priority, zio_flags, zb);
+
+ return (zio);
+}
+
+static int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+#ifdef _KERNEL
+ uint64_t available_memory = ptob(freemem);
+
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
+ available_memory = MIN(available_memory, uma_avail());
+#endif
+
+ if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
+ return (0);
+
+ if (txg > spa->spa_lowmem_last_txg) {
+ spa->spa_lowmem_last_txg = txg;
+ spa->spa_lowmem_page_load = 0;
+ }
+ /*
+ * If we are in pageout, we know that memory is already tight,
+ * the arc is already going to be evicting, so we just want to
+ * continue to let page writes occur as quickly as possible.
+ */
+ if (curproc == pageproc) {
+ if (spa->spa_lowmem_page_load >
+ MAX(ptob(minfree), available_memory) / 4)
+ return (SET_ERROR(ERESTART));
+ /* Note: reserve is inflated, so we deflate */
+ atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
+ return (0);
+ } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
+ /* memory is low, delay before restarting */
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (SET_ERROR(EAGAIN));
+ }
+ spa->spa_lowmem_page_load = 0;
+#endif /* _KERNEL */
+ return (0);
+}
+
+void
+arc_tempreserve_clear(uint64_t reserve)
+{
+ atomic_add_64(&arc_tempreserve, -reserve);
+ ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+ int error;
+ uint64_t anon_size;
+
+ if (reserve > arc_c/4 && !arc_no_grow) {
+ arc_c = MIN(arc_c_max, reserve * 4);
+ DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
+ }
+ if (reserve > arc_c)
+ return (SET_ERROR(ENOMEM));
+
+ /*
+ * Don't count loaned bufs as in flight dirty data to prevent long
+ * network delays from blocking transactions that are ready to be
+ * assigned to a txg.
+ */
+
+ /* assert that it has not wrapped around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
+
+ anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
+ arc_loaned_bytes), 0);
+
+ /*
+ * Writes will, almost always, require additional memory allocations
+ * in order to compress/encrypt/etc the data. We therefore need to
+ * make sure that there is sufficient available memory for this.
+ */
+ error = arc_memory_throttle(spa, reserve, txg);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Throttle writes when the amount of dirty data in the cache
+ * gets too large. We try to keep the cache less than half full
+ * of dirty blocks so that our sync times don't grow too large.
+ *
+ * In the case of one pool being built on another pool, we want
+ * to make sure we don't end up throttling the lower (backing)
+ * pool when the upper pool is the majority contributor to dirty
+ * data. To insure we make forward progress during throttling, we
+ * also check the current pool's net dirty data and only throttle
+ * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
+ * data in the cache.
+ *
+ * Note: if two requests come in concurrently, we might let them
+ * both succeed, when one of them should fail. Not a huge deal.
+ */
+ uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
+ uint64_t spa_dirty_anon = spa_dirty_data(spa);
+
+ if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
+ anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
+ spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
+ uint64_t meta_esize =
+ zfs_refcount_count(
+ &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ uint64_t data_esize =
+ zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+ dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+ "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
+ arc_tempreserve >> 10, meta_esize >> 10,
+ data_esize >> 10, reserve >> 10, arc_c >> 10);
+ return (SET_ERROR(ERESTART));
+ }
+ atomic_add_64(&arc_tempreserve, reserve);
+ return (0);
+}
+
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+ kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+ size->value.ui64 = zfs_refcount_count(&state->arcs_size);
+ evict_data->value.ui64 =
+ zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
+ evict_metadata->value.ui64 =
+ zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+ arc_stats_t *as = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE) {
+ return (EACCES);
+ } else {
+ arc_kstat_update_state(arc_anon,
+ &as->arcstat_anon_size,
+ &as->arcstat_anon_evictable_data,
+ &as->arcstat_anon_evictable_metadata);
+ arc_kstat_update_state(arc_mru,
+ &as->arcstat_mru_size,
+ &as->arcstat_mru_evictable_data,
+ &as->arcstat_mru_evictable_metadata);
+ arc_kstat_update_state(arc_mru_ghost,
+ &as->arcstat_mru_ghost_size,
+ &as->arcstat_mru_ghost_evictable_data,
+ &as->arcstat_mru_ghost_evictable_metadata);
+ arc_kstat_update_state(arc_mfu,
+ &as->arcstat_mfu_size,
+ &as->arcstat_mfu_evictable_data,
+ &as->arcstat_mfu_evictable_metadata);
+ arc_kstat_update_state(arc_mfu_ghost,
+ &as->arcstat_mfu_ghost_size,
+ &as->arcstat_mfu_ghost_evictable_data,
+ &as->arcstat_mfu_ghost_evictable_metadata);
+
+ ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
+ ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
+ ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
+ ARCSTAT(arcstat_metadata_size) =
+ aggsum_value(&astat_metadata_size);
+ ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
+ ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
+ ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
+ ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
+#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
+ ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) +
+ aggsum_value(&astat_dnode_size) +
+ aggsum_value(&astat_dbuf_size);
+#endif
+ ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
+ }
+
+ return (0);
+}
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+ arc_buf_hdr_t *hdr = obj;
+
+ /*
+ * We rely on b_dva to generate evenly distributed index
+ * numbers using buf_hash below. So, as an added precaution,
+ * let's make sure we never add empty buffers to the arc lists.
+ */
+ ASSERT(!HDR_EMPTY(hdr));
+
+ /*
+ * The assumption here, is the hash value for a given
+ * arc_buf_hdr_t will remain constant throughout it's lifetime
+ * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
+ * Thus, we don't need to store the header's sublist index
+ * on insertion, as this index can be recalculated on removal.
+ *
+ * Also, the low order bits of the hash value are thought to be
+ * distributed evenly. Otherwise, in the case that the multilist
+ * has a power of two number of sublists, each sublists' usage
+ * would not be evenly distributed.
+ */
+ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+ multilist_get_num_sublists(ml));
+}
+
+#ifdef _KERNEL
+static eventhandler_tag arc_event_lowmem = NULL;
+
+static void
+arc_lowmem(void *arg __unused, int howto __unused)
+{
+ int64_t free_memory, to_free;
+
+ arc_no_grow = B_TRUE;
+ arc_warm = B_TRUE;
+ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+ free_memory = arc_available_memory();
+ to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
+ DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
+ arc_reduce_target_size(to_free);
+
+ mutex_enter(&arc_adjust_lock);
+ arc_adjust_needed = B_TRUE;
+ zthr_wakeup(arc_adjust_zthr);
+
+ /*
+ * It is unsafe to block here in arbitrary threads, because we can come
+ * here from ARC itself and may hold ARC locks and thus risk a deadlock
+ * with ARC reclaim thread.
+ */
+ if (curproc == pageproc)
+ (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock);
+ mutex_exit(&arc_adjust_lock);
+}
+#endif
+
+static void
+arc_state_init(void)
+{
+ arc_anon = &ARC_anon;
+ arc_mru = &ARC_mru;
+ arc_mru_ghost = &ARC_mru_ghost;
+ arc_mfu = &ARC_mfu;
+ arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
+
+ arc_mru->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mru->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mfu->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mfu->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+ arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
+ multilist_create(sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ arc_state_multilist_index_func);
+
+ zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+ zfs_refcount_create(&arc_anon->arcs_size);
+ zfs_refcount_create(&arc_mru->arcs_size);
+ zfs_refcount_create(&arc_mru_ghost->arcs_size);
+ zfs_refcount_create(&arc_mfu->arcs_size);
+ zfs_refcount_create(&arc_mfu_ghost->arcs_size);
+ zfs_refcount_create(&arc_l2c_only->arcs_size);
+
+ aggsum_init(&arc_meta_used, 0);
+ aggsum_init(&arc_size, 0);
+ aggsum_init(&astat_data_size, 0);
+ aggsum_init(&astat_metadata_size, 0);
+ aggsum_init(&astat_hdr_size, 0);
+ aggsum_init(&astat_bonus_size, 0);
+ aggsum_init(&astat_dnode_size, 0);
+ aggsum_init(&astat_dbuf_size, 0);
+ aggsum_init(&astat_l2_hdr_size, 0);
+}
+
+static void
+arc_state_fini(void)
+{
+ zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+ zfs_refcount_destroy(&arc_anon->arcs_size);
+ zfs_refcount_destroy(&arc_mru->arcs_size);
+ zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
+ zfs_refcount_destroy(&arc_mfu->arcs_size);
+ zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
+ zfs_refcount_destroy(&arc_l2c_only->arcs_size);
+
+ multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+
+ aggsum_fini(&arc_meta_used);
+ aggsum_fini(&arc_size);
+ aggsum_fini(&astat_data_size);
+ aggsum_fini(&astat_metadata_size);
+ aggsum_fini(&astat_hdr_size);
+ aggsum_fini(&astat_bonus_size);
+ aggsum_fini(&astat_dnode_size);
+ aggsum_fini(&astat_dbuf_size);
+ aggsum_fini(&astat_l2_hdr_size);
+}
+
+uint64_t
+arc_max_bytes(void)
+{
+ return (arc_c_max);
+}
+
+void
+arc_init(void)
+{
+ int i, prefetch_tunable_set = 0;
+
+ /*
+ * allmem is "all memory that we could possibly use".
+ */
+#ifdef illumos
+#ifdef _KERNEL
+ uint64_t allmem = ptob(physmem - swapfs_minfree);
+#else
+ uint64_t allmem = (physmem * PAGESIZE) / 2;
+#endif
+#else
+ uint64_t allmem = kmem_size();
+#endif
+ mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
+
+ mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
+
+ /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
+ arc_c_min = MAX(allmem / 32, arc_abs_min);
+ /* set max to 5/8 of all memory, or all but 1GB, whichever is more */
+ if (allmem >= 1 << 30)
+ arc_c_max = allmem - (1 << 30);
+ else
+ arc_c_max = arc_c_min;
+ arc_c_max = MAX(allmem * 5 / 8, arc_c_max);
+
+ /*
+ * In userland, there's only the memory pressure that we artificially
+ * create (see arc_available_memory()). Don't let arc_c get too
+ * small, because it can cause transactions to be larger than
+ * arc_c, causing arc_tempreserve_space() to fail.
+ */
+#ifndef _KERNEL
+ arc_c_min = arc_c_max / 2;
+#endif
+
+#ifdef _KERNEL
+ /*
+ * Allow the tunables to override our calculations if they are
+ * reasonable.
+ */
+ if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) {
+ arc_c_max = zfs_arc_max;
+ arc_c_min = MIN(arc_c_min, arc_c_max);
+ }
+ if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
+ arc_c_min = zfs_arc_min;
+#endif
+
+ arc_c = arc_c_max;
+ arc_p = (arc_c >> 1);
+
+ /* limit meta-data to 1/4 of the arc capacity */
+ arc_meta_limit = arc_c_max / 4;
+
+#ifdef _KERNEL
+ /*
+ * Metadata is stored in the kernel's heap. Don't let us
+ * use more than half the heap for the ARC.
+ */
+#ifdef __FreeBSD__
+ arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2);
+ arc_dnode_limit = arc_meta_limit / 10;
+#else
+ arc_meta_limit = MIN(arc_meta_limit,
+ vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
+#endif
+#endif
+
+ /* Allow the tunable to override if it is reasonable */
+ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
+ arc_meta_limit = zfs_arc_meta_limit;
+
+ if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
+ arc_c_min = arc_meta_limit / 2;
+
+ if (zfs_arc_meta_min > 0) {
+ arc_meta_min = zfs_arc_meta_min;
+ } else {
+ arc_meta_min = arc_c_min / 2;
+ }
+
+ /* Valid range: <arc_meta_min> - <arc_c_max> */
+ if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
+ (zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
+ (zfs_arc_dnode_limit <= arc_c_max))
+ arc_dnode_limit = zfs_arc_dnode_limit;
+
+ if (zfs_arc_grow_retry > 0)
+ arc_grow_retry = zfs_arc_grow_retry;
+
+ if (zfs_arc_shrink_shift > 0)
+ arc_shrink_shift = zfs_arc_shrink_shift;
+
+ if (zfs_arc_no_grow_shift > 0)
+ arc_no_grow_shift = zfs_arc_no_grow_shift;
+ /*
+ * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
+ */
+ if (arc_no_grow_shift >= arc_shrink_shift)
+ arc_no_grow_shift = arc_shrink_shift - 1;
+
+ if (zfs_arc_p_min_shift > 0)
+ arc_p_min_shift = zfs_arc_p_min_shift;
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc_c = arc_c / 2;
+ if (arc_c < arc_c_min)
+ arc_c = arc_c_min;
+
+ zfs_arc_min = arc_c_min;
+ zfs_arc_max = arc_c_max;
+
+ arc_state_init();
+
+ /*
+ * The arc must be "uninitialized", so that hdr_recl() (which is
+ * registered by buf_init()) will not access arc_reap_zthr before
+ * it is created.
+ */
+ ASSERT(!arc_initialized);
+ buf_init();
+
+ list_create(&arc_prune_list, sizeof (arc_prune_t),
+ offsetof(arc_prune_t, p_node));
+ mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
+ max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+ arc_dnlc_evicts_thread_exit = FALSE;
+
+ arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+ if (arc_ksp != NULL) {
+ arc_ksp->ks_data = &arc_stats;
+ arc_ksp->ks_update = arc_kstat_update;
+ kstat_install(arc_ksp);
+ }
+
+ arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check,
+ arc_adjust_cb, NULL, SEC2NSEC(1));
+ arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
+ arc_reap_cb, NULL, SEC2NSEC(1));
+
+#ifdef _KERNEL
+ arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
+ EVENTHANDLER_PRI_FIRST);
+#endif
+
+ (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+
+ arc_initialized = B_TRUE;
+ arc_warm = B_FALSE;
+
+ /*
+ * Calculate maximum amount of dirty data per pool.
+ *
+ * If it has been set by /etc/system, take that.
+ * Otherwise, use a percentage of physical memory defined by
+ * zfs_dirty_data_max_percent (default 10%) with a cap at
+ * zfs_dirty_data_max_max (default 4GB).
+ */
+ if (zfs_dirty_data_max == 0) {
+ zfs_dirty_data_max = ptob(physmem) *
+ zfs_dirty_data_max_percent / 100;
+ zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+ zfs_dirty_data_max_max);
+ }
+
+#ifdef _KERNEL
+ if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
+ prefetch_tunable_set = 1;
+
+#ifdef __i386__
+ if (prefetch_tunable_set == 0) {
+ printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
+ "-- to enable,\n");
+ printf(" add \"vfs.zfs.prefetch_disable=0\" "
+ "to /boot/loader.conf.\n");
+ zfs_prefetch_disable = 1;
+ }
+#else
+ if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
+ prefetch_tunable_set == 0) {
+ printf("ZFS NOTICE: Prefetch is disabled by default if less "
+ "than 4GB of RAM is present;\n"
+ " to enable, add \"vfs.zfs.prefetch_disable=0\" "
+ "to /boot/loader.conf.\n");
+ zfs_prefetch_disable = 1;
+ }
+#endif
+ /* Warn about ZFS memory and address space requirements. */
+ if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
+ printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
+ "expect unstable behavior.\n");
+ }
+ if (allmem < 512 * (1 << 20)) {
+ printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
+ "expect unstable behavior.\n");
+ printf(" Consider tuning vm.kmem_size and "
+ "vm.kmem_size_max\n");
+ printf(" in /boot/loader.conf.\n");
+ }
+#endif
+}
+
+void
+arc_fini(void)
+{
+ arc_prune_t *p;
+
+#ifdef _KERNEL
+ if (arc_event_lowmem != NULL)
+ EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+#endif
+
+ /* Use B_TRUE to ensure *all* buffers are evicted */
+ arc_flush(NULL, B_TRUE);
+
+ mutex_enter(&arc_dnlc_evicts_lock);
+ arc_dnlc_evicts_thread_exit = TRUE;
+ /*
+ * The user evicts thread will set arc_user_evicts_thread_exit
+ * to FALSE when it is finished exiting; we're waiting for that.
+ */
+ while (arc_dnlc_evicts_thread_exit) {
+ cv_signal(&arc_dnlc_evicts_cv);
+ cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
+ }
+ mutex_exit(&arc_dnlc_evicts_lock);
+
+ arc_initialized = B_FALSE;
+
+ if (arc_ksp != NULL) {
+ kstat_delete(arc_ksp);
+ arc_ksp = NULL;
+ }
+
+ taskq_wait(arc_prune_taskq);
+ taskq_destroy(arc_prune_taskq);
+
+ mutex_enter(&arc_prune_mtx);
+ while ((p = list_head(&arc_prune_list)) != NULL) {
+ list_remove(&arc_prune_list, p);
+ zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
+ zfs_refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+ }
+ mutex_exit(&arc_prune_mtx);
+
+ list_destroy(&arc_prune_list);
+ mutex_destroy(&arc_prune_mtx);
+
+ (void) zthr_cancel(arc_adjust_zthr);
+ zthr_destroy(arc_adjust_zthr);
+
+ mutex_destroy(&arc_dnlc_evicts_lock);
+ cv_destroy(&arc_dnlc_evicts_cv);
+
+ (void) zthr_cancel(arc_reap_zthr);
+ zthr_destroy(arc_reap_zthr);
+
+ mutex_destroy(&arc_adjust_lock);
+ cv_destroy(&arc_adjust_waiters_cv);
+
+ /*
+ * buf_fini() must proceed arc_state_fini() because buf_fin() may
+ * trigger the release of kmem magazines, which can callback to
+ * arc_space_return() which accesses aggsums freed in act_state_fini().
+ */
+ buf_fini();
+ arc_state_fini();
+
+ ASSERT0(arc_loaned_bytes);
+}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes. The main role of this cache is to boost
+ * the performance of random read workloads. The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ * +-----------------------+
+ * | ARC |
+ * +-----------------------+
+ * | ^ ^
+ * | | |
+ * l2arc_feed_thread() arc_read()
+ * | | |
+ * | l2arc read |
+ * V | |
+ * +---------------+ |
+ * | L2ARC | |
+ * +---------------+ |
+ * | ^ |
+ * l2arc_write() | |
+ * | | |
+ * V | |
+ * +-------+ +-------+
+ * | vdev | | vdev |
+ * | cache | | cache |
+ * +-------+ +-------+
+ * +=========+ .-----.
+ * : L2ARC : |-_____-|
+ * : devices : | Disks |
+ * +=========+ `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ * 1) ARC
+ * 2) vdev cache of L2ARC devices
+ * 3) L2ARC devices
+ * 4) vdev cache of disks
+ * 5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists. The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ * head --> tail
+ * +---------------------+----------+
+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+ * +---------------------+----------+ | o L2ARC eligible
+ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+ * +---------------------+----------+ |
+ * 15.9 Gbytes ^ 32 Mbytes |
+ * headroom |
+ * l2arc_feed_thread()
+ * |
+ * l2arc write hand <--[oooo]--'
+ * | 8 Mbyte
+ * | write max
+ * V
+ * +==============================+
+ * L2ARC dev |####|#|###|###| |####| ... |
+ * +==============================+
+ * 32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage. It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers. This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes. This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static. Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster. Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes. Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 7. The L2ARC does not store dirty content. It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ * l2arc_write_max max write bytes per interval
+ * l2arc_write_boost extra write bytes during device warmup
+ * l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_headroom number of max device writes to precache
+ * l2arc_headroom_boost when we find compressed buffers during ARC
+ * scanning, we multiply headroom by this
+ * percentage factor for the next scan cycle,
+ * since more compressed buffers are likely to
+ * be present
+ * l2arc_feed_secs seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ *
+ * There are three key functions that control how the L2ARC warms up:
+ *
+ * l2arc_write_eligible() check if a buffer is eligible to cache
+ * l2arc_write_size() calculate how much to write
+ * l2arc_write_interval() calculate sleep delay between writes
+ *
+ * These three functions determine what to write, how much, and how quickly
+ * to send writes.
+ */
+
+static boolean_t
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
+{
+ /*
+ * A buffer is *not* eligible for the L2ARC if it:
+ * 1. belongs to a different spa.
+ * 2. is already cached on the L2ARC.
+ * 3. has an I/O in progress (it may be an incomplete read).
+ * 4. is flagged not eligible (zfs property).
+ */
+ if (hdr->b_spa != spa_guid) {
+ ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
+ return (B_FALSE);
+ }
+ if (HDR_HAS_L2HDR(hdr)) {
+ ARCSTAT_BUMP(arcstat_l2_write_in_l2);
+ return (B_FALSE);
+ }
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
+ return (B_FALSE);
+ }
+ if (!HDR_L2CACHE(hdr)) {
+ ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+static uint64_t
+l2arc_write_size(void)
+{
+ uint64_t size;
+
+ /*
+ * Make sure our globals have meaningful values in case the user
+ * altered them.
+ */
+ size = l2arc_write_max;
+ if (size == 0) {
+ cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+ "be greater than zero, resetting it to the default (%d)",
+ L2ARC_WRITE_SIZE);
+ size = l2arc_write_max = L2ARC_WRITE_SIZE;
+ }
+
+ if (arc_warm == B_FALSE)
+ size += l2arc_write_boost;
+
+ return (size);
+
+}
+
+static clock_t
+l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
+{
+ clock_t interval, next, now;
+
+ /*
+ * If the ARC lists are busy, increase our write rate; if the
+ * lists are stale, idle back. This is achieved by checking
+ * how much we previously wrote - if it was more than half of
+ * what we wanted, schedule the next write much sooner.
+ */
+ if (l2arc_feed_again && wrote > (wanted / 2))
+ interval = (hz * l2arc_feed_min_ms) / 1000;
+ else
+ interval = hz * l2arc_feed_secs;
+
+ now = ddi_get_lbolt();
+ next = MAX(now, MIN(now + interval, began + interval));
+
+ return (next);
+}
+
+/*
+ * Cycle through L2ARC devices. This is how L2ARC load balances.
+ * If a device is returned, this also returns holding the spa config lock.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+ l2arc_dev_t *first, *next = NULL;
+
+ /*
+ * Lock out the removal of spas (spa_namespace_lock), then removal
+ * of cache devices (l2arc_dev_mtx). Once a device has been selected,
+ * both locks will be dropped and a spa config lock held instead.
+ */
+ mutex_enter(&spa_namespace_lock);
+ mutex_enter(&l2arc_dev_mtx);
+
+ /* if there are no vdevs, there is nothing to do */
+ if (l2arc_ndev == 0)
+ goto out;
+
+ first = NULL;
+ next = l2arc_dev_last;
+ do {
+ /* loop around the list looking for a non-faulted vdev */
+ if (next == NULL) {
+ next = list_head(l2arc_dev_list);
+ } else {
+ next = list_next(l2arc_dev_list, next);
+ if (next == NULL)
+ next = list_head(l2arc_dev_list);
+ }
+
+ /* if we have come back to the start, bail out */
+ if (first == NULL)
+ first = next;
+ else if (next == first)
+ break;
+
+ } while (vdev_is_dead(next->l2ad_vdev));
+
+ /* if we were unable to find any usable vdevs, return NULL */
+ if (vdev_is_dead(next->l2ad_vdev))
+ next = NULL;
+
+ l2arc_dev_last = next;
+
+out:
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Grab the config lock to prevent the 'next' device from being
+ * removed while we are writing to it.
+ */
+ if (next != NULL)
+ spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
+ mutex_exit(&spa_namespace_lock);
+
+ return (next);
+}
+
+/*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write()
+{
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
+
+ mutex_enter(&l2arc_free_on_write_mtx);
+ buflist = l2arc_free_on_write;
+
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
+ ASSERT3P(df->l2df_abd, !=, NULL);
+ abd_free(df->l2df_abd);
+ list_remove(buflist, df);
+ kmem_free(df, sizeof (l2arc_data_free_t));
+ }
+
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
+ * A write to a cache device has completed. Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+ l2arc_write_callback_t *cb;
+ l2arc_dev_t *dev;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *hdr, *hdr_prev;
+ kmutex_t *hash_lock;
+ int64_t bytes_dropped = 0;
+
+ cb = zio->io_private;
+ ASSERT3P(cb, !=, NULL);
+ dev = cb->l2wcb_dev;
+ ASSERT3P(dev, !=, NULL);
+ head = cb->l2wcb_head;
+ ASSERT3P(head, !=, NULL);
+ buflist = &dev->l2ad_buflist;
+ ASSERT3P(buflist, !=, NULL);
+ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+ l2arc_write_callback_t *, cb);
+
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ /*
+ * All writes completed, or an error was hit.
+ */
+top:
+ mutex_enter(&dev->l2ad_mtx);
+ for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
+ hdr_prev = list_prev(buflist, hdr);
+
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We cannot use mutex_enter or else we can deadlock
+ * with l2arc_write_buffers (due to swapping the order
+ * the hash lock and l2ad_mtx are taken).
+ */
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. We must retry so we
+ * don't leave the ARC_FLAG_L2_WRITING bit set.
+ */
+ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+ /*
+ * We don't want to rescan the headers we've
+ * already marked as having been written out, so
+ * we reinsert the head node so we can pick up
+ * where we left off.
+ */
+ list_remove(buflist, head);
+ list_insert_after(buflist, hdr, head);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ /*
+ * We wait for the hash lock to become available
+ * to try and prevent busy waiting, and increase
+ * the chance we'll be able to acquire the lock
+ * the next time around.
+ */
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ /*
+ * We could not have been moved into the arc_l2c_only
+ * state while in-flight due to our ARC_FLAG_L2_WRITING
+ * bit being set. Let's just ensure that's being enforced.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ if (zio->io_error != 0) {
+ /*
+ * Error - drop L2ARC entry.
+ */
+ list_remove(buflist, hdr);
+ l2arc_trim(hdr);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+
+ ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
+
+ bytes_dropped += arc_hdr_size(hdr);
+ (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
+ arc_hdr_size(hdr), hdr);
+ }
+
+ /*
+ * Allow ARC to begin reads and ghost list evictions to
+ * this L2ARC entry.
+ */
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
+
+ mutex_exit(hash_lock);
+ }
+
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
+ mutex_exit(&dev->l2ad_mtx);
+
+ vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
+
+ l2arc_do_free_on_write();
+
+ kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+/*
+ * A read to a cache device completed. Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ boolean_t valid_cksum;
+
+ ASSERT3P(zio->io_vd, !=, NULL);
+ ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
+
+ spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
+
+ cb = zio->io_private;
+ ASSERT3P(cb, !=, NULL);
+ hdr = cb->l2rcb_hdr;
+ ASSERT3P(hdr, !=, NULL);
+
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
+ /*
+ * If the data was read into a temporary buffer,
+ * move it and free the buffer.
+ */
+ if (cb->l2rcb_abd != NULL) {
+ ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
+ if (zio->io_error == 0) {
+ abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd,
+ arc_hdr_size(hdr));
+ }
+
+ /*
+ * The following must be done regardless of whether
+ * there was an error:
+ * - free the temporary buffer
+ * - point zio to the real ARC buffer
+ * - set zio size accordingly
+ * These are required because zio is either re-used for
+ * an I/O of the block in the case of the error
+ * or the zio is passed to arc_read_done() and it
+ * needs real data.
+ */
+ abd_free(cb->l2rcb_abd);
+ zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
+ zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
+ }
+
+ ASSERT3P(zio->io_abd, !=, NULL);
+
+ /*
+ * Check this survived the L2ARC journey.
+ */
+ ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
+ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
+ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
+
+ valid_cksum = arc_cksum_is_equal(hdr, zio);
+ if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+ mutex_exit(hash_lock);
+ zio->io_private = hdr;
+ arc_read_done(zio);
+ } else {
+ /*
+ * Buffer didn't survive caching. Increment stats and
+ * reissue to the original storage device.
+ */
+ if (zio->io_error != 0) {
+ ARCSTAT_BUMP(arcstat_l2_io_error);
+ } else {
+ zio->io_error = SET_ERROR(EIO);
+ }
+ if (!valid_cksum)
+ ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+ /*
+ * If there's no waiter, issue an async i/o to the primary
+ * storage now. If there *is* a waiter, the caller must
+ * issue the i/o in a context where it's OK to block.
+ */
+ if (zio->io_waiter == NULL) {
+ zio_t *pio = zio_unique_parent(zio);
+
+ ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ zio = zio_read(pio, zio->io_spa, zio->io_bp,
+ hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
+ hdr, zio->io_priority, cb->l2rcb_flags,
+ &cb->l2rcb_zb);
+ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
+ acb != NULL; acb = acb->acb_next)
+ acb->acb_zio_head = zio;
+ mutex_exit(hash_lock);
+ zio_nowait(zio);
+ } else
+ mutex_exit(hash_lock);
+ }
+
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache. This is used within loops (0..3) to cycle through lists in the
+ * desired order. This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists. This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
+{
+ multilist_t *ml = NULL;
+ unsigned int idx;
+
+ ASSERT(list_num >= 0 && list_num <= 3);
+
+ switch (list_num) {
+ case 0:
+ ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ break;
+ case 1:
+ ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
+ break;
+ case 2:
+ ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
+ break;
+ case 3:
+ ml = arc_mru->arcs_list[ARC_BUFC_DATA];
+ break;
+ }
+
+ /*
+ * Return a randomly-selected sublist. This is acceptable
+ * because the caller feeds only a little bit of data for each
+ * call (8MB). Subsequent calls will result in different
+ * sublists being selected.
+ */
+ idx = multilist_get_random_index(ml);
+ return (multilist_sublist_lock(ml, idx));
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes. This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+ list_t *buflist;
+ arc_buf_hdr_t *hdr, *hdr_prev;
+ kmutex_t *hash_lock;
+ uint64_t taddr;
+
+ buflist = &dev->l2ad_buflist;
+
+ if (!all && dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict.
+ */
+ return;
+ }
+
+ if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
+ /*
+ * When nearing the end of the device, evict to the end
+ * before the device write hand jumps to the start.
+ */
+ taddr = dev->l2ad_end;
+ } else {
+ taddr = dev->l2ad_hand + distance;
+ }
+ DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+ uint64_t, taddr, boolean_t, all);
+
+top:
+ mutex_enter(&dev->l2ad_mtx);
+ for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
+ hdr_prev = list_prev(buflist, hdr);
+
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We cannot use mutex_enter or else we can deadlock
+ * with l2arc_write_buffers (due to swapping the order
+ * the hash lock and l2ad_mtx are taken).
+ */
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. Retry.
+ */
+ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+ mutex_exit(&dev->l2ad_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ /*
+ * A header can't be on this list if it doesn't have L2 header.
+ */
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ /* Ensure this header has finished being written. */
+ ASSERT(!HDR_L2_WRITING(hdr));
+ ASSERT(!HDR_L2_WRITE_HEAD(hdr));
+
+ if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
+ hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
+ /*
+ * We've evicted to the target address,
+ * or the end of the device.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (!HDR_HAS_L1HDR(hdr)) {
+ ASSERT(!HDR_L2_READING(hdr));
+ /*
+ * This doesn't exist in the ARC. Destroy.
+ * arc_hdr_destroy() will call list_remove()
+ * and decrement arcstat_l2_lsize.
+ */
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
+ } else {
+ ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+ ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
+ /*
+ * Invalidate issued or about to be issued
+ * reads, since we may be about to write
+ * over this location.
+ */
+ if (HDR_L2_READING(hdr)) {
+ ARCSTAT_BUMP(arcstat_l2_evict_reading);
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
+ }
+
+ arc_hdr_l2hdr_destroy(hdr);
+ }
+ mutex_exit(hash_lock);
+ }
+ mutex_exit(&dev->l2ad_mtx);
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment).
+ */
+static uint64_t
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+{
+ arc_buf_hdr_t *hdr, *hdr_prev, *head;
+ uint64_t write_asize, write_psize, write_lsize, headroom;
+ boolean_t full;
+ l2arc_write_callback_t *cb;
+ zio_t *pio, *wzio;
+ uint64_t guid = spa_load_guid(spa);
+ int try;
+
+ ASSERT3P(dev->l2ad_vdev, !=, NULL);
+
+ pio = NULL;
+ write_lsize = write_asize = write_psize = 0;
+ full = B_FALSE;
+ head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
+ arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
+ /*
+ * Copy buffers for L2ARC writing.
+ */
+ for (try = 0; try <= 3; try++) {
+ multilist_sublist_t *mls = l2arc_sublist_lock(try);
+ uint64_t passed_sz = 0;
+
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
+
+ /*
+ * L2ARC fast warmup.
+ *
+ * Until the ARC is warm and starts to evict, read from the
+ * head of the ARC lists rather than the tail.
+ */
+ if (arc_warm == B_FALSE)
+ hdr = multilist_sublist_head(mls);
+ else
+ hdr = multilist_sublist_tail(mls);
+ if (hdr == NULL)
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
+
+ headroom = target_sz * l2arc_headroom;
+ if (zfs_compressed_arc_enabled)
+ headroom = (headroom * l2arc_headroom_boost) / 100;
+
+ for (; hdr; hdr = hdr_prev) {
+ kmutex_t *hash_lock;
+
+ if (arc_warm == B_FALSE)
+ hdr_prev = multilist_sublist_next(mls, hdr);
+ else
+ hdr_prev = multilist_sublist_prev(mls, hdr);
+ ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
+ HDR_GET_LSIZE(hdr));
+
+ hash_lock = HDR_LOCK(hdr);
+ if (!mutex_tryenter(hash_lock)) {
+ ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
+ /*
+ * Skip this buffer rather than waiting.
+ */
+ continue;
+ }
+
+ passed_sz += HDR_GET_LSIZE(hdr);
+ if (passed_sz > headroom) {
+ /*
+ * Searched too far.
+ */
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
+ break;
+ }
+
+ if (!l2arc_write_eligible(guid, hdr)) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ /*
+ * We rely on the L1 portion of the header below, so
+ * it's invalid for this header to have been evicted out
+ * of the ghost cache, prior to being written out. The
+ * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ ASSERT3U(arc_hdr_size(hdr), >, 0);
+ uint64_t psize = arc_hdr_size(hdr);
+ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
+ psize);
+
+ if ((write_asize + asize) > target_sz) {
+ full = B_TRUE;
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_full);
+ break;
+ }
+
+ if (pio == NULL) {
+ /*
+ * Insert a dummy header on the buflist so
+ * l2arc_write_done() can find where the
+ * write buffers begin without searching.
+ */
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_buflist, head);
+ mutex_exit(&dev->l2ad_mtx);
+
+ cb = kmem_alloc(
+ sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb->l2wcb_dev = dev;
+ cb->l2wcb_head = head;
+ pio = zio_root(spa, l2arc_write_done, cb,
+ ZIO_FLAG_CANFAIL);
+ ARCSTAT_BUMP(arcstat_l2_write_pios);
+ }
+
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+ arc_hdr_set_flags(hdr,
+ ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
+
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_buflist, hdr);
+ mutex_exit(&dev->l2ad_mtx);
+
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc, psize,
+ hdr);
+
+ /*
+ * Normally the L2ARC can use the hdr's data, but if
+ * we're sharing data between the hdr and one of its
+ * bufs, L2ARC needs its own copy of the data so that
+ * the ZIO below can't race with the buf consumer.
+ * Another case where we need to create a copy of the
+ * data is when the buffer size is not device-aligned
+ * and we need to pad the block to make it such.
+ * That also keeps the clock hand suitably aligned.
+ *
+ * To ensure that the copy will be available for the
+ * lifetime of the ZIO and be cleaned up afterwards, we
+ * add it to the l2arc_free_on_write queue.
+ */
+ abd_t *to_write;
+ if (!HDR_SHARED_DATA(hdr) && psize == asize) {
+ to_write = hdr->b_l1hdr.b_pabd;
+ } else {
+ to_write = abd_alloc_for_io(asize,
+ HDR_ISTYPE_METADATA(hdr));
+ abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
+ if (asize != psize) {
+ abd_zero_off(to_write, psize,
+ asize - psize);
+ }
+ l2arc_free_abd_on_write(to_write, asize,
+ arc_buf_type(hdr));
+ }
+ wzio = zio_write_phys(pio, dev->l2ad_vdev,
+ hdr->b_l2hdr.b_daddr, asize, to_write,
+ ZIO_CHECKSUM_OFF, NULL, hdr,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, B_FALSE);
+
+ write_lsize += HDR_GET_LSIZE(hdr);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+
+ write_psize += psize;
+ write_asize += asize;
+ dev->l2ad_hand += asize;
+
+ mutex_exit(hash_lock);
+
+ (void) zio_nowait(wzio);
+ }
+
+ multilist_sublist_unlock(mls);
+
+ if (full == B_TRUE)
+ break;
+ }
+
+ /* No buffers selected for writing? */
+ if (pio == NULL) {
+ ASSERT0(write_lsize);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
+ return (0);
+ }
+
+ ASSERT3U(write_psize, <=, target_sz);
+ ARCSTAT_BUMP(arcstat_l2_writes_sent);
+ ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
+ ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
+ ARCSTAT_INCR(arcstat_l2_psize, write_psize);
+ vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
+
+ /*
+ * Bump device hand to the device start if it is approaching the end.
+ * l2arc_evict() will already have evicted ahead for this case.
+ */
+ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ }
+
+ dev->l2ad_writing = B_TRUE;
+ (void) zio_wait(pio);
+ dev->l2ad_writing = B_FALSE;
+
+ return (write_asize);
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals. This is the beating
+ * heart of the L2ARC.
+ */
+/* ARGSUSED */
+static void
+l2arc_feed_thread(void *unused __unused)
+{
+ callb_cpr_t cpr;
+ l2arc_dev_t *dev;
+ spa_t *spa;
+ uint64_t size, wrote;
+ clock_t begin, next = ddi_get_lbolt();
+
+ CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&l2arc_feed_thr_lock);
+
+ while (l2arc_thread_exit == 0) {
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
+ next - ddi_get_lbolt());
+ CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+ next = ddi_get_lbolt() + hz;
+
+ /*
+ * Quick check for L2ARC devices.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ if (l2arc_ndev == 0) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+ begin = ddi_get_lbolt();
+
+ /*
+ * This selects the next l2arc device to write to, and in
+ * doing so the next spa to feed from: dev->l2ad_spa. This
+ * will return NULL if there are now no l2arc devices or if
+ * they are all faulted.
+ *
+ * If a device is returned, its spa's config lock is also
+ * held to prevent device removal. l2arc_dev_get_next()
+ * will grab and release l2arc_dev_mtx.
+ */
+ if ((dev = l2arc_dev_get_next()) == NULL)
+ continue;
+
+ spa = dev->l2ad_spa;
+ ASSERT3P(spa, !=, NULL);
+
+ /*
+ * If the pool is read-only then force the feed thread to
+ * sleep a little longer.
+ */
+ if (!spa_writeable(spa)) {
+ next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ /*
+ * Avoid contributing to memory pressure.
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ ARCSTAT_BUMP(arcstat_l2_feeds);
+
+ size = l2arc_write_size();
+
+ /*
+ * Evict L2ARC buffers that will be overwritten.
+ */
+ l2arc_evict(dev, size, B_FALSE);
+
+ /*
+ * Write ARC buffers.
+ */
+ wrote = l2arc_write_buffers(spa, dev, size);
+
+ /*
+ * Calculate interval between writes.
+ */
+ next = l2arc_write_interval(begin, size, wrote);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ }
+
+ l2arc_thread_exit = 0;
+ cv_broadcast(&l2arc_feed_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
+ thread_exit();
+}
+
+boolean_t
+l2arc_vdev_present(vdev_t *vd)
+{
+ l2arc_dev_t *dev;
+
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev != NULL;
+ dev = list_next(l2arc_dev_list, dev)) {
+ if (dev->l2ad_vdev == vd)
+ break;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+
+ return (dev != NULL);
+}
+
+/*
+ * Add a vdev for use by the L2ARC. By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
+{
+ l2arc_dev_t *adddev;
+
+ ASSERT(!l2arc_vdev_present(vd));
+
+ vdev_ashift_optimize(vd);
+
+ /*
+ * Create a new l2arc device entry.
+ */
+ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+ adddev->l2ad_spa = spa;
+ adddev->l2ad_vdev = vd;
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
+ adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_first = B_TRUE;
+ adddev->l2ad_writing = B_FALSE;
+
+ mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
+ /*
+ * This is a list of all ARC buffers that are still valid on the
+ * device.
+ */
+ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
+
+ vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+ zfs_refcount_create(&adddev->l2ad_alloc);
+
+ /*
+ * Add device to global list
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ list_insert_head(l2arc_dev_list, adddev);
+ atomic_inc_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+ l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+
+ /*
+ * Find the device by vdev
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
+ nextdev = list_next(l2arc_dev_list, dev);
+ if (vd == dev->l2ad_vdev) {
+ remdev = dev;
+ break;
+ }
+ }
+ ASSERT3P(remdev, !=, NULL);
+
+ /*
+ * Remove device from global list
+ */
+ list_remove(l2arc_dev_list, remdev);
+ l2arc_dev_last = NULL; /* may have been invalidated */
+ atomic_dec_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Clear all buflists and ARC references. L2ARC device flush.
+ */
+ l2arc_evict(remdev, 0, B_TRUE);
+ list_destroy(&remdev->l2ad_buflist);
+ mutex_destroy(&remdev->l2ad_mtx);
+ zfs_refcount_destroy(&remdev->l2ad_alloc);
+ kmem_free(remdev, sizeof (l2arc_dev_t));
+}
+
+void
+l2arc_init(void)
+{
+ l2arc_thread_exit = 0;
+ l2arc_ndev = 0;
+ l2arc_writes_sent = 0;
+ l2arc_writes_done = 0;
+
+ mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ l2arc_dev_list = &L2ARC_dev_list;
+ l2arc_free_on_write = &L2ARC_free_on_write;
+ list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+ offsetof(l2arc_dev_t, l2ad_node));
+ list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+ offsetof(l2arc_data_free_t, l2df_list_node));
+}
+
+void
+l2arc_fini(void)
+{
+ /*
+ * This is called from dmu_fini(), which is called from spa_fini();
+ * Because of this, we can assume that all l2arc devices have
+ * already been removed when the pools themselves were removed.
+ */
+
+ l2arc_do_free_on_write();
+
+ mutex_destroy(&l2arc_feed_thr_lock);
+ cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_dev_mtx);
+ mutex_destroy(&l2arc_free_on_write_mtx);
+
+ list_destroy(l2arc_dev_list);
+ list_destroy(l2arc_free_on_write);
+}
+
+void
+l2arc_start(void)
+{
+ if (!(spa_mode_global & FWRITE))
+ return;
+
+ (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+}
+
+void
+l2arc_stop(void)
+{
+ if (!(spa_mode_global & FWRITE))
+ return;
+
+ mutex_enter(&l2arc_feed_thr_lock);
+ cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
+ l2arc_thread_exit = 1;
+ while (l2arc_thread_exit != 0)
+ cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+ mutex_exit(&l2arc_feed_thr_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
new file mode 100644
index 000000000000..d7a7fdb0e1b1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
@@ -0,0 +1,152 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Embedded-data Block Pointers
+ *
+ * Normally, block pointers point (via their DVAs) to a block which holds data.
+ * If the data that we need to store is very small, this is an inefficient
+ * use of space, because a block must be at minimum 1 sector (typically 512
+ * bytes or 4KB). Additionally, reading these small blocks tends to generate
+ * more random reads.
+ *
+ * Embedded-data Block Pointers allow small pieces of data (the "payload",
+ * up to 112 bytes) to be stored in the block pointer itself, instead of
+ * being pointed to. The "Pointer" part of this name is a bit of a
+ * misnomer, as nothing is pointed to.
+ *
+ * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
+ * be embedded in the block pointer. The logic for this is handled in
+ * the SPA, by the zio pipeline. Therefore most code outside the zio
+ * pipeline doesn't need special-cases to handle these block pointers.
+ *
+ * See spa.h for details on the exact layout of embedded block pointers.
+ */
+
+void
+encode_embedded_bp_compressed(blkptr_t *bp, void *data,
+ enum zio_compress comp, int uncompressed_size, int compressed_size)
+{
+ uint64_t *bp64 = (uint64_t *)bp;
+ uint64_t w = 0;
+ uint8_t *data8 = data;
+
+ ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
+ ASSERT(uncompressed_size == compressed_size ||
+ comp != ZIO_COMPRESS_OFF);
+ ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+
+ bzero(bp, sizeof (*bp));
+ BP_SET_EMBEDDED(bp, B_TRUE);
+ BP_SET_COMPRESS(bp, comp);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ BPE_SET_LSIZE(bp, uncompressed_size);
+ BPE_SET_PSIZE(bp, compressed_size);
+
+ /*
+ * Encode the byte array into the words of the block pointer.
+ * First byte goes into low bits of first word (little endian).
+ */
+ for (int i = 0; i < compressed_size; i++) {
+ BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
+ if (i % sizeof (w) == sizeof (w) - 1) {
+ /* we've reached the end of a word */
+ ASSERT3P(bp64, <, bp + 1);
+ *bp64 = w;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ w = 0;
+ }
+ }
+ /* write last partial word */
+ if (bp64 < (uint64_t *)(bp + 1))
+ *bp64 = w;
+}
+
+/*
+ * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
+ * more than BPE_PAYLOAD_SIZE bytes).
+ */
+void
+decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
+{
+ int psize;
+ uint8_t *buf8 = buf;
+ uint64_t w = 0;
+ const uint64_t *bp64 = (const uint64_t *)bp;
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ psize = BPE_GET_PSIZE(bp);
+
+ /*
+ * Decode the words of the block pointer into the byte array.
+ * Low bits of first word are the first byte (little endian).
+ */
+ for (int i = 0; i < psize; i++) {
+ if (i % sizeof (w) == 0) {
+ /* beginning of a word */
+ ASSERT3P(bp64, <, bp + 1);
+ w = *bp64;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ }
+ buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
+ }
+}
+
+/*
+ * Fill in the buffer with the (decompressed) payload of the embedded
+ * blkptr_t. Takes into account compression and byteorder (the payload is
+ * treated as a stream of bytes).
+ * Return 0 on success, or ENOSPC if it won't fit in the buffer.
+ */
+int
+decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
+{
+ int lsize, psize;
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ lsize = BPE_GET_LSIZE(bp);
+ psize = BPE_GET_PSIZE(bp);
+
+ if (lsize > buflen)
+ return (ENOSPC);
+ ASSERT3U(lsize, ==, buflen);
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint8_t dstbuf[BPE_PAYLOAD_SIZE];
+ decode_embedded_bp_compressed(bp, dstbuf);
+ VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
+ dstbuf, buf, psize, buflen));
+ } else {
+ ASSERT3U(lsize, ==, psize);
+ decode_embedded_bp_compressed(bp, buf);
+ }
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
new file mode 100644
index 000000000000..ee12db3a266d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+
+void
+bplist_create(bplist_t *bpl)
+{
+ mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+ offsetof(bplist_entry_t, bpe_node));
+}
+
+void
+bplist_destroy(bplist_t *bpl)
+{
+ list_destroy(&bpl->bpl_list);
+ mutex_destroy(&bpl->bpl_lock);
+}
+
+void
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
+{
+ bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
+
+ mutex_enter(&bpl->bpl_lock);
+ bpe->bpe_blk = *bp;
+ list_insert_tail(&bpl->bpl_list, bpe);
+ mutex_exit(&bpl->bpl_lock);
+}
+
+/*
+ * To aid debugging, we keep the most recently removed entry. This way if
+ * we are in the callback, we can easily locate the entry.
+ */
+static bplist_entry_t *bplist_iterate_last_removed;
+
+void
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
+{
+ bplist_entry_t *bpe;
+
+ mutex_enter(&bpl->bpl_lock);
+ while (bpe = list_head(&bpl->bpl_list)) {
+ bplist_iterate_last_removed = bpe;
+ list_remove(&bpl->bpl_list, bpe);
+ mutex_exit(&bpl->bpl_lock);
+ func(arg, &bpe->bpe_blk, tx);
+ kmem_free(bpe, sizeof (*bpe));
+ mutex_enter(&bpl->bpl_lock);
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
new file mode 100644
index 000000000000..bbdd765214fc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
@@ -0,0 +1,606 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017 Datto Inc.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfeature.h>
+#include <sys/zap.h>
+
+/*
+ * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
+ */
+uint64_t
+bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+ if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+ ASSERT0(dp->dp_empty_bpobj);
+ dp->dp_empty_bpobj =
+ bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY(zap_add(os,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+ &dp->dp_empty_bpobj, tx) == 0);
+ }
+ spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
+ ASSERT(dp->dp_empty_bpobj != 0);
+ return (dp->dp_empty_bpobj);
+ } else {
+ return (bpobj_alloc(os, blocksize, tx));
+ }
+}
+
+void
+bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
+ if (!spa_feature_is_active(dmu_objset_spa(os),
+ SPA_FEATURE_EMPTY_BPOBJ)) {
+ VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_EMPTY_BPOBJ, tx));
+ VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
+ dp->dp_empty_bpobj = 0;
+ }
+}
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+ size = BPOBJ_SIZE_V0;
+ else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ size = BPOBJ_SIZE_V1;
+ else
+ size = sizeof (bpobj_phys_t);
+
+ return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+ DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ int64_t i;
+ bpobj_t bpo;
+ dmu_object_info_t doi;
+ int epb;
+ dmu_buf_t *dbuf = NULL;
+
+ ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
+ VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+ mutex_enter(&bpo.bpo_lock);
+
+ if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ VERIFY3U(0, ==, dmu_buf_hold(os,
+ bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ bpobj_free(os, objarray[blkoff], tx);
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+ mutex_exit(&bpo.bpo_lock);
+ bpobj_close(&bpo);
+
+ VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(os, object, &doi);
+ if (err)
+ return (err);
+
+ bzero(bpo, sizeof (*bpo));
+ mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ASSERT(bpo->bpo_dbuf == NULL);
+ ASSERT(bpo->bpo_phys == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+ err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+ if (err)
+ return (err);
+
+ bpo->bpo_os = os;
+ bpo->bpo_object = object;
+ bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+ bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+ bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+ bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+ return (0);
+}
+
+boolean_t
+bpobj_is_open(const bpobj_t *bpo)
+{
+ return (bpo->bpo_object != 0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+ /* Lame workaround for closing a bpobj that was never opened. */
+ if (bpo->bpo_object == 0)
+ return;
+
+ dmu_buf_rele(bpo->bpo_dbuf, bpo);
+ if (bpo->bpo_cached_dbuf != NULL)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ bpo->bpo_dbuf = NULL;
+ bpo->bpo_phys = NULL;
+ bpo->bpo_cached_dbuf = NULL;
+ bpo->bpo_object = 0;
+
+ mutex_destroy(&bpo->bpo_lock);
+}
+
+boolean_t
+bpobj_is_empty(bpobj_t *bpo)
+{
+ return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
+ (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+ boolean_t free)
+{
+ dmu_object_info_t doi;
+ int epb;
+ int64_t i;
+ int err = 0;
+ dmu_buf_t *dbuf = NULL;
+
+ ASSERT(bpobj_is_open(bpo));
+ mutex_enter(&bpo->bpo_lock);
+
+ if (free)
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+ for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+ blkptr_t *bparray;
+ blkptr_t *bp;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (blkptr_t);
+ blkoff = P2PHASE(i, bpo->bpo_epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+ FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ bparray = dbuf->db_data;
+ bp = &bparray[blkoff];
+ err = func(arg, bp, tx);
+ if (err)
+ break;
+ if (free) {
+ bpo->bpo_phys->bpo_bytes -=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+ }
+ bpo->bpo_phys->bpo_num_blkptrs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+ (i + 1) * sizeof (blkptr_t), -1ULL, tx));
+ }
+ if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ ASSERT(bpo->bpo_havecomp);
+ err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+ if (err) {
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+ }
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+ bpobj_t sublist;
+ uint64_t used_before, comp_before, uncomp_before;
+ uint64_t used_after, comp_after, uncomp_after;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+ if (err)
+ break;
+ if (free) {
+ err = bpobj_space(&sublist,
+ &used_before, &comp_before, &uncomp_before);
+ if (err != 0) {
+ bpobj_close(&sublist);
+ break;
+ }
+ }
+ err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+ if (free) {
+ VERIFY3U(0, ==, bpobj_space(&sublist,
+ &used_after, &comp_after, &uncomp_after));
+ bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
+ bpo->bpo_phys->bpo_uncomp -=
+ uncomp_before - uncomp_after;
+ }
+
+ bpobj_close(&sublist);
+ if (err)
+ break;
+ if (free) {
+ err = dmu_object_free(bpo->bpo_os,
+ objarray[blkoff], tx);
+ if (err)
+ break;
+ bpo->bpo_phys->bpo_num_subobjs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ (i + 1) * sizeof (uint64_t), -1ULL, tx));
+ }
+
+out:
+ /* If there are no entries, there should be no bytes. */
+ if (bpobj_is_empty(bpo)) {
+ ASSERT0(bpo->bpo_phys->bpo_bytes);
+ ASSERT0(bpo->bpo_phys->bpo_comp);
+ ASSERT0(bpo->bpo_phys->bpo_uncomp);
+ }
+
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+}
+
+/*
+ * Iterate and remove the entries. If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries. If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+ bpobj_t subbpo;
+ uint64_t used, comp, uncomp, subsubobjs;
+
+ ASSERT(bpobj_is_open(bpo));
+ ASSERT(subobj != 0);
+ ASSERT(bpo->bpo_havesubobj);
+ ASSERT(bpo->bpo_havecomp);
+ ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+ if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
+ bpobj_decr_empty(bpo->bpo_os, tx);
+ return;
+ }
+
+ VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+ VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+
+ if (bpobj_is_empty(&subbpo)) {
+ /* No point in having an empty subobj. */
+ bpobj_close(&subbpo);
+ bpobj_free(bpo->bpo_os, subobj, tx);
+ return;
+ }
+
+ mutex_enter(&bpo->bpo_lock);
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
+ }
+
+ dmu_object_info_t doi;
+ ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ sizeof (subobj), &subobj, tx);
+ bpo->bpo_phys->bpo_num_subobjs++;
+
+ /*
+ * If subobj has only one block of subobjs, then move subobj's
+ * subobjs to bpo's subobj list directly. This reduces
+ * recursion in bpobj_iterate due to nested subobjs.
+ */
+ subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+ if (subsubobjs != 0) {
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+ if (doi.doi_max_offset == doi.doi_data_block_size) {
+ dmu_buf_t *subdb;
+ uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
+ 0, FTAG, &subdb, 0));
+ /*
+ * Make sure that we are not asking dmu_write()
+ * to write more data than we have in our buffer.
+ */
+ VERIFY3U(subdb->db_size, >=,
+ numsubsub * sizeof (subobj));
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ numsubsub * sizeof (subobj), subdb->db_data, tx);
+ dmu_buf_rele(subdb, FTAG);
+ bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+ dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+ subbpo.bpo_phys->bpo_subobjs = 0;
+ VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
+ subsubobjs, tx));
+ }
+ }
+ bpo->bpo_phys->bpo_bytes += used;
+ bpo->bpo_phys->bpo_comp += comp;
+ bpo->bpo_phys->bpo_uncomp += uncomp;
+ mutex_exit(&bpo->bpo_lock);
+
+ bpobj_close(&subbpo);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ blkptr_t stored_bp = *bp;
+ uint64_t offset;
+ int blkoff;
+ blkptr_t *bparray;
+
+ ASSERT(bpobj_is_open(bpo));
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+ if (BP_IS_EMBEDDED(bp)) {
+ /*
+ * The bpobj will compress better without the payload.
+ *
+ * Note that we store EMBEDDED bp's because they have an
+ * uncompressed size, which must be accounted for. An
+ * alternative would be to add their size to bpo_uncomp
+ * without storing the bp, but that would create additional
+ * complications: bpo_uncomp would be inconsistent with the
+ * set of BP's stored, and bpobj_iterate() wouldn't visit
+ * all the space accounted for in the bpobj.
+ */
+ bzero(&stored_bp, sizeof (stored_bp));
+ stored_bp.blk_prop = bp->blk_prop;
+ stored_bp.blk_birth = bp->blk_birth;
+ } else if (!BP_GET_DEDUP(bp)) {
+ /* The bpobj will compress better without the checksum */
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+ }
+
+ /* We never need the fill count. */
+ stored_bp.blk_fill = 0;
+
+ mutex_enter(&bpo->bpo_lock);
+
+ offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+ blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+ if (bpo->bpo_cached_dbuf == NULL ||
+ offset < bpo->bpo_cached_dbuf->db_offset ||
+ offset >= bpo->bpo_cached_dbuf->db_offset +
+ bpo->bpo_cached_dbuf->db_size) {
+ if (bpo->bpo_cached_dbuf)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+ offset, bpo, &bpo->bpo_cached_dbuf, 0));
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+ bparray = bpo->bpo_cached_dbuf->db_data;
+ bparray[blkoff] = stored_bp;
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ bpo->bpo_phys->bpo_num_blkptrs++;
+ bpo->bpo_phys->bpo_bytes +=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+ }
+ mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+ spa_t *spa;
+ uint64_t mintxg;
+ uint64_t maxtxg;
+ uint64_t used;
+ uint64_t comp;
+ uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct space_range_arg *sra = arg;
+
+ if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+ if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
+ sra->used += bp_get_dsize_sync(sra->spa, bp);
+ else
+ sra->used += bp_get_dsize(sra->spa, bp);
+ sra->comp += BP_GET_PSIZE(bp);
+ sra->uncomp += BP_GET_UCSIZE(bp);
+ }
+ return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ ASSERT(bpobj_is_open(bpo));
+ mutex_enter(&bpo->bpo_lock);
+
+ *usedp = bpo->bpo_phys->bpo_bytes;
+ if (bpo->bpo_havecomp) {
+ *compp = bpo->bpo_phys->bpo_comp;
+ *uncompp = bpo->bpo_phys->bpo_uncomp;
+ mutex_exit(&bpo->bpo_lock);
+ return (0);
+ } else {
+ mutex_exit(&bpo->bpo_lock);
+ return (bpobj_space_range(bpo, 0, UINT64_MAX,
+ usedp, compp, uncompp));
+ }
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ struct space_range_arg sra = { 0 };
+ int err;
+
+ ASSERT(bpobj_is_open(bpo));
+
+ /*
+ * As an optimization, if they want the whole txg range, just
+ * get bpo_bytes rather than iterating over the bps.
+ */
+ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+ return (bpobj_space(bpo, usedp, compp, uncompp));
+
+ sra.spa = dmu_objset_spa(bpo->bpo_os);
+ sra.mintxg = mintxg;
+ sra.maxtxg = maxtxg;
+
+ err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+ *usedp = sra.used;
+ *compp = sra.comp;
+ *uncompp = sra.uncomp;
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
new file mode 100644
index 000000000000..c74d07236c1b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
@@ -0,0 +1,301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/arc.h>
+#include <sys/bptree.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/refcount.h>
+#include <sys/spa.h>
+
+/*
+ * A bptree is a queue of root block pointers from destroyed datasets. When a
+ * dataset is destroyed its root block pointer is put on the end of the pool's
+ * bptree queue so the dataset's blocks can be freed asynchronously by
+ * dsl_scan_sync. This allows the delete operation to finish without traversing
+ * all the dataset's blocks.
+ *
+ * Note that while bt_begin and bt_end are only ever incremented in this code,
+ * they are effectively reset to 0 every time the entire bptree is freed because
+ * the bptree's object is destroyed and re-created.
+ */
+
+struct bptree_args {
+ bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */
+ boolean_t ba_free; /* true if freeing during traversal */
+
+ bptree_itor_t *ba_func; /* function to call for each blockpointer */
+ void *ba_arg; /* caller supplied argument to ba_func */
+ dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */
+} bptree_args_t;
+
+uint64_t
+bptree_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+
+ obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+ sizeof (bptree_phys_t), tx);
+
+ /*
+ * Bonus buffer contents are already initialized to 0, but for
+ * readability we make it explicit.
+ */
+ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ bt = db->db_data;
+ bt->bt_begin = 0;
+ bt->bt_end = 0;
+ bt->bt_bytes = 0;
+ bt->bt_comp = 0;
+ bt->bt_uncomp = 0;
+ dmu_buf_rele(db, FTAG);
+
+ return (obj);
+}
+
+int
+bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ ASSERT3U(bt->bt_begin, ==, bt->bt_end);
+ ASSERT0(bt->bt_bytes);
+ ASSERT0(bt->bt_comp);
+ ASSERT0(bt->bt_uncomp);
+ dmu_buf_rele(db, FTAG);
+
+ return (dmu_object_free(os, obj, tx));
+}
+
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+ boolean_t rv;
+
+ VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ rv = (bt->bt_begin == bt->bt_end);
+ dmu_buf_rele(db, FTAG);
+ return (rv);
+}
+
+void
+bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+ uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+ bptree_entry_phys_t bte = { 0 };
+
+ /*
+ * bptree objects are in the pool mos, therefore they can only be
+ * modified in syncing context. Furthermore, this is only modified
+ * by the sync thread, so no locking is necessary.
+ */
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+
+ bte.be_birth_txg = birth_txg;
+ bte.be_bp = *bp;
+ dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
+
+ dmu_buf_will_dirty(db, tx);
+ bt->bt_end++;
+ bt->bt_bytes += bytes;
+ bt->bt_comp += comp;
+ bt->bt_uncomp += uncomp;
+ dmu_buf_rele(db, FTAG);
+}
+
+/* ARGSUSED */
+static int
+bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ int err;
+ struct bptree_args *ba = arg;
+
+ if (bp == NULL || BP_IS_HOLE(bp))
+ return (0);
+
+ err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
+ if (err == 0 && ba->ba_free) {
+ ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
+ ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
+ ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
+ }
+ return (err);
+}
+
+/*
+ * If "free" is set:
+ * - It is assumed that "func" will be freeing the block pointers.
+ * - If "func" returns nonzero, the bookmark will be remembered and
+ * iteration will be restarted from this point on next invocation.
+ * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ * bptree_iterate will remember the bookmark, continue traversing
+ * any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
+int
+bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
+ void *arg, dmu_tx_t *tx)
+{
+ boolean_t ioerr = B_FALSE;
+ int err;
+ uint64_t i;
+ dmu_buf_t *db;
+ struct bptree_args ba;
+
+ ASSERT(!free || dmu_tx_is_syncing(tx));
+
+ err = dmu_bonus_hold(os, obj, FTAG, &db);
+ if (err != 0)
+ return (err);
+
+ if (free)
+ dmu_buf_will_dirty(db, tx);
+
+ ba.ba_phys = db->db_data;
+ ba.ba_free = free;
+ ba.ba_func = func;
+ ba.ba_arg = arg;
+ ba.ba_tx = tx;
+
+ err = 0;
+ for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
+ bptree_entry_phys_t bte;
+ int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
+
+ err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
+ &bte, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ break;
+
+ if (zfs_free_leak_on_eio)
+ flags |= TRAVERSE_HARD;
+ zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
+ "bookmark %lld/%lld/%lld/%lld",
+ (longlong_t)i,
+ (longlong_t)bte.be_birth_txg,
+ (longlong_t)bte.be_zb.zb_objset,
+ (longlong_t)bte.be_zb.zb_object,
+ (longlong_t)bte.be_zb.zb_level,
+ (longlong_t)bte.be_zb.zb_blkid);
+ err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
+ bte.be_birth_txg, &bte.be_zb, flags,
+ bptree_visit_cb, &ba);
+ if (free) {
+ /*
+ * The callback has freed the visited block pointers.
+ * Record our traversal progress on disk, either by
+ * updating this record's bookmark, or by logically
+ * removing this record by advancing bt_begin.
+ */
+ if (err != 0) {
+ /* save bookmark for future resume */
+ ASSERT3U(bte.be_zb.zb_objset, ==,
+ ZB_DESTROYED_OBJSET);
+ ASSERT0(bte.be_zb.zb_level);
+ dmu_write(os, obj, i * sizeof (bte),
+ sizeof (bte), &bte, tx);
+ if (err == EIO || err == ECKSUM ||
+ err == ENXIO) {
+ /*
+ * Skip the rest of this tree and
+ * continue on to the next entry.
+ */
+ err = 0;
+ ioerr = B_TRUE;
+ } else {
+ break;
+ }
+ } else if (ioerr) {
+ /*
+ * This entry is finished, but there were
+ * i/o errors on previous entries, so we
+ * can't adjust bt_begin. Set this entry's
+ * be_birth_txg such that it will be
+ * treated as a no-op in future traversals.
+ */
+ bte.be_birth_txg = UINT64_MAX;
+ dmu_write(os, obj, i * sizeof (bte),
+ sizeof (bte), &bte, tx);
+ }
+
+ if (!ioerr) {
+ ba.ba_phys->bt_begin++;
+ (void) dmu_free_range(os, obj,
+ i * sizeof (bte), sizeof (bte), tx);
+ }
+ } else if (err != 0) {
+ break;
+ }
+ }
+
+ ASSERT(!free || err != 0 || ioerr ||
+ ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+
+ /* if all blocks are free there should be no used space */
+ if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+ if (zfs_free_leak_on_eio) {
+ ba.ba_phys->bt_bytes = 0;
+ ba.ba_phys->bt_comp = 0;
+ ba.ba_phys->bt_uncomp = 0;
+ }
+
+ ASSERT0(ba.ba_phys->bt_bytes);
+ ASSERT0(ba.ba_phys->bt_comp);
+ ASSERT0(ba.ba_phys->bt_uncomp);
+ }
+
+ dmu_buf_rele(db, FTAG);
+
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
new file mode 100644
index 000000000000..1ddc697b5424
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/bqueue.h>
+#include <sys/zfs_context.h>
+
+static inline bqueue_node_t *
+obj2node(bqueue_t *q, void *data)
+{
+ return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
+}
+
+/*
+ * Initialize a blocking queue The maximum capacity of the queue is set to
+ * size. Types that want to be stored in a bqueue must contain a bqueue_node_t,
+ * and offset should give its offset from the start of the struct. Return 0 on
+ * success, or -1 on failure.
+ */
+int
+bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
+{
+ list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
+ node_offset + offsetof(bqueue_node_t, bqn_node));
+ cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
+ q->bq_node_offset = node_offset;
+ q->bq_size = 0;
+ q->bq_maxsize = size;
+ return (0);
+}
+
+/*
+ * Destroy a blocking queue. This function asserts that there are no
+ * elements in the queue, and no one is blocked on the condition
+ * variables.
+ */
+void
+bqueue_destroy(bqueue_t *q)
+{
+ ASSERT0(q->bq_size);
+ cv_destroy(&q->bq_add_cv);
+ cv_destroy(&q->bq_pop_cv);
+ mutex_destroy(&q->bq_lock);
+ list_destroy(&q->bq_list);
+}
+
+/*
+ * Add data to q, consuming size units of capacity. If there is insufficient
+ * capacity to consume size units, block until capacity exists. Asserts size is
+ * > 0.
+ */
+void
+bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+{
+ ASSERT3U(item_size, >, 0);
+ ASSERT3U(item_size, <, q->bq_maxsize);
+ mutex_enter(&q->bq_lock);
+ obj2node(q, data)->bqn_size = item_size;
+ while (q->bq_size + item_size > q->bq_maxsize) {
+ cv_wait(&q->bq_add_cv, &q->bq_lock);
+ }
+ q->bq_size += item_size;
+ list_insert_tail(&q->bq_list, data);
+ cv_signal(&q->bq_pop_cv);
+ mutex_exit(&q->bq_lock);
+}
+/*
+ * Take the first element off of q. If there are no elements on the queue, wait
+ * until one is put there. Return the removed element.
+ */
+void *
+bqueue_dequeue(bqueue_t *q)
+{
+ void *ret;
+ uint64_t item_size;
+ mutex_enter(&q->bq_lock);
+ while (q->bq_size == 0) {
+ cv_wait(&q->bq_pop_cv, &q->bq_lock);
+ }
+ ret = list_remove_head(&q->bq_list);
+ item_size = obj2node(q, ret)->bqn_size;
+ q->bq_size -= item_size;
+ mutex_exit(&q->bq_lock);
+ cv_signal(&q->bq_add_cv);
+ return (ret);
+}
+
+/*
+ * Returns true if the space used is 0.
+ */
+boolean_t
+bqueue_empty(bqueue_t *q)
+{
+ return (q->bq_size == 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
new file mode 100644
index 000000000000..2b62edad0342
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
@@ -0,0 +1,63 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/cityhash.h>
+
+#define HASH_K1 0xb492b66fbe98f273ULL
+#define HASH_K2 0x9ae16a3b2f90404fULL
+
+/*
+ * Bitwise right rotate. Normally this will compile to a single
+ * instruction.
+ */
+static inline uint64_t
+rotate(uint64_t val, int shift)
+{
+ // Avoid shifting by 64: doing so yields an undefined result.
+ return (shift == 0 ? val : (val >> shift) | (val << (64 - shift)));
+}
+
+static inline uint64_t
+cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
+{
+ uint64_t a = (u ^ v) * mul;
+ a ^= (a >> 47);
+ uint64_t b = (v ^ a) * mul;
+ b ^= (b >> 47);
+ b *= mul;
+ return (b);
+}
+
+uint64_t
+cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
+{
+ uint64_t mul = HASH_K2 + 64;
+ uint64_t a = w1 * HASH_K1;
+ uint64_t b = w2;
+ uint64_t c = w4 * mul;
+ uint64_t d = w3 * HASH_K2;
+ return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
+ a + rotate(b + HASH_K2, 18) + c, mul));
+
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
new file mode 100644
index 000000000000..1974ff2197c2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -0,0 +1,4248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/callb.h>
+#include <sys/abd.h>
+#include <sys/vdev.h>
+#include <sys/cityhash.h>
+#include <sys/spa_impl.h>
+
+kstat_t *dbuf_ksp;
+
+typedef struct dbuf_stats {
+ /*
+ * Various statistics about the size of the dbuf cache.
+ */
+ kstat_named_t cache_count;
+ kstat_named_t cache_size_bytes;
+ kstat_named_t cache_size_bytes_max;
+ /*
+ * Statistics regarding the bounds on the dbuf cache size.
+ */
+ kstat_named_t cache_target_bytes;
+ kstat_named_t cache_lowater_bytes;
+ kstat_named_t cache_hiwater_bytes;
+ /*
+ * Total number of dbuf cache evictions that have occurred.
+ */
+ kstat_named_t cache_total_evicts;
+ /*
+ * The distribution of dbuf levels in the dbuf cache and
+ * the total size of all dbufs at each level.
+ */
+ kstat_named_t cache_levels[DN_MAX_LEVELS];
+ kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
+ /*
+ * Statistics about the dbuf hash table.
+ */
+ kstat_named_t hash_hits;
+ kstat_named_t hash_misses;
+ kstat_named_t hash_collisions;
+ kstat_named_t hash_elements;
+ kstat_named_t hash_elements_max;
+ /*
+ * Number of sublists containing more than one dbuf in the dbuf
+ * hash table. Keep track of the longest hash chain.
+ */
+ kstat_named_t hash_chains;
+ kstat_named_t hash_chain_max;
+ /*
+ * Number of times a dbuf_create() discovers that a dbuf was
+ * already created and in the dbuf hash table.
+ */
+ kstat_named_t hash_insert_race;
+ /*
+ * Statistics about the size of the metadata dbuf cache.
+ */
+ kstat_named_t metadata_cache_count;
+ kstat_named_t metadata_cache_size_bytes;
+ kstat_named_t metadata_cache_size_bytes_max;
+ /*
+ * For diagnostic purposes, this is incremented whenever we can't add
+ * something to the metadata cache because it's full, and instead put
+ * the data in the regular dbuf cache.
+ */
+ kstat_named_t metadata_cache_overflow;
+} dbuf_stats_t;
+
+dbuf_stats_t dbuf_stats = {
+ { "cache_count", KSTAT_DATA_UINT64 },
+ { "cache_size_bytes", KSTAT_DATA_UINT64 },
+ { "cache_size_bytes_max", KSTAT_DATA_UINT64 },
+ { "cache_target_bytes", KSTAT_DATA_UINT64 },
+ { "cache_lowater_bytes", KSTAT_DATA_UINT64 },
+ { "cache_hiwater_bytes", KSTAT_DATA_UINT64 },
+ { "cache_total_evicts", KSTAT_DATA_UINT64 },
+ { { "cache_levels_N", KSTAT_DATA_UINT64 } },
+ { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },
+ { "hash_hits", KSTAT_DATA_UINT64 },
+ { "hash_misses", KSTAT_DATA_UINT64 },
+ { "hash_collisions", KSTAT_DATA_UINT64 },
+ { "hash_elements", KSTAT_DATA_UINT64 },
+ { "hash_elements_max", KSTAT_DATA_UINT64 },
+ { "hash_chains", KSTAT_DATA_UINT64 },
+ { "hash_chain_max", KSTAT_DATA_UINT64 },
+ { "hash_insert_race", KSTAT_DATA_UINT64 },
+ { "metadata_cache_count", KSTAT_DATA_UINT64 },
+ { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
+ { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
+ { "metadata_cache_overflow", KSTAT_DATA_UINT64 }
+};
+
+#define DBUF_STAT_INCR(stat, val) \
+ atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
+#define DBUF_STAT_DECR(stat, val) \
+ DBUF_STAT_INCR(stat, -(val));
+#define DBUF_STAT_BUMP(stat) \
+ DBUF_STAT_INCR(stat, 1);
+#define DBUF_STAT_BUMPDOWN(stat) \
+ DBUF_STAT_INCR(stat, -1);
+#define DBUF_STAT_MAX(stat, v) { \
+ uint64_t _m; \
+ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
+ (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
+ continue; \
+}
+
+struct dbuf_hold_impl_data {
+ /* Function arguments */
+ dnode_t *dh_dn;
+ uint8_t dh_level;
+ uint64_t dh_blkid;
+ boolean_t dh_fail_sparse;
+ boolean_t dh_fail_uncached;
+ void *dh_tag;
+ dmu_buf_impl_t **dh_dbp;
+ /* Local variables */
+ dmu_buf_impl_t *dh_db;
+ dmu_buf_impl_t *dh_parent;
+ blkptr_t *dh_bp;
+ int dh_err;
+ dbuf_dirty_record_t *dh_dr;
+ int dh_depth;
+};
+
+static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
+ dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse,
+ boolean_t fail_uncached,
+ void *tag, dmu_buf_impl_t **dbp, int depth);
+static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
+
+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+static kmem_cache_t *dbuf_kmem_cache;
+static taskq_t *dbu_evict_taskq;
+
+static kthread_t *dbuf_cache_evict_thread;
+static kmutex_t dbuf_evict_lock;
+static kcondvar_t dbuf_evict_cv;
+static boolean_t dbuf_evict_thread_exit;
+
+/*
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ * that represent the metadata that describes filesystems/snapshots/
+ * bookmarks/properties/etc. We only evict from this cache when we export a
+ * pool, to short-circuit as much I/O as possible for all administrative
+ * commands that need the metadata. There is no eviction policy for this
+ * cache, because we try to only include types in it which would occupy a
+ * very small amount of space per object but create a large impact on the
+ * performance of these commands. Instead, after it reaches a maximum size
+ * (which should only happen on very small memory systems with a very large
+ * number of filesystem objects), we stop taking new dbufs into the
+ * metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
+ * are not currently held but have been recently released. These dbufs
+ * are not eligible for arc eviction until they are aged out of the cache.
+ * Dbufs that are aged out of the cache will be immediately destroyed and
+ * become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
+ */
+typedef struct dbuf_cache {
+ multilist_t *cache;
+ zfs_refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
+
+/* Size limits for the caches */
+uint64_t dbuf_cache_max_bytes = 0;
+uint64_t dbuf_metadata_cache_max_bytes = 0;
+/* Set the default sizes of the caches to log2 fraction of arc size */
+int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
+
+/*
+ * For diagnostic purposes, this is incremented whenever we can't add
+ * something to the metadata cache because it's full, and instead put
+ * the data in the regular dbuf cache.
+ */
+uint64_t dbuf_metadata_cache_overflow;
+
+/*
+ * The LRU dbuf cache uses a three-stage eviction policy:
+ * - A low water marker designates when the dbuf eviction thread
+ * should stop evicting from the dbuf cache.
+ * - When we reach the maximum size (aka mid water mark), we
+ * signal the eviction thread to run.
+ * - The high water mark indicates when the eviction thread
+ * is unable to keep up with the incoming load and eviction must
+ * happen in the context of the calling thread.
+ *
+ * The dbuf cache:
+ * (max size)
+ * low water mid water hi water
+ * +----------------------------------------+----------+----------+
+ * | | | |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +----------------------------------------+----------+----------+
+ * stop signal evict
+ * evicting eviction directly
+ * thread
+ *
+ * The high and low water marks indicate the operating range for the eviction
+ * thread. The low water mark is, by default, 90% of the total size of the
+ * cache and the high water mark is at 110% (both of these percentages can be
+ * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
+ * respectively). The eviction thread will try to ensure that the cache remains
+ * within this range by waking up every second and checking if the cache is
+ * above the low water mark. The thread can also be woken up by callers adding
+ * elements into the cache if the cache is larger than the mid water (i.e max
+ * cache size). Once the eviction thread is woken up and eviction is required,
+ * it will continue evicting buffers until it's able to reduce the cache size
+ * to the low water mark. If the cache size continues to grow and hits the high
+ * water mark, then callers adding elments to the cache will begin to evict
+ * directly from the cache until the cache is no longer above the high water
+ * mark.
+ */
+
+/*
+ * The percentage above and below the maximum cache size.
+ */
+uint_t dbuf_cache_hiwater_pct = 10;
+uint_t dbuf_cache_lowater_pct = 10;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN,
+ &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN,
+ &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN,
+ &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN,
+ &dbuf_metadata_cache_shift, 0,
+ "dbuf metadata cache size as log2 fraction of ARC");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD,
+ &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN,
+ &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN,
+ &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size");
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+ dmu_buf_impl_t *db = vdb;
+ bzero(db, sizeof (dmu_buf_impl_t));
+
+ mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+ multilist_link_init(&db->db_cache_link);
+ zfs_refcount_create(&db->db_holds);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+ dmu_buf_impl_t *db = vdb;
+ mutex_destroy(&db->db_mtx);
+ cv_destroy(&db->db_changed);
+ ASSERT(!multilist_link_active(&db->db_cache_link));
+ zfs_refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+ return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
+}
+
+#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
+ ((dbuf)->db.db_object == (obj) && \
+ (dbuf)->db_objset == (os) && \
+ (dbuf)->db_level == (level) && \
+ (dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv = dbuf_hash(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *db;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+ if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (db);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (NULL);
+}
+
+static dmu_buf_impl_t *
+dbuf_find_bonus(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *db = NULL;
+
+ if (dnode_hold(os, object, FTAG, &dn) == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus != NULL) {
+ db = dn->dn_bonus;
+ mutex_enter(&db->db_mtx);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ }
+ return (db);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_t *os = db->db_objset;
+ uint64_t obj = db->db.db_object;
+ int level = db->db_level;
+ uint64_t blkid, hv, idx;
+ dmu_buf_impl_t *dbf;
+ uint32_t i;
+
+ blkid = db->db_blkid;
+ hv = dbuf_hash(os, obj, level, blkid);
+ idx = hv & h->hash_table_mask;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
+ dbf = dbf->db_hash_next, i++) {
+ if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+ mutex_enter(&dbf->db_mtx);
+ if (dbf->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (dbf);
+ }
+ mutex_exit(&dbf->db_mtx);
+ }
+ }
+
+ if (i > 0) {
+ DBUF_STAT_BUMP(hash_collisions);
+ if (i == 1)
+ DBUF_STAT_BUMP(hash_chains);
+
+ DBUF_STAT_MAX(hash_chain_max, i);
+ }
+
+ mutex_enter(&db->db_mtx);
+ db->db_hash_next = h->hash_table[idx];
+ h->hash_table[idx] = db;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_inc_64(&dbuf_hash_count);
+ DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
+
+ return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table. It must be in the EVICTING state.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv, idx;
+ dmu_buf_impl_t *dbf, **dbp;
+
+ hv = dbuf_hash(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid);
+ idx = hv & h->hash_table_mask;
+
+ /*
+ * We mustn't hold db_mtx to maintain lock ordering:
+ * DBUF_HASH_MUTEX > db_mtx.
+ */
+ ASSERT(zfs_refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_state == DB_EVICTING);
+ ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ dbp = &h->hash_table[idx];
+ while ((dbf = *dbp) != db) {
+ dbp = &dbf->db_hash_next;
+ ASSERT(dbf != NULL);
+ }
+ *dbp = db->db_hash_next;
+ db->db_hash_next = NULL;
+ if (h->hash_table[idx] &&
+ h->hash_table[idx]->db_hash_next == NULL)
+ DBUF_STAT_BUMPDOWN(hash_chains);
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_dec_64(&dbuf_hash_count);
+}
+
+typedef enum {
+ DBVU_EVICTING,
+ DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
+
+static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+ int64_t holds;
+
+ if (db->db_user == NULL)
+ return;
+
+ /* Only data blocks support the attachment of user data. */
+ ASSERT(db->db_level == 0);
+
+ /* Clients must resolve a dbuf before attaching user data. */
+ ASSERT(db->db.db_data != NULL);
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+
+ holds = zfs_refcount_count(&db->db_holds);
+ if (verify_type == DBVU_EVICTING) {
+ /*
+ * Immediate eviction occurs when holds == dirtycnt.
+ * For normal eviction buffers, holds is zero on
+ * eviction, except when dbuf_fix_old_data() calls
+ * dbuf_clear_data(). However, the hold count can grow
+ * during eviction even though db_mtx is held (see
+ * dmu_bonus_hold() for an example), so we can only
+ * test the generic invariant that holds >= dirtycnt.
+ */
+ ASSERT3U(holds, >=, db->db_dirtycnt);
+ } else {
+ if (db->db_user_immediate_evict == TRUE)
+ ASSERT3U(holds, >=, db->db_dirtycnt);
+ else
+ ASSERT3U(holds, >, 0);
+ }
+#endif
+}
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+ dmu_buf_user_t *dbu = db->db_user;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (dbu == NULL)
+ return;
+
+ dbuf_verify_user(db, DBVU_EVICTING);
+ db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+ if (dbu->dbu_clear_on_evict_dbufp != NULL)
+ *dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+ /*
+ * There are two eviction callbacks - one that we call synchronously
+ * and one that we invoke via a taskq. The async one is useful for
+ * avoiding lock order reversals and limiting stack depth.
+ *
+ * Note that if we have a sync callback but no async callback,
+ * it's likely that the sync callback will free the structure
+ * containing the dbu. In that case we need to take care to not
+ * dereference dbu after calling the sync evict func.
+ */
+ boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
+
+ if (dbu->dbu_evict_func_sync != NULL)
+ dbu->dbu_evict_func_sync(dbu);
+
+ if (has_async) {
+ taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
+ dbu, 0, &dbu->dbu_tqent);
+ }
+}
+
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+ if (db->db_level > 0) {
+ return (B_TRUE);
+ } else {
+ boolean_t is_metadata;
+
+ DB_DNODE_ENTER(db);
+ is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
+ DB_DNODE_EXIT(db);
+
+ return (is_metadata);
+ }
+}
+
+/*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+ DB_DNODE_ENTER(db);
+ dmu_object_type_t type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ /* Check if this dbuf is one of the types we care about */
+ if (DMU_OT_IS_METADATA_CACHED(type)) {
+ /* If we hit this, then we set something up wrong in dmu_ot */
+ ASSERT(DMU_OT_IS_METADATA(type));
+
+ /*
+ * Sanity check for small-memory systems: don't allocate too
+ * much memory for this purpose.
+ */
+ if (zfs_refcount_count(
+ &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+ dbuf_metadata_cache_max_bytes) {
+ dbuf_metadata_cache_overflow++;
+ DTRACE_PROBE1(dbuf__metadata__cache__overflow,
+ dmu_buf_impl_t *, db);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the dbuf eviction
+ * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
+{
+ dmu_buf_impl_t *db = obj;
+
+ /*
+ * The assumption here, is the hash value for a given
+ * dmu_buf_impl_t will remain constant throughout it's lifetime
+ * (i.e. it's objset, object, level and blkid fields don't change).
+ * Thus, we don't need to store the dbuf's sublist index
+ * on insertion, as this index can be recalculated on removal.
+ *
+ * Also, the low order bits of the hash value are thought to be
+ * distributed evenly. Otherwise, in the case that the multilist
+ * has a power of two number of sublists, each sublists' usage
+ * would not be evenly distributed.
+ */
+ return (dbuf_hash(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid) %
+ multilist_get_num_sublists(ml));
+}
+
+static inline unsigned long
+dbuf_cache_target_bytes(void)
+{
+ return MIN(dbuf_cache_max_bytes,
+ arc_max_bytes() >> dbuf_cache_shift);
+}
+
+static inline uint64_t
+dbuf_cache_hiwater_bytes(void)
+{
+ uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
+ return (dbuf_cache_target +
+ (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
+}
+
+static inline uint64_t
+dbuf_cache_lowater_bytes(void)
+{
+ uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
+ return (dbuf_cache_target -
+ (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
+}
+
+static inline boolean_t
+dbuf_cache_above_lowater(void)
+{
+ return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+ dbuf_cache_lowater_bytes());
+}
+
+/*
+ * Evict the oldest eligible dbuf from the dbuf cache.
+ */
+static void
+dbuf_evict_one(void)
+{
+ int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+ multilist_sublist_t *mls = multilist_sublist_lock(
+ dbuf_caches[DB_DBUF_CACHE].cache, idx);
+
+ ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
+
+ dmu_buf_impl_t *db = multilist_sublist_tail(mls);
+ while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
+ db = multilist_sublist_prev(mls, db);
+ }
+
+ DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
+ multilist_sublist_t *, mls);
+
+ if (db != NULL) {
+ multilist_sublist_remove(mls, db);
+ multilist_sublist_unlock(mls);
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[DB_DBUF_CACHE].size,
+ db->db.db_size, db);
+ DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+ DBUF_STAT_BUMPDOWN(cache_count);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+ db->db.db_size);
+ ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+ db->db_caching_status = DB_NO_CACHE;
+ dbuf_destroy(db);
+ DBUF_STAT_BUMP(cache_total_evicts);
+ } else {
+ multilist_sublist_unlock(mls);
+ }
+}
+
+/*
+ * The dbuf evict thread is responsible for aging out dbufs from the
+ * cache. Once the cache has reached it's maximum size, dbufs are removed
+ * and destroyed. The eviction thread will continue running until the size
+ * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
+ * out of the cache it is destroyed and becomes eligible for arc eviction.
+ */
+/* ARGSUSED */
+static void
+dbuf_evict_thread(void *unused __unused)
+{
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&dbuf_evict_lock);
+ while (!dbuf_evict_thread_exit) {
+ while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait_hires(&dbuf_evict_cv,
+ &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+ CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
+#ifdef __FreeBSD__
+ if (dbuf_ksp != NULL)
+ dbuf_ksp->ks_update(dbuf_ksp, KSTAT_READ);
+#endif
+ }
+ mutex_exit(&dbuf_evict_lock);
+
+ /*
+ * Keep evicting as long as we're above the low water mark
+ * for the cache. We do this without holding the locks to
+ * minimize lock contention.
+ */
+ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+ dbuf_evict_one();
+ }
+
+ mutex_enter(&dbuf_evict_lock);
+ }
+
+ dbuf_evict_thread_exit = B_FALSE;
+ cv_broadcast(&dbuf_evict_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
+ thread_exit();
+}
+
+/*
+ * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
+ * If the dbuf cache is at its high water mark, then evict a dbuf from the
+ * dbuf cache using the callers context.
+ */
+static void
+dbuf_evict_notify(uint64_t size)
+{
+ /*
+ * We check if we should evict without holding the dbuf_evict_lock,
+ * because it's OK to occasionally make the wrong decision here,
+ * and grabbing the lock results in massive lock contention.
+ */
+ if (size > dbuf_cache_max_bytes) {
+ if (size > dbuf_cache_hiwater_bytes())
+ dbuf_evict_one();
+ cv_signal(&dbuf_evict_cv);
+ }
+}
+
+static int
+dbuf_kstat_update(kstat_t *ksp, int rw)
+{
+ dbuf_stats_t *ds = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE) {
+ return (SET_ERROR(EACCES));
+ } else {
+ ds->metadata_cache_size_bytes.value.ui64 =
+ zfs_refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size);
+ ds->cache_size_bytes.value.ui64 =
+ zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
+ ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
+ ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
+ ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
+ ds->hash_elements.value.ui64 = dbuf_hash_count;
+ }
+
+ return (0);
+}
+
+void
+dbuf_init(void)
+{
+ uint64_t hsize = 1ULL << 16;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 4K block size. The table will take up
+ * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
+ */
+ while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
+ hsize <<= 1;
+
+retry:
+ h->hash_table_mask = hsize - 1;
+ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+ if (h->hash_table == NULL) {
+ /* XXX - we should really return an error instead of assert */
+ ASSERT(hsize > (1ULL << 10));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
+ sizeof (dmu_buf_impl_t),
+ 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+ dbuf_stats_init(h);
+ /*
+ * Setup the parameters for the dbuf caches. We set the sizes of the
+ * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
+ * of the size of the ARC, respectively. If the values are set in
+ * /etc/system and they're not greater than the size of the ARC, then
+ * we honor that value.
+ */
+ if (dbuf_cache_max_bytes == 0 ||
+ dbuf_cache_max_bytes >= arc_max_bytes()) {
+ dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
+ }
+ if (dbuf_metadata_cache_max_bytes == 0 ||
+ dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
+ dbuf_metadata_cache_max_bytes =
+ arc_max_bytes() >> dbuf_metadata_cache_shift;
+ }
+
+ /*
+ * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+ * configuration is not required.
+ */
+ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
+
+ for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+ dbuf_caches[dcs].cache =
+ multilist_create(sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_cache_link),
+ dbuf_cache_multilist_index_func);
+ zfs_refcount_create(&dbuf_caches[dcs].size);
+ }
+
+ dbuf_evict_thread_exit = B_FALSE;
+ mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
+ dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
+ NULL, 0, &p0, TS_RUN, minclsyspri);
+
+ dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (dbuf_ksp != NULL) {
+ for (i = 0; i < DN_MAX_LEVELS; i++) {
+ snprintf(dbuf_stats.cache_levels[i].name,
+ KSTAT_STRLEN, "cache_level_%d", i);
+ dbuf_stats.cache_levels[i].data_type =
+ KSTAT_DATA_UINT64;
+ snprintf(dbuf_stats.cache_levels_bytes[i].name,
+ KSTAT_STRLEN, "cache_level_%d_bytes", i);
+ dbuf_stats.cache_levels_bytes[i].data_type =
+ KSTAT_DATA_UINT64;
+ }
+ dbuf_ksp->ks_data = &dbuf_stats;
+ dbuf_ksp->ks_update = dbuf_kstat_update;
+ kstat_install(dbuf_ksp);
+ }
+}
+
+void
+dbuf_fini(void)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ dbuf_stats_destroy();
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_destroy(&h->hash_mutexes[i]);
+ kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+ kmem_cache_destroy(dbuf_kmem_cache);
+ taskq_destroy(dbu_evict_taskq);
+
+ mutex_enter(&dbuf_evict_lock);
+ dbuf_evict_thread_exit = B_TRUE;
+ while (dbuf_evict_thread_exit) {
+ cv_signal(&dbuf_evict_cv);
+ cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
+ }
+ mutex_exit(&dbuf_evict_lock);
+
+ mutex_destroy(&dbuf_evict_lock);
+ cv_destroy(&dbuf_evict_cv);
+
+ for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+ zfs_refcount_destroy(&dbuf_caches[dcs].size);
+ multilist_destroy(dbuf_caches[dcs].cache);
+ }
+
+ if (dbuf_ksp != NULL) {
+ kstat_delete(dbuf_ksp);
+ dbuf_ksp = NULL;
+ }
+}
+
+/*
+ * Other stuff.
+ */
+
+#ifdef ZFS_DEBUG
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+ dnode_t *dn;
+ dbuf_dirty_record_t *dr;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+ return;
+
+ ASSERT(db->db_objset != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn == NULL) {
+ ASSERT(db->db_parent == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ } else {
+ ASSERT3U(db->db.db_object, ==, dn->dn_object);
+ ASSERT3P(db->db_objset, ==, dn->dn_objset);
+ ASSERT3U(db->db_level, <, dn->dn_nlevels);
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID ||
+ !avl_is_empty(&dn->dn_dbufs));
+ }
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+ } else if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT0(db->db.db_offset);
+ } else {
+ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+ }
+
+ for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
+ for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
+ /*
+ * We can't assert that db_size matches dn_datablksz because it
+ * can be momentarily different when another thread is doing
+ * dnode_set_blksz().
+ */
+ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dr = db->db_data_pending;
+ /*
+ * It should only be modified in syncing context, so
+ * make sure we only have one copy of the data.
+ */
+ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
+ }
+
+ /* verify db->db_blkptr */
+ if (db->db_blkptr) {
+ if (db->db_parent == dn->dn_dbuf) {
+ /* db is pointed to by the dnode */
+ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
+ ASSERT(db->db_parent == NULL);
+ else
+ ASSERT(db->db_parent != NULL);
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ } else {
+ /* db is pointed to by an indirect block */
+ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+ ASSERT3U(db->db_parent->db.db_object, ==,
+ db->db.db_object);
+ /*
+ * dnode_grow_indblksz() can make this fail if we don't
+ * have the struct_rwlock. XXX indblksz no longer
+ * grows. safe to do this now?
+ */
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ ASSERT3P(db->db_blkptr, ==,
+ ((blkptr_t *)db->db_parent->db.db_data +
+ db->db_blkid % epb));
+ }
+ }
+ }
+ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+ (db->db_buf == NULL || db->db_buf->b_data) &&
+ db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_state != DB_FILL && !dn->dn_free_txg) {
+ /*
+ * If the blkptr isn't set but they have nonzero data,
+ * it had better be dirty, otherwise we'll lose that
+ * data when we evict this buffer.
+ *
+ * There is an exception to this rule for indirect blocks; in
+ * this case, if the indirect block is a hole, we fill in a few
+ * fields on each of the child blocks (importantly, birth time)
+ * to prevent hole birth times from being lost when you
+ * partially fill in a hole.
+ */
+ if (db->db_dirtycnt == 0) {
+ if (db->db_level == 0) {
+ uint64_t *buf = db->db.db_data;
+ int i;
+
+ for (i = 0; i < db->db.db_size >> 3; i++) {
+ ASSERT(buf[i] == 0);
+ }
+ } else {
+ blkptr_t *bps = db->db.db_data;
+ ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
+ db->db.db_size);
+ /*
+ * We want to verify that all the blkptrs in the
+ * indirect block are holes, but we may have
+ * automatically set up a few fields for them.
+ * We iterate through each blkptr and verify
+ * they only have those fields set.
+ */
+ for (int i = 0;
+ i < db->db.db_size / sizeof (blkptr_t);
+ i++) {
+ blkptr_t *bp = &bps[i];
+ ASSERT(ZIO_CHECKSUM_IS_ZERO(
+ &bp->blk_cksum));
+ ASSERT(
+ DVA_IS_EMPTY(&bp->blk_dva[0]) &&
+ DVA_IS_EMPTY(&bp->blk_dva[1]) &&
+ DVA_IS_EMPTY(&bp->blk_dva[2]));
+ ASSERT0(bp->blk_fill);
+ ASSERT0(bp->blk_pad[0]);
+ ASSERT0(bp->blk_pad[1]);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT0(bp->blk_phys_birth);
+ }
+ }
+ }
+ }
+ DB_DNODE_EXIT(db);
+}
+#endif
+
+static void
+dbuf_clear_data(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ dbuf_evict_user(db);
+ ASSERT3P(db->db_buf, ==, NULL);
+ db->db.db_data = NULL;
+ if (db->db_state != DB_NOFILL)
+ db->db_state = DB_UNCACHED;
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(buf != NULL);
+
+ db->db_buf = buf;
+ ASSERT(buf->b_data != NULL);
+ db->db.db_data = buf->b_data;
+}
+
+/*
+ * Loan out an arc_buf for read. Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+ arc_buf_t *abuf;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ mutex_enter(&db->db_mtx);
+ if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
+ int blksz = db->db.db_size;
+ spa_t *spa = db->db_objset->os_spa;
+
+ mutex_exit(&db->db_mtx);
+ abuf = arc_loan_buf(spa, B_FALSE, blksz);
+ bcopy(db->db.db_data, abuf->b_data, blksz);
+ } else {
+ abuf = db->db_buf;
+ arc_loan_inuse_buf(abuf, db);
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ mutex_exit(&db->db_mtx);
+ }
+ return (abuf);
+}
+
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
+uint64_t
+dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
+{
+ if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+ /*
+ * The level n blkid is equal to the level 0 blkid divided by
+ * the number of level 0s in a level n block.
+ *
+ * The level 0 blkid is offset >> datablkshift =
+ * offset / 2^datablkshift.
+ *
+ * The number of level 0s in a level n is the number of block
+ * pointers in an indirect block, raised to the power of level.
+ * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+ * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+ *
+ * Thus, the level n blkid is: offset /
+ * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
+ * = offset / 2^(datablkshift + level *
+ * (indblkshift - SPA_BLKPTRSHIFT))
+ * = offset >> (datablkshift + level *
+ * (indblkshift - SPA_BLKPTRSHIFT))
+ */
+ return (offset >> (dn->dn_datablkshift + level *
+ (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
+ } else {
+ ASSERT3U(offset, <, dn->dn_datablksz);
+ return (0);
+ }
+}
+
+static void
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db_state, ==, DB_READ);
+ /*
+ * All reads are synchronous, so we must have a hold on the dbuf
+ */
+ ASSERT(zfs_refcount_count(&db->db_holds) > 0);
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ if (buf == NULL) {
+ /* i/o error */
+ ASSERT(zio == NULL || zio->io_error != 0);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT3P(db->db_buf, ==, NULL);
+ db->db_state = DB_UNCACHED;
+ } else if (db->db_level == 0 && db->db_freed_in_flight) {
+ /* freed in flight */
+ ASSERT(zio == NULL || zio->io_error == 0);
+ if (buf == NULL) {
+ buf = arc_alloc_buf(db->db_objset->os_spa,
+ db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
+ }
+ arc_release(buf, db);
+ bzero(buf->b_data, db->db.db_size);
+ arc_buf_freeze(buf);
+ db->db_freed_in_flight = FALSE;
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else {
+ /* success */
+ ASSERT(zio == NULL || zio->io_error == 0);
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ }
+ cv_broadcast(&db->db_changed);
+ dbuf_rele_and_unlock(db, NULL, B_FALSE);
+}
+
+static void
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+ dnode_t *dn;
+ zbookmark_phys_t zb;
+ arc_flags_t aflags = ARC_FLAG_NOWAIT;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+ /* We need the struct_rwlock to prevent db_blkptr from changing. */
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_buf == NULL);
+
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ /*
+ * The bonus length stored in the dnode may be less than
+ * the maximum available space in the bonus buffer.
+ */
+ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+
+ ASSERT3U(bonuslen, <=, db->db.db_size);
+ db->db.db_data = zio_buf_alloc(max_bonuslen);
+ arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
+ if (bonuslen < max_bonuslen)
+ bzero(db->db.db_data, max_bonuslen);
+ if (bonuslen)
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ DB_DNODE_EXIT(db);
+ db->db_state = DB_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ /*
+ * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+ * processes the delete record and clears the bp while we are waiting
+ * for the dn_mtx (resulting in a "no" from block_freed).
+ */
+ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+ (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
+ BP_IS_HOLE(db->db_blkptr)))) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
+ db->db.db_size));
+ bzero(db->db.db_data, db->db.db_size);
+
+ if (db->db_blkptr != NULL && db->db_level > 0 &&
+ BP_IS_HOLE(db->db_blkptr) &&
+ db->db_blkptr->blk_birth != 0) {
+ blkptr_t *bps = db->db.db_data;
+ for (int i = 0; i < ((1 <<
+ DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
+ i++) {
+ blkptr_t *bp = &bps[i];
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ 1 << dn->dn_indblkshift);
+ BP_SET_LSIZE(bp,
+ BP_GET_LEVEL(db->db_blkptr) == 1 ?
+ dn->dn_datablksz :
+ BP_GET_LSIZE(db->db_blkptr));
+ BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
+ BP_SET_LEVEL(bp,
+ BP_GET_LEVEL(db->db_blkptr) - 1);
+ BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+ }
+ }
+ DB_DNODE_EXIT(db);
+ db->db_state = DB_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ DB_DNODE_EXIT(db);
+
+ db->db_state = DB_READ;
+ mutex_exit(&db->db_mtx);
+
+ if (DBUF_IS_L2CACHEABLE(db))
+ aflags |= ARC_FLAG_L2CACHE;
+
+ SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ dbuf_add_ref(db, NULL);
+
+ (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+ (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ &aflags, &zb);
+}
+
+/*
+ * This is our just-in-time copy function. It makes a copy of buffers that
+ * have been modified in a previous transaction group before we access them in
+ * the current active group.
+ *
+ * This function is used in three places: when we are dirtying a buffer for the
+ * first time in a txg, when we are freeing a range in a dnode that includes
+ * this buffer, and when we are accessing a buffer which was received compressed
+ * and later referenced in a WRITE_BYREF record.
+ *
+ * Note that when we are called from dbuf_free_range() we do not put a hold on
+ * the buffer, we just traverse the active dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+ if (dr == NULL ||
+ (dr->dt.dl.dr_data !=
+ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ return;
+
+ /*
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
+ */
+ ASSERT(dr->dr_txg >= txg - 2);
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ /* Note that the data bufs here are zio_bufs */
+ dnode_t *dn = DB_DNODE(db);
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
+ arc_space_consume(bonuslen, ARC_SPACE_BONUS);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
+ } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = arc_buf_size(db->db_buf);
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa = db->db_objset->os_spa;
+ enum zio_compress compress_type =
+ arc_get_compression(db->db_buf);
+
+ if (compress_type == ZIO_COMPRESS_OFF) {
+ dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
+ } else {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
+ size, arc_buf_lsize(db->db_buf), compress_type);
+ }
+ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ } else {
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ }
+}
+
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+ int err = 0;
+ boolean_t prefetch;
+ dnode_t *dn;
+
+ /*
+ * We don't have to hold the mutex to check db_state because it
+ * can't be freed while we have a hold on the buffer.
+ */
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+ if (db->db_state == DB_NOFILL)
+ return (SET_ERROR(EIO));
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
+ DBUF_IS_CACHEABLE(db);
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_CACHED) {
+ /*
+ * If the arc buf is compressed, we need to decompress it to
+ * read the data. This could happen during the "zfs receive" of
+ * a stream which is compressed and deduplicated.
+ */
+ if (db->db_buf != NULL &&
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
+ dbuf_fix_old_data(db,
+ spa_syncing_txg(dmu_objset_spa(db->db_objset)));
+ err = arc_decompress(db->db_buf);
+ dbuf_set_data(db, db->db_buf);
+ }
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+ DBUF_STAT_BUMP(hash_hits);
+ } else if (db->db_state == DB_UNCACHED) {
+ spa_t *spa = dn->dn_objset->os_spa;
+ boolean_t need_wait = B_FALSE;
+
+ if (zio == NULL &&
+ db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ need_wait = B_TRUE;
+ }
+ dbuf_read_impl(db, zio, flags);
+
+ /* dbuf_read_impl has dropped db_mtx for us */
+
+ if (prefetch)
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+ DBUF_STAT_BUMP(hash_misses);
+
+ if (need_wait)
+ err = zio_wait(zio);
+ } else {
+ /*
+ * Another reader came in while the dbuf was in flight
+ * between UNCACHED and CACHED. Either a writer will finish
+ * writing the buffer (sending the dbuf to CACHED) or the
+ * first reader's request will reach the read_done callback
+ * and send the dbuf to CACHED. Otherwise, a failure
+ * occurred and the dbuf went to UNCACHED.
+ */
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+ DBUF_STAT_BUMP(hash_misses);
+
+ /* Skip the wait per the caller's request. */
+ mutex_enter(&db->db_mtx);
+ if ((flags & DB_RF_NEVERWAIT) == 0) {
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL) {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+ db, zio_t *, zio);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ }
+ if (db->db_state == DB_UNCACHED)
+ err = SET_ERROR(EIO);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+
+ return (err);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa = db->db_objset->os_spa;
+
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
+ db->db_state = DB_FILL;
+ } else if (db->db_state == DB_NOFILL) {
+ dbuf_clear_data(db);
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+void
+dbuf_unoverride(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
+ uint64_t txg = dr->dr_txg;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ /*
+ * This assert is valid because dmu_sync() expects to be called by
+ * a zilog's get_data while holding a range lock. This call only
+ * comes from dbuf_dirty() callers who must also hold a range lock.
+ */
+ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+ ASSERT(db->db_level == 0);
+
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+ return;
+
+ ASSERT(db->db_data_pending != dr);
+
+ /* free this block */
+ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+ zio_free(db->db_objset->os_spa, txg, bp);
+
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ dr->dt.dl.dr_nopwrite = B_FALSE;
+
+ /*
+ * Release the already-written buffer, so we leave it in
+ * a consistent dirty state. Note that all callers are
+ * modifying the buffer, so they will immediately do
+ * another (redundant) arc_release(). Therefore, leave
+ * the buf thawed to save the effort of freezing &
+ * immediately re-thawing it.
+ */
+ arc_release(dr->dt.dl.dr_data, db);
+}
+
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks.
+ */
+void
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t db_search;
+ dmu_buf_impl_t *db, *db_next;
+ uint64_t txg = tx->tx_txg;
+ avl_index_t where;
+
+ if (end_blkid > dn->dn_maxblkid &&
+ !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
+ end_blkid = dn->dn_maxblkid;
+ dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+ db_search.db_level = 0;
+ db_search.db_blkid = start_blkid;
+ db_search.db_state = DB_SEARCH;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ ASSERT3P(db, ==, NULL);
+
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ for (; db != NULL; db = db_next) {
+ db_next = AVL_NEXT(&dn->dn_dbufs, db);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ if (db->db_level != 0 || db->db_blkid > end_blkid) {
+ break;
+ }
+ ASSERT3U(db->db_blkid, >=, start_blkid);
+
+ /* found a level 0 buffer in the range */
+ mutex_enter(&db->db_mtx);
+ if (dbuf_undirty(db, tx)) {
+ /* mutex has been dropped and dbuf destroyed */
+ continue;
+ }
+
+ if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL ||
+ db->db_state == DB_EVICTING) {
+ ASSERT(db->db.db_data == NULL);
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+ /* will be handled in dbuf_read_done or dbuf_rele */
+ db->db_freed_in_flight = TRUE;
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (zfs_refcount_count(&db->db_holds) == 0) {
+ ASSERT(db->db_buf);
+ dbuf_destroy(db);
+ continue;
+ }
+ /* The dbuf is referenced */
+
+ if (db->db_last_dirty != NULL) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ if (dr->dr_txg == txg) {
+ /*
+ * This buffer is "in-use", re-adjust the file
+ * size to reflect that this buffer may
+ * contain new data when we sync.
+ */
+ if (db->db_blkid != DMU_SPILL_BLKID &&
+ db->db_blkid > dn->dn_maxblkid)
+ dn->dn_maxblkid = db->db_blkid;
+ dbuf_unoverride(dr);
+ } else {
+ /*
+ * This dbuf is not dirty in the open context.
+ * Either uncache it (if its not referenced in
+ * the open context) or reset its contents to
+ * empty.
+ */
+ dbuf_fix_old_data(db, txg);
+ }
+ }
+ /* clear the contents if its cached */
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ arc_release(db->db_buf, db);
+ bzero(db->db.db_data, db->db.db_size);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ mutex_exit(&db->db_mtx);
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+ arc_buf_t *buf, *obuf;
+ int osize = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dnode_t *dn;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ /* XXX does *this* func really need the lock? */
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ /*
+ * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
+ * is OK, because there can be no other references to the db
+ * when we are changing its size, so no concurrent DB_FILL can
+ * be happening.
+ */
+ /*
+ * XXX we should be doing a dbuf_read, checking the return
+ * value and returning that up to our callers
+ */
+ dmu_buf_will_dirty(&db->db, tx);
+
+ /* create the data buffer for the new block */
+ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
+
+ /* copy old block data to the new block */
+ obuf = db->db_buf;
+ bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
+ /* zero the remainder */
+ if (size > osize)
+ bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+ mutex_enter(&db->db_mtx);
+ dbuf_set_data(db, buf);
+ arc_buf_destroy(obuf, db);
+ db->db.db_size = size;
+
+ if (db->db_level == 0) {
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ db->db_last_dirty->dt.dl.dr_data = buf;
+ }
+ mutex_exit(&db->db_mtx);
+
+ dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
+ DB_DNODE_EXIT(db);
+}
+
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+ objset_t *os = db->db_objset;
+
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(arc_released(os->os_phys_buf) ||
+ list_link_active(&os->os_dsl_dataset->ds_synced_link));
+ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+ (void) arc_release(db->db_buf, db);
+}
+
+/*
+ * We already have a dirty record for this TXG, and we are being
+ * dirtied again.
+ */
+static void
+dbuf_redirty(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(dr);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+ db->db_state != DB_NOFILL) {
+ /* Already released on initial dirty, so just thaw. */
+ ASSERT(arc_released(db->db_buf));
+ arc_buf_thaw(db->db_buf);
+ }
+ }
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ objset_t *os;
+ dbuf_dirty_record_t **drp, *dr;
+ int drop_struct_lock = FALSE;
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+ DMU_TX_DIRTY_BUF(tx, db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ /*
+ * Shouldn't dirty a regular buffer in syncing context. Private
+ * objects may be dirtied in syncing context, but only if they
+ * were already pre-dirtied in open context.
+ */
+#ifdef DEBUG
+ if (dn->dn_objset->os_dsl_dataset != NULL) {
+ rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+ RW_READER, FTAG);
+ }
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+ DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+ dn->dn_objset->os_dsl_dataset == NULL);
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
+ /*
+ * We make this assert for private objects as well, but after we
+ * check if we're already dirty. They are allowed to re-dirty
+ * in syncing context.
+ */
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * XXX make this true for indirects too? The problem is that
+ * transactions created with dmu_tx_create_assigned() from
+ * syncing context don't bother holding ahead.
+ */
+ ASSERT(db->db_level != 0 ||
+ db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+ db->db_state == DB_NOFILL);
+
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * Don't set dirtyctx to SYNC if we're just modifying this as we
+ * initialize the objset.
+ */
+ if (dn->dn_dirtyctx == DN_UNDIRTIED) {
+ if (dn->dn_objset->os_dsl_dataset != NULL) {
+ rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+ RW_READER, FTAG);
+ }
+ if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+ dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
+ DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+ ASSERT(dn->dn_dirtyctx_firstset == NULL);
+ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ }
+ if (dn->dn_objset->os_dsl_dataset != NULL) {
+ rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+ FTAG);
+ }
+ }
+
+ if (tx->tx_txg > dn->dn_dirty_txg)
+ dn->dn_dirty_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ dn->dn_have_spill = B_TRUE;
+
+ /*
+ * If this buffer is already dirty, we're done.
+ */
+ drp = &db->db_last_dirty;
+ ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
+ db->db.db_object == DMU_META_DNODE_OBJECT);
+ while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+ drp = &dr->dr_next;
+ if (dr && dr->dr_txg == tx->tx_txg) {
+ DB_DNODE_EXIT(db);
+
+ dbuf_redirty(dr);
+ mutex_exit(&db->db_mtx);
+ return (dr);
+ }
+
+ /*
+ * Only valid if not already dirty.
+ */
+ ASSERT(dn->dn_object == 0 ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ ASSERT3U(dn->dn_nlevels, >, db->db_level);
+
+ /*
+ * We should only be dirtying in syncing context if it's the
+ * mos or we're initializing the os or it's a special object.
+ * However, we are allowed to dirty in syncing context provided
+ * we already dirtied it in open context. Hence we must make
+ * this assertion only if we're not already dirty.
+ */
+ os = dn->dn_objset;
+ VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
+#ifdef DEBUG
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+ os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
+ ASSERT(db->db.db_size != 0);
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ if (db->db_blkid != DMU_BONUS_BLKID) {
+ dmu_objset_willuse_space(os, db->db.db_size, tx);
+ }
+
+ /*
+ * If this buffer is dirty in an old transaction group we need
+ * to make a copy of it so that the changes we make in this
+ * transaction group won't leak out when we sync the older txg.
+ */
+ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+ list_link_init(&dr->dr_dirty_node);
+ if (db->db_level == 0) {
+ void *data_old = db->db_buf;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so
+ * that we can modify it without impacting
+ * possible other users of this cached data
+ * block. Note that indirect blocks and
+ * private objects are not released until the
+ * syncing state (since they are only modified
+ * then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
+ }
+ dr->dt.dl.dr_data = data_old;
+ } else {
+ mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&dr->dt.di.dr_children,
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+ if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+ dr->dr_accounted = db->db.db_size;
+ dr->dr_dbuf = db;
+ dr->dr_txg = tx->tx_txg;
+ dr->dr_next = *drp;
+ *drp = dr;
+
+ /*
+ * We could have been freed_in_flight between the dbuf_noread
+ * and dbuf_dirty. We win, as though the dbuf_noread() had
+ * happened after the free.
+ */
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ range_tree_clear(dn->dn_free_ranges[txgoff],
+ db->db_blkid, 1);
+ }
+ mutex_exit(&dn->dn_mtx);
+ db->db_freed_in_flight = FALSE;
+ }
+
+ /*
+ * This buffer is now part of this txg
+ */
+ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ db->db_dirtycnt += 1;
+ ASSERT3U(db->db_dirtycnt, <=, 3);
+
+ mutex_exit(&db->db_mtx);
+
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
+ return (dr);
+ }
+
+ /*
+ * The dn_struct_rwlock prevents db_blkptr from changing
+ * due to a write from syncing context completing
+ * while we are running, so we want to acquire it before
+ * looking at db_blkptr.
+ */
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ /*
+ * We need to hold the dn_struct_rwlock to make this assertion,
+ * because it protects dn_phys / dn_next_nlevels from changing.
+ */
+ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+ dn->dn_phys->dn_nlevels > db->db_level ||
+ dn->dn_next_nlevels[txgoff] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+ /*
+ * If we are overwriting a dedup BP, then unless it is snapshotted,
+ * when we get to syncing context we will need to decrement its
+ * refcount in the DDT. Prefetch the relevant DDT block so that
+ * syncing context won't have to wait for the i/o.
+ */
+ ddt_prefetch(os->os_spa, db->db_blkptr);
+
+ if (db->db_level == 0) {
+ dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
+ if (db->db_level+1 < dn->dn_nlevels) {
+ dmu_buf_impl_t *parent = db->db_parent;
+ dbuf_dirty_record_t *di;
+ int parent_held = FALSE;
+
+ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ parent = dbuf_hold_level(dn, db->db_level+1,
+ db->db_blkid >> epbs, FTAG);
+ ASSERT(parent != NULL);
+ parent_held = TRUE;
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3U(db->db_level+1, ==, parent->db_level);
+ di = dbuf_dirty(parent, tx);
+ if (parent_held)
+ dbuf_rele(parent, FTAG);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * Since we've dropped the mutex, it's possible that
+ * dbuf_undirty() might have changed this out from under us.
+ */
+ if (db->db_last_dirty == dr ||
+ dn->dn_object == DMU_META_DNODE_OBJECT) {
+ mutex_enter(&di->dt.di.dr_mtx);
+ ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&di->dt.di.dr_children, dr);
+ mutex_exit(&di->dt.di.dr_mtx);
+ dr->dr_parent = di;
+ }
+ mutex_exit(&db->db_mtx);
+ } else {
+ ASSERT(db->db_level+1 == dn->dn_nlevels);
+ ASSERT(db->db_blkid < dn->dn_nblkptr);
+ ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
+ return (dr);
+}
+
+/*
+ * Undirty a buffer in the transaction group referenced by the given
+ * transaction. Return whether this evicted the dbuf.
+ */
+static boolean_t
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ uint64_t txg = tx->tx_txg;
+ dbuf_dirty_record_t *dr, **drp;
+
+ ASSERT(txg != 0);
+
+ /*
+ * Due to our use of dn_nlevels below, this can only be called
+ * in open context, unless we are operating on the MOS.
+ * From syncing context, dn_nlevels may be different from the
+ * dn_nlevels used when dbuf was dirtied.
+ */
+ ASSERT(db->db_objset ==
+ dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+ txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT0(db->db_level);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ /*
+ * If this buffer is not dirty, we're done.
+ */
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+ if (dr->dr_txg <= txg)
+ break;
+ if (dr == NULL || dr->dr_txg < txg)
+ return (B_FALSE);
+ ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ ASSERT(db->db.db_size != 0);
+
+ dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+ dr->dr_accounted, txg);
+
+ *drp = dr->dr_next;
+
+ /*
+ * Note that there are three places in dbuf_dirty()
+ * where this dirty record may be put on a list.
+ * Make sure to do a list_remove corresponding to
+ * every one of those list_insert calls.
+ */
+ if (dr->dr_parent) {
+ mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+ list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+ mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+ } else if (db->db_blkid == DMU_SPILL_BLKID ||
+ db->db_level + 1 == dn->dn_nlevels) {
+ ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+ mutex_exit(&dn->dn_mtx);
+ }
+ DB_DNODE_EXIT(db);
+
+ if (db->db_state != DB_NOFILL) {
+ dbuf_unoverride(dr);
+
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
+
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+
+ if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
+ ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+ dbuf_destroy(db);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+void
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+ /*
+ * Quick check for dirtyness. For already dirty blocks, this
+ * reduces runtime of this function by >90%, and overall performance
+ * by 50% for some workloads (e.g. file deletion with indirect blocks
+ * cached).
+ */
+ mutex_enter(&db->db_mtx);
+ dbuf_dirty_record_t *dr;
+ for (dr = db->db_last_dirty;
+ dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
+ /*
+ * It's possible that it is already dirty but not cached,
+ * because there are some calls to dbuf_dirty() that don't
+ * go through dmu_buf_will_dirty().
+ */
+ if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
+ /* This dbuf is already dirty and cached. */
+ dbuf_redirty(dr);
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ }
+ mutex_exit(&db->db_mtx);
+
+ DB_DNODE_ENTER(db);
+ if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ DB_DNODE_EXIT(db);
+ (void) dbuf_read(db, NULL, rf);
+ (void) dbuf_dirty(db, tx);
+}
+
+void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_state = DB_NOFILL;
+
+ dmu_buf_will_fill(db_fake, tx);
+}
+
+void
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(db->db_level == 0);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
+ dmu_tx_private_ok(tx));
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ if (db->db_state == DB_FILL) {
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ /* we were freed while filling */
+ /* XXX dbuf_undirty? */
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_freed_in_flight = FALSE;
+ }
+ db->db_state = DB_CACHED;
+ cv_broadcast(&db->db_changed);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+ bp_embedded_type_t etype, enum zio_compress comp,
+ int uncompressed_size, int compressed_size, int byteorder,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ struct dirty_leaf *dl;
+ dmu_object_type_t type;
+
+ if (etype == BP_EMBEDDED_TYPE_DATA) {
+ ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+ SPA_FEATURE_EMBEDDED_DATA));
+ }
+
+ DB_DNODE_ENTER(db);
+ type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ ASSERT0(db->db_level);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ dmu_buf_will_not_fill(dbuf, tx);
+
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ dl = &db->db_last_dirty->dt.dl;
+ encode_embedded_bp_compressed(&dl->dr_overridden_by,
+ data, comp, uncompressed_size, compressed_size);
+ BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+ BP_SET_TYPE(&dl->dr_overridden_by, type);
+ BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+ BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+ dl->dr_override_state = DR_OVERRIDDEN;
+ dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
+/*
+ * Directly assign a provided arc buf to a given dbuf if it's not referenced
+ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
+ */
+void
+dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
+{
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(db->db_level == 0);
+ ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
+ ASSERT(buf != NULL);
+ ASSERT(arc_buf_lsize(buf) == db->db.db_size);
+ ASSERT(tx->tx_txg != 0);
+
+ arc_return_buf(buf, db);
+ ASSERT(arc_released(buf));
+
+ mutex_enter(&db->db_mtx);
+
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+
+ if (db->db_state == DB_CACHED &&
+ zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+ arc_buf_destroy(buf, db);
+ xuio_stat_wbuf_copied();
+ return;
+ }
+
+ xuio_stat_wbuf_nocopy();
+ if (db->db_state == DB_CACHED) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(db->db_buf != NULL);
+ if (dr != NULL && dr->dr_txg == tx->tx_txg) {
+ ASSERT(dr->dt.dl.dr_data == db->db_buf);
+ if (!arc_released(db->db_buf)) {
+ ASSERT(dr->dt.dl.dr_override_state ==
+ DR_OVERRIDDEN);
+ arc_release(db->db_buf, db);
+ }
+ dr->dt.dl.dr_data = buf;
+ arc_buf_destroy(db->db_buf, db);
+ } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
+ arc_release(db->db_buf, db);
+ arc_buf_destroy(db->db_buf, db);
+ }
+ db->db_buf = NULL;
+ }
+ ASSERT(db->db_buf == NULL);
+ dbuf_set_data(db, buf);
+ db->db_state = DB_FILL;
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ dmu_buf_fill_done(&db->db, tx);
+}
+
+void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *parent = db->db_parent;
+ dmu_buf_impl_t *dndb;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(zfs_refcount_is_zero(&db->db_holds));
+
+ if (db->db_buf != NULL) {
+ arc_buf_destroy(db->db_buf, db);
+ db->db_buf = NULL;
+ }
+
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ if (db->db.db_data != NULL) {
+ zio_buf_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
+ db->db_state = DB_UNCACHED;
+ }
+ }
+
+ dbuf_clear_data(db);
+
+ if (multilist_link_active(&db->db_cache_link)) {
+ ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+ db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+ multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size,
+ db->db.db_size, db);
+
+ if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMPDOWN(metadata_cache_count);
+ } else {
+ DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+ DBUF_STAT_BUMPDOWN(cache_count);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+ db->db.db_size);
+ }
+ db->db_caching_status = DB_NO_CACHE;
+ }
+
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
+ ASSERT(db->db_data_pending == NULL);
+
+ db->db_state = DB_EVICTING;
+ db->db_blkptr = NULL;
+
+ /*
+ * Now that db_state is DB_EVICTING, nobody else can find this via
+ * the hash table. We can now drop db_mtx, which allows us to
+ * acquire the dn_dbufs_mtx.
+ */
+ mutex_exit(&db->db_mtx);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dndb = dn->dn_dbuf;
+ if (db->db_blkid != DMU_BONUS_BLKID) {
+ boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+ if (needlock)
+ mutex_enter(&dn->dn_dbufs_mtx);
+ avl_remove(&dn->dn_dbufs, db);
+ membar_producer();
+ DB_DNODE_EXIT(db);
+ if (needlock)
+ mutex_exit(&dn->dn_dbufs_mtx);
+ /*
+ * Decrementing the dbuf count means that the hold corresponding
+ * to the removed dbuf is no longer discounted in dnode_move(),
+ * so the dnode cannot be moved until after we release the hold.
+ * The membar_producer() ensures visibility of the decremented
+ * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+ * release any lock.
+ */
+ mutex_enter(&dn->dn_mtx);
+ dnode_rele_and_unlock(dn, db, B_TRUE);
+ db->db_dnode_handle = NULL;
+
+ dbuf_hash_remove(db);
+ } else {
+ DB_DNODE_EXIT(db);
+ }
+
+ ASSERT(zfs_refcount_is_zero(&db->db_holds));
+
+ db->db_parent = NULL;
+
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ ASSERT(db->db_hash_next == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_data_pending == NULL);
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT(!multilist_link_active(&db->db_cache_link));
+
+ kmem_cache_free(dbuf_kmem_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+
+ /*
+ * If this dbuf is referenced from an indirect dbuf,
+ * decrement the ref count on the indirect dbuf.
+ */
+ if (parent && parent != dndb) {
+ mutex_enter(&parent->db_mtx);
+ dbuf_rele_and_unlock(parent, db, B_TRUE);
+ }
+}
+
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or a userused or groupused
+ * object.
+ */
+__attribute__((always_inline))
+static inline int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+ dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
+{
+ *parentp = NULL;
+ *bpp = NULL;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+
+ if (blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_have_spill &&
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
+ else
+ *bpp = NULL;
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
+
+ int nlevels =
+ (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT3U(level * epbs, <, 64);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ /*
+ * This assertion shouldn't trip as long as the max indirect block size
+ * is less than 1M. The reason for this is that up to that point,
+ * the number of levels required to address an entire object with blocks
+ * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
+ * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
+ * (i.e. we can address the entire object), objects will all use at most
+ * N-1 levels and the assertion won't overflow. However, once epbs is
+ * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
+ * enough to address an entire object, so objects will have 5 levels,
+ * but then this assertion will overflow.
+ *
+ * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
+ * need to redo this logic to handle overflows.
+ */
+ ASSERT(level >= nlevels ||
+ ((nlevels - level - 1) * epbs) +
+ highbit64(dn->dn_phys->dn_nblkptr) <= 64);
+ if (level >= nlevels ||
+ blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
+ ((nlevels - level - 1) * epbs)) ||
+ (fail_sparse &&
+ blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+ /* the buffer has no parent yet */
+ return (SET_ERROR(ENOENT));
+ } else if (level < nlevels-1) {
+ /* this block is referenced from an indirect block */
+ int err;
+ if (dh == NULL) {
+ err = dbuf_hold_impl(dn, level+1,
+ blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
+ } else {
+ __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
+ blkid >> epbs, fail_sparse, FALSE, NULL,
+ parentp, dh->dh_depth + 1);
+ err = __dbuf_hold_impl(dh + 1);
+ }
+ if (err)
+ return (err);
+ err = dbuf_read(*parentp, NULL,
+ (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err) {
+ dbuf_rele(*parentp, NULL);
+ *parentp = NULL;
+ return (err);
+ }
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
+ ASSERT(BP_IS_HOLE(*bpp));
+ return (0);
+ } else {
+ /* the block is referenced from the dnode */
+ ASSERT3U(level, ==, nlevels-1);
+ ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+ blkid < dn->dn_phys->dn_nblkptr);
+ if (dn->dn_dbuf) {
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ }
+ *bpp = &dn->dn_phys->dn_blkptr[blkid];
+ return (0);
+ }
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+ dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+ objset_t *os = dn->dn_objset;
+ dmu_buf_impl_t *db, *odb;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
+
+ db->db_objset = os;
+ db->db.db_object = dn->dn_object;
+ db->db_level = level;
+ db->db_blkid = blkid;
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt = 0;
+ db->db_dnode_handle = dn->dn_handle;
+ db->db_parent = parent;
+ db->db_blkptr = blkptr;
+
+ db->db_user = NULL;
+ db->db_user_immediate_evict = FALSE;
+ db->db_freed_in_flight = FALSE;
+ db->db_pending_evict = FALSE;
+
+ if (blkid == DMU_BONUS_BLKID) {
+ ASSERT3P(parent, ==, dn->dn_dbuf);
+ db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ db->db.db_offset = DMU_BONUS_BLKID;
+ db->db_state = DB_UNCACHED;
+ db->db_caching_status = DB_NO_CACHE;
+ /* the bonus dbuf is not placed in the hash table */
+ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+ return (db);
+ } else if (blkid == DMU_SPILL_BLKID) {
+ db->db.db_size = (blkptr != NULL) ?
+ BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+ db->db.db_offset = 0;
+ } else {
+ int blocksize =
+ db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
+ db->db.db_size = blocksize;
+ db->db.db_offset = db->db_blkid * blocksize;
+ }
+
+ /*
+ * Hold the dn_dbufs_mtx while we get the new dbuf
+ * in the hash table *and* added to the dbufs list.
+ * This prevents a possible deadlock with someone
+ * trying to look up this dbuf before its added to the
+ * dn_dbufs list.
+ */
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db->db_state = DB_EVICTING;
+ if ((odb = dbuf_hash_insert(db)) != NULL) {
+ /* someone else inserted it first */
+ kmem_cache_free(dbuf_kmem_cache, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ DBUF_STAT_BUMP(hash_insert_race);
+ return (odb);
+ }
+ avl_add(&dn->dn_dbufs, db);
+
+ db->db_state = DB_UNCACHED;
+ db->db_caching_status = DB_NO_CACHE;
+ mutex_exit(&dn->dn_dbufs_mtx);
+ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_add_ref(parent, db);
+
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ zfs_refcount_count(&dn->dn_holds) > 0);
+ (void) zfs_refcount_add(&dn->dn_holds, db);
+
+ dprintf_dbuf(db, "db=%p\n", db);
+
+ return (db);
+}
+
+typedef struct dbuf_prefetch_arg {
+ spa_t *dpa_spa; /* The spa to issue the prefetch in. */
+ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+ int dpa_curlevel; /* The current level that we're reading */
+ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
+ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+} dbuf_prefetch_arg_t;
+
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
+{
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ return;
+
+ arc_flags_t aflags =
+ dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+ ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+ ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+ ASSERT(dpa->dpa_zio != NULL);
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+ dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &dpa->dpa_zb);
+}
+
+/*
+ * Called when an indirect block above our prefetch target is read in. This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
+static void
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+ dbuf_prefetch_arg_t *dpa = private;
+
+ ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+ ASSERT3S(dpa->dpa_curlevel, >, 0);
+
+ if (abuf == NULL) {
+ ASSERT(zio == NULL || zio->io_error != 0);
+ kmem_free(dpa, sizeof (*dpa));
+ return;
+ }
+ ASSERT(zio == NULL || zio->io_error == 0);
+
+ /*
+ * The dpa_dnode is only valid if we are called with a NULL
+ * zio. This indicates that the arc_read() returned without
+ * first calling zio_read() to issue a physical read. Once
+ * a physical read is made the dpa_dnode must be invalidated
+ * as the locks guarding it may have been dropped. If the
+ * dpa_dnode is still valid, then we want to add it to the dbuf
+ * cache. To do so, we must hold the dbuf associated with the block
+ * we just prefetched, read its contents so that we associate it
+ * with an arc_buf_t, and then release it.
+ */
+ if (zio != NULL) {
+ ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+ if (zio->io_flags & ZIO_FLAG_RAW) {
+ ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
+ } else {
+ ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
+ }
+ ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
+
+ dpa->dpa_dnode = NULL;
+ } else if (dpa->dpa_dnode != NULL) {
+ uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
+ (dpa->dpa_epbs * (dpa->dpa_curlevel -
+ dpa->dpa_zb.zb_level));
+ dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
+ dpa->dpa_curlevel, curblkid, FTAG);
+ (void) dbuf_read(db, NULL,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
+ dbuf_rele(db, FTAG);
+ }
+
+ if (abuf == NULL) {
+ kmem_free(dpa, sizeof(*dpa));
+ return;
+ }
+
+ dpa->dpa_curlevel--;
+
+ uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+ (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+ blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+ P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+ if (BP_IS_HOLE(bp)) {
+ kmem_free(dpa, sizeof (*dpa));
+ } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+ ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+ dbuf_issue_final_prefetch(dpa, bp);
+ kmem_free(dpa, sizeof (*dpa));
+ } else {
+ arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+ zbookmark_phys_t zb;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
+ iter_aflags |= ARC_FLAG_L2CACHE;
+
+ ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+ SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+ dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+ bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &iter_aflags, &zb);
+ }
+
+ arc_buf_destroy(abuf, private);
+}
+
+/*
+ * Issue prefetch reads for the given block on the given level. If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously. As a result, this call never blocks waiting for a read to
+ * complete.
+ */
+void
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+ arc_flags_t aflags)
+{
+ blkptr_t bp;
+ int epbs, nlevels, curlevel;
+ uint64_t curblkid;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ if (blkid > dn->dn_maxblkid)
+ return;
+
+ if (dnode_block_freed(dn, blkid))
+ return;
+
+ /*
+ * This dnode hasn't been written to disk yet, so there's nothing to
+ * prefetch.
+ */
+ nlevels = dn->dn_phys->dn_nlevels;
+ if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
+ return;
+
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
+ return;
+
+ dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
+ level, blkid);
+ if (db != NULL) {
+ mutex_exit(&db->db_mtx);
+ /*
+ * This dbuf already exists. It is either CACHED, or
+ * (we assume) about to be read or filled.
+ */
+ return;
+ }
+
+ /*
+ * Find the closest ancestor (indirect block) of the target block
+ * that is present in the cache. In this indirect block, we will
+ * find the bp that is at curlevel, curblkid.
+ */
+ curlevel = level;
+ curblkid = blkid;
+ while (curlevel < nlevels - 1) {
+ int parent_level = curlevel + 1;
+ uint64_t parent_blkid = curblkid >> epbs;
+ dmu_buf_impl_t *db;
+
+ if (dbuf_hold_impl(dn, parent_level, parent_blkid,
+ FALSE, TRUE, FTAG, &db) == 0) {
+ blkptr_t *bpp = db->db_buf->b_data;
+ bp = bpp[P2PHASE(curblkid, 1 << epbs)];
+ dbuf_rele(db, FTAG);
+ break;
+ }
+
+ curlevel = parent_level;
+ curblkid = parent_blkid;
+ }
+
+ if (curlevel == nlevels - 1) {
+ /* No cached indirect blocks found. */
+ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
+ bp = dn->dn_phys->dn_blkptr[curblkid];
+ }
+ if (BP_IS_HOLE(&bp))
+ return;
+
+ ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
+
+ zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, level, blkid);
+ dpa->dpa_curlevel = curlevel;
+ dpa->dpa_prio = prio;
+ dpa->dpa_aflags = aflags;
+ dpa->dpa_spa = dn->dn_objset->os_spa;
+ dpa->dpa_dnode = dn;
+ dpa->dpa_epbs = epbs;
+ dpa->dpa_zio = pio;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+ dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
+
+ /*
+ * If we have the indirect just above us, no need to do the asynchronous
+ * prefetch chain; we'll just run the last step ourselves. If we're at
+ * a higher level, though, we want to issue the prefetches for all the
+ * indirect blocks asynchronously, so we can go on with whatever we were
+ * doing.
+ */
+ if (curlevel == level) {
+ ASSERT3U(curblkid, ==, blkid);
+ dbuf_issue_final_prefetch(dpa, &bp);
+ kmem_free(dpa, sizeof (*dpa));
+ } else {
+ arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+ zbookmark_phys_t zb;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+ iter_aflags |= ARC_FLAG_L2CACHE;
+
+ SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, curlevel, curblkid);
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+ &bp, dbuf_prefetch_indirect_done, dpa, prio,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &iter_aflags, &zb);
+ }
+ /*
+ * We use pio here instead of dpa_zio since it's possible that
+ * dpa may have already been freed.
+ */
+ zio_nowait(pio);
+}
+
+#define DBUF_HOLD_IMPL_MAX_DEPTH 20
+
+/*
+ * Helper function for __dbuf_hold_impl() to copy a buffer. Handles
+ * the case of encrypted, compressed and uncompressed buffers by
+ * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
+ * arc_alloc_compressed_buf() or arc_alloc_buf().*
+ *
+ * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl().
+ */
+noinline static void
+dbuf_hold_copy(struct dbuf_hold_impl_data *dh)
+{
+ dnode_t *dn = dh->dh_dn;
+ dmu_buf_impl_t *db = dh->dh_db;
+ dbuf_dirty_record_t *dr = dh->dh_dr;
+ arc_buf_t *data = dr->dt.dl.dr_data;
+
+ enum zio_compress compress_type = arc_get_compression(data);
+
+ if (compress_type != ZIO_COMPRESS_OFF) {
+ dbuf_set_data(db, arc_alloc_compressed_buf(
+ dn->dn_objset->os_spa, db, arc_buf_size(data),
+ arc_buf_lsize(data), compress_type));
+ } else {
+ dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
+ DBUF_GET_BUFC_TYPE(db), db->db.db_size));
+ }
+
+ bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+static int
+__dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
+{
+ ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
+ dh->dh_parent = NULL;
+
+ ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
+ ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
+
+ *(dh->dh_dbp) = NULL;
+
+ /* dbuf_find() returns with db_mtx held */
+ dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
+ dh->dh_level, dh->dh_blkid);
+
+ if (dh->dh_db == NULL) {
+ dh->dh_bp = NULL;
+
+ if (dh->dh_fail_uncached)
+ return (SET_ERROR(ENOENT));
+
+ ASSERT3P(dh->dh_parent, ==, NULL);
+ dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
+ dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh);
+ if (dh->dh_fail_sparse) {
+ if (dh->dh_err == 0 &&
+ dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
+ dh->dh_err = SET_ERROR(ENOENT);
+ if (dh->dh_err) {
+ if (dh->dh_parent)
+ dbuf_rele(dh->dh_parent, NULL);
+ return (dh->dh_err);
+ }
+ }
+ if (dh->dh_err && dh->dh_err != ENOENT)
+ return (dh->dh_err);
+ dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
+ dh->dh_parent, dh->dh_bp);
+ }
+
+ if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
+ mutex_exit(&dh->dh_db->db_mtx);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (dh->dh_db->db_buf != NULL) {
+ arc_buf_access(dh->dh_db->db_buf);
+ ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
+ }
+
+ ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
+
+ /*
+ * If this buffer is currently syncing out, and we are are
+ * still referencing it from db_data, we need to make a copy
+ * of it in case we decide we want to dirty it again in this txg.
+ */
+ if (dh->dh_db->db_level == 0 &&
+ dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
+ dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
+ dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
+ dh->dh_dr = dh->dh_db->db_data_pending;
+ if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf)
+ dbuf_hold_copy(dh);
+ }
+
+ if (multilist_link_active(&dh->dh_db->db_cache_link)) {
+ ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds));
+ ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE ||
+ dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+ multilist_remove(
+ dbuf_caches[dh->dh_db->db_caching_status].cache,
+ dh->dh_db);
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[dh->dh_db->db_caching_status].size,
+ dh->dh_db->db.db_size, dh->dh_db);
+
+ if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMPDOWN(metadata_cache_count);
+ } else {
+ DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
+ DBUF_STAT_BUMPDOWN(cache_count);
+ DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
+ dh->dh_db->db.db_size);
+ }
+ dh->dh_db->db_caching_status = DB_NO_CACHE;
+ }
+ (void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
+ DBUF_VERIFY(dh->dh_db);
+ mutex_exit(&dh->dh_db->db_mtx);
+
+ /* NOTE: we can't rele the parent until after we drop the db_mtx */
+ if (dh->dh_parent)
+ dbuf_rele(dh->dh_parent, NULL);
+
+ ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
+ ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
+ ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
+ *(dh->dh_dbp) = dh->dh_db;
+
+ return (0);
+}
+
+/*
+ * The following code preserves the recursive function dbuf_hold_impl()
+ * but moves the local variables AND function arguments to the heap to
+ * minimize the stack frame size. Enough space is initially allocated
+ * on the stack for 20 levels of recursion.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+ boolean_t fail_sparse, boolean_t fail_uncached,
+ void *tag, dmu_buf_impl_t **dbp)
+{
+ struct dbuf_hold_impl_data *dh;
+ int error;
+
+ dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) *
+ DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
+ __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse,
+ fail_uncached, tag, dbp, 0);
+
+ error = __dbuf_hold_impl(dh);
+
+ kmem_free(dh, sizeof (struct dbuf_hold_impl_data) *
+ DBUF_HOLD_IMPL_MAX_DEPTH);
+
+ return (error);
+}
+
+static void
+__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
+ dnode_t *dn, uint8_t level, uint64_t blkid,
+ boolean_t fail_sparse, boolean_t fail_uncached,
+ void *tag, dmu_buf_impl_t **dbp, int depth)
+{
+ dh->dh_dn = dn;
+ dh->dh_level = level;
+ dh->dh_blkid = blkid;
+
+ dh->dh_fail_sparse = fail_sparse;
+ dh->dh_fail_uncached = fail_uncached;
+
+ dh->dh_tag = tag;
+ dh->dh_dbp = dbp;
+
+ dh->dh_db = NULL;
+ dh->dh_parent = NULL;
+ dh->dh_bp = NULL;
+ dh->dh_err = 0;
+ dh->dh_dr = NULL;
+
+ dh->dh_depth = depth;
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+{
+ return (dbuf_hold_level(dn, 0, blkid, tag));
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+void
+dbuf_create_bonus(dnode_t *dn)
+{
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ ASSERT(dn->dn_bonus == NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ return (SET_ERROR(ENOTSUP));
+ if (blksz == 0)
+ blksz = SPA_MINBLOCKSIZE;
+ ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dbuf_new_size(db, blksz, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+
+ return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds = zfs_refcount_add(&db->db_holds, tag);
+ ASSERT3S(holds, >, 1);
+}
+
+#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
+boolean_t
+dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
+ void *tag)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dmu_buf_impl_t *found_db;
+ boolean_t result = B_FALSE;
+
+ if (db->db_blkid == DMU_BONUS_BLKID)
+ found_db = dbuf_find_bonus(os, obj);
+ else
+ found_db = dbuf_find(os, obj, 0, blkid);
+
+ if (found_db != NULL) {
+ if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
+ (void) zfs_refcount_add(&db->db_holds, tag);
+ result = B_TRUE;
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ return (result);
+}
+
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
+void
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, tag, B_FALSE);
+}
+
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+ dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf. This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically. The 'evicting'
+ * argument should be set if we are already in the dbuf-evicting code
+ * path, in which case we don't want to recursively evict. This allows us to
+ * avoid deeply nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ * ^ |
+ * | |
+ * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
+{
+ int64_t holds;
+ uint64_t size;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ DBUF_VERIFY(db);
+
+ /*
+ * Remove the reference to the dbuf before removing its hold on the
+ * dnode so we can guarantee in dnode_move() that a referenced bonus
+ * buffer has a corresponding dnode hold.
+ */
+ holds = zfs_refcount_remove(&db->db_holds, tag);
+ ASSERT(holds >= 0);
+
+ /*
+ * We can't freeze indirects if there is a possibility that they
+ * may be modified in the current syncing context.
+ */
+ if (db->db_buf != NULL &&
+ holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
+ arc_buf_freeze(db->db_buf);
+ }
+
+ if (holds == db->db_dirtycnt &&
+ db->db_level == 0 && db->db_user_immediate_evict)
+ dbuf_evict_user(db);
+
+ if (holds == 0) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dnode_t *dn;
+ boolean_t evict_dbuf = db->db_pending_evict;
+
+ /*
+ * If the dnode moves here, we cannot cross this
+ * barrier until the move completes.
+ */
+ DB_DNODE_ENTER(db);
+
+ dn = DB_DNODE(db);
+ atomic_dec_32(&dn->dn_dbufs_count);
+
+ /*
+ * Decrementing the dbuf count means that the bonus
+ * buffer's dnode hold is no longer discounted in
+ * dnode_move(). The dnode cannot move until after
+ * the dnode_rele() below.
+ */
+ DB_DNODE_EXIT(db);
+
+ /*
+ * Do not reference db after its lock is dropped.
+ * Another thread may evict it.
+ */
+ mutex_exit(&db->db_mtx);
+
+ if (evict_dbuf)
+ dnode_evict_bonus(dn);
+
+ dnode_rele(dn, db);
+ } else if (db->db_buf == NULL) {
+ /*
+ * This is a special case: we never associated this
+ * dbuf with any data allocated from the ARC.
+ */
+ ASSERT(db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
+ dbuf_destroy(db);
+ } else if (arc_released(db->db_buf)) {
+ /*
+ * This dbuf has anonymous data associated with it.
+ */
+ dbuf_destroy(db);
+ } else {
+ boolean_t do_arc_evict = B_FALSE;
+ blkptr_t bp;
+ spa_t *spa = dmu_objset_spa(db->db_objset);
+
+ if (!DBUF_IS_CACHEABLE(db) &&
+ db->db_blkptr != NULL &&
+ !BP_IS_HOLE(db->db_blkptr) &&
+ !BP_IS_EMBEDDED(db->db_blkptr)) {
+ do_arc_evict = B_TRUE;
+ bp = *db->db_blkptr;
+ }
+
+ if (!DBUF_IS_CACHEABLE(db) ||
+ db->db_pending_evict) {
+ dbuf_destroy(db);
+ } else if (!multilist_link_active(&db->db_cache_link)) {
+ ASSERT3U(db->db_caching_status, ==,
+ DB_NO_CACHE);
+
+ dbuf_cached_state_t dcs =
+ dbuf_include_in_metadata_cache(db) ?
+ DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+ db->db_caching_status = dcs;
+
+ multilist_insert(dbuf_caches[dcs].cache, db);
+ size = zfs_refcount_add_many(
+ &dbuf_caches[dcs].size, db->db.db_size, db);
+
+ if (dcs == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMP(metadata_cache_count);
+ DBUF_STAT_MAX(
+ metadata_cache_size_bytes_max,
+ size);
+ } else {
+ DBUF_STAT_BUMP(
+ cache_levels[db->db_level]);
+ DBUF_STAT_BUMP(cache_count);
+ DBUF_STAT_INCR(
+ cache_levels_bytes[db->db_level],
+ db->db.db_size);
+ DBUF_STAT_MAX(cache_size_bytes_max,
+ size);
+ }
+ mutex_exit(&db->db_mtx);
+
+ if (dcs == DB_DBUF_CACHE && !evicting)
+ dbuf_evict_notify(size);
+ }
+
+ if (do_arc_evict)
+ arc_freed(spa, &bp);
+ }
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+ return (zfs_refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+ dmu_buf_user_t *new_user)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ mutex_enter(&db->db_mtx);
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ if (db->db_user == old_user)
+ db->db_user = new_user;
+ else
+ old_user = db->db_user;
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ mutex_exit(&db->db_mtx);
+
+ return (old_user);
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+ return (dmu_buf_replace_user(db_fake, NULL, user));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_user_immediate_evict = TRUE;
+ return (dmu_buf_set_user(db_fake, user));
+}
+
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+ return (dmu_buf_replace_user(db_fake, user, NULL));
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ return (db->db_user);
+}
+
+void
+dmu_buf_user_evict_wait()
+{
+ taskq_wait(dbu_evict_taskq);
+}
+
+blkptr_t *
+dmu_buf_get_blkptr(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ return (dbi->db_blkptr);
+}
+
+objset_t *
+dmu_buf_get_objset(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ return (dbi->db_objset);
+}
+
+dnode_t *
+dmu_buf_dnode_enter(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ DB_DNODE_ENTER(dbi);
+ return (DB_DNODE(dbi));
+}
+
+void
+dmu_buf_dnode_exit(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ DB_DNODE_EXIT(dbi);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+ /* ASSERT(dmu_tx_is_syncing(tx) */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_blkptr != NULL)
+ return;
+
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
+ BP_ZERO(db->db_blkptr);
+ return;
+ }
+ if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+ /*
+ * This buffer was allocated at a time when there was
+ * no available blkptrs from the dnode, or it was
+ * inappropriate to hook it in (i.e., nlevels mis-match).
+ */
+ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+ ASSERT(db->db_parent == NULL);
+ db->db_parent = dn->dn_dbuf;
+ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+ DBUF_VERIFY(db);
+ } else {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT(dn->dn_phys->dn_nlevels > 1);
+ if (parent == NULL) {
+ mutex_exit(&db->db_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ parent = dbuf_hold_level(dn, db->db_level + 1,
+ db->db_blkid >> epbs, db);
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ db->db_parent = parent;
+ }
+ db->db_blkptr = (blkptr_t *)parent->db.db_data +
+ (db->db_blkid & ((1ULL << epbs) - 1));
+ DBUF_VERIFY(db);
+ }
+}
+
+/*
+ * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
+ * is critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn;
+ zio_t *zio;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(db->db_level > 0);
+ DBUF_VERIFY(db);
+
+ /* Read the block if it hasn't been read yet. */
+ if (db->db_buf == NULL) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ mutex_enter(&db->db_mtx);
+ }
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT(db->db_buf != NULL);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ /* Indirect block size must match what the dnode thinks it is. */
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ dbuf_check_blkptr(dn, db);
+ DB_DNODE_EXIT(db);
+
+ /* Provide the pending dirty record to child dbufs */
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ dbuf_write(dr, db->db_buf, tx);
+
+ zio = dr->dr_zio;
+ mutex_enter(&dr->dt.di.dr_mtx);
+ dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_exit(&dr->dt.di.dr_mtx);
+ zio_nowait(zio);
+}
+
+/*
+ * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
+ * critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ arc_buf_t **datap = &dr->dt.dl.dr_data;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn;
+ objset_t *os;
+ uint64_t txg = tx->tx_txg;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * To be synced, we must be dirtied. But we
+ * might have been freed after the dirty.
+ */
+ if (db->db_state == DB_UNCACHED) {
+ /* This buffer has been freed since it was dirtied */
+ ASSERT(db->db.db_data == NULL);
+ } else if (db->db_state == DB_FILL) {
+ /* This buffer was freed and is now being re-filled */
+ ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else {
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
+ }
+ DBUF_VERIFY(db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+ /*
+ * In the previous transaction group, the bonus buffer
+ * was entirely used to store the attributes for the
+ * dnode which overrode the dn_spill field. However,
+ * when adding more attributes to the file a spill
+ * block was required to hold the extra attributes.
+ *
+ * Make sure to clear the garbage left in the dn_spill
+ * field from the previous attributes in the bonus
+ * buffer. Otherwise, after writing out the spill
+ * block to the new allocated dva, it will free
+ * the old block pointed to by the invalid dn_spill.
+ */
+ db->db_blkptr = NULL;
+ }
+ dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ /*
+ * If this is a bonus buffer, simply copy the bonus data into the
+ * dnode. It will be written out when the dnode is synced (and it
+ * will be synced, since it must have been dirty for dbuf_sync to
+ * be called).
+ */
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dbuf_dirty_record_t **drp;
+
+ ASSERT(*datap != NULL);
+ ASSERT0(db->db_level);
+ ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
+ DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
+ bcopy(*datap, DN_BONUS(dn->dn_phys),
+ DN_MAX_BONUS_LEN(dn->dn_phys));
+ DB_DNODE_EXIT(db);
+
+ if (*datap != db->db.db_data) {
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ zio_buf_free(*datap, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
+ }
+ db->db_data_pending = NULL;
+ drp = &db->db_last_dirty;
+ while (*drp != dr)
+ drp = &(*drp)->dr_next;
+ ASSERT(dr->dr_next == NULL);
+ ASSERT(dr->dr_dbuf == db);
+ *drp = dr->dr_next;
+ if (dr->dr_dbuf->db_level != 0) {
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
+ return;
+ }
+
+ os = dn->dn_objset;
+
+ /*
+ * This function may have dropped the db_mtx lock allowing a dmu_sync
+ * operation to sneak in. As a result, we need to ensure that we
+ * don't check the dr_override_state until we have returned from
+ * dbuf_check_blkptr.
+ */
+ dbuf_check_blkptr(dn, db);
+
+ /*
+ * If this buffer is in the middle of an immediate write,
+ * wait for the synchronous IO to complete.
+ */
+ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+ }
+
+ if (db->db_state != DB_NOFILL &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ zfs_refcount_count(&db->db_holds) > 1 &&
+ dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
+ *datap == db->db_buf) {
+ /*
+ * If this buffer is currently "in use" (i.e., there
+ * are active holds and db_data still references it),
+ * then make a copy before we start the write so that
+ * any modifications from the open txg will not leak
+ * into this write.
+ *
+ * NOTE: this copy does not need to be made for
+ * objects only modified in the syncing context (e.g.
+ * DNONE_DNODE blocks).
+ */
+ int psize = arc_buf_size(*datap);
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ enum zio_compress compress_type = arc_get_compression(*datap);
+
+ if (compress_type == ZIO_COMPRESS_OFF) {
+ *datap = arc_alloc_buf(os->os_spa, db, type, psize);
+ } else {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ int lsize = arc_buf_lsize(*datap);
+ *datap = arc_alloc_compressed_buf(os->os_spa, db,
+ psize, lsize, compress_type);
+ }
+ bcopy(db->db.db_data, (*datap)->b_data, psize);
+ }
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ dbuf_write(dr, *datap, tx);
+
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ if (dn->dn_object == DMU_META_DNODE_OBJECT) {
+ list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
+ DB_DNODE_EXIT(db);
+ } else {
+ /*
+ * Although zio_nowait() does not "wait for an IO", it does
+ * initiate the IO. If this is an empty write it seems plausible
+ * that the IO could actually be completed before the nowait
+ * returns. We need to DB_DNODE_EXIT() first in case
+ * zio_nowait() invalidates the dbuf.
+ */
+ DB_DNODE_EXIT(db);
+ zio_nowait(dr->dr_zio);
+ }
+}
+
+void
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ if (dr->dr_zio != NULL) {
+ /*
+ * If we find an already initialized zio then we
+ * are processing the meta-dnode, and we have finished.
+ * The dbufs for all dnodes are put back on the list
+ * during processing, so that we can zio_wait()
+ * these IOs after initiating all child IOs.
+ */
+ ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+ DMU_META_DNODE_OBJECT);
+ break;
+ }
+ if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+ }
+ list_remove(list, dr);
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn;
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ spa_t *spa = zio->io_spa;
+ int64_t delta;
+ uint64_t fill = 0;
+ int i;
+
+ ASSERT3P(db->db_blkptr, !=, NULL);
+ ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+ zio->io_prev_space_delta = delta;
+
+ if (bp->blk_birth != 0) {
+ ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_type) ||
+ (db->db_blkid == DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+ BP_IS_EMBEDDED(bp));
+ ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+ }
+
+ mutex_enter(&db->db_mtx);
+
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(bp)) &&
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
+ }
+#endif
+
+ if (db->db_level == 0) {
+ mutex_enter(&dn->dn_mtx);
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+ db->db_blkid != DMU_SPILL_BLKID)
+ dn->dn_phys->dn_maxblkid = db->db_blkid;
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ i = 0;
+ while (i < db->db.db_size) {
+ dnode_phys_t *dnp =
+ (void *)(((char *)db->db.db_data) + i);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE) {
+ fill++;
+ i += dnp->dn_extra_slots *
+ DNODE_MIN_SIZE;
+ }
+ }
+ } else {
+ if (BP_IS_HOLE(bp)) {
+ fill = 0;
+ } else {
+ fill = 1;
+ }
+ }
+ } else {
+ blkptr_t *ibp = db->db.db_data;
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+ if (BP_IS_HOLE(ibp))
+ continue;
+ fill += BP_GET_FILL(ibp);
+ }
+ }
+ DB_DNODE_EXIT(db);
+
+ if (!BP_IS_EMBEDDED(bp))
+ bp->blk_fill = fill;
+
+ mutex_exit(&db->db_mtx);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ *db->db_blkptr = *bp;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/* ARGSUSED */
+/*
+ * This function gets called just prior to running through the compression
+ * stage of the zio pipeline. If we're an indirect block comprised of only
+ * holes, then we want this indirect to be compressed away to a hole. In
+ * order to do that we must zero out any information about the holes that
+ * this indirect points to prior to before we try to compress it.
+ */
+static void
+dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn;
+ blkptr_t *bp;
+ unsigned int epbs, i;
+
+ ASSERT3U(db->db_level, >, 0);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(epbs, <, 31);
+
+ /* Determine if all our children are holes */
+ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
+ if (!BP_IS_HOLE(bp))
+ break;
+ }
+
+ /*
+ * If all the children are holes, then zero them all out so that
+ * we may get compressed away.
+ */
+ if (i == 1 << epbs) {
+ /*
+ * We only found holes. Grab the rwlock to prevent
+ * anybody from reading the blocks we're about to
+ * zero out.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ bzero(db->db.db_data, db->db.db_size);
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+ DB_DNODE_EXIT(db);
+}
+
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times). This
+ * allows the DMU to monitor the progress of each logical i/o. For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block. There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ dmu_buf_impl_t *db = arg;
+ objset_t *os = db->db_objset;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+ dbuf_dirty_record_t *dr;
+ int delta = 0;
+
+ dr = db->db_data_pending;
+ ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+ /*
+ * The callback will be called io_phys_children times. Retire one
+ * portion of our dirty space each time we are called. Any rounding
+ * error will be cleaned up by dsl_pool_sync()'s call to
+ * dsl_pool_undirty_space().
+ */
+ delta = dr->dr_accounted / zio->io_phys_children;
+ dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ blkptr_t *bp = db->db_blkptr;
+ objset_t *os = db->db_objset;
+ dmu_tx_t *tx = os->os_synctx;
+ dbuf_dirty_record_t **drp, *dr;
+
+ ASSERT0(zio->io_error);
+ ASSERT(db->db_blkptr == bp);
+
+ /*
+ * For nopwrites and rewrites we ensure that the bp matches our
+ * original and bypass all the accounting.
+ */
+ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ DBUF_VERIFY(db);
+
+ drp = &db->db_last_dirty;
+ while ((dr = *drp) != db->db_data_pending)
+ drp = &dr->dr_next;
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ ASSERT(dr->dr_dbuf == db);
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
+
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
+ DB_DNODE_EXIT(db);
+ }
+#endif
+
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+ if (db->db_state != DB_NOFILL) {
+ if (dr->dt.dl.dr_data != db->db_buf)
+ arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
+ } else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ int epbs =
+ dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_blkid, <=,
+ dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ db->db.db_size);
+ }
+ DB_DNODE_EXIT(db);
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ cv_broadcast(&db->db_changed);
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ db->db_data_pending = NULL;
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+ dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+ dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+ mutex_enter(&db->db_mtx);
+ if (!BP_EQUAL(zio->io_bp, obp)) {
+ if (!BP_IS_HOLE(obp))
+ dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+ arc_release(dr->dt.dl.dr_data, db);
+ }
+ mutex_exit(&db->db_mtx);
+ dbuf_write_done(zio, NULL, db);
+
+ if (zio->io_abd != NULL)
+ abd_put(zio->io_abd);
+}
+
+typedef struct dbuf_remap_impl_callback_arg {
+ objset_t *drica_os;
+ uint64_t drica_blk_birth;
+ dmu_tx_t *drica_tx;
+} dbuf_remap_impl_callback_arg_t;
+
+static void
+dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ dbuf_remap_impl_callback_arg_t *drica = arg;
+ objset_t *os = drica->drica_os;
+ spa_t *spa = dmu_objset_spa(os);
+ dmu_tx_t *tx = drica->drica_tx;
+
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (os == spa_meta_objset(spa)) {
+ spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+ } else {
+ dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
+ size, drica->drica_blk_birth, tx);
+ }
+}
+
+static void
+dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
+{
+ blkptr_t bp_copy = *bp;
+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
+ dbuf_remap_impl_callback_arg_t drica;
+
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ drica.drica_os = dn->dn_objset;
+ drica.drica_blk_birth = bp->blk_birth;
+ drica.drica_tx = tx;
+ if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
+ &drica)) {
+ /*
+ * The struct_rwlock prevents dbuf_read_impl() from
+ * dereferencing the BP while we are changing it. To
+ * avoid lock contention, only grab it when we are actually
+ * changing the BP.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ *bp = bp_copy;
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+}
+
+/*
+ * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
+ * to remap a copy of every bp in the dbuf.
+ */
+boolean_t
+dbuf_can_remap(const dmu_buf_impl_t *db)
+{
+ spa_t *spa = dmu_objset_spa(db->db_objset);
+ blkptr_t *bp = db->db.db_data;
+ boolean_t ret = B_FALSE;
+
+ ASSERT3U(db->db_level, >, 0);
+ ASSERT3S(db->db_state, ==, DB_CACHED);
+
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
+ blkptr_t bp_copy = bp[i];
+ if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
+ ret = B_TRUE;
+ break;
+ }
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ return (ret);
+}
+
+boolean_t
+dnode_needs_remap(const dnode_t *dn)
+{
+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
+ boolean_t ret = B_FALSE;
+
+ if (dn->dn_phys->dn_nlevels == 0) {
+ return (B_FALSE);
+ }
+
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
+ blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
+ if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
+ ret = B_TRUE;
+ break;
+ }
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ return (ret);
+}
+
+/*
+ * Remap any existing BP's to concrete vdevs, if possible.
+ */
+static void
+dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(db->db_objset);
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
+ return;
+
+ if (db->db_level > 0) {
+ blkptr_t *bp = db->db.db_data;
+ for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
+ dbuf_remap_impl(dn, &bp[i], tx);
+ }
+ } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dnode_phys_t *dnp = db->db.db_data;
+ ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
+ DMU_OT_DNODE);
+ for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
+ i += dnp[i].dn_extra_slots + 1) {
+ for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
+ dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
+ }
+ }
+ }
+}
+
+
+/* Issue I/O to commit a dirty buffer to disk. */
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn;
+ objset_t *os;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_phys_t zb;
+ zio_prop_t zp;
+ zio_t *zio;
+ int wp_flag = 0;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ os = dn->dn_objset;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ if (BP_IS_HOLE(db->db_blkptr)) {
+ arc_buf_thaw(data);
+ } else {
+ dbuf_release_bp(db);
+ }
+ dbuf_remap(dn, db, tx);
+ }
+ }
+
+ if (parent != dn->dn_dbuf) {
+ /* Our parent is an indirect block. */
+ /* We have a dirty parent that has been scheduled for write. */
+ ASSERT(parent && parent->db_data_pending);
+ /* Our parent's buffer is one level closer to the dnode. */
+ ASSERT(db->db_level == parent->db_level-1);
+ /*
+ * We're about to modify our parent's db_data by modifying
+ * our block pointer, so the parent must be released.
+ */
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
+ } else {
+ /* Our parent is the dnode itself. */
+ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+ db->db_blkid != DMU_SPILL_BLKID) ||
+ (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ zio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ wp_flag = WP_SPILL;
+ wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+ dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+ DB_DNODE_EXIT(db);
+
+ /*
+ * We copy the blkptr now (rather than when we instantiate the dirty
+ * record), because its value can change between open context and
+ * syncing context. We do not need to hold dn_struct_rwlock to read
+ * db_blkptr because we are in syncing context.
+ */
+ dr->dr_bp_copy = *db->db_blkptr;
+
+ if (db->db_level == 0 &&
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * The BP for this block has been provided by open context
+ * (by dmu_sync() or dmu_buf_write_embedded()).
+ */
+ abd_t *contents = (data != NULL) ?
+ abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
+
+ dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
+ contents, db->db.db_size, db->db.db_size, &zp,
+ dbuf_write_override_ready, NULL, NULL,
+ dbuf_write_override_done,
+ dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ mutex_enter(&db->db_mtx);
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+ mutex_exit(&db->db_mtx);
+ } else if (db->db_state == DB_NOFILL) {
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+ zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
+ dbuf_write_nofill_ready, NULL, NULL,
+ dbuf_write_nofill_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+ } else {
+ ASSERT(arc_released(data));
+
+ /*
+ * For indirect blocks, we want to setup the children
+ * ready callback so that we can properly handle an indirect
+ * block that only contains holes.
+ */
+ arc_write_done_func_t *children_ready_cb = NULL;
+ if (db->db_level != 0)
+ children_ready_cb = dbuf_write_children_ready;
+
+ dr->dr_zio = arc_write(zio, os->os_spa, txg,
+ &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
+ &zp, dbuf_write_ready, children_ready_cb,
+ dbuf_write_physdone, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
new file mode 100644
index 000000000000..0a86830f71ad
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
@@ -0,0 +1,242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+
+/*
+ * Calculate the index of the arc header for the state, disabled by default.
+ */
+int zfs_dbuf_state_index = 0;
+
+/*
+ * ==========================================================================
+ * Dbuf Hash Read Routines
+ * ==========================================================================
+ */
+typedef struct dbuf_stats_t {
+ kmutex_t lock;
+ kstat_t *kstat;
+ dbuf_hash_table_t *hash;
+ int idx;
+} dbuf_stats_t;
+
+static dbuf_stats_t dbuf_stats_hash_table;
+
+static int
+dbuf_stats_hash_table_headers(char *buf, size_t size)
+{
+ size = snprintf(buf, size - 1,
+ "%-88s | %-124s | %s\n"
+ "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | "
+ "%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
+ "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | "
+ "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
+ "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
+ "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
+ "atype", "index", "flags", "count", "asize", "access", "mru", "gmru",
+ "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds",
+ "dtype", "btype", "data_bs", "meta_bs", "bsize",
+ "lvls", "dholds", "blocks", "dsize");
+ buf[size] = '\0';
+
+ return (0);
+}
+
+int
+__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
+{
+ arc_buf_info_t abi = { 0 };
+ dmu_object_info_t doi = { 0 };
+ dnode_t *dn = DB_DNODE(db);
+
+ if (db->db_buf)
+ arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
+
+ if (dn)
+ __dmu_object_info_from_dnode(dn, &doi);
+
+ size = snprintf(buf, size - 1,
+ "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
+ "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
+ "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | "
+ "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
+ /* dmu_buf_impl_t */
+ spa_name(dn->dn_objset->os_spa),
+ (u_longlong_t)dmu_objset_id(db->db_objset),
+ (longlong_t)db->db.db_object,
+ (longlong_t)db->db_level,
+ (longlong_t)db->db_blkid,
+ (u_longlong_t)db->db.db_offset,
+ (u_longlong_t)db->db.db_size,
+ !!dbuf_is_metadata(db),
+ db->db_state,
+ (ulong_t)zfs_refcount_count(&db->db_holds),
+ /* arc_buf_info_t */
+ abi.abi_state_type,
+ abi.abi_state_contents,
+ (longlong_t)abi.abi_state_index,
+ abi.abi_flags,
+ (ulong_t)abi.abi_bufcnt,
+ (u_longlong_t)abi.abi_size,
+ (u_longlong_t)abi.abi_access,
+ (ulong_t)abi.abi_mru_hits,
+ (ulong_t)abi.abi_mru_ghost_hits,
+ (ulong_t)abi.abi_mfu_hits,
+ (ulong_t)abi.abi_mfu_ghost_hits,
+ (ulong_t)abi.abi_l2arc_hits,
+ (u_longlong_t)abi.abi_l2arc_dattr,
+ (u_longlong_t)abi.abi_l2arc_asize,
+ abi.abi_l2arc_compress,
+ (ulong_t)abi.abi_holds,
+ /* dmu_object_info_t */
+ doi.doi_type,
+ doi.doi_bonus_type,
+ (ulong_t)doi.doi_data_block_size,
+ (ulong_t)doi.doi_metadata_block_size,
+ (u_longlong_t)doi.doi_bonus_size,
+ (ulong_t)doi.doi_indirection,
+ (ulong_t)zfs_refcount_count(&dn->dn_holds),
+ (u_longlong_t)doi.doi_fill_count,
+ (u_longlong_t)doi.doi_max_offset);
+ buf[size] = '\0';
+
+ return (size);
+}
+
+static int
+dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
+{
+ dbuf_stats_t *dsh = (dbuf_stats_t *)data;
+ dbuf_hash_table_t *h = dsh->hash;
+ dmu_buf_impl_t *db;
+ int length, error = 0;
+
+ ASSERT3S(dsh->idx, >=, 0);
+ ASSERT3S(dsh->idx, <=, h->hash_table_mask);
+ memset(buf, 0, size);
+
+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
+ for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
+ /*
+ * Returning ENOMEM will cause the data and header functions
+ * to be called with a larger scratch buffers.
+ */
+ if (size < 512) {
+ error = ENOMEM;
+ break;
+ }
+
+ mutex_enter(&db->db_mtx);
+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
+
+ length = __dbuf_stats_hash_table_data(buf, size, db);
+ buf += length;
+ size -= length;
+
+ mutex_exit(&db->db_mtx);
+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
+
+ return (error);
+}
+
+static void *
+dbuf_stats_hash_table_addr(kstat_t *ksp, off_t n)
+{
+ dbuf_stats_t *dsh = ksp->ks_private;
+
+ ASSERT(MUTEX_HELD(&dsh->lock));
+
+ if (n <= dsh->hash->hash_table_mask) {
+ dsh->idx = n;
+ return (dsh);
+ }
+
+ return (NULL);
+}
+
+#ifndef __FreeBSD__
+/*
+ * XXX The FreeBSD SPL is missing support for KSTAT_TYPE_RAW
+ * we can enable this as soon as that's implemented. See the
+ * lindebugfs module for similar callback semantics.
+ */
+static void
+dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
+{
+ dbuf_stats_t *dsh = &dbuf_stats_hash_table;
+ kstat_t *ksp;
+
+ mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
+ dsh->hash = hash;
+
+ ksp = kstat_create("zfs", 0, "dbufs", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+ dsh->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &dsh->lock;
+ ksp->ks_ndata = UINT32_MAX;
+ ksp->ks_private = dsh;
+ kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
+ dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
+ kstat_install(ksp);
+ }
+}
+
+static void
+dbuf_stats_hash_table_destroy(void)
+{
+ dbuf_stats_t *dsh = &dbuf_stats_hash_table;
+ kstat_t *ksp;
+
+ ksp = dsh->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_destroy(&dsh->lock);
+}
+#else
+static void
+dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
+{
+}
+
+static void
+dbuf_stats_hash_table_destroy(void)
+{
+}
+#endif
+
+void
+dbuf_stats_init(dbuf_hash_table_t *hash)
+{
+ dbuf_stats_hash_table_init(hash);
+}
+
+void
+dbuf_stats_destroy(void)
+{
+ dbuf_stats_hash_table_destroy();
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
new file mode 100644
index 000000000000..964aa6c054f5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
@@ -0,0 +1,1189 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 1;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS DEDUP");
+SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch,
+ 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+ &ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+ "ditto",
+ "duplicate",
+ "unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp == 0);
+ VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+ ASSERT(*objectp != 0);
+
+ VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, objectp, tx) == 0);
+
+ VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ uint64_t count;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp != 0);
+ VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
+ ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+ VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+ VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+ VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+ bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+
+ *objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ uint64_t count;
+ char name[DDT_NAMELEN];
+ int error;
+
+ ddt_object_name(ddt, type, class, name);
+
+ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+ if (error != 0)
+ return (error);
+
+ VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class]));
+
+ /*
+ * Seed the cached statistics.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ error = ddt_object_count(ddt, type, class, &count);
+ if (error)
+ return error;
+
+ ddo->ddo_count = count;
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ return (0);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ uint64_t count;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+
+ /*
+ * Cache DDT statistics; this is the only time they'll change.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+ VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
+
+ ddo->ddo_count = count;
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (SET_ERROR(ENOENT));
+
+ return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde));
+}
+
+static void
+ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return;
+
+ ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde);
+}
+
+int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ uint64_t *walk, ddt_entry_t *dde)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, walk));
+}
+
+int
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+ ddt->ddt_object[type][class], count));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_object_info_t *doi)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (SET_ERROR(ENOENT));
+
+ return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+ doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ char *name)
+{
+ (void) sprintf(name, DMU_POOL_DDT,
+ zio_checksum_table[ddt->ddt_checksum].ci_name,
+ ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(txg != 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ bp->blk_dva[d] = ddp->ddp_dva[d];
+ BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+void
+ddt_bp_create(enum zio_checksum checksum,
+ const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+{
+ BP_ZERO(bp);
+
+ if (ddp != NULL)
+ ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+ bp->blk_cksum = ddk->ddk_cksum;
+ bp->blk_fill = 1;
+
+ BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+ BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+ BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_TYPE(bp, DMU_OT_DEDUP);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+ ddk->ddk_cksum = bp->blk_cksum;
+ ddk->ddk_prop = 0;
+
+ DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+ ASSERT(ddp->ddp_phys_birth == 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ ddp->ddp_dva[d] = bp->blk_dva[d];
+ ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+ bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+ ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+ if (ddp) {
+ ASSERT((int64_t)ddp->ddp_refcnt > 0);
+ ddp->ddp_refcnt--;
+ }
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+ blkptr_t blk;
+
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ ddt_phys_clear(ddp);
+ zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+ ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+ BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ return (ddp);
+ }
+ return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+ uint64_t refcnt = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ refcnt += dde->dde_phys[p].ddp_refcnt;
+
+ return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ uint64_t lsize = DDK_GET_LSIZE(ddk);
+ uint64_t psize = DDK_GET_PSIZE(ddk);
+
+ bzero(dds, sizeof (*dds));
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ uint64_t dsize = 0;
+ uint64_t refcnt = ddp->ddp_refcnt;
+
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+ dds->dds_blocks += 1;
+ dds->dds_lsize += lsize;
+ dds->dds_psize += psize;
+ dds->dds_dsize += dsize;
+
+ dds->dds_ref_blocks += refcnt;
+ dds->dds_ref_lsize += lsize * refcnt;
+ dds->dds_ref_psize += psize * refcnt;
+ dds->dds_ref_dsize += dsize * refcnt;
+ }
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+ const uint64_t *s = (const uint64_t *)src;
+ uint64_t *d = (uint64_t *)dst;
+ uint64_t *d_end = (uint64_t *)(dst + 1);
+
+ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
+
+ while (d < d_end)
+ *d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+ ddt_stat_t dds;
+ ddt_histogram_t *ddh;
+ int bucket;
+
+ ddt_stat_generate(ddt, dde, &dds);
+
+ bucket = highbit64(dds.dds_ref_blocks) - 1;
+ ASSERT(bucket >= 0);
+
+ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+ bzero(dds, sizeof (*dds));
+
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+ const uint64_t *s = (const uint64_t *)ddh;
+ const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+ while (s < s_end)
+ if (*s++ != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+ /* Sum the statistics we cached in ddt_object_sync(). */
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_object_t *ddo =
+ &ddt->ddt_object_stats[type][class];
+ ddo_total->ddo_count += ddo->ddo_count;
+ ddo_total->ddo_dspace += ddo->ddo_dspace;
+ ddo_total->ddo_mspace += ddo->ddo_mspace;
+ }
+ }
+ }
+
+ /* ... and compute the averages. */
+ if (ddo_total->ddo_count != 0) {
+ ddo_total->ddo_dspace /= ddo_total->ddo_count;
+ ddo_total->ddo_mspace /= ddo_total->ddo_count;
+ }
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(ddh,
+ &ddt->ddt_histogram_cache[type][class]);
+ }
+ }
+ }
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+ ddt_histogram_t *ddh_total;
+
+ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh_total);
+ ddt_histogram_stat(dds_total, ddh_total);
+ kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ if (dds_total.dds_dsize == 0)
+ return (100);
+
+ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+int
+ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+{
+ spa_t *spa = ddt->ddt_spa;
+ uint64_t total_refcnt = 0;
+ uint64_t ditto = spa->spa_dedup_ditto;
+ int total_copies = 0;
+ int desired_copies = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *zio = dde->dde_lead_zio[p];
+ uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
+ if (zio != NULL)
+ refcnt += zio->io_parent_count; /* pending refs */
+ if (ddp == ddp_willref)
+ refcnt++; /* caller's ref */
+ if (refcnt != 0) {
+ total_refcnt += refcnt;
+ total_copies += p;
+ }
+ }
+
+ if (ditto == 0 || ditto > UINT32_MAX)
+ ditto = UINT32_MAX;
+
+ if (total_refcnt >= 1)
+ desired_copies++;
+ if (total_refcnt >= ditto)
+ desired_copies++;
+ if (total_refcnt >= ditto * ditto)
+ desired_copies++;
+
+ return (MAX(desired_copies, total_copies) - total_copies);
+}
+
+int
+ddt_ditto_copies_present(ddt_entry_t *dde)
+{
+ ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+ dva_t *dva = ddp->ddp_dva;
+ int copies = 0 - DVA_GET_GANG(dva);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+ if (DVA_IS_VALID(dva))
+ copies++;
+
+ ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+ return (copies);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+ uchar_t *version = dst++;
+ int cpfunc = ZIO_COMPRESS_ZLE;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ size_t c_len;
+
+ ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
+
+ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+ if (c_len == s_len) {
+ cpfunc = ZIO_COMPRESS_OFF;
+ bcopy(src, dst, s_len);
+ }
+
+ *version = cpfunc;
+ /* CONSTCOND */
+ if (ZFS_HOST_BYTEORDER)
+ *version |= DDT_COMPRESS_BYTEORDER_MASK;
+
+ return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+ uchar_t version = *src++;
+ int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ if (ci->ci_decompress != NULL)
+ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ else
+ bcopy(src, dst, d_len);
+
+ if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
+ (ZFS_HOST_BYTEORDER != 0))
+ byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+{
+ return (spa->spa_ddt[c]);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+ return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+ mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+ mutex_exit(&ddt->ddt_lock);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+ ddt_entry_t *dde;
+
+ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+ dde->dde_key = *ddk;
+
+ return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++)
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+
+ if (dde->dde_repair_abd != NULL)
+ abd_free(dde->dde_repair_abd);
+
+ cv_destroy(&dde->dde_cv);
+ kmem_free(dde, sizeof (*dde));
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+ ddt_entry_t *dde, dde_search;
+ enum ddt_type type;
+ enum ddt_class class;
+ avl_index_t where;
+ int error;
+
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ ddt_key_fill(&dde_search.dde_key, bp);
+
+ dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ if (dde == NULL) {
+ if (!add)
+ return (NULL);
+ dde = ddt_alloc(&dde_search.dde_key);
+ avl_insert(&ddt->ddt_tree, dde, where);
+ }
+
+ while (dde->dde_loading)
+ cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+ if (dde->dde_loaded)
+ return (dde);
+
+ dde->dde_loading = B_TRUE;
+
+ ddt_exit(ddt);
+
+ error = ENOENT;
+
+ for (type = 0; type < DDT_TYPES; type++) {
+ for (class = 0; class < DDT_CLASSES; class++) {
+ error = ddt_object_lookup(ddt, type, class, dde);
+ if (error != ENOENT) {
+ ASSERT0(error);
+ break;
+ }
+ }
+ if (error != ENOENT)
+ break;
+ }
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_loaded == B_FALSE);
+ ASSERT(dde->dde_loading == B_TRUE);
+
+ dde->dde_type = type; /* will be DDT_TYPES if no entry found */
+ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+ dde->dde_loaded = B_TRUE;
+ dde->dde_loading = B_FALSE;
+
+ if (error == 0)
+ ddt_stat_update(ddt, dde, -1ULL);
+
+ cv_broadcast(&dde->dde_cv);
+
+ return (dde);
+}
+
+void
+ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
+ return;
+
+ /*
+ * We only remove the DDT once all tables are empty and only
+ * prefetch dedup blocks when there are entries in the DDT.
+ * Thus no locking is required as the DDT can't disappear on us.
+ */
+ ddt = ddt_select(spa, bp);
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ ddt_object_prefetch(ddt, type, class, &dde);
+ }
+ }
+}
+
+/*
+ * Opaque struct used for ddt_key comparison
+ */
+#define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t))
+
+typedef struct ddt_key_cmp {
+ uint16_t u16[DDT_KEY_CMP_LEN];
+} ddt_key_cmp_t;
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+ const ddt_entry_t *dde1 = x1;
+ const ddt_entry_t *dde2 = x2;
+ const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key;
+ const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key;
+ int32_t cmp = 0;
+
+ for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
+ cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
+ if (likely(cmp))
+ break;
+ }
+
+ return (AVL_ISIGN(cmp));
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+ ddt_t *ddt;
+
+ ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+ mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ ddt->ddt_checksum = c;
+ ddt->ddt_spa = spa;
+ ddt->ddt_os = spa->spa_meta_objset;
+
+ return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+ ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ avl_destroy(&ddt->ddt_tree);
+ avl_destroy(&ddt->ddt_repair_tree);
+ mutex_destroy(&ddt->ddt_lock);
+ kmem_free(ddt, sizeof (*ddt));
+}
+
+void
+ddt_create(spa_t *spa)
+{
+ spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+ int error;
+
+ ddt_create(spa);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object);
+
+ if (error)
+ return (error == ENOENT ? 0 : error);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ error = ddt_object_load(ddt, type, class);
+ if (error != 0 && error != ENOENT)
+ return (error);
+ }
+ }
+
+ /*
+ * Seed the cached histograms.
+ */
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+ }
+
+ return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (spa->spa_ddt[c]) {
+ ddt_table_free(spa->spa_ddt[c]);
+ spa->spa_ddt[c] = NULL;
+ }
+ }
+}
+
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ if (max_class == DDT_CLASS_UNIQUE)
+ return (B_TRUE);
+
+ ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ for (enum ddt_class class = 0; class <= max_class; class++)
+ if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+ ddt_key_t ddk;
+ ddt_entry_t *dde;
+
+ ddt_key_fill(&ddk, bp);
+
+ dde = ddt_alloc(&ddk);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ /*
+ * We can only do repair if there are multiple copies
+ * of the block. For anything in the UNIQUE class,
+ * there's definitely only one copy, so don't even try.
+ */
+ if (class != DDT_CLASS_UNIQUE &&
+ ddt_object_lookup(ddt, type, class, dde) == 0)
+ return (dde);
+ }
+ }
+
+ bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+ return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+ avl_index_t where;
+
+ ddt_enter(ddt);
+
+ if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
+ avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+ avl_insert(&ddt->ddt_repair_tree, dde, where);
+ else
+ ddt_free(dde);
+
+ ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+ ddt_entry_t *rdde = zio->io_private;
+
+ ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *rddp = rdde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ ddt_key_t *rddk = &rdde->dde_key;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio = zio_null(rio, rio->io_spa, NULL,
+ ddt_repair_entry_done, rdde, rio->io_flags);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+ bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+ rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+ }
+
+ zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde, *rdde_next, *rdde;
+ avl_tree_t *t = &ddt->ddt_repair_tree;
+ blkptr_t blk;
+
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ ddt_enter(ddt);
+ for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+ rdde_next = AVL_NEXT(t, rdde);
+ avl_remove(&ddt->ddt_repair_tree, rdde);
+ ddt_exit(ddt);
+ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+ dde = ddt_repair_start(ddt, &blk);
+ ddt_repair_entry(ddt, dde, rdde, rio);
+ ddt_repair_done(ddt, dde);
+ ddt_enter(ddt);
+ }
+ ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+ dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ enum ddt_type otype = dde->dde_type;
+ enum ddt_type ntype = DDT_TYPE_CURRENT;
+ enum ddt_class oclass = dde->dde_class;
+ enum ddt_class nclass;
+ uint64_t total_refcnt = 0;
+
+ ASSERT(dde->dde_loaded);
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+ ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+ if (ddp->ddp_phys_birth == 0) {
+ ASSERT(ddp->ddp_refcnt == 0);
+ continue;
+ }
+ if (p == DDT_PHYS_DITTO) {
+ if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ continue;
+ }
+ if (ddp->ddp_refcnt == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ total_refcnt += ddp->ddp_refcnt;
+ }
+
+ if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+ nclass = DDT_CLASS_DITTO;
+ else if (total_refcnt > 1)
+ nclass = DDT_CLASS_DUPLICATE;
+ else
+ nclass = DDT_CLASS_UNIQUE;
+
+ if (otype != DDT_TYPES &&
+ (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+ ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ }
+
+ if (total_refcnt != 0) {
+ dde->dde_type = ntype;
+ dde->dde_class = nclass;
+ ddt_stat_update(ddt, dde, 0);
+ if (!ddt_object_exists(ddt, ntype, nclass))
+ ddt_object_create(ddt, ntype, nclass, tx);
+ VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+ /*
+ * If the class changes, the order that we scan this bp
+ * changes. If it decreases, we could miss it, so
+ * scan it right now. (This covers both class changing
+ * while we are doing ddt_walk(), and when we are
+ * traversing.)
+ */
+ if (nclass < oclass) {
+ dsl_scan_ddt_entry(dp->dp_scan,
+ ddt->ddt_checksum, dde, tx);
+ }
+ }
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde;
+ void *cookie = NULL;
+
+ if (avl_numnodes(&ddt->ddt_tree) == 0)
+ return;
+
+ ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+ if (spa->spa_ddt_stat_object == 0) {
+ spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+ DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, tx);
+ }
+
+ while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+ ddt_sync_entry(ddt, dde, tx, txg);
+ ddt_free(dde);
+ }
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ uint64_t add, count = 0;
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (ddt_object_exists(ddt, type, class)) {
+ ddt_object_sync(ddt, type, class, tx);
+ VERIFY(ddt_object_count(ddt, type, class,
+ &add) == 0);
+ count += add;
+ }
+ }
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (count == 0 && ddt_object_exists(ddt, type, class))
+ ddt_object_destroy(ddt, type, class, tx);
+ }
+ }
+
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+ dmu_tx_t *tx;
+ zio_t *rio;
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+
+ /*
+ * This function may cause an immediate scan of ddt blocks (see
+ * the comment above dsl_scan_ddt() for details). We set the
+ * scan's root zio here so that we can wait for any scan IOs in
+ * addition to the regular ddt IOs.
+ */
+ ASSERT3P(scn->scn_zio_root, ==, NULL);
+ scn->scn_zio_root = rio;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL)
+ continue;
+ ddt_sync_table(ddt, tx, txg);
+ ddt_repair_table(ddt, rio);
+ }
+
+ (void) zio_wait(rio);
+ scn->scn_zio_root = NULL;
+
+ dmu_tx_commit(tx);
+}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+ do {
+ do {
+ do {
+ ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ int error = ENOENT;
+ if (ddt_object_exists(ddt, ddb->ddb_type,
+ ddb->ddb_class)) {
+ error = ddt_object_walk(ddt,
+ ddb->ddb_type, ddb->ddb_class,
+ &ddb->ddb_cursor, dde);
+ }
+ dde->dde_type = ddb->ddb_type;
+ dde->dde_class = ddb->ddb_class;
+ if (error == 0)
+ return (0);
+ if (error != ENOENT)
+ return (error);
+ ddb->ddb_cursor = 0;
+ } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ddb->ddb_checksum = 0;
+ } while (++ddb->ddb_type < DDT_TYPES);
+ ddb->ddb_type = 0;
+ } while (++ddb->ddb_class < DDT_CLASSES);
+
+ return (SET_ERROR(ENOENT));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
new file mode 100644
index 000000000000..b2202fb91531
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
@@ -0,0 +1,165 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+ zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+ if (prehash)
+ flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+ *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+ ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+ DMU_OT_NONE, 0, tx);
+
+ return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t one, csize;
+ int error;
+
+ error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, &one, &csize);
+ if (error)
+ return (error);
+
+ ASSERT(one == 1);
+ ASSERT(csize <= sizeof (cbuf));
+
+ error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ if (error)
+ return (error);
+
+ ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+ return (0);
+}
+
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize;
+
+ csize = ddt_compress(dde->dde_phys, cbuf,
+ sizeof (dde->dde_phys), sizeof (cbuf));
+
+ return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ if (*walk == 0) {
+ /*
+ * We don't want to prefetch the entire ZAP object, because
+ * it can be enormous. Also the primary use of DDT iteration
+ * is for scrubbing, in which case we will be issuing many
+ * scrub i/os for each ZAP block that we read in, so
+ * reading the ZAP is unlikely to be the bottleneck.
+ */
+ zap_cursor_init_noprefetch(&zc, os, object);
+ } else {
+ zap_cursor_init_serialized(&zc, os, object, *walk);
+ }
+ if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize = za.za_num_integers;
+ ASSERT(za.za_integer_length == 1);
+ error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ ASSERT(error == 0);
+ if (error == 0) {
+ ddt_decompress(cbuf, dde->dde_phys, csize,
+ sizeof (dde->dde_phys));
+ dde->dde_key = *(ddt_key_t *)za.za_name;
+ }
+ zap_cursor_advance(&zc);
+ *walk = zap_cursor_serialize(&zc);
+ }
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+static int
+ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count)
+{
+
+ return (zap_count(os, object, count));
+}
+
+const ddt_ops_t ddt_zap_ops = {
+ "zap",
+ ddt_zap_create,
+ ddt_zap_destroy,
+ ddt_zap_lookup,
+ ddt_zap_prefetch,
+ ddt_zap_update,
+ ddt_zap_remove,
+ ddt_zap_walk,
+ ddt_zap_count,
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
new file mode 100644
index 000000000000..59e551e75d43
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -0,0 +1,2748 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
+ */
+/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
+/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
+/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#ifdef _KERNEL
+#include <sys/racct.h>
+#include <sys/vm.h>
+#include <sys/zfs_znode.h>
+#endif
+
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
+ &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
+
+/*
+ * Tunable to control percentage of dirtied L1 blocks from frees allowed into
+ * one TXG. After this threshold is crossed, additional dirty blocks from frees
+ * will wait until the next TXG.
+ * A value of zero will disable this throttle.
+ */
+uint32_t zfs_per_txg_dirty_frees_percent = 5;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
+ &zfs_per_txg_dirty_frees_percent, 0,
+ "Percentage of dirtied indirect blocks from frees allowed in one txg");
+
+/*
+ * This can be used for testing, to ensure that certain actions happen
+ * while in the middle of a remap (which might otherwise complete too
+ * quickly).
+ */
+int zfs_object_remap_one_indirect_delay_ticks = 0;
+
+/*
+ * Limit the amount we can prefetch with one call to this amount. This
+ * helps to limit the amount of memory that can be used by prefetching.
+ * Larger objects should be prefetched a bit at a time.
+ */
+uint64_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" },
+ { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" },
+ { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" },
+ { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
+ { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" },
+ { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" },
+ { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
+};
+
+const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+ { byteswap_uint8_array, "uint8" },
+ { byteswap_uint16_array, "uint16" },
+ { byteswap_uint32_array, "uint32" },
+ { byteswap_uint64_array, "uint64" },
+ { zap_byteswap, "zap" },
+ { dnode_buf_byteswap, "dnode" },
+ { dmu_objset_byteswap, "objset" },
+ { zfs_znode_byteswap, "znode" },
+ { zfs_oldacl_byteswap, "oldacl" },
+ { zfs_acl_byteswap, "acl" }
+};
+
+int
+dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
+{
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+
+ blkid = dbuf_whichblock(dn, 0, offset);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (db == NULL) {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+
+ *dbp = &db->db;
+ return (0);
+}
+int
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ blkid = dbuf_whichblock(dn, 0, offset);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+
+ if (db == NULL) {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+
+ err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
+ if (err == 0) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+ err = dbuf_read(db, NULL, db_flags);
+ if (err != 0) {
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ }
+ }
+
+ return (err);
+}
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+
+ err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+ if (err == 0) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+ err = dbuf_read(db, NULL, db_flags);
+ if (err != 0) {
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ }
+ }
+
+ return (err);
+}
+
+int
+dmu_bonus_max(void)
+{
+ return (DN_OLD_MAX_BONUSLEN);
+}
+
+int
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (dn->dn_bonus != db) {
+ error = SET_ERROR(EINVAL);
+ } else if (newsize < 0 || newsize > db_fake->db_size) {
+ error = SET_ERROR(EINVAL);
+ } else {
+ dnode_setbonuslen(dn, newsize, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+int
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (!DMU_OT_IS_VALID(type)) {
+ error = SET_ERROR(EINVAL);
+ } else if (dn->dn_bonus != db) {
+ error = SET_ERROR(EINVAL);
+ } else {
+ dnode_setbonus_type(dn, type, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ type = dn->dn_bonustype;
+ DB_DNODE_EXIT(db);
+
+ return (type);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ dbuf_rm_spill(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_rm_spill(dn, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (error);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ */
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *db;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ if (error)
+ return (error);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dbuf_create_bonus(dn);
+ }
+ db = dn->dn_bonus;
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (zfs_refcount_add(&db->db_holds, tag) == 1) {
+ VERIFY(dnode_add_ref(dn, db));
+ atomic_inc_32(&dn->dn_dbufs_count);
+ }
+
+ /*
+ * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+ * hold and incrementing the dbuf count to ensure that dnode_move() sees
+ * a dnode hold for every dbuf.
+ */
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
+
+ *dbp = &db->db;
+ return (0);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = NULL;
+ int err;
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+
+ ASSERT(db != NULL);
+ err = dbuf_read(db, NULL, flags);
+ if (err == 0)
+ *dbp = &db->db;
+ else
+ dbuf_rele(db, tag);
+ return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+ err = SET_ERROR(EINVAL);
+ } else {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (!dn->dn_have_spill) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ err = dmu_spill_hold_by_dnode(dn,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ DB_DNODE_EXIT(db);
+ return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+ boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+{
+ dmu_buf_t **dbp;
+ uint64_t blkid, nblks, i;
+ uint32_t dbuf_flags;
+ int err;
+ zio_t *zio;
+
+ ASSERT(length <= DMU_MAX_ACCESS);
+
+ /*
+ * Note: We directly notify the prefetch code of this read, so that
+ * we can tell it about the multi-block read. dbuf_read() only knows
+ * about the one block it is accessing.
+ */
+ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
+ DB_RF_NOPREFETCH;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
+ P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
+ } else {
+ if (offset + length > dn->dn_datablksz) {
+ zfs_panic_recover("zfs: accessing past end of object "
+ "%llx/%llx (size=%u access=%llu+%llu)",
+ (longlong_t)dn->dn_objset->
+ os_dsl_dataset->ds_object,
+ (longlong_t)dn->dn_object, dn->dn_datablksz,
+ (longlong_t)offset, (longlong_t)length);
+ rw_exit(&dn->dn_struct_rwlock);
+ return (SET_ERROR(EIO));
+ }
+ nblks = 1;
+ }
+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+#if defined(_KERNEL) && defined(RACCT)
+ if (racct_enable && !read) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, length);
+ racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
+ PROC_UNLOCK(curproc);
+ }
+#endif
+
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ blkid = dbuf_whichblock(dn, 0, offset);
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
+ if (db == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_rele_array(dbp, nblks, tag);
+ zio_nowait(zio);
+ return (SET_ERROR(EIO));
+ }
+
+ /* initiate async i/o */
+ if (read)
+ (void) dbuf_read(db, zio, dbuf_flags);
+#ifdef _KERNEL
+ else
+ curthread->td_ru.ru_oublock++;
+#endif
+ dbp[i] = &db->db;
+ }
+
+ if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
+ DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+ dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
+ read && DNODE_IS_CACHEABLE(dn));
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* wait for async i/o */
+ err = zio_wait(zio);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+
+ /* wait for other io to complete */
+ if (read) {
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED)
+ err = SET_ERROR(EIO);
+ mutex_exit(&db->db_mtx);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+ }
+ }
+
+ *numbufsp = nblks;
+ *dbpp = dbp;
+ return (0);
+}
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp, DMU_READ_PREFETCH);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
+ uint64_t length, boolean_t read, void *tag, int *numbufsp,
+ dmu_buf_t ***dbpp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp, DMU_READ_PREFETCH);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+{
+ int i;
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+ if (numbufs == 0)
+ return;
+
+ for (i = 0; i < numbufs; i++) {
+ if (dbp[i])
+ dbuf_rele(dbp[i], tag);
+ }
+
+ kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+/*
+ * Issue prefetch i/os for the given blocks. If level is greater than 0, the
+ * indirect blocks prefeteched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
+ *
+ * Note that if the indirect blocks above the blocks being prefetched are not in
+ * cache, they will be asychronously read in.
+ */
+void
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+ uint64_t len, zio_priority_t pri)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ int nblks, err;
+
+ if (len == 0) { /* they're interested in the bonus buffer */
+ dn = DMU_META_DNODE(os);
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, level,
+ object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, level, blkid, pri, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+
+ /*
+ * See comment before the definition of dmu_prefetch_max.
+ */
+ len = MIN(len, dmu_prefetch_max);
+
+ /*
+ * XXX - Note, if the dnode for the requested object is not
+ * already cached, we will do a *synchronous* read in the
+ * dnode_hold() call. The same is true for any indirects.
+ */
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ /*
+ * offset + len - 1 is the last byte we want to prefetch for, and offset
+ * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
+ * last block we want to prefetch, and dbuf_whichblock(dn, level,
+ * offset) is the first. Then the number we need to prefetch is the
+ * last - first + 1.
+ */
+ if (level > 0 || dn->dn_datablkshift != 0) {
+ nblks = dbuf_whichblock(dn, level, offset + len - 1) -
+ dbuf_whichblock(dn, level, offset) + 1;
+ } else {
+ nblks = (offset < dn->dn_datablksz);
+ }
+
+ if (nblks != 0) {
+ blkid = dbuf_whichblock(dn, level, offset);
+ for (int i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, level, blkid + i, pri, 0);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+}
+
+/*
+ * Get the next "chunk" of file data to free. We traverse the file from
+ * the end so that the file gets shorter over time (if we crashes in the
+ * middle, this will leave us in a better state). We find allocated file
+ * data by simply searching the allocated level 1 indirects.
+ *
+ * On input, *start should be the first offset that does not need to be
+ * freed (e.g. "offset + length"). On return, *start will be the first
+ * offset that should be freed and l1blks is set to the number of level 1
+ * indirect blocks found within the chunk.
+ */
+static int
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
+{
+ uint64_t blks;
+ uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
+ /* bytes of data covered by a level-1 indirect block */
+ uint64_t iblkrange =
+ dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+
+ ASSERT3U(minimum, <=, *start);
+
+ /*
+ * Check if we can free the entire range assuming that all of the
+ * L1 blocks in this range have data. If we can, we use this
+ * worst case value as an estimate so we can avoid having to look
+ * at the object's actual data.
+ */
+ uint64_t total_l1blks =
+ (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
+ iblkrange;
+ if (total_l1blks <= maxblks) {
+ *l1blks = total_l1blks;
+ *start = minimum;
+ return (0);
+ }
+ ASSERT(ISP2(iblkrange));
+
+ for (blks = 0; *start > minimum && blks < maxblks; blks++) {
+ int err;
+
+ /*
+ * dnode_next_offset(BACKWARDS) will find an allocated L1
+ * indirect block at or before the input offset. We must
+ * decrement *start so that it is at the end of the region
+ * to search.
+ */
+ (*start)--;
+
+ err = dnode_next_offset(dn,
+ DNODE_FIND_BACKWARDS, start, 2, 1, 0);
+
+ /* if there are no indirect blocks before start, we are done */
+ if (err == ESRCH) {
+ *start = minimum;
+ break;
+ } else if (err != 0) {
+ *l1blks = blks;
+ return (err);
+ }
+
+ /* set start to the beginning of this L1 indirect */
+ *start = P2ALIGN(*start, iblkrange);
+ }
+ if (*start < minimum)
+ *start = minimum;
+ *l1blks = blks;
+
+ return (0);
+}
+
+/*
+ * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
+ * otherwise return false.
+ * Used below in dmu_free_long_range_impl() to enable abort when unmounting
+ */
+/*ARGSUSED*/
+static boolean_t
+dmu_objset_zfs_unmounting(objset_t *os)
+{
+#ifdef _KERNEL
+ if (dmu_objset_type(os) == DMU_OST_ZFS)
+ return (zfs_get_vfs_flag_unmounted(os));
+#endif
+ return (B_FALSE);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+ uint64_t length)
+{
+ uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+ int err;
+ uint64_t dirty_frees_threshold;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ if (offset >= object_size)
+ return (0);
+
+ if (zfs_per_txg_dirty_frees_percent <= 100)
+ dirty_frees_threshold =
+ zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
+ else
+ dirty_frees_threshold = zfs_dirty_data_max / 20;
+
+ if (length == DMU_OBJECT_END || offset + length > object_size)
+ length = object_size - offset;
+
+ while (length != 0) {
+ uint64_t chunk_end, chunk_begin, chunk_len;
+ uint64_t l1blks;
+ dmu_tx_t *tx;
+
+ if (dmu_objset_zfs_unmounting(dn->dn_objset))
+ return (SET_ERROR(EINTR));
+
+ chunk_end = chunk_begin = offset + length;
+
+ /* move chunk_begin backwards to the beginning of this chunk */
+ err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
+ if (err)
+ return (err);
+ ASSERT3U(chunk_begin, >=, offset);
+ ASSERT3U(chunk_begin, <=, chunk_end);
+
+ chunk_len = chunk_end - chunk_begin;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
+
+ /*
+ * Mark this transaction as typically resulting in a net
+ * reduction in space used.
+ */
+ dmu_tx_mark_netfree(tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&dp->dp_lock);
+ uint64_t long_free_dirty =
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
+ mutex_exit(&dp->dp_lock);
+
+ /*
+ * To avoid filling up a TXG with just frees, wait for
+ * the next TXG to open before freeing more chunks if
+ * we have reached the threshold of frees.
+ */
+ if (dirty_frees_threshold != 0 &&
+ long_free_dirty >= dirty_frees_threshold) {
+ dmu_tx_commit(tx);
+ txg_wait_open(dp, 0);
+ continue;
+ }
+
+ /*
+ * In order to prevent unnecessary write throttling, for each
+ * TXG, we track the cumulative size of L1 blocks being dirtied
+ * in dnode_free_range() below. We compare this number to a
+ * tunable threshold, past which we prevent new L1 dirty freeing
+ * blocks from being added into the open TXG. See
+ * dmu_free_long_range_impl() for details. The threshold
+ * prevents write throttle activation due to dirty freeing L1
+ * blocks taking up a large percentage of zfs_dirty_data_max.
+ */
+ mutex_enter(&dp->dp_lock);
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
+ l1blks << dn->dn_indblkshift;
+ mutex_exit(&dp->dp_lock);
+ DTRACE_PROBE3(free__long__range,
+ uint64_t, long_free_dirty, uint64_t, chunk_len,
+ uint64_t, txg);
+ dnode_free_range(dn, chunk_begin, chunk_len, tx);
+ dmu_tx_commit(tx);
+
+ length -= chunk_len;
+ }
+ return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ err = dmu_free_long_range_impl(os, dn, offset, length);
+
+ /*
+ * It is important to zero out the maxblkid when freeing the entire
+ * file, so that (a) subsequent calls to dmu_free_long_range_impl()
+ * will take the fast path, and (b) dnode_reallocate() can verify
+ * that the entire file has been freed.
+ */
+ if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
+ dn->dn_maxblkid = 0;
+
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_free_long_object(objset_t *os, uint64_t object)
+{
+ dmu_tx_t *tx;
+ int err;
+
+ err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
+ if (err != 0)
+ return (err);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+ dmu_tx_mark_netfree(tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ err = dmu_object_free(os, object, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+
+ return (err);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ ASSERT(offset < UINT64_MAX);
+ ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+ dnode_free_range(dn, offset, size, tx);
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+static int
+dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
+ void *buf, uint32_t flags)
+{
+ dmu_buf_t **dbp;
+ int numbufs, err = 0;
+
+ /*
+ * Deal with odd block sizes, where there can't be data past the first
+ * block. If we ever do the tail block optimization, we will need to
+ * handle that here as well.
+ */
+ if (dn->dn_maxblkid == 0) {
+ int newsz = offset > dn->dn_datablksz ? 0 :
+ MIN(size, dn->dn_datablksz - offset);
+ bzero((char *)buf + newsz, size - newsz);
+ size = newsz;
+ }
+
+ while (size > 0) {
+ uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+ int i;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
+ TRUE, FTAG, &numbufs, &dbp, flags);
+ if (err)
+ break;
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ bcopy((char *)db->db_data + bufoff, buf, tocpy);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ }
+ return (err);
+}
+
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf, uint32_t flags)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+
+ err = dmu_read_impl(dn, offset, size, buf, flags);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
+ uint32_t flags)
+{
+ return (dmu_read_impl(dn, offset, size, buf, flags));
+}
+
+static void
+dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ int i;
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs;
+
+ if (size == 0)
+ return;
+
+ VERIFY0(dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+ dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+void
+dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs;
+
+ if (size == 0)
+ return;
+
+ VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
+ FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
+ dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+static int
+dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
+ uint64_t last_removal_txg, uint64_t offset)
+{
+ uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
+ int err = 0;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+ ASSERT3P(dbuf, !=, NULL);
+
+ /*
+ * If the block hasn't been written yet, this default will ensure
+ * we don't try to remap it.
+ */
+ uint64_t birth = UINT64_MAX;
+ ASSERT3U(last_removal_txg, !=, UINT64_MAX);
+ if (dbuf->db_blkptr != NULL)
+ birth = dbuf->db_blkptr->blk_birth;
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /*
+ * If this L1 was already written after the last removal, then we've
+ * already tried to remap it.
+ */
+ if (birth <= last_removal_txg &&
+ dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
+ dbuf_can_remap(dbuf)) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ (void) dbuf_dirty(dbuf, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ }
+
+ dbuf_rele(dbuf, FTAG);
+
+ delay(zfs_object_remap_one_indirect_delay_ticks);
+
+ return (err);
+}
+
+/*
+ * Remap all blockpointers in the object, if possible, so that they reference
+ * only concrete vdevs.
+ *
+ * To do this, iterate over the L0 blockpointers and remap any that reference
+ * an indirect vdev. Note that we only examine L0 blockpointers; since we
+ * cannot guarantee that we can remap all blockpointer anyways (due to split
+ * blocks), we do not want to make the code unnecessarily complicated to
+ * catch the unlikely case that there is an L1 block on an indirect vdev that
+ * contains no indirect blockpointers.
+ */
+int
+dmu_object_remap_indirects(objset_t *os, uint64_t object,
+ uint64_t last_removal_txg)
+{
+ uint64_t offset, l1span;
+ int err;
+ dnode_t *dn;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0) {
+ return (err);
+ }
+
+ if (dn->dn_nlevels <= 1) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ err = SET_ERROR(EINTR);
+ }
+
+ /*
+ * If the dnode has no indirect blocks, we cannot dirty them.
+ * We still want to remap the blkptr(s) in the dnode if
+ * appropriate, so mark it as dirty.
+ */
+ if (err == 0 && dnode_needs_remap(dn)) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, dn->dn_object);
+ if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
+ dnode_setdirty(dn, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ }
+
+ dnode_rele(dn, FTAG);
+ return (err);
+ }
+
+ offset = 0;
+ l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
+ dn->dn_datablkshift);
+ /*
+ * Find the next L1 indirect that is not a hole.
+ */
+ while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ err = SET_ERROR(EINTR);
+ break;
+ }
+ if ((err = dmu_object_remap_one_indirect(os, dn,
+ last_removal_txg, offset)) != 0) {
+ break;
+ }
+ offset += l1span;
+ }
+
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+void
+dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ if (size == 0)
+ return;
+
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+
+ for (i = 0; i < numbufs; i++) {
+ dmu_buf_t *db = dbp[i];
+
+ dmu_buf_will_not_fill(db, tx);
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+ void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+ int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+
+ ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+ VERIFY0(dmu_buf_hold_noread(os, object, offset,
+ FTAG, &db));
+
+ dmu_buf_write_embedded(db,
+ data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+ uncompressed_size, compressed_size, byteorder, tx);
+
+ dmu_buf_rele(db, FTAG);
+}
+
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+ dmu_xuio_t *priv;
+ uio_t *uio = &xuio->xu_uio;
+
+ uio->uio_iovcnt = nblk;
+ uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+ priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+ priv->cnt = nblk;
+ priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+ priv->iovp = uio->uio_iov;
+ XUIO_XUZC_PRIV(xuio) = priv;
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+ return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int nblk = priv->cnt;
+
+ kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+ kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+ kmem_free(priv, sizeof (dmu_xuio_t));
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+ struct iovec *iov;
+ uio_t *uio = &xuio->xu_uio;
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int i = priv->next++;
+
+ ASSERT(i < priv->cnt);
+ ASSERT(off + n <= arc_buf_lsize(abuf));
+ iov = uio->uio_iov + i;
+ iov->iov_base = (char *)abuf->b_data + off;
+ iov->iov_len = n;
+ priv->bufs[i] = abuf;
+ return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+ xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (xuio_ksp != NULL) {
+ xuio_ksp->ks_data = &xuio_stats;
+ kstat_install(xuio_ksp);
+ }
+}
+
+static void
+xuio_stat_fini(void)
+{
+ if (xuio_ksp != NULL) {
+ kstat_delete(xuio_ksp);
+ xuio_ksp = NULL;
+ }
+}
+
+void
+xuio_stat_wbuf_copied(void)
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy(void)
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i, err;
+ xuio_t *xuio = NULL;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+ TRUE, FTAG, &numbufs, &dbp, 0);
+ if (err)
+ return (err);
+
+#ifdef UIO_XUIO
+ if (uio->uio_extflg == UIO_XUIO)
+ xuio = (xuio_t *)uio;
+#endif
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = uio->uio_loffset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ if (xuio) {
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ arc_buf_t *dbuf_abuf = dbi->db_buf;
+ arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+ err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+ if (!err) {
+ uio->uio_resid -= tocpy;
+ uio->uio_loffset += tocpy;
+ }
+
+ if (abuf == dbuf_abuf)
+ XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+ else
+ XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+ } else {
+#ifdef illumos
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+#else
+ err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
+ tocpy, uio);
+#endif
+ }
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_read_uio_dnode(dn, uio, size);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From the specified object
+ * Starting at offset uio->uio_loffset.
+ */
+int
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_read_uio_dnode(dn, uio, size);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs;
+ int err = 0;
+ int i;
+
+ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+ FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = uio->uio_loffset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+#ifdef illumos
+ /*
+ * XXX uiomove could block forever (eg. nfs-backed
+ * pages). There needs to be a uiolockdown() function
+ * to lock the pages in memory, so that uiomove won't
+ * block.
+ */
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_WRITE, uio);
+#else
+ err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
+ uio);
+#endif
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To the specified object.
+ * Starting at offset uio->uio_loffset.
+ */
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+#ifdef illumos
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ page_t *pp, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy, copied, thiscpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+ caddr_t va;
+
+ ASSERT(size > 0);
+ ASSERT3U(db->db_size, >=, PAGESIZE);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+ ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
+ thiscpy = MIN(PAGESIZE, tocpy - copied);
+ va = zfs_map_page(pp, S_READ);
+ bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+ zfs_unmap_page(pp, va);
+ pp = pp->p_next;
+ bufoff += PAGESIZE;
+ }
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+#else /* !illumos */
+
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ vm_page_t *ma, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ struct sf_buf *sf;
+ int numbufs, i;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy, copied, thiscpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+ caddr_t va;
+
+ ASSERT(size > 0);
+ ASSERT3U(db->db_size, >=, PAGESIZE);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+ ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
+ thiscpy = MIN(PAGESIZE, tocpy - copied);
+ va = zfs_map_page(*ma, &sf);
+ bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+ zfs_unmap_page(sf);
+ ma += 1;
+ bufoff += PAGESIZE;
+ }
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+int
+dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+ int *rbehind, int *rahead, int last_size)
+{
+ struct sf_buf *sf;
+ vm_object_t vmobj;
+ vm_page_t m;
+ dmu_buf_t **dbp;
+ dmu_buf_t *db;
+ caddr_t va;
+ int numbufs, i;
+ int bufoff, pgoff, tocpy;
+ int mi, di;
+ int err;
+
+ ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
+ ASSERT(last_size <= PAGE_SIZE);
+
+ err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
+ IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+ if (err != 0)
+ return (err);
+
+#ifdef DEBUG
+ IMPLY(last_size < PAGE_SIZE, *rahead == 0);
+ if (dbp[0]->db_offset != 0 || numbufs > 1) {
+ for (i = 0; i < numbufs; i++) {
+ ASSERT(ISP2(dbp[i]->db_size));
+ ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
+ ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
+ }
+ }
+#endif
+
+ vmobj = ma[0]->object;
+
+ db = dbp[0];
+ for (i = 0; i < *rbehind; i++) {
+ m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT |
+ VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
+ if (m == NULL)
+ break;
+ if (!vm_page_none_valid(m)) {
+ ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_sunbusy(m);
+ break;
+ }
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_write_mapped(m));
+
+ ASSERT(db->db_size > PAGE_SIZE);
+ bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+ va = zfs_map_page(m, &sf);
+ bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+ vm_page_activate(m);
+ else
+ vm_page_deactivate(m);
+ vm_page_sunbusy(m);
+ }
+ *rbehind = i;
+
+ bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
+ pgoff = 0;
+ for (mi = 0, di = 0; mi < count && di < numbufs; ) {
+ if (pgoff == 0) {
+ m = ma[mi];
+ if (m != bogus_page) {
+ vm_page_assert_xbusied(m);
+ ASSERT(vm_page_none_valid(m));
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_mapped(m));
+ va = zfs_map_page(m, &sf);
+ }
+ }
+ if (bufoff == 0)
+ db = dbp[di];
+
+ if (m != bogus_page) {
+ ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
+ db->db_offset + bufoff);
+ }
+
+ /*
+ * We do not need to clamp the copy size by the file
+ * size as the last block is zero-filled beyond the
+ * end of file anyway.
+ */
+ tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
+ if (m != bogus_page)
+ bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
+
+ pgoff += tocpy;
+ ASSERT(pgoff <= PAGESIZE);
+ if (pgoff == PAGESIZE) {
+ if (m != bogus_page) {
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ }
+ ASSERT(mi < count);
+ mi++;
+ pgoff = 0;
+ }
+
+ bufoff += tocpy;
+ ASSERT(bufoff <= db->db_size);
+ if (bufoff == db->db_size) {
+ ASSERT(di < numbufs);
+ di++;
+ bufoff = 0;
+ }
+ }
+
+#ifdef DEBUG
+ /*
+ * Three possibilities:
+ * - last requested page ends at a buffer boundary and , thus,
+ * all pages and buffers have been iterated;
+ * - all requested pages are filled, but the last buffer
+ * has not been exhausted;
+ * the read-ahead is possible only in this case;
+ * - all buffers have been read, but the last page has not been
+ * fully filled;
+ * this is only possible if the file has only a single buffer
+ * with a size that is not a multiple of the page size.
+ */
+ if (mi == count) {
+ ASSERT(di >= numbufs - 1);
+ IMPLY(*rahead != 0, di == numbufs - 1);
+ IMPLY(*rahead != 0, bufoff != 0);
+ ASSERT(pgoff == 0);
+ }
+ if (di == numbufs) {
+ ASSERT(mi >= count - 1);
+ ASSERT(*rahead == 0);
+ IMPLY(pgoff == 0, mi == count);
+ if (pgoff != 0) {
+ ASSERT(mi == count - 1);
+ ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
+ }
+ }
+#endif
+ if (pgoff != 0) {
+ ASSERT(m != bogus_page);
+ bzero(va + pgoff, PAGESIZE - pgoff);
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ }
+
+ for (i = 0; i < *rahead; i++) {
+ m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT |
+ VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
+ if (m == NULL)
+ break;
+ if (!vm_page_none_valid(m)) {
+ ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_sunbusy(m);
+ break;
+ }
+ ASSERT(m->dirty == 0);
+ ASSERT(!pmap_page_is_write_mapped(m));
+
+ ASSERT(db->db_size > PAGE_SIZE);
+ bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+ tocpy = MIN(db->db_size - bufoff, PAGESIZE);
+ va = zfs_map_page(m, &sf);
+ bcopy((char *)db->db_data + bufoff, va, tocpy);
+ if (tocpy < PAGESIZE) {
+ ASSERT(i == *rahead - 1);
+ ASSERT((db->db_size & PAGE_MASK) != 0);
+ bzero(va + tocpy, PAGESIZE - tocpy);
+ }
+ zfs_unmap_page(sf);
+ vm_page_valid(m);
+ if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+ vm_page_activate(m);
+ else
+ vm_page_deactivate(m);
+ vm_page_sunbusy(m);
+ }
+ *rahead = i;
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (0);
+}
+#endif /* illumos */
+#endif /* _KERNEL */
+
+/*
+ * Allocate a loaned anonymous arc buffer.
+ */
+arc_buf_t *
+dmu_request_arcbuf(dmu_buf_t *handle, int size)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+
+ return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
+}
+
+/*
+ * Free a loaned arc buffer.
+ */
+void
+dmu_return_arcbuf(arc_buf_t *buf)
+{
+ arc_return_buf(buf, FTAG);
+ arc_buf_destroy(buf, FTAG);
+}
+
+/*
+ * When possible directly assign passed loaned arc buffer to a dbuf.
+ * If this is not possible copy the contents of passed arc buf via
+ * dmu_write().
+ */
+void
+dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
+ uint64_t blkid;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, 0, offset);
+ VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /*
+ * We can only assign if the offset is aligned, the arc buf is the
+ * same size as the dbuf, and the dbuf is not metadata.
+ */
+ if (offset == db->db.db_offset && blksz == db->db.db_size) {
+#ifdef _KERNEL
+ curthread->td_ru.ru_oublock++;
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, blksz);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+#endif /* _KERNEL */
+ dbuf_assign_arcbuf(db, buf, tx);
+ dbuf_rele(db, FTAG);
+ } else {
+ objset_t *os;
+ uint64_t object;
+
+ /* compressed bufs must always be assignable to their dbuf */
+ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
+ ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
+
+ os = dn->dn_objset;
+ object = dn->dn_object;
+
+ dbuf_rele(db, FTAG);
+ dmu_write(os, object, offset, blksz, buf->b_data, tx);
+ dmu_return_arcbuf(buf);
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+ }
+}
+
+void
+dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+
+ DB_DNODE_ENTER(dbuf);
+ dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
+ DB_DNODE_EXIT(dbuf);
+}
+
+typedef struct {
+ dbuf_dirty_record_t *dsa_dr;
+ dmu_sync_cb_t *dsa_done;
+ zgd_t *dsa_zgd;
+ dmu_tx_t *dsa_tx;
+} dmu_sync_arg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ dmu_sync_arg_t *dsa = varg;
+ dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_error == 0) {
+ if (BP_IS_HOLE(bp)) {
+ /*
+ * A block of zeros may compress to a hole, but the
+ * block size still needs to be known for replay.
+ */
+ BP_SET_LSIZE(bp, db->db_size);
+ } else if (!BP_IS_EMBEDDED(bp)) {
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ bp->blk_fill = 1;
+ }
+ }
+}
+
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+ dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ dmu_sync_arg_t *dsa = varg;
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ zgd_t *zgd = dsa->dsa_zgd;
+
+ /*
+ * Record the vdev(s) backing this blkptr so they can be flushed after
+ * the writes for the lwb have completed.
+ */
+ if (zio->io_error == 0) {
+ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+ }
+
+ mutex_enter(&db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+ if (zio->io_error == 0) {
+ dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ if (dr->dt.dl.dr_nopwrite) {
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
+
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ VERIFY(BP_EQUAL(bp, db->db_blkptr));
+ ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+ ASSERT(zio_checksum_table[chksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE);
+ }
+ dr->dt.dl.dr_overridden_by = *zio->io_bp;
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+
+ /*
+ * Old style holes are filled with all zeros, whereas
+ * new-style holes maintain their lsize, type, level,
+ * and birth time (see zio_write_compress). While we
+ * need to reset the BP_SET_LSIZE() call that happened
+ * in dmu_sync_ready for old style holes, we do *not*
+ * want to wipe out the information contained in new
+ * style holes. Thus, only zero out the block pointer if
+ * it's an old style hole.
+ */
+ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
+ dr->dt.dl.dr_overridden_by.blk_birth == 0)
+ BP_ZERO(&dr->dt.dl.dr_overridden_by);
+ } else {
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ }
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dmu_sync_arg_t *dsa = zio->io_private;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ zgd_t *zgd = dsa->dsa_zgd;
+
+ if (zio->io_error == 0) {
+ /*
+ * Record the vdev(s) backing this blkptr so they can be
+ * flushed after the writes for the lwb have completed.
+ */
+ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+
+ if (!BP_IS_HOLE(bp)) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
+ ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+ ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ }
+ }
+
+ dmu_tx_commit(dsa->dsa_tx);
+
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ abd_put(zio->io_abd);
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+ zio_prop_t *zp, zbookmark_phys_t *zb)
+{
+ dmu_sync_arg_t *dsa;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+ if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+ dmu_tx_abort(tx);
+ /* Make zl_get_data do txg_waited_synced() */
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * In order to prevent the zgd's lwb from being free'd prior to
+ * dmu_sync_late_arrival_done() being called, we have to ensure
+ * the lwb's "max txg" takes this tx's txg into account.
+ */
+ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
+
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = NULL;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = tx;
+
+ /*
+ * Since we are currently syncing this txg, it's nontrivial to
+ * determine what BP to nopwrite against, so we disable nopwrite.
+ *
+ * When syncing, the db_blkptr is initially the BP of the previous
+ * txg. We can not nopwrite against it because it will be changed
+ * (this is similar to the non-late-arrival case where the dbuf is
+ * dirty in a future txg).
+ *
+ * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
+ * We can not nopwrite against it because although the BP will not
+ * (typically) be changed, the data has not yet been persisted to this
+ * location.
+ *
+ * Finally, when dbuf_write_done() is called, it is theoretically
+ * possible to always nopwrite, because the data that was written in
+ * this txg is the same data that we are trying to write. However we
+ * would need to check that this dbuf is not dirty in any future
+ * txg's (as we do in the normal dmu_sync() path). For simplicity, we
+ * don't nopwrite in this case.
+ */
+ zp->zp_nopwrite = B_FALSE;
+
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+ abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
+ zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
+ dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+ dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+ return (0);
+}
+
+/*
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ * EEXIST: this txg has already been synced, so there's nothing to do.
+ * The caller should not log the write.
+ *
+ * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ * The caller should not log the write.
+ *
+ * EALREADY: this block is already in the process of being synced.
+ * The caller should track its progress (somehow).
+ *
+ * EIO: could not do the I/O.
+ * The caller should do a txg_wait_synced().
+ *
+ * 0: the I/O has been initiated.
+ * The caller should log this blkptr in the done callback.
+ * It is possible that the I/O will fail, in which case
+ * the error will be reported to the done callback and
+ * propagated to pio from zio_done().
+ */
+int
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
+ objset_t *os = db->db_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dbuf_dirty_record_t *dr;
+ dmu_sync_arg_t *dsa;
+ zbookmark_phys_t zb;
+ zio_prop_t zp;
+ dnode_t *dn;
+
+ ASSERT(pio != NULL);
+ ASSERT(txg != 0);
+
+ SET_BOOKMARK(&zb, ds->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+ DB_DNODE_EXIT(db);
+
+ /*
+ * If we're frozen (running ziltest), we always need to generate a bp.
+ */
+ if (txg > spa_freeze_txg(os->os_spa))
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
+
+ /*
+ * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+ * and us. If we determine that this txg is not yet syncing,
+ * but it begins to sync a moment later, that's OK because the
+ * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
+ */
+ mutex_enter(&db->db_mtx);
+
+ if (txg <= spa_last_synced_txg(os->os_spa)) {
+ /*
+ * This txg has already synced. There's nothing to do.
+ */
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(EEXIST));
+ }
+
+ if (txg <= spa_syncing_txg(os->os_spa)) {
+ /*
+ * This txg is currently syncing, so we can't mess with
+ * the dirty record anymore; just write a new log block.
+ */
+ mutex_exit(&db->db_mtx);
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
+ }
+
+ dr = db->db_last_dirty;
+ while (dr && dr->dr_txg != txg)
+ dr = dr->dr_next;
+
+ if (dr == NULL) {
+ /*
+ * There's no dr for this dbuf, so it must have been freed.
+ * There's no need to log writes to freed blocks, so we're done.
+ */
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(ENOENT));
+ }
+
+ ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
+
+ if (db->db_blkptr != NULL) {
+ /*
+ * We need to fill in zgd_bp with the current blkptr so that
+ * the nopwrite code can check if we're writing the same
+ * data that's already on disk. We can only nopwrite if we
+ * are sure that after making the copy, db_blkptr will not
+ * change until our i/o completes. We ensure this by
+ * holding the db_mtx, and only allowing nopwrite if the
+ * block is not already dirty (see below). This is verified
+ * by dmu_sync_done(), which VERIFYs that the db_blkptr has
+ * not changed.
+ */
+ *zgd->zgd_bp = *db->db_blkptr;
+ }
+
+ /*
+ * Assume the on-disk data is X, the current syncing data (in
+ * txg - 1) is Y, and the current in-memory data is Z (currently
+ * in dmu_sync).
+ *
+ * We usually want to perform a nopwrite if X and Z are the
+ * same. However, if Y is different (i.e. the BP is going to
+ * change before this write takes effect), then a nopwrite will
+ * be incorrect - we would override with X, which could have
+ * been freed when Y was written.
+ *
+ * (Note that this is not a concern when we are nop-writing from
+ * syncing context, because X and Y must be identical, because
+ * all previous txgs have been synced.)
+ *
+ * Therefore, we disable nopwrite if the current BP could change
+ * before this TXG. There are two ways it could change: by
+ * being dirty (dr_next is non-NULL), or by being freed
+ * (dnode_block_freed()). This behavior is verified by
+ * zio_done(), which VERIFYs that the override BP is identical
+ * to the on-disk BP.
+ */
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
+ zp.zp_nopwrite = B_FALSE;
+ DB_DNODE_EXIT(db);
+
+ ASSERT(dr->dr_txg == txg);
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * We have already issued a sync write for this buffer,
+ * or this buffer has already been synced. It could not
+ * have been dirtied since, or we would have cleared the state.
+ */
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(EALREADY));
+ }
+
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+ dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+ mutex_exit(&db->db_mtx);
+
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = dr;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = NULL;
+
+ zio_nowait(arc_write(pio, os->os_spa, txg,
+ zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+ &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+
+ return (0);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_blksz(dn, size, ibs, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /*
+ * Send streams include each object's checksum function. This
+ * check ensures that the receiving system can understand the
+ * checksum function transmitted.
+ */
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
+ dn->dn_checksum = checksum;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /*
+ * Send streams include each object's compression function. This
+ * check ensures that the receiving system can understand the
+ * compression function transmitted.
+ */
+ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
+ dn->dn_compress = compress;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+int zfs_mdcomp_disable = 0;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
+ &zfs_mdcomp_disable, 0, "Disable metadata compression");
+
+/*
+ * When the "redundant_metadata" property is set to "most", only indirect
+ * blocks of this level and higher will have an additional ditto block.
+ */
+int zfs_redundant_metadata_most_ditto_level = 2;
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+ dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+ boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
+ (wp & WP_SPILL));
+ enum zio_checksum checksum = os->os_checksum;
+ enum zio_compress compress = os->os_compress;
+ enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+ boolean_t dedup = B_FALSE;
+ boolean_t nopwrite = B_FALSE;
+ boolean_t dedup_verify = os->os_dedup_verify;
+ int copies = os->os_copies;
+
+ /*
+ * We maintain different write policies for each of the following
+ * types of data:
+ * 1. metadata
+ * 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+ * 3. all other level 0 blocks
+ */
+ if (ismd) {
+ if (zfs_mdcomp_disable) {
+ compress = ZIO_COMPRESS_EMPTY;
+ } else {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ compress = zio_compress_select(os->os_spa,
+ ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
+ }
+
+ /*
+ * Metadata always gets checksummed. If the data
+ * checksum is multi-bit correctable, and it's not a
+ * ZBT-style checksum, then it's suitable for metadata
+ * as well. Otherwise, the metadata checksum defaults
+ * to fletcher4.
+ */
+ if (!(zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_METADATA) ||
+ (zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_EMBEDDED))
+ checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+ if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+ (os->os_redundant_metadata ==
+ ZFS_REDUNDANT_METADATA_MOST &&
+ (level >= zfs_redundant_metadata_most_ditto_level ||
+ DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+ copies++;
+ } else if (wp & WP_NOFILL) {
+ ASSERT(level == 0);
+
+ /*
+ * If we're writing preallocated blocks, we aren't actually
+ * writing them so don't set any policy properties. These
+ * blocks are currently only used by an external subsystem
+ * outside of zfs (i.e. dump) and not written by the zio
+ * pipeline.
+ */
+ compress = ZIO_COMPRESS_OFF;
+ checksum = ZIO_CHECKSUM_NOPARITY;
+ } else {
+ compress = zio_compress_select(os->os_spa, dn->dn_compress,
+ compress);
+
+ checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+ zio_checksum_select(dn->dn_checksum, checksum) :
+ dedup_checksum;
+
+ /*
+ * Determine dedup setting. If we are in dmu_sync(),
+ * we won't actually dedup now because that's all
+ * done in syncing context; but we do want to use the
+ * dedup checkum. If the checksum is not strong
+ * enough to ensure unique signatures, force
+ * dedup_verify.
+ */
+ if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+ dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+ if (!(zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP))
+ dedup_verify = B_TRUE;
+ }
+
+ /*
+ * Enable nopwrite if we have secure enough checksum
+ * algorithm (see comment in zio_nop_write) and
+ * compression is enabled. We don't enable nopwrite if
+ * dedup is enabled as the two features are mutually
+ * exclusive.
+ */
+ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE) &&
+ compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
+ }
+
+ zp->zp_checksum = checksum;
+ zp->zp_compress = compress;
+ ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
+
+ zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
+ zp->zp_level = level;
+ zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
+ zp->zp_dedup = dedup;
+ zp->zp_dedup_verify = dedup && dedup_verify;
+ zp->zp_nopwrite = nopwrite;
+ zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
+ os->os_zpl_special_smallblock : 0;
+}
+
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+ dnode_t *dn;
+ int err;
+
+ /*
+ * Sync any current changes before
+ * we go trundling through the block pointers.
+ */
+ err = dmu_object_wait_synced(os, object);
+ if (err) {
+ return (err);
+ }
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err) {
+ return (err);
+ }
+
+ err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+/*
+ * Given the ZFS object, if it contains any dirty nodes
+ * this function flushes all dirty blocks to disk. This
+ * ensures the DMU object info is updated. A more efficient
+ * future version might just find the TXG with the maximum
+ * ID and wait for that to be synced.
+ */
+int
+dmu_object_wait_synced(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ int error, i;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ if (error) {
+ return (error);
+ }
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (list_link_active(&dn->dn_dirty_link[i])) {
+ break;
+ }
+ }
+ dnode_rele(dn, FTAG);
+ if (i != TXG_SIZE) {
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ }
+
+ return (0);
+}
+
+void
+__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ dnode_phys_t *dnp = dn->dn_phys;
+
+ doi->doi_data_block_size = dn->dn_datablksz;
+ doi->doi_metadata_block_size = dn->dn_indblkshift ?
+ 1ULL << dn->dn_indblkshift : 0;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
+ doi->doi_indirection = dn->dn_nlevels;
+ doi->doi_checksum = dn->dn_checksum;
+ doi->doi_compress = dn->dn_compress;
+ doi->doi_nblkptr = dn->dn_nblkptr;
+ doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+ doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+ doi->doi_fill_count = 0;
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
+ doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ mutex_enter(&dn->dn_mtx);
+
+ __dmu_object_info_from_dnode(dn, doi);
+
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os, object, FTAG, &dn);
+
+ if (err)
+ return (err);
+
+ if (doi != NULL)
+ dmu_object_info_from_dnode(dn, doi);
+
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ DB_DNODE_ENTER(db);
+ dmu_object_info_from_dnode(DB_DNODE(db), doi);
+ DB_DNODE_EXIT(db);
+}
+
+/*
+ * Faster still when you only care about the size.
+ * This is specifically optimized for zfs_getattr().
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+ u_longlong_t *nblk512)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ *blksize = dn->dn_datablksz;
+ /* add in number of slots used for the dnode itself */
+ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
+ SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
+ DB_DNODE_EXIT(db);
+}
+
+void
+dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ *dnsize = dn->dn_num_slots << DNODE_SHIFT;
+ DB_DNODE_EXIT(db);
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+ uint64_t *buf = vbuf;
+ size_t count = size >> 3;
+ int i;
+
+ ASSERT((size & 7) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+ uint32_t *buf = vbuf;
+ size_t count = size >> 2;
+ int i;
+
+ ASSERT((size & 3) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+ uint16_t *buf = vbuf;
+ size_t count = size >> 1;
+ int i;
+
+ ASSERT((size & 1) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+ abd_init();
+ zfs_dbgmsg_init();
+ sa_cache_init();
+ xuio_stat_init();
+ dmu_objset_init();
+ dnode_init();
+ zfetch_init();
+ zio_compress_init();
+ l2arc_init();
+ arc_init();
+ dbuf_init();
+}
+
+void
+dmu_fini(void)
+{
+ arc_fini(); /* arc depends on l2arc, so arc must go first */
+ l2arc_fini();
+ zfetch_fini();
+ zio_compress_fini();
+ dbuf_fini();
+ dnode_fini();
+ dmu_objset_fini();
+ xuio_stat_fini();
+ sa_cache_fini();
+ zfs_dbgmsg_fini();
+ abd_fini();
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
new file mode 100644
index 000000000000..e7bfdaa90e97
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
@@ -0,0 +1,251 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+
+struct diffarg {
+ struct file *da_fp; /* file to which we are reporting */
+ offset_t *da_offp;
+ int da_err; /* error that stopped diff search */
+ dmu_diff_record_t da_ddr;
+ kthread_t *da_td;
+};
+
+static int
+write_bytes(struct diffarg *da)
+{
+ struct uio auio;
+ struct iovec aiov;
+
+ aiov.iov_base = (caddr_t)&da->da_ddr;
+ aiov.iov_len = sizeof (da->da_ddr);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = aiov.iov_len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_offset = (off_t)-1;
+ auio.uio_td = da->da_td;
+#ifdef _KERNEL
+ if (da->da_fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td));
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ return (EOPNOTSUPP);
+#endif
+}
+
+static int
+write_record(struct diffarg *da)
+{
+
+ if (da->da_ddr.ddr_type == DDR_NONE) {
+ da->da_err = 0;
+ return (0);
+ }
+
+ da->da_err = write_bytes(da);
+ *da->da_offp += sizeof (da->da_ddr);
+ return (da->da_err);
+}
+
+static int
+report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
+{
+ ASSERT(first <= last);
+ if (da->da_ddr.ddr_type != DDR_FREE ||
+ first != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_FREE;
+ da->da_ddr.ddr_first = first;
+ da->da_ddr.ddr_last = last;
+ return (0);
+ }
+ da->da_ddr.ddr_last = last;
+ return (0);
+}
+
+static int
+report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp)
+{
+ ASSERT(dnp != NULL);
+ if (dnp->dn_type == DMU_OT_NONE)
+ return (report_free_dnode_range(da, object, object));
+
+ if (da->da_ddr.ddr_type != DDR_INUSE ||
+ object != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_INUSE;
+ da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
+ return (0);
+ }
+ da->da_ddr.ddr_last = object;
+ return (0);
+}
+
+#define DBP_SPAN(dnp, level) \
+ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ struct diffarg *da = arg;
+ int err = 0;
+
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ return (SET_ERROR(EINTR));
+
+ if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
+ return (0);
+
+ if (BP_IS_HOLE(bp)) {
+ uint64_t span = DBP_SPAN(dnp, zb->zb_level);
+ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+
+ err = report_free_dnode_range(da, dnobj,
+ dnobj + (span >> DNODE_SHIFT) - 1);
+ if (err)
+ return (err);
+ } else if (zb->zb_level == 0) {
+ dnode_phys_t *blk;
+ arc_buf_t *abuf;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ int blksz = BP_GET_LSIZE(bp);
+ int i;
+
+ if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+ &aflags, zb) != 0)
+ return (SET_ERROR(EIO));
+
+ blk = abuf->b_data;
+ for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj = (zb->zb_blkid <<
+ (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ err = report_dnode(da, dnobj, blk+i);
+ if (err)
+ break;
+ }
+ arc_buf_destroy(abuf, &abuf);
+ if (err)
+ return (err);
+ /* Don't care about the data blocks */
+ return (TRAVERSE_VISIT_NO_CHILDREN);
+ }
+ return (0);
+}
+
+int
+dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+#ifdef illumos
+ struct vnode *vp, offset_t *offp)
+#else
+ struct file *fp, offset_t *offp)
+#endif
+{
+ struct diffarg da;
+ dsl_dataset_t *fromsnap;
+ dsl_dataset_t *tosnap;
+ dsl_pool_t *dp;
+ int error;
+ uint64_t fromtxg;
+
+ if (strchr(tosnap_name, '@') == NULL ||
+ strchr(fromsnap_name, '@') == NULL)
+ return (SET_ERROR(EINVAL));
+
+ error = dsl_pool_hold(tosnap_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
+ if (error != 0) {
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
+ dsl_dataset_rele(fromsnap, FTAG);
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
+ dsl_dataset_rele(fromsnap, FTAG);
+
+ dsl_dataset_long_hold(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ da.da_fp = fp;
+ da.da_offp = offp;
+ da.da_ddr.ddr_type = DDR_NONE;
+ da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
+ da.da_err = 0;
+ da.da_td = curthread;
+
+ error = traverse_dataset(tosnap, fromtxg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
+
+ if (error != 0) {
+ da.da_err = error;
+ } else {
+ /* we set the da.da_err we return as side-effect */
+ (void) write_record(&da);
+ }
+
+ dsl_dataset_long_rele(tosnap, FTAG);
+ dsl_dataset_rele(tosnap, FTAG);
+
+ return (da.da_err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
new file mode 100644
index 000000000000..b40ccf4a7839
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -0,0 +1,444 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_dataset.h>
+
+/*
+ * Each of the concurrent object allocators will grab
+ * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
+ * grab 128 slots, which is 4 blocks worth. This was experimentally
+ * determined to be the lowest value that eliminates the measurable effect
+ * of lock contention from this code path.
+ */
+int dmu_object_alloc_chunk_shift = 7;
+
+static uint64_t
+dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t object;
+ uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
+ (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
+ dnode_t *dn = NULL;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ boolean_t restarted = B_FALSE;
+ uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
+ os->os_obj_next_percpu_len];
+ int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ int error;
+
+ if (dn_slots == 0) {
+ dn_slots = DNODE_MIN_SLOTS;
+ } else {
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+ }
+
+ /*
+ * The "chunk" of dnodes that is assigned to a CPU-specific
+ * allocator needs to be at least one block's worth, to avoid
+ * lock contention on the dbuf. It can be at most one L1 block's
+ * worth, so that the "rescan after polishing off a L1's worth"
+ * logic below will be sure to kick in.
+ */
+ if (dnodes_per_chunk < DNODES_PER_BLOCK)
+ dnodes_per_chunk = DNODES_PER_BLOCK;
+ if (dnodes_per_chunk > L1_dnode_count)
+ dnodes_per_chunk = L1_dnode_count;
+
+#ifdef __FreeBSD__
+ object = atomic_load_64(cpuobj);
+#else
+ object = *cpuobj;
+#endif
+
+ for (;;) {
+ /*
+ * If we finished a chunk of dnodes, get a new one from
+ * the global allocator.
+ */
+ if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
+ (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
+ dn_slots)) {
+ DNODE_STAT_BUMP(dnode_alloc_next_chunk);
+ mutex_enter(&os->os_obj_lock);
+ ASSERT0(P2PHASE(os->os_obj_next_chunk,
+ dnodes_per_chunk));
+ object = os->os_obj_next_chunk;
+
+ /*
+ * Each time we polish off a L1 bp worth of dnodes
+ * (2^12 objects), move to another L1 bp that's
+ * still reasonably sparse (at most 1/4 full). Look
+ * from the beginning at most once per txg. If we
+ * still can't allocate from that L1 block, search
+ * for an empty L0 block, which will quickly skip
+ * to the end of the metadnode if the no nearby L0
+ * blocks are empty. This fallback avoids a
+ * pathology where full dnode blocks containing
+ * large dnodes appear sparse because they have a
+ * low blk_fill, leading to many failed allocation
+ * attempts. In the long term a better mechanism to
+ * search for sparse metadnode regions, such as
+ * spacemaps, could be implemented.
+ *
+ * os_scan_dnodes is set during txg sync if enough
+ * objects have been freed since the previous
+ * rescan to justify backfilling again.
+ *
+ * Note that dmu_traverse depends on the behavior
+ * that we use multiple blocks of the dnode object
+ * before going back to reuse objects. Any change
+ * to this algorithm should preserve that property
+ * or find another solution to the issues described
+ * in traverse_visitbp.
+ */
+ if (P2PHASE(object, L1_dnode_count) == 0) {
+ uint64_t offset;
+ uint64_t blkfill;
+ int minlvl;
+ if (os->os_rescan_dnodes) {
+ offset = 0;
+ os->os_rescan_dnodes = B_FALSE;
+ } else {
+ offset = object << DNODE_SHIFT;
+ }
+ blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+ minlvl = restarted ? 1 : 2;
+ restarted = B_TRUE;
+ error = dnode_next_offset(DMU_META_DNODE(os),
+ DNODE_FIND_HOLE, &offset, minlvl,
+ blkfill, 0);
+ if (error == 0) {
+ object = offset >> DNODE_SHIFT;
+ }
+ }
+ /*
+ * Note: if "restarted", we may find a L0 that
+ * is not suitably aligned.
+ */
+ os->os_obj_next_chunk =
+ P2ALIGN(object, dnodes_per_chunk) +
+ dnodes_per_chunk;
+ (void) atomic_swap_64(cpuobj, object);
+ mutex_exit(&os->os_obj_lock);
+ }
+
+ /*
+ * The value of (*cpuobj) before adding dn_slots is the object
+ * ID assigned to us. The value afterwards is the object ID
+ * assigned to whoever wants to do an allocation next.
+ */
+ object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
+
+ /*
+ * XXX We should check for an i/o error here and return
+ * up to our caller. Actually we should pre-read it in
+ * dmu_tx_assign(), but there is currently no mechanism
+ * to do so.
+ */
+ error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+ dn_slots, FTAG, &dn);
+ if (error == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ /*
+ * Another thread could have allocated it; check
+ * again now that we have the struct lock.
+ */
+ if (dn->dn_type == DMU_OT_NONE) {
+ dnode_allocate(dn, ot, blocksize, 0,
+ bonustype, bonuslen, dn_slots, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_tx_add_new_object(tx, dn);
+ dnode_rele(dn, FTAG);
+ return (object);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ DNODE_STAT_BUMP(dnode_alloc_race);
+ }
+
+ /*
+ * Skip to next known valid starting point on error. This
+ * is the start of the next block of dnodes.
+ */
+ if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
+ object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
+ DNODE_STAT_BUMP(dnode_alloc_next_block);
+ }
+ (void) atomic_swap_64(cpuobj, object);
+ }
+}
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, 0, tx));
+}
+
+uint64_t
+dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, dnodesize, tx));
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ int err;
+
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ return (SET_ERROR(EBADF));
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
+ FTAG, &dn);
+ if (err)
+ return (err);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
+ dmu_tx_add_new_object(tx, dn);
+
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, DNODE_MIN_SIZE, tx));
+}
+
+int
+dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ int err;
+
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+
+ if (object == DMU_META_DNODE_OBJECT)
+ return (SET_ERROR(EBADF));
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
+
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+ /*
+ * If we don't create this free range, we'll leak indirect blocks when
+ * we get to freeing the dnode in syncing context.
+ */
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+ dnode_free(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
+{
+ uint64_t offset;
+ uint64_t start_obj;
+ struct dsl_dataset *ds = os->os_dsl_dataset;
+ int error;
+
+ if (*objectp == 0) {
+ start_obj = 1;
+ } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+ uint64_t i = *objectp + 1;
+ uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
+ dmu_object_info_t doi;
+
+ /*
+ * Scan through the remaining meta dnode block. The contents
+ * of each slot in the block are known so it can be quickly
+ * checked. If the block is exhausted without a match then
+ * hand off to dnode_next_offset() for further scanning.
+ */
+ while (i <= last_obj) {
+ error = dmu_object_info(os, i, &doi);
+ if (error == ENOENT) {
+ if (hole) {
+ *objectp = i;
+ return (0);
+ } else {
+ i++;
+ }
+ } else if (error == EEXIST) {
+ i++;
+ } else if (error == 0) {
+ if (hole) {
+ i += doi.doi_dnodesize >> DNODE_SHIFT;
+ } else {
+ *objectp = i;
+ return (0);
+ }
+ } else {
+ return (error);
+ }
+ }
+
+ start_obj = i;
+ } else {
+ start_obj = *objectp + 1;
+ }
+
+ offset = start_obj << DNODE_SHIFT;
+
+ error = dnode_next_offset(DMU_META_DNODE(os),
+ (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
+
+ *objectp = offset >> DNODE_SHIFT;
+
+ return (error);
+}
+
+/*
+ * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
+ * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
+ *
+ * Only for use from syncing context, on MOS objects.
+ */
+void
+dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+ if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
+ dnode_rele(dn, FTAG);
+ return;
+ }
+ ASSERT3U(dn->dn_type, ==, old_type);
+ ASSERT0(dn->dn_maxblkid);
+
+ /*
+ * We must initialize the ZAP data before changing the type,
+ * so that concurrent calls to *_is_zapified() can determine if
+ * the object has been completely zapified by checking the type.
+ */
+ mzap_create_impl(mos, object, 0, 0, tx);
+
+ dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
+ DMU_OTN_ZAP_METADATA;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ spa_feature_incr(dmu_objset_spa(mos),
+ SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+}
+
+void
+dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ dmu_object_type_t t;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+ t = dn->dn_type;
+ dnode_rele(dn, FTAG);
+
+ if (t == DMU_OTN_ZAP_METADATA) {
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+ }
+ VERIFY0(dmu_object_free(mos, object, tx));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
new file mode 100644
index 000000000000..1b691d412293
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -0,0 +1,2484 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/cred.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zvol.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/sa.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
+#include <sys/zfeature.h>
+#include "zfs_namecheck.h"
+
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+/*
+ * Tunable to overwrite the maximum number of threads for the parallization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+int dmu_find_threads = 0;
+
+/*
+ * Backfill lower metadnode objects after this many have been freed.
+ * Backfilling negatively impacts object creation rates, so only do it
+ * if there are enough holes to fill.
+ */
+int dmu_rescan_dnode_threshold = 131072;
+
+static void dmu_objset_find_dp_cb(void *arg);
+
+void
+dmu_objset_init(void)
+{
+ rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+ rw_destroy(&os_lock);
+}
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+ return (os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+ return (os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+ dsl_dataset_t *ds;
+
+ if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
+ return (ds->ds_dir->dd_pool);
+ else
+ return (spa_get_dsl(os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+ return (os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+ return (os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+ dsl_dataset_name(os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+
+ return (ds ? ds->ds_object : 0);
+}
+
+uint64_t
+dmu_objset_dnodesize(objset_t *os)
+{
+ return (os->os_dnodesize);
+}
+
+zfs_sync_type_t
+dmu_objset_syncprop(objset_t *os)
+{
+ return (os->os_sync);
+}
+
+zfs_logbias_op_t
+dmu_objset_logbias(objset_t *os)
+{
+ return (os->os_logbias);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+ os->os_compress = zio_compress_select(os->os_spa, newval,
+ ZIO_COMPRESS_ON);
+}
+
+static void
+copies_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval > 0);
+ ASSERT(newval <= spa_max_replication(os->os_spa));
+
+ os->os_copies = newval;
+}
+
+static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+ spa_t *spa = os->os_spa;
+ enum zio_checksum checksum;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
+
+ os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+ os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
+}
+
+static void
+primary_cache_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+ newval == ZFS_CACHE_METADATA);
+
+ os->os_primary_cache = newval;
+}
+
+static void
+secondary_cache_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+ newval == ZFS_CACHE_METADATA);
+
+ os->os_secondary_cache = newval;
+}
+
+static void
+sync_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
+ newval == ZFS_SYNC_DISABLED);
+
+ os->os_sync = newval;
+ if (os->os_zil)
+ zil_set_sync(os->os_zil, newval);
+}
+
+static void
+redundant_metadata_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
+ newval == ZFS_REDUNDANT_METADATA_MOST);
+
+ os->os_redundant_metadata = newval;
+}
+
+static void
+dnodesize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ switch (newval) {
+ case ZFS_DNSIZE_LEGACY:
+ os->os_dnodesize = DNODE_MIN_SIZE;
+ break;
+ case ZFS_DNSIZE_AUTO:
+ /*
+ * Choose a dnode size that will work well for most
+ * workloads if the user specified "auto". Future code
+ * improvements could dynamically select a dnode size
+ * based on observed workload patterns.
+ */
+ os->os_dnodesize = DNODE_MIN_SIZE * 2;
+ break;
+ case ZFS_DNSIZE_1K:
+ case ZFS_DNSIZE_2K:
+ case ZFS_DNSIZE_4K:
+ case ZFS_DNSIZE_8K:
+ case ZFS_DNSIZE_16K:
+ os->os_dnodesize = newval;
+ break;
+ }
+}
+
+static void
+smallblk_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ os->os_zpl_special_smallblock = newval;
+}
+
+static void
+logbias_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
+ newval == ZFS_LOGBIAS_THROUGHPUT);
+ os->os_logbias = newval;
+ if (os->os_zil)
+ zil_set_logbias(os->os_zil, newval);
+}
+
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ os->os_recordsize = newval;
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+ objset_phys_t *osp = buf;
+
+ ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
+ dnode_byteswap(&osp->os_meta_dnode);
+ byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+ osp->os_type = BSWAP_64(osp->os_type);
+ osp->os_flags = BSWAP_64(osp->os_flags);
+ if (size == sizeof (objset_phys_t)) {
+ dnode_byteswap(&osp->os_userused_dnode);
+ dnode_byteswap(&osp->os_groupused_dnode);
+ }
+}
+
+/*
+ * The hash is a CRC-based hash of the objset_t pointer and the object number.
+ */
+static uint64_t
+dnode_hash(const objset_t *os, uint64_t obj)
+{
+ uintptr_t osv = (uintptr_t)os;
+ uint64_t crc = -1ULL;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ /*
+ * The low 6 bits of the pointer don't have much entropy, because
+ * the objset_t is larger than 2^6 bytes long.
+ */
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
+
+ crc ^= (osv>>14) ^ (obj>>24);
+
+ return (crc);
+}
+
+unsigned int
+dnode_multilist_index_func(multilist_t *ml, void *obj)
+{
+ dnode_t *dn = obj;
+ return (dnode_hash(dn->dn_objset, dn->dn_object) %
+ multilist_get_num_sublists(ml));
+}
+
+/*
+ * Instantiates the objset_t in-memory structure corresponding to the
+ * objset_phys_t that's pointed to by the specified blkptr_t.
+ */
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ objset_t **osp)
+{
+ objset_t *os;
+ int i, err;
+
+ ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
+
+#if 0
+ /*
+ * The $ORIGIN dataset (if it exists) doesn't have an associated
+ * objset, so there's no reason to open it. The $ORIGIN dataset
+ * will not exist on pools older than SPA_VERSION_ORIGIN.
+ */
+ if (ds != NULL && spa_get_dsl(spa) != NULL &&
+ spa_get_dsl(spa)->dp_origin_snap != NULL) {
+ ASSERT3P(ds->ds_dir, !=,
+ spa_get_dsl(spa)->dp_origin_snap->ds_dir);
+ }
+#endif
+
+ os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
+ os->os_dsl_dataset = ds;
+ os->os_spa = spa;
+ os->os_rootbp = bp;
+ if (!BP_IS_HOLE(os->os_rootbp)) {
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ zbookmark_phys_t zb;
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+ if (DMU_OS_IS_L2CACHEABLE(os))
+ aflags |= ARC_FLAG_L2CACHE;
+
+ dprintf_bp(os->os_rootbp, "reading %s", "");
+ err = arc_read(NULL, spa, os->os_rootbp,
+ arc_getbuf_func, &os->os_phys_buf,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
+ if (err != 0) {
+ kmem_free(os, sizeof (objset_t));
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = SET_ERROR(EIO);
+ return (err);
+ }
+
+ /* Increase the blocksize if we are permitted. */
+ if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
+ arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
+ arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
+ ARC_BUFC_METADATA, sizeof (objset_phys_t));
+ bzero(buf->b_data, sizeof (objset_phys_t));
+ bcopy(os->os_phys_buf->b_data, buf->b_data,
+ arc_buf_size(os->os_phys_buf));
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+ os->os_phys_buf = buf;
+ }
+
+ os->os_phys = os->os_phys_buf->b_data;
+ os->os_flags = os->os_phys->os_flags;
+ } else {
+ int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
+ sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
+ os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
+ ARC_BUFC_METADATA, size);
+ os->os_phys = os->os_phys_buf->b_data;
+ bzero(os->os_phys, size);
+ }
+
+ /*
+ * Note: the changed_cb will be called once before the register
+ * func returns, thus changing the checksum/compression from the
+ * default (fletcher2/off). Snapshots don't need to know about
+ * checksum/compression/copies.
+ */
+ if (ds != NULL) {
+ boolean_t needlock = B_FALSE;
+
+ /*
+ * Note: it's valid to open the objset if the dataset is
+ * long-held, in which case the pool_config lock will not
+ * be held.
+ */
+ if (!dsl_pool_config_held(dmu_objset_pool(os))) {
+ needlock = B_TRUE;
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ }
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
+ primary_cache_changed_cb, os);
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
+ secondary_cache_changed_cb, os);
+ }
+ if (!ds->ds_is_snapshot) {
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM),
+ checksum_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+ compression_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_COPIES),
+ copies_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DEDUP),
+ dedup_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_LOGBIAS),
+ logbias_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SYNC),
+ sync_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(
+ ZFS_PROP_REDUNDANT_METADATA),
+ redundant_metadata_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+ dnodesize_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS),
+ smallblk_changed_cb, os);
+ }
+ }
+ if (needlock)
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (err != 0) {
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+ } else {
+ /* It's the meta-objset. */
+ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ os->os_compress = ZIO_COMPRESS_ON;
+ os->os_copies = spa_max_replication(spa);
+ os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+ os->os_dedup_verify = B_FALSE;
+ os->os_logbias = ZFS_LOGBIAS_LATENCY;
+ os->os_sync = ZFS_SYNC_STANDARD;
+ os->os_primary_cache = ZFS_CACHE_ALL;
+ os->os_secondary_cache = ZFS_CACHE_ALL;
+ os->os_dnodesize = DNODE_MIN_SIZE;
+ }
+ /*
+ * These properties will be filled in by the logic in zfs_get_zplprop()
+ * when they are queried for the first time.
+ */
+ os->os_version = OBJSET_PROP_UNINITIALIZED;
+ os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+ os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+ os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
+
+ if (ds == NULL || !ds->ds_is_snapshot)
+ os->os_zil_header = os->os_phys->os_zil_header;
+ os->os_zil = zil_alloc(os, &os->os_zil_header);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]),
+ dnode_multilist_index_func);
+ }
+ list_create(&os->os_dnodes, sizeof (dnode_t),
+ offsetof(dnode_t, dn_link));
+ list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+ os->os_obj_next_percpu_len = boot_ncpus;
+ os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
+ sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
+
+ dnode_special_open(os, &os->os_phys->os_meta_dnode,
+ DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
+ if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
+ dnode_special_open(os, &os->os_phys->os_userused_dnode,
+ DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+ dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+ DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
+ }
+
+ *osp = os;
+ return (0);
+}
+
+int
+dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
+{
+ int err = 0;
+
+ /*
+ * We shouldn't be doing anything with dsl_dataset_t's unless the
+ * pool_config lock is held, or the dataset is long-held.
+ */
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
+ dsl_dataset_long_held(ds));
+
+ mutex_enter(&ds->ds_opening_lock);
+ if (ds->ds_objset == NULL) {
+ objset_t *os;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+ ds, dsl_dataset_get_blkptr(ds), &os);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ if (err == 0) {
+ mutex_enter(&ds->ds_lock);
+ ASSERT(ds->ds_objset == NULL);
+ ds->ds_objset = os;
+ mutex_exit(&ds->ds_lock);
+ }
+ }
+ *osp = ds->ds_objset;
+ mutex_exit(&ds->ds_opening_lock);
+ return (err);
+}
+
+/*
+ * Holds the pool while the objset is held. Therefore only one objset
+ * can be held at a time.
+ */
+int
+dmu_objset_hold(const char *name, void *tag, objset_t **osp)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_pool_hold(name, tag, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_hold(dp, name, tag, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, tag);
+ return (err);
+ }
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err != 0) {
+ dsl_dataset_rele(ds, tag);
+ dsl_pool_rele(dp, tag);
+ }
+
+ return (err);
+}
+
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+{
+ int err;
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err != 0) {
+ dsl_dataset_disown(ds, tag);
+ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+ dsl_dataset_disown(ds, tag);
+ return (SET_ERROR(EINVAL));
+ } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+ dsl_dataset_disown(ds, tag);
+ return (SET_ERROR(EROFS));
+ }
+ return (err);
+}
+
+/*
+ * dsl_pool must not be held when this is called.
+ * Upon successful return, there will be a longhold on the dataset,
+ * and the dsl_pool will not be held.
+ */
+int
+dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_pool_hold(name, FTAG, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_own(dp, name, tag, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+ err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
+ dsl_pool_rele(dp, FTAG);
+
+ return (err);
+}
+
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+ if (err != 0)
+ return (err);
+
+ return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+}
+
+void
+dmu_objset_rele(objset_t *os, void *tag)
+{
+ dsl_pool_t *dp = dmu_objset_pool(os);
+ dsl_dataset_rele(os->os_dsl_dataset, tag);
+ dsl_pool_rele(dp, tag);
+}
+
+/*
+ * When we are called, os MUST refer to an objset associated with a dataset
+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
+ * == tag. We will then release and reacquire ownership of the dataset while
+ * holding the pool config_rwlock to avoid intervening namespace or ownership
+ * changes may occur.
+ *
+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
+ * release the hold on its dataset and acquire a new one on the dataset of the
+ * same name so that it can be partially torn down and reconstructed.
+ */
+void
+dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
+ void *tag)
+{
+ dsl_pool_t *dp;
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+
+ VERIFY3P(ds, !=, NULL);
+ VERIFY3P(ds->ds_owner, ==, tag);
+ VERIFY(dsl_dataset_long_held(ds));
+
+ dsl_dataset_name(ds, name);
+ dp = ds->ds_dir->dd_pool;
+ dsl_pool_config_enter(dp, FTAG);
+ dsl_dataset_disown(ds, tag);
+ VERIFY0(dsl_dataset_own(dp, name, tag, newds));
+ dsl_pool_config_exit(dp, FTAG);
+}
+
+void
+dmu_objset_disown(objset_t *os, void *tag)
+{
+ dsl_dataset_disown(os->os_dsl_dataset, tag);
+}
+
+void
+dmu_objset_evict_dbufs(objset_t *os)
+{
+ dnode_t dn_marker;
+ dnode_t *dn;
+
+ mutex_enter(&os->os_lock);
+ dn = list_head(&os->os_dnodes);
+ while (dn != NULL) {
+ /*
+ * Skip dnodes without holds. We have to do this dance
+ * because dnode_add_ref() only works if there is already a
+ * hold. If the dnode has no holds, then it has no dbufs.
+ */
+ if (dnode_add_ref(dn, FTAG)) {
+ list_insert_after(&os->os_dnodes, dn, &dn_marker);
+ mutex_exit(&os->os_lock);
+
+ dnode_evict_dbufs(dn);
+ dnode_rele(dn, FTAG);
+
+ mutex_enter(&os->os_lock);
+ dn = list_next(&os->os_dnodes, &dn_marker);
+ list_remove(&os->os_dnodes, &dn_marker);
+ } else {
+ dn = list_next(&os->os_dnodes, dn);
+ }
+ }
+ mutex_exit(&os->os_lock);
+
+ if (DMU_USERUSED_DNODE(os) != NULL) {
+ dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+ dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
+ }
+ dnode_evict_dbufs(DMU_META_DNODE(os));
+}
+
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction. Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ * dnode_buf_pageout()), it is possible for the meta dnode for the
+ * objset to have no holds even though os->os_dnodes is not empty.
+ */
+void
+dmu_objset_evict(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!dmu_objset_is_dirty(os, t));
+
+ if (ds)
+ dsl_prop_unregister_all(ds, os);
+
+ if (os->os_sa)
+ sa_tear_down(os);
+
+ dmu_objset_evict_dbufs(os);
+
+ mutex_enter(&os->os_lock);
+ spa_evicting_os_register(os->os_spa, os);
+ if (list_is_empty(&os->os_dnodes)) {
+ mutex_exit(&os->os_lock);
+ dmu_objset_evict_done(os);
+ } else {
+ mutex_exit(&os->os_lock);
+ }
+}
+
+void
+dmu_objset_evict_done(objset_t *os)
+{
+ ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
+ dnode_special_close(&os->os_meta_dnode);
+ if (DMU_USERUSED_DNODE(os)) {
+ dnode_special_close(&os->os_userused_dnode);
+ dnode_special_close(&os->os_groupused_dnode);
+ }
+ zil_free(os->os_zil);
+
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+
+ /*
+ * This is a barrier to prevent the objset from going away in
+ * dnode_move() until we can safely ensure that the objset is still in
+ * use. We consider the objset valid before the barrier and invalid
+ * after the barrier.
+ */
+ rw_enter(&os_lock, RW_READER);
+ rw_exit(&os_lock);
+
+ kmem_free(os->os_obj_next_percpu,
+ os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
+
+ mutex_destroy(&os->os_lock);
+ mutex_destroy(&os->os_userused_lock);
+ mutex_destroy(&os->os_obj_lock);
+ mutex_destroy(&os->os_user_ptr_lock);
+ for (int i = 0; i < TXG_SIZE; i++) {
+ multilist_destroy(os->os_dirty_dnodes[i]);
+ }
+ spa_evicting_os_deregister(os->os_spa, os);
+ kmem_free(os, sizeof (objset_t));
+}
+
+timestruc_t
+dmu_objset_snap_cmtime(objset_t *os)
+{
+ return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
+}
+
+/* called from dsl for meta-objset */
+objset_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, dmu_tx_t *tx)
+{
+ objset_t *os;
+ dnode_t *mdn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ if (ds != NULL)
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ else
+ VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
+
+ mdn = DMU_META_DNODE(os);
+
+ dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
+ DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
+
+ /*
+ * We don't want to have to increase the meta-dnode's nlevels
+ * later, because then we could do it in quescing context while
+ * we are also accessing it in open context.
+ *
+ * This precaution is not necessary for the MOS (ds == NULL),
+ * because the MOS is only updated in syncing context.
+ * This is most fortunate: the MOS is the only objset that
+ * needs to be synced multiple times as spa_sync() iterates
+ * to convergence, so minimizing its dn_nlevels matters.
+ */
+ if (ds != NULL) {
+ int levels = 1;
+
+ /*
+ * Determine the number of levels necessary for the meta-dnode
+ * to contain DN_MAX_OBJECT dnodes. Note that in order to
+ * ensure that we do not overflow 64 bits, there has to be
+ * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
+ * but < 2^64. Therefore,
+ * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
+ * less than (64 - log2(DN_MAX_OBJECT)) (16).
+ */
+ while ((uint64_t)mdn->dn_nblkptr <<
+ (mdn->dn_datablkshift - DNODE_SHIFT +
+ (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+ DN_MAX_OBJECT)
+ levels++;
+
+ mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+ mdn->dn_nlevels = levels;
+ }
+
+ ASSERT(type != DMU_OST_NONE);
+ ASSERT(type != DMU_OST_ANY);
+ ASSERT(type < DMU_OST_NUMTYPES);
+ os->os_phys->os_type = type;
+ if (dmu_objset_userused_enabled(os)) {
+ os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ os->os_flags = os->os_phys->os_flags;
+ }
+
+ dsl_dataset_dirty(ds, tx);
+
+ return (os);
+}
+
+typedef struct dmu_objset_create_arg {
+ const char *doca_name;
+ cred_t *doca_cred;
+ void (*doca_userfunc)(objset_t *os, void *arg,
+ cred_t *cr, dmu_tx_t *tx);
+ void *doca_userarg;
+ dmu_objset_type_t doca_type;
+ uint64_t doca_flags;
+} dmu_objset_create_arg_t;
+
+/*ARGSUSED*/
+static int
+dmu_objset_create_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_objset_create_arg_t *doca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *pdd;
+ dsl_dataset_t *parentds;
+ objset_t *parentos;
+ const char *tail;
+ int error;
+
+ if (strchr(doca->doca_name, '@') != NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ if (dataset_nestcheck(doca->doca_name) != 0)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
+ if (error != 0)
+ return (error);
+ if (tail == NULL) {
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+ error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ doca->doca_cred);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+
+ /* can't create below anything but filesystems (eg. no ZVOLs) */
+ error = dsl_dataset_hold_obj(pdd->dd_pool,
+ dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+ error = dmu_objset_from_ds(parentds, &parentos);
+ if (error != 0) {
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+ }
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+
+ return (error);
+}
+
+static void
+dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_objset_create_arg_t *doca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *pdd;
+ const char *tail;
+ dsl_dataset_t *ds;
+ uint64_t obj;
+ blkptr_t *bp;
+ objset_t *os;
+
+ VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
+
+ obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
+ doca->doca_cred, tx);
+
+ VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ bp = dsl_dataset_get_blkptr(ds);
+ os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
+ ds, bp, doca->doca_type, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ if (doca->doca_userfunc != NULL) {
+ doca->doca_userfunc(os, doca->doca_userarg,
+ doca->doca_cred, tx);
+ }
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ zvol_create_minors(dp->dp_spa, doca->doca_name);
+#endif
+ spa_history_log_internal_ds(ds, "create", tx, "");
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
+ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
+{
+ dmu_objset_create_arg_t doca;
+
+ doca.doca_name = name;
+ doca.doca_cred = CRED();
+ doca.doca_flags = flags;
+ doca.doca_userfunc = func;
+ doca.doca_userarg = arg;
+ doca.doca_type = type;
+
+ return (dsl_sync_task(name,
+ dmu_objset_create_check, dmu_objset_create_sync, &doca,
+ 5, ZFS_SPACE_CHECK_NORMAL));
+}
+
+typedef struct dmu_objset_clone_arg {
+ const char *doca_clone;
+ const char *doca_origin;
+ cred_t *doca_cred;
+} dmu_objset_clone_arg_t;
+
+/*ARGSUSED*/
+static int
+dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_objset_clone_arg_t *doca = arg;
+ dsl_dir_t *pdd;
+ const char *tail;
+ int error;
+ dsl_dataset_t *origin;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ if (strchr(doca->doca_clone, '@') != NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
+ if (error != 0)
+ return (error);
+ if (tail == NULL) {
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ doca->doca_cred);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(EDQUOT));
+ }
+ dsl_dir_rele(pdd, FTAG);
+
+ error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
+ if (error != 0)
+ return (error);
+
+ /* You can only clone snapshots, not the head datasets. */
+ if (!origin->ds_is_snapshot) {
+ dsl_dataset_rele(origin, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ dsl_dataset_rele(origin, FTAG);
+
+ return (0);
+}
+
+static void
+dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_objset_clone_arg_t *doca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *pdd;
+ const char *tail;
+ dsl_dataset_t *origin, *ds;
+ uint64_t obj;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+ VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
+ VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
+
+ obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
+ doca->doca_cred, tx);
+
+ VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+ dsl_dataset_name(origin, namebuf);
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ zvol_create_minors(dp->dp_spa, doca->doca_clone);
+#endif
+ spa_history_log_internal_ds(ds, "clone", tx,
+ "origin=%s (%llu)", namebuf, origin->ds_object);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+}
+
+int
+dmu_objset_clone(const char *clone, const char *origin)
+{
+ dmu_objset_clone_arg_t doca;
+
+ doca.doca_clone = clone;
+ doca.doca_origin = origin;
+ doca.doca_cred = CRED();
+
+ return (dsl_sync_task(clone,
+ dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
+ 5, ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg)
+{
+ int error = 0;
+ uint64_t object = 0;
+ while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
+ error = dmu_object_remap_indirects(os, object,
+ last_removed_txg);
+ /*
+ * If the ZPL removed the object before we managed to dnode_hold
+ * it, we would get an ENOENT. If the ZPL declares its intent
+ * to remove the object (dnode_free) before we manage to
+ * dnode_hold it, we would get an EEXIST. In either case, we
+ * want to continue remapping the other objects in the objset;
+ * in all other cases, we want to break early.
+ */
+ if (error != 0 && error != ENOENT && error != EEXIST) {
+ break;
+ }
+ }
+ if (error == ESRCH) {
+ error = 0;
+ }
+ return (error);
+}
+
+int
+dmu_objset_remap_indirects(const char *fsname)
+{
+ int error = 0;
+ objset_t *os = NULL;
+ uint64_t last_removed_txg;
+ uint64_t remap_start_txg;
+ dsl_dir_t *dd;
+
+ error = dmu_objset_hold(fsname, FTAG, &os);
+ if (error != 0) {
+ return (error);
+ }
+ dd = dmu_objset_ds(os)->ds_dir;
+
+ if (!spa_feature_is_enabled(dmu_objset_spa(os),
+ SPA_FEATURE_OBSOLETE_COUNTS)) {
+ dmu_objset_rele(os, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) {
+ dmu_objset_rele(os, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If there has not been a removal, we're done.
+ */
+ last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os));
+ if (last_removed_txg == -1ULL) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
+
+ /*
+ * If we have remapped since the last removal, we're done.
+ */
+ if (dsl_dir_is_zapified(dd)) {
+ uint64_t last_remap_txg;
+ if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)),
+ dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+ sizeof (last_remap_txg), 1, &last_remap_txg) == 0 &&
+ last_remap_txg > last_removed_txg) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
+ }
+
+ dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+ remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os));
+ error = dmu_objset_remap_indirects_impl(os, last_removed_txg);
+ if (error == 0) {
+ /*
+ * We update the last_remap_txg to be the start txg so that
+ * we can guarantee that every block older than last_remap_txg
+ * that can be remapped has been remapped.
+ */
+ error = dsl_dir_update_last_remap_txg(dd, remap_start_txg);
+ }
+
+ dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
+ dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+
+ return (error);
+}
+
+int
+dmu_objset_snapshot_one(const char *fsname, const char *snapname)
+{
+ int err;
+ char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
+ nvlist_t *snaps = fnvlist_alloc();
+
+ fnvlist_add_boolean(snaps, longsnap);
+ strfree(longsnap);
+ err = dsl_dataset_snapshot(snaps, NULL, NULL);
+ fnvlist_free(snaps);
+ return (err);
+}
+
+static void
+dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ while ((dn = multilist_sublist_head(list)) != NULL) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ ASSERT(dn->dn_dbuf->db_data_pending);
+ /*
+ * Initialize dn_zio outside dnode_sync() because the
+ * meta-dnode needs to set it ouside dnode_sync().
+ */
+ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
+ ASSERT(dn->dn_zio);
+
+ ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
+ multilist_sublist_remove(list, dn);
+
+ /*
+ * If we are not doing useraccounting (os_synced_dnodes == NULL)
+ * we are done with this dnode for this txg. Unset dn_dirty_txg
+ * if later txgs aren't dirtying it so that future holders do
+ * not get a stale value. Otherwise, we will do this in
+ * userquota_updates_task() when processing has completely
+ * finished for this txg.
+ */
+ multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
+ if (newlist != NULL) {
+ (void) dnode_add_ref(dn, newlist);
+ multilist_insert(newlist, dn);
+ } else {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_dirty_txg == tx->tx_txg)
+ dn->dn_dirty_txg = 0;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ dnode_sync(dn, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ blkptr_t *bp = zio->io_bp;
+ objset_t *os = arg;
+ dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+ ASSERT0(BP_GET_LEVEL(bp));
+
+ /*
+ * Update rootbp fill count: it should be the number of objects
+ * allocated in the object set (not counting the "special"
+ * objects that are stored in the objset_phys_t -- the meta
+ * dnode and user/group accounting objects).
+ */
+ bp->blk_fill = 0;
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
+ bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
+ if (os->os_dsl_dataset != NULL)
+ rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
+ *os->os_rootbp = *bp;
+ if (os->os_dsl_dataset != NULL)
+ rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ objset_t *os = arg;
+
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
+ }
+ kmem_free(bp, sizeof (*bp));
+}
+
+typedef struct sync_dnodes_arg {
+ multilist_t *sda_list;
+ int sda_sublist_idx;
+ multilist_t *sda_newlist;
+ dmu_tx_t *sda_tx;
+} sync_dnodes_arg_t;
+
+static void
+sync_dnodes_task(void *arg)
+{
+ sync_dnodes_arg_t *sda = arg;
+
+ multilist_sublist_t *ms =
+ multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+
+ dmu_objset_sync_dnodes(ms, sda->sda_tx);
+
+ multilist_sublist_unlock(ms);
+
+ kmem_free(sda, sizeof (*sda));
+}
+
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
+{
+ int txgoff;
+ zbookmark_phys_t zb;
+ zio_prop_t zp;
+ zio_t *zio;
+ list_t *list;
+ dbuf_dirty_record_t *dr;
+ int num_sublists;
+ multilist_t *ml;
+ blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
+ *blkptr_copy = *os->os_rootbp;
+
+ dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* XXX the write_done callback should really give us the tx... */
+ os->os_synctx = tx;
+
+ if (os->os_dsl_dataset == NULL) {
+ /*
+ * This is the MOS. If we have upgraded,
+ * spa_max_replication() could change, so reset
+ * os_copies here.
+ */
+ os->os_copies = spa_max_replication(os->os_spa);
+ }
+
+ /*
+ * Create the root block IO
+ */
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+
+ dmu_write_policy(os, NULL, 0, 0, &zp);
+
+ zio = arc_write(pio, os->os_spa, tx->tx_txg,
+ blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
+ &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+ os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+
+ /*
+ * Sync special dnodes - the parent IO for the sync is the root block
+ */
+ DMU_META_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_META_DNODE(os), tx);
+
+ os->os_phys->os_flags = os->os_flags;
+
+ if (DMU_USERUSED_DNODE(os) &&
+ DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+ DMU_USERUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_USERUSED_DNODE(os), tx);
+ DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
+ }
+
+ txgoff = tx->tx_txg & TXG_MASK;
+
+ if (dmu_objset_userused_enabled(os)) {
+ /*
+ * We must create the list here because it uses the
+ * dn_dirty_link[] of this txg. But it may already
+ * exist because we call dsl_dataset_sync() twice per txg.
+ */
+ if (os->os_synced_dnodes == NULL) {
+ os->os_synced_dnodes =
+ multilist_create(sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[txgoff]),
+ dnode_multilist_index_func);
+ } else {
+ ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
+ offsetof(dnode_t, dn_dirty_link[txgoff]));
+ }
+ }
+
+ ml = os->os_dirty_dnodes[txgoff];
+ num_sublists = multilist_get_num_sublists(ml);
+ for (int i = 0; i < num_sublists; i++) {
+ if (multilist_sublist_is_empty_idx(ml, i))
+ continue;
+ sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
+ sda->sda_list = ml;
+ sda->sda_sublist_idx = i;
+ sda->sda_tx = tx;
+ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
+ sync_dnodes_task, sda, 0);
+ /* callback frees sda */
+ }
+ taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
+
+ list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+ while ((dr = list_head(list)) != NULL) {
+ ASSERT0(dr->dr_dbuf->db_level);
+ list_remove(list, dr);
+ if (dr->dr_zio)
+ zio_nowait(dr->dr_zio);
+ }
+
+ /* Enable dnode backfill if enough objects have been freed. */
+ if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+ os->os_rescan_dnodes = B_TRUE;
+ os->os_freed_dnodes = 0;
+ }
+
+ /*
+ * Free intent log blocks up to this tx.
+ */
+ zil_sync(os->os_zil, tx);
+ os->os_phys->os_zil_header = os->os_zil_header;
+ zio_nowait(zio);
+}
+
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+ return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
+}
+
+static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+
+void
+dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
+{
+ used_cbs[ost] = cb;
+}
+
+boolean_t
+dmu_objset_userused_enabled(objset_t *os)
+{
+ return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
+ used_cbs[os->os_phys->os_type] != NULL &&
+ DMU_USERUSED_DNODE(os) != NULL);
+}
+
+typedef struct userquota_node {
+ uint64_t uqn_id;
+ int64_t uqn_delta;
+ avl_node_t uqn_node;
+} userquota_node_t;
+
+typedef struct userquota_cache {
+ avl_tree_t uqc_user_deltas;
+ avl_tree_t uqc_group_deltas;
+} userquota_cache_t;
+
+static int
+userquota_compare(const void *l, const void *r)
+{
+ const userquota_node_t *luqn = l;
+ const userquota_node_t *ruqn = r;
+
+ if (luqn->uqn_id < ruqn->uqn_id)
+ return (-1);
+ if (luqn->uqn_id > ruqn->uqn_id)
+ return (1);
+ return (0);
+}
+
+static void
+do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
+{
+ void *cookie;
+ userquota_node_t *uqn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ cookie = NULL;
+ while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
+ &cookie)) != NULL) {
+ /*
+ * os_userused_lock protects against concurrent calls to
+ * zap_increment_int(). It's needed because zap_increment_int()
+ * is not thread-safe (i.e. not atomic).
+ */
+ mutex_enter(&os->os_userused_lock);
+ VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
+ uqn->uqn_id, uqn->uqn_delta, tx));
+ mutex_exit(&os->os_userused_lock);
+ kmem_free(uqn, sizeof (*uqn));
+ }
+ avl_destroy(&cache->uqc_user_deltas);
+
+ cookie = NULL;
+ while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
+ &cookie)) != NULL) {
+ mutex_enter(&os->os_userused_lock);
+ VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+ uqn->uqn_id, uqn->uqn_delta, tx));
+ mutex_exit(&os->os_userused_lock);
+ kmem_free(uqn, sizeof (*uqn));
+ }
+ avl_destroy(&cache->uqc_group_deltas);
+}
+
+static void
+userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
+{
+ userquota_node_t search = { .uqn_id = id };
+ avl_index_t idx;
+
+ userquota_node_t *uqn = avl_find(avl, &search, &idx);
+ if (uqn == NULL) {
+ uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
+ uqn->uqn_id = id;
+ avl_insert(avl, uqn, idx);
+ }
+ uqn->uqn_delta += delta;
+}
+
+static void
+do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
+ uint64_t user, uint64_t group, boolean_t subtract)
+{
+ if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
+ int64_t delta = DNODE_MIN_SIZE + used;
+ if (subtract)
+ delta = -delta;
+
+ userquota_update_cache(&cache->uqc_user_deltas, user, delta);
+ userquota_update_cache(&cache->uqc_group_deltas, group, delta);
+ }
+}
+
+typedef struct userquota_updates_arg {
+ objset_t *uua_os;
+ int uua_sublist_idx;
+ dmu_tx_t *uua_tx;
+} userquota_updates_arg_t;
+
+static void
+userquota_updates_task(void *arg)
+{
+ userquota_updates_arg_t *uua = arg;
+ objset_t *os = uua->uua_os;
+ dmu_tx_t *tx = uua->uua_tx;
+ dnode_t *dn;
+ userquota_cache_t cache = { 0 };
+
+ multilist_sublist_t *list =
+ multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
+
+ ASSERT(multilist_sublist_head(list) == NULL ||
+ dmu_objset_userused_enabled(os));
+ avl_create(&cache.uqc_user_deltas, userquota_compare,
+ sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+ avl_create(&cache.uqc_group_deltas, userquota_compare,
+ sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+
+ while ((dn = multilist_sublist_head(list)) != NULL) {
+ int flags;
+ ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
+ dn->dn_phys->dn_flags &
+ DNODE_FLAG_USERUSED_ACCOUNTED);
+
+ flags = dn->dn_id_flags;
+ ASSERT(flags);
+ if (flags & DN_ID_OLD_EXIST) {
+ do_userquota_update(&cache,
+ dn->dn_oldused, dn->dn_oldflags,
+ dn->dn_olduid, dn->dn_oldgid, B_TRUE);
+ }
+ if (flags & DN_ID_NEW_EXIST) {
+ do_userquota_update(&cache,
+ DN_USED_BYTES(dn->dn_phys),
+ dn->dn_phys->dn_flags, dn->dn_newuid,
+ dn->dn_newgid, B_FALSE);
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+ dn->dn_olduid = dn->dn_newuid;
+ dn->dn_oldgid = dn->dn_newgid;
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (dn->dn_bonuslen == 0)
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ else
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
+ if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
+ dn->dn_dirty_txg = 0;
+ mutex_exit(&dn->dn_mtx);
+
+ multilist_sublist_remove(list, dn);
+ dnode_rele(dn, os->os_synced_dnodes);
+ }
+ do_userquota_cacheflush(os, &cache, tx);
+ multilist_sublist_unlock(list);
+ kmem_free(uua, sizeof (*uua));
+}
+
+void
+dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
+{
+ int num_sublists;
+
+ if (!dmu_objset_userused_enabled(os))
+ return;
+
+ /* Allocate the user/groupused objects if necessary. */
+ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
+ VERIFY0(zap_create_claim(os,
+ DMU_USERUSED_OBJECT,
+ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+ VERIFY0(zap_create_claim(os,
+ DMU_GROUPUSED_OBJECT,
+ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+ }
+
+ num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
+ for (int i = 0; i < num_sublists; i++) {
+ if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i))
+ continue;
+ userquota_updates_arg_t *uua =
+ kmem_alloc(sizeof (*uua), KM_SLEEP);
+ uua->uua_os = os;
+ uua->uua_sublist_idx = i;
+ uua->uua_tx = tx;
+ /* note: caller does taskq_wait() */
+ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
+ userquota_updates_task, uua, 0);
+ /* callback frees uua */
+ }
+}
+
+/*
+ * Returns a pointer to data to find uid/gid from
+ *
+ * If a dirty record for transaction group that is syncing can't
+ * be found then NULL is returned. In the NULL case it is assumed
+ * the uid/gid aren't changing.
+ */
+static void *
+dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr, **drp;
+ void *data;
+
+ if (db->db_dirtycnt == 0)
+ return (db->db.db_data); /* Nothing is changing */
+
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+ if (dr->dr_txg == tx->tx_txg)
+ break;
+
+ if (dr == NULL) {
+ data = NULL;
+ } else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(dr->dr_dbuf);
+ dn = DB_DNODE(dr->dr_dbuf);
+
+ if (dn->dn_bonuslen == 0 &&
+ dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+ data = dr->dt.dl.dr_data->b_data;
+ else
+ data = dr->dt.dl.dr_data;
+
+ DB_DNODE_EXIT(dr->dr_dbuf);
+ }
+
+ return (data);
+}
+
+void
+dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
+{
+ objset_t *os = dn->dn_objset;
+ void *data = NULL;
+ dmu_buf_impl_t *db = NULL;
+ uint64_t *user = NULL;
+ uint64_t *group = NULL;
+ int flags = dn->dn_id_flags;
+ int error;
+ boolean_t have_spill = B_FALSE;
+
+ if (!dmu_objset_userused_enabled(dn->dn_objset))
+ return;
+
+ if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
+ DN_ID_CHKED_SPILL)))
+ return;
+
+ if (before && dn->dn_bonuslen != 0)
+ data = DN_BONUS(dn->dn_phys);
+ else if (!before && dn->dn_bonuslen != 0) {
+ if (dn->dn_bonus) {
+ db = dn->dn_bonus;
+ mutex_enter(&db->db_mtx);
+ data = dmu_objset_userquota_find_data(db, tx);
+ } else {
+ data = DN_BONUS(dn->dn_phys);
+ }
+ } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
+ int rf = 0;
+
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ error = dmu_spill_hold_by_dnode(dn,
+ rf | DB_RF_MUST_SUCCEED,
+ FTAG, (dmu_buf_t **)&db);
+ ASSERT(error == 0);
+ mutex_enter(&db->db_mtx);
+ data = (before) ? db->db.db_data :
+ dmu_objset_userquota_find_data(db, tx);
+ have_spill = B_TRUE;
+ } else {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+
+ if (before) {
+ ASSERT(data);
+ user = &dn->dn_olduid;
+ group = &dn->dn_oldgid;
+ } else if (data) {
+ user = &dn->dn_newuid;
+ group = &dn->dn_newgid;
+ }
+
+ /*
+ * Must always call the callback in case the object
+ * type has changed and that type isn't an object type to track
+ */
+ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
+ user, group);
+
+ /*
+ * Preserve existing uid/gid when the callback can't determine
+ * what the new uid/gid are and the callback returned EEXIST.
+ * The EEXIST error tells us to just use the existing uid/gid.
+ * If we don't know what the old values are then just assign
+ * them to 0, since that is a new file being created.
+ */
+ if (!before && data == NULL && error == EEXIST) {
+ if (flags & DN_ID_OLD_EXIST) {
+ dn->dn_newuid = dn->dn_olduid;
+ dn->dn_newgid = dn->dn_oldgid;
+ } else {
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ }
+ error = 0;
+ }
+
+ if (db)
+ mutex_exit(&db->db_mtx);
+
+ mutex_enter(&dn->dn_mtx);
+ if (error == 0 && before)
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (error == 0 && !before)
+ dn->dn_id_flags |= DN_ID_NEW_EXIST;
+
+ if (have_spill) {
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ } else {
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ mutex_exit(&dn->dn_mtx);
+ if (have_spill)
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
+}
+
+boolean_t
+dmu_objset_userspace_present(objset_t *os)
+{
+ return (os->os_phys->os_flags &
+ OBJSET_FLAG_USERACCOUNTING_COMPLETE);
+}
+
+int
+dmu_objset_userspace_upgrade(objset_t *os)
+{
+ uint64_t obj;
+ int err = 0;
+
+ if (dmu_objset_userspace_present(os))
+ return (0);
+ if (!dmu_objset_userused_enabled(os))
+ return (SET_ERROR(ENOTSUP));
+ if (dmu_objset_is_snapshot(os))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * We simply need to mark every object dirty, so that it will be
+ * synced out and now accounted. If this is called
+ * concurrently, or if we already did some work before crashing,
+ * that's fine, since we track each object's accounted state
+ * independently.
+ */
+
+ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ int objerr;
+
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ return (SET_ERROR(EINTR));
+
+ objerr = dmu_bonus_hold(os, obj, FTAG, &db);
+ if (objerr != 0)
+ continue;
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, obj);
+ objerr = dmu_tx_assign(tx, TXG_WAIT);
+ if (objerr != 0) {
+ dmu_tx_abort(tx);
+ continue;
+ }
+ dmu_buf_will_dirty(db, tx);
+ dmu_buf_rele(db, FTAG);
+ dmu_tx_commit(tx);
+ }
+
+ os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ return (0);
+}
+
+void
+dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
+ usedobjsp, availobjsp);
+}
+
+uint64_t
+dmu_objset_fsid_guid(objset_t *os)
+{
+ return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
+}
+
+void
+dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
+{
+ stat->dds_type = os->os_phys->os_type;
+ if (os->os_dsl_dataset)
+ dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
+}
+
+void
+dmu_objset_stats(objset_t *os, nvlist_t *nv)
+{
+ ASSERT(os->os_dsl_dataset ||
+ os->os_phys->os_type == DMU_OST_META);
+
+ if (os->os_dsl_dataset != NULL)
+ dsl_dataset_stats(os->os_dsl_dataset, nv);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
+ os->os_phys->os_type);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
+ dmu_objset_userspace_present(os));
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+ if (os->os_dsl_dataset != NULL)
+ return (os->os_dsl_dataset->ds_is_snapshot);
+ else
+ return (B_FALSE);
+}
+
+int
+dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
+ boolean_t *conflict)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ uint64_t ignored;
+
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
+ return (SET_ERROR(ENOENT));
+
+ return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
+ MT_NORMALIZE, real, maxlen, conflict));
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
+ return (SET_ERROR(ENOENT));
+
+ zap_cursor_init_serialized(&cursor,
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ (void) strcpy(name, attr.za_name);
+ if (idp)
+ *idp = attr.za_first_integer;
+ if (case_conflict)
+ *case_conflict = attr.za_normalization_conflict;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+int
+dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp)
+{
+ dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ /* there is no next dir on a snapshot! */
+ if (os->os_dsl_dataset->ds_object !=
+ dsl_dir_phys(dd)->dd_head_dataset_obj)
+ return (SET_ERROR(ENOENT));
+
+ zap_cursor_init_serialized(&cursor,
+ dd->dd_pool->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ (void) strcpy(name, attr.za_name);
+ if (idp)
+ *idp = attr.za_first_integer;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+typedef struct dmu_objset_find_ctx {
+ taskq_t *dc_tq;
+ dsl_pool_t *dc_dp;
+ uint64_t dc_ddobj;
+ char *dc_ddname; /* last component of ddobj's name */
+ int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+ void *dc_arg;
+ int dc_flags;
+ kmutex_t *dc_error_lock;
+ int *dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
+{
+ dsl_pool_t *dp = dcp->dc_dp;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ zap_cursor_t zc;
+ zap_attribute_t *attr;
+ uint64_t thisobj;
+ int err = 0;
+
+ /* don't process if there already was an error */
+ if (*dcp->dc_error != 0)
+ goto out;
+
+ /*
+ * Note: passing the name (dc_ddname) here is optional, but it
+ * improves performance because we don't need to call
+ * zap_value_search() to determine the name.
+ */
+ err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
+ if (err != 0)
+ goto out;
+
+ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+ if (dd->dd_myname[0] == '$') {
+ dsl_dir_rele(dd, FTAG);
+ goto out;
+ }
+
+ thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+ attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /*
+ * Iterate over all children.
+ */
+ if (dcp->dc_flags & DS_FIND_CHILDREN) {
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT3U(attr->za_integer_length, ==,
+ sizeof (uint64_t));
+ ASSERT3U(attr->za_num_integers, ==, 1);
+
+ dmu_objset_find_ctx_t *child_dcp =
+ kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
+ *child_dcp = *dcp;
+ child_dcp->dc_ddobj = attr->za_first_integer;
+ child_dcp->dc_ddname = spa_strdup(attr->za_name);
+ if (dcp->dc_tq != NULL)
+ (void) taskq_dispatch(dcp->dc_tq,
+ dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+ else
+ dmu_objset_find_dp_impl(child_dcp);
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ /*
+ * Iterate over all snapshots.
+ */
+ if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
+ dsl_dataset_t *ds;
+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+ if (err == 0) {
+ uint64_t snapobj;
+
+ snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+ dsl_dataset_rele(ds, FTAG);
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT3U(attr->za_integer_length, ==,
+ sizeof (uint64_t));
+ ASSERT3U(attr->za_num_integers, ==, 1);
+
+ err = dsl_dataset_hold_obj(dp,
+ attr->za_first_integer, FTAG, &ds);
+ if (err != 0)
+ break;
+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
+ dsl_dataset_rele(ds, FTAG);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ }
+ }
+
+ kmem_free(attr, sizeof (zap_attribute_t));
+
+ if (err != 0) {
+ dsl_dir_rele(dd, FTAG);
+ goto out;
+ }
+
+ /*
+ * Apply to self.
+ */
+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+ /*
+ * Note: we hold the dir while calling dsl_dataset_hold_obj() so
+ * that the dir will remain cached, and we won't have to re-instantiate
+ * it (which could be expensive due to finding its name via
+ * zap_value_search()).
+ */
+ dsl_dir_rele(dd, FTAG);
+ if (err != 0)
+ goto out;
+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
+ dsl_dataset_rele(ds, FTAG);
+
+out:
+ if (err != 0) {
+ mutex_enter(dcp->dc_error_lock);
+ /* only keep first error */
+ if (*dcp->dc_error == 0)
+ *dcp->dc_error = err;
+ mutex_exit(dcp->dc_error_lock);
+ }
+
+ if (dcp->dc_ddname != NULL)
+ spa_strfree(dcp->dc_ddname);
+ kmem_free(dcp, sizeof (*dcp));
+}
+
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+ dmu_objset_find_ctx_t *dcp = arg;
+ dsl_pool_t *dp = dcp->dc_dp;
+
+ /*
+ * We need to get a pool_config_lock here, as there are several
+ * asssert(pool_config_held) down the stack. Getting a lock via
+ * dsl_pool_config_enter is risky, as it might be stalled by a
+ * pending writer. This would deadlock, as the write lock can
+ * only be granted when our parent thread gives up the lock.
+ * The _prio interface gives us priority over a pending writer.
+ */
+ dsl_pool_config_enter_prio(dp, FTAG);
+
+ dmu_objset_find_dp_impl(dcp);
+
+ dsl_pool_config_exit(dp, FTAG);
+}
+
+/*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+ int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+ int error = 0;
+ taskq_t *tq = NULL;
+ int ntasks;
+ dmu_objset_find_ctx_t *dcp;
+ kmutex_t err_lock;
+
+ mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+ dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
+ dcp->dc_tq = NULL;
+ dcp->dc_dp = dp;
+ dcp->dc_ddobj = ddobj;
+ dcp->dc_ddname = NULL;
+ dcp->dc_func = func;
+ dcp->dc_arg = arg;
+ dcp->dc_flags = flags;
+ dcp->dc_error_lock = &err_lock;
+ dcp->dc_error = &error;
+
+ if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+ /*
+ * In case a write lock is held we can't make use of
+ * parallelism, as down the stack of the worker threads
+ * the lock is asserted via dsl_pool_config_held.
+ * In case of a read lock this is solved by getting a read
+ * lock in each worker thread, which isn't possible in case
+ * of a writer lock. So we fall back to the synchronous path
+ * here.
+ * In the future it might be possible to get some magic into
+ * dsl_pool_config_held in a way that it returns true for
+ * the worker threads so that a single lock held from this
+ * thread suffices. For now, stay single threaded.
+ */
+ dmu_objset_find_dp_impl(dcp);
+ mutex_destroy(&err_lock);
+
+ return (error);
+ }
+
+ ntasks = dmu_find_threads;
+ if (ntasks == 0)
+ ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+ tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
+ INT_MAX, 0);
+ if (tq == NULL) {
+ kmem_free(dcp, sizeof (*dcp));
+ mutex_destroy(&err_lock);
+
+ return (SET_ERROR(ENOMEM));
+ }
+ dcp->dc_tq = tq;
+
+ /* dcp will be freed by task */
+ (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+ /*
+ * PORTING: this code relies on the property of taskq_wait to wait
+ * until no more tasks are queued and no more tasks are active. As
+ * we always queue new tasks from within other tasks, task_wait
+ * reliably waits for the full recursion to finish, even though we
+ * enqueue new tasks after taskq_wait has been called.
+ * On platforms other than illumos, taskq_wait may not have this
+ * property.
+ */
+ taskq_wait(tq);
+ taskq_destroy(tq);
+ mutex_destroy(&err_lock);
+
+ return (error);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * The dp_config_rwlock must not be held when this is called, and it
+ * will not be held when the callback is called.
+ * Therefore this function should only be used when the pool is not changing
+ * (e.g. in syncing context), or the callback can deal with the possible races.
+ */
+static int
+dmu_objset_find_impl(spa_t *spa, const char *name,
+ int func(const char *, void *), void *arg, int flags)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ dsl_dataset_t *ds;
+ zap_cursor_t zc;
+ zap_attribute_t *attr;
+ char *child;
+ uint64_t thisobj;
+ int err;
+
+ dsl_pool_config_enter(dp, FTAG);
+
+ err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
+ if (err != 0) {
+ dsl_pool_config_exit(dp, FTAG);
+ return (err);
+ }
+
+ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+ if (dd->dd_myname[0] == '$') {
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ return (0);
+ }
+
+ thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+ attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /*
+ * Iterate over all children.
+ */
+ if (flags & DS_FIND_CHILDREN) {
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT3U(attr->za_integer_length, ==,
+ sizeof (uint64_t));
+ ASSERT3U(attr->za_num_integers, ==, 1);
+
+ child = kmem_asprintf("%s/%s", name, attr->za_name);
+ dsl_pool_config_exit(dp, FTAG);
+ err = dmu_objset_find_impl(spa, child,
+ func, arg, flags);
+ dsl_pool_config_enter(dp, FTAG);
+ strfree(child);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+
+ if (err != 0) {
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ kmem_free(attr, sizeof (zap_attribute_t));
+ return (err);
+ }
+ }
+
+ /*
+ * Iterate over all snapshots.
+ */
+ if (flags & DS_FIND_SNAPSHOTS) {
+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+ if (err == 0) {
+ uint64_t snapobj;
+
+ snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+ dsl_dataset_rele(ds, FTAG);
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT3U(attr->za_integer_length, ==,
+ sizeof (uint64_t));
+ ASSERT3U(attr->za_num_integers, ==, 1);
+
+ child = kmem_asprintf("%s@%s",
+ name, attr->za_name);
+ dsl_pool_config_exit(dp, FTAG);
+ err = func(child, arg);
+ dsl_pool_config_enter(dp, FTAG);
+ strfree(child);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ }
+ }
+
+ dsl_dir_rele(dd, FTAG);
+ kmem_free(attr, sizeof (zap_attribute_t));
+ dsl_pool_config_exit(dp, FTAG);
+
+ if (err != 0)
+ return (err);
+
+ /* Apply to self. */
+ return (func(name, arg));
+}
+
+/*
+ * See comment above dmu_objset_find_impl().
+ */
+int
+dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+ int flags)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ error = dmu_objset_find_impl(spa, name, func, arg, flags);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+ os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+ return (os->os_user_ptr);
+}
+
+/*
+ * Determine name of filesystem, given name of snapshot.
+ * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
+ */
+int
+dmu_fsname(const char *snapname, char *buf)
+{
+ char *atp = strchr(snapname, '@');
+ if (atp == NULL)
+ return (SET_ERROR(EINVAL));
+ if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strlcpy(buf, snapname, atp - snapname + 1);
+ return (0);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context to track
+ * the amount of dirty data in the open txg, which is also the amount
+ * of memory that can not be evicted until this txg syncs.
+ */
+void
+dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
+
+ if (ds != NULL) {
+ dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+ dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
new file mode 100644
index 000000000000..f4dcc4bcb976
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -0,0 +1,3550 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+#include <sys/bqueue.h>
+#ifdef __FreeBSD__
+#include <sys/zvol.h>
+#endif
+
+#ifdef __FreeBSD__
+#undef dump_write
+#define dump_write dmu_dump_write
+#endif
+
+/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+int zfs_send_corrupt_data = B_FALSE;
+int zfs_send_queue_length = 16 * 1024 * 1024;
+int zfs_recv_queue_length = 16 * 1024 * 1024;
+/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
+int zfs_send_set_freerecords_bit = B_TRUE;
+
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
+#endif
+
+static char *dmu_recv_tag = "dmu_recv_tag";
+const char *recv_clone_name = "%recv";
+
+/*
+ * Use this to override the recordsize calculation for fast zfs send estimates.
+ */
+uint64_t zfs_override_estimate_recordsize = 0;
+
+#define BP_SPAN(datablkszsec, indblkshift, level) \
+ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (indblkshift - SPA_BLKPTRSHIFT)))
+
+static void byteswap_record(dmu_replay_record_t *drr);
+
+struct send_thread_arg {
+ bqueue_t q;
+ dsl_dataset_t *ds; /* Dataset to traverse */
+ uint64_t fromtxg; /* Traverse from this txg */
+ int flags; /* flags to pass to traverse_dataset */
+ int error_code;
+ boolean_t cancel;
+ zbookmark_phys_t resume;
+};
+
+struct send_block_record {
+ boolean_t eos_marker; /* Marks the end of the stream */
+ blkptr_t bp;
+ zbookmark_phys_t zb;
+ uint8_t indblkshift;
+ uint16_t datablkszsec;
+ bqueue_node_t ln;
+};
+
+static int
+dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
+{
+ dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
+ struct uio auio;
+ struct iovec aiov;
+
+ /*
+ * The code does not rely on this (len being a multiple of 8). We keep
+ * this assertion because of the corresponding assertion in
+ * receive_read(). Keeping this assertion ensures that we do not
+ * inadvertently break backwards compatibility (causing the assertion
+ * in receive_read() to trigger on old software).
+ *
+ * Removing the assertions could be rolled into a new feature that uses
+ * data that isn't 8-byte aligned; if the assertions were removed, a
+ * feature flag would have to be added.
+ */
+
+ ASSERT0(len % 8);
+
+ aiov.iov_base = buf;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_offset = (off_t)-1;
+ auio.uio_td = dsp->dsa_td;
+#ifdef _KERNEL
+ if (dsp->dsa_fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0,
+ dsp->dsa_td);
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ dsp->dsa_err = EOPNOTSUPP;
+#endif
+ mutex_enter(&ds->ds_sendstream_lock);
+ *dsp->dsa_off += len;
+ mutex_exit(&ds->ds_sendstream_lock);
+
+ return (dsp->dsa_err);
+}
+
+/*
+ * For all record types except BEGIN, fill in the checksum (overlaid in
+ * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
+ * up to the start of the checksum itself.
+ */
+static int
+dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
+{
+ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+ (void) fletcher_4_incremental_native(dsp->dsa_drr,
+ offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ &dsp->dsa_zc);
+ if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
+ dsp->dsa_sent_begin = B_TRUE;
+ } else {
+ ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
+ drr_checksum.drr_checksum));
+ dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
+ }
+ if (dsp->dsa_drr->drr_type == DRR_END) {
+ dsp->dsa_sent_end = B_TRUE;
+ }
+ (void) fletcher_4_incremental_native(&dsp->dsa_drr->
+ drr_u.drr_checksum.drr_checksum,
+ sizeof (zio_cksum_t), &dsp->dsa_zc);
+ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+ return (SET_ERROR(EINTR));
+ if (payload_len != 0) {
+ (void) fletcher_4_incremental_native(payload, payload_len,
+ &dsp->dsa_zc);
+ if (dump_bytes(dsp, payload, payload_len) != 0)
+ return (SET_ERROR(EINTR));
+ }
+ return (0);
+}
+
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
+static int
+dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+ uint64_t length)
+{
+ struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
+
+ /*
+ * When we receive a free record, dbuf_free_range() assumes
+ * that the receiving system doesn't have any dbufs in the range
+ * being freed. This is always true because there is a one-record
+ * constraint: we only send one WRITE record for any given
+ * object,offset. We know that the one-record constraint is
+ * true because we always send data in increasing order by
+ * object,offset.
+ *
+ * If the increasing-order constraint ever changes, we should find
+ * another way to assert that the one-record constraint is still
+ * satisfied.
+ */
+ ASSERT(object > dsp->dsa_last_data_object ||
+ (object == dsp->dsa_last_data_object &&
+ offset > dsp->dsa_last_data_offset));
+
+ if (length != -1ULL && offset + length < offset)
+ length = -1ULL;
+
+ /*
+ * If there is a pending op, but it's not PENDING_FREE, push it out,
+ * since free block aggregation can only be done for blocks of the
+ * same type (i.e., DRR_FREE records can only be aggregated with
+ * other DRR_FREE records. DRR_FREEOBJECTS records can only be
+ * aggregated with other DRR_FREEOBJECTS records.
+ */
+ if (dsp->dsa_pending_op != PENDING_NONE &&
+ dsp->dsa_pending_op != PENDING_FREE) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+
+ if (dsp->dsa_pending_op == PENDING_FREE) {
+ /*
+ * There should never be a PENDING_FREE if length is -1
+ * (because dump_dnode is the only place where this
+ * function is called with a -1, and only after flushing
+ * any pending record).
+ */
+ ASSERT(length != -1ULL);
+ /*
+ * Check to see whether this free block can be aggregated
+ * with pending one.
+ */
+ if (drrf->drr_object == object && drrf->drr_offset +
+ drrf->drr_length == offset) {
+ drrf->drr_length += length;
+ return (0);
+ } else {
+ /* not a continuation. Push out pending record */
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+ }
+ /* create a FREE record and make it pending */
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_FREE;
+ drrf->drr_object = object;
+ drrf->drr_offset = offset;
+ drrf->drr_length = length;
+ drrf->drr_toguid = dsp->dsa_toguid;
+ if (length == -1ULL) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ } else {
+ dsp->dsa_pending_op = PENDING_FREE;
+ }
+
+ return (0);
+}
+
+static int
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
+ uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp,
+ void *data)
+{
+ uint64_t payload_size;
+ struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
+
+ /*
+ * We send data in increasing object, offset order.
+ * See comment in dump_free() for details.
+ */
+ ASSERT(object > dsp->dsa_last_data_object ||
+ (object == dsp->dsa_last_data_object &&
+ offset > dsp->dsa_last_data_offset));
+ dsp->dsa_last_data_object = object;
+ dsp->dsa_last_data_offset = offset + lsize - 1;
+
+ /*
+ * If there is any kind of pending aggregation (currently either
+ * a grouping of free objects or free blocks), push it out to
+ * the stream, since aggregation can't be done across operations
+ * of different types.
+ */
+ if (dsp->dsa_pending_op != PENDING_NONE) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+ /* write a WRITE record */
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_WRITE;
+ drrw->drr_object = object;
+ drrw->drr_type = type;
+ drrw->drr_offset = offset;
+ drrw->drr_toguid = dsp->dsa_toguid;
+ drrw->drr_logical_size = lsize;
+
+ /* only set the compression fields if the buf is compressed */
+ if (lsize != psize) {
+ ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
+ ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
+ ASSERT3S(psize, >, 0);
+ ASSERT3S(lsize, >=, psize);
+
+ drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
+ drrw->drr_compressed_size = psize;
+ payload_size = drrw->drr_compressed_size;
+ } else {
+ payload_size = drrw->drr_logical_size;
+ }
+
+ if (bp == NULL || BP_IS_EMBEDDED(bp)) {
+ /*
+ * There's no pre-computed checksum for partial-block
+ * writes or embedded BP's, so (like
+ * fletcher4-checkummed blocks) userland will have to
+ * compute a dedup-capable checksum itself.
+ */
+ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+ } else {
+ drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+ if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP)
+ drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+ DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+ drrw->drr_key.ddk_cksum = bp->blk_cksum;
+ }
+
+ if (dump_record(dsp, data, payload_size) != 0)
+ return (SET_ERROR(EINTR));
+ return (0);
+}
+
+static int
+dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+ int blksz, const blkptr_t *bp)
+{
+ char buf[BPE_PAYLOAD_SIZE];
+ struct drr_write_embedded *drrw =
+ &(dsp->dsa_drr->drr_u.drr_write_embedded);
+
+ if (dsp->dsa_pending_op != PENDING_NONE) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (EINTR);
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
+ drrw->drr_object = object;
+ drrw->drr_offset = offset;
+ drrw->drr_length = blksz;
+ drrw->drr_toguid = dsp->dsa_toguid;
+ drrw->drr_compression = BP_GET_COMPRESS(bp);
+ drrw->drr_etype = BPE_GET_ETYPE(bp);
+ drrw->drr_lsize = BPE_GET_LSIZE(bp);
+ drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+ decode_embedded_bp_compressed(bp, buf);
+
+ if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
+{
+ struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
+
+ if (dsp->dsa_pending_op != PENDING_NONE) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+
+ /* write a SPILL record */
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_SPILL;
+ drrs->drr_object = object;
+ drrs->drr_length = blksz;
+ drrs->drr_toguid = dsp->dsa_toguid;
+
+ if (dump_record(dsp, data, blksz) != 0)
+ return (SET_ERROR(EINTR));
+ return (0);
+}
+
+static int
+dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
+{
+ struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
+
+ /*
+ * If there is a pending op, but it's not PENDING_FREEOBJECTS,
+ * push it out, since free block aggregation can only be done for
+ * blocks of the same type (i.e., DRR_FREE records can only be
+ * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
+ * can only be aggregated with other DRR_FREEOBJECTS records.
+ */
+ if (dsp->dsa_pending_op != PENDING_NONE &&
+ dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+ if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
+ /*
+ * See whether this free object array can be aggregated
+ * with pending one
+ */
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
+ drrfo->drr_numobjs += numobjs;
+ return (0);
+ } else {
+ /* can't be aggregated. Push out pending record */
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+ }
+
+ /* write a FREEOBJECTS record */
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
+ drrfo->drr_firstobj = firstobj;
+ drrfo->drr_numobjs = numobjs;
+ drrfo->drr_toguid = dsp->dsa_toguid;
+
+ dsp->dsa_pending_op = PENDING_FREEOBJECTS;
+
+ return (0);
+}
+
+static int
+dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
+{
+ struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
+
+ if (object < dsp->dsa_resume_object) {
+ /*
+ * Note: when resuming, we will visit all the dnodes in
+ * the block of dnodes that we are resuming from. In
+ * this case it's unnecessary to send the dnodes prior to
+ * the one we are resuming from. We should be at most one
+ * block's worth of dnodes behind the resume point.
+ */
+ ASSERT3U(dsp->dsa_resume_object - object, <,
+ 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+ return (0);
+ }
+
+ if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+ return (dump_freeobjects(dsp, object, 1));
+
+ if (dsp->dsa_pending_op != PENDING_NONE) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+
+ /* write an OBJECT record */
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_OBJECT;
+ drro->drr_object = object;
+ drro->drr_type = dnp->dn_type;
+ drro->drr_bonustype = dnp->dn_bonustype;
+ drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ drro->drr_bonuslen = dnp->dn_bonuslen;
+ drro->drr_dn_slots = dnp->dn_extra_slots + 1;
+ drro->drr_checksumtype = dnp->dn_checksum;
+ drro->drr_compress = dnp->dn_compress;
+ drro->drr_toguid = dsp->dsa_toguid;
+
+ if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+ drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
+ if (dump_record(dsp, DN_BONUS(dnp),
+ P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
+ return (SET_ERROR(EINTR));
+ }
+
+ /* Free anything past the end of the file. */
+ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
+ return (SET_ERROR(EINTR));
+ if (dsp->dsa_err != 0)
+ return (SET_ERROR(EINTR));
+ return (0);
+}
+
+static boolean_t
+backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
+{
+ if (!BP_IS_EMBEDDED(bp))
+ return (B_FALSE);
+
+ /*
+ * Compression function must be legacy, or explicitly enabled.
+ */
+ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+ !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
+ return (B_FALSE);
+
+ /*
+ * Embed type must be explicitly enabled.
+ */
+ switch (BPE_GET_ETYPE(bp)) {
+ case BP_EMBEDDED_TYPE_DATA:
+ if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+ return (B_TRUE);
+ break;
+ default:
+ return (B_FALSE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * This is the callback function to traverse_dataset that acts as the worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
+static int
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+ struct send_thread_arg *sta = arg;
+ struct send_block_record *record;
+ uint64_t record_size;
+ int err = 0;
+
+ ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+ zb->zb_object >= sta->resume.zb_object);
+
+ if (sta->cancel)
+ return (SET_ERROR(EINTR));
+
+ if (bp == NULL) {
+ ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+ return (0);
+ } else if (zb->zb_level < 0) {
+ return (0);
+ }
+
+ record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
+ record->eos_marker = B_FALSE;
+ record->bp = *bp;
+ record->zb = *zb;
+ record->indblkshift = dnp->dn_indblkshift;
+ record->datablkszsec = dnp->dn_datablkszsec;
+ record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ bqueue_enqueue(&sta->q, record, record_size);
+
+ return (err);
+}
+
+/*
+ * This function kicks off the traverse_dataset. It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished. If there is no
+ * dataset to traverse, the thread immediately pushes End of Stream marker.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+ struct send_thread_arg *st_arg = arg;
+ int err;
+ struct send_block_record *data;
+
+ if (st_arg->ds != NULL) {
+ err = traverse_dataset_resume(st_arg->ds,
+ st_arg->fromtxg, &st_arg->resume,
+ st_arg->flags, send_cb, st_arg);
+
+ if (err != EINTR)
+ st_arg->error_code = err;
+ }
+ data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+ data->eos_marker = B_TRUE;
+ bqueue_enqueue(&st_arg->q, data, 1);
+ thread_exit();
+}
+
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, reading the data (which has hopefully been prefetched), and calling
+ * the appropriate helper function.
+ */
+static int
+do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
+{
+ dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
+ const blkptr_t *bp = &data->bp;
+ const zbookmark_phys_t *zb = &data->zb;
+ uint8_t indblkshift = data->indblkshift;
+ uint16_t dblkszsec = data->datablkszsec;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+ int err = 0;
+
+ ASSERT3U(zb->zb_level, >=, 0);
+
+ ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+ zb->zb_object >= dsa->dsa_resume_object);
+
+ if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+ DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+ return (0);
+ } else if (BP_IS_HOLE(bp) &&
+ zb->zb_object == DMU_META_DNODE_OBJECT) {
+ uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
+ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+ err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
+ } else if (BP_IS_HOLE(bp)) {
+ uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
+ uint64_t offset = zb->zb_blkid * span;
+ err = dump_free(dsa, zb->zb_object, offset, span);
+ } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
+ return (0);
+ } else if (type == DMU_OT_DNODE) {
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ arc_buf_t *abuf;
+
+ ASSERT0(zb->zb_level);
+
+ if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+ &aflags, zb) != 0)
+ return (SET_ERROR(EIO));
+
+ dnode_phys_t *blk = abuf->b_data;
+ uint64_t dnobj = zb->zb_blkid * epb;
+ for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
+ err = dump_dnode(dsa, dnobj + i, blk + i);
+ if (err != 0)
+ break;
+ }
+ arc_buf_destroy(abuf, &abuf);
+ } else if (type == DMU_OT_SA) {
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ arc_buf_t *abuf;
+ int blksz = BP_GET_LSIZE(bp);
+
+ if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+ &aflags, zb) != 0)
+ return (SET_ERROR(EIO));
+
+ err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
+ arc_buf_destroy(abuf, &abuf);
+ } else if (backup_do_embed(dsa, bp)) {
+ /* it's an embedded level-0 block of a regular object */
+ int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+ ASSERT0(zb->zb_level);
+ err = dump_write_embedded(dsa, zb->zb_object,
+ zb->zb_blkid * blksz, blksz, bp);
+ } else {
+ /* it's a level-0 block of a regular object */
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ arc_buf_t *abuf;
+ int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+ uint64_t offset;
+
+ /*
+ * If we have large blocks stored on disk but the send flags
+ * don't allow us to send large blocks, we split the data from
+ * the arc buf into chunks.
+ */
+ boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
+ !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
+ /*
+ * We should only request compressed data from the ARC if all
+ * the following are true:
+ * - stream compression was requested
+ * - we aren't splitting large blocks into smaller chunks
+ * - the data won't need to be byteswapped before sending
+ * - this isn't an embedded block
+ * - this isn't metadata (if receiving on a different endian
+ * system it can be byteswapped more easily)
+ */
+ boolean_t request_compressed =
+ (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
+ !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
+ !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
+
+ ASSERT0(zb->zb_level);
+ ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+ (zb->zb_object == dsa->dsa_resume_object &&
+ zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
+ ASSERT0(zb->zb_level);
+ ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+ (zb->zb_object == dsa->dsa_resume_object &&
+ zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
+ ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
+
+ enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+ if (request_compressed)
+ zioflags |= ZIO_FLAG_RAW;
+ if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
+ if (zfs_send_corrupt_data) {
+ /* Send a block filled with 0x"zfs badd bloc" */
+ abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
+ blksz);
+ uint64_t *ptr;
+ for (ptr = abuf->b_data;
+ (char *)ptr < (char *)abuf->b_data + blksz;
+ ptr++)
+ *ptr = 0x2f5baddb10cULL;
+ } else {
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ offset = zb->zb_blkid * blksz;
+
+ if (split_large_blocks) {
+ ASSERT3U(arc_get_compression(abuf), ==,
+ ZIO_COMPRESS_OFF);
+ char *buf = abuf->b_data;
+ while (blksz > 0 && err == 0) {
+ int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+ err = dump_write(dsa, type, zb->zb_object,
+ offset, n, n, NULL, buf);
+ offset += n;
+ buf += n;
+ blksz -= n;
+ }
+ } else {
+ err = dump_write(dsa, type, zb->zb_object, offset,
+ blksz, arc_buf_size(abuf), bp, abuf->b_data);
+ }
+ arc_buf_destroy(abuf, &abuf);
+ }
+
+ ASSERT(err == 0 || err == EINTR);
+ return (err);
+}
+
+/*
+ * Pop the new data off the queue, and free the old data.
+ */
+static struct send_block_record *
+get_next_record(bqueue_t *bq, struct send_block_record *data)
+{
+ struct send_block_record *tmp = bqueue_dequeue(bq);
+ kmem_free(data, sizeof (*data));
+ return (tmp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * Note: Releases dp using the specified tag.
+ */
+static int
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
+ zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
+ int outfd, uint64_t resumeobj, uint64_t resumeoff,
+#ifdef illumos
+ vnode_t *vp, offset_t *off)
+#else
+ struct file *fp, offset_t *off)
+#endif
+{
+ objset_t *os;
+ dmu_replay_record_t *drr;
+ dmu_sendarg_t *dsp;
+ int err;
+ uint64_t fromtxg = 0;
+ uint64_t featureflags = 0;
+ struct send_thread_arg to_arg = { 0 };
+
+ err = dmu_objset_from_ds(to_ds, &os);
+ if (err != 0) {
+ dsl_pool_rele(dp, tag);
+ return (err);
+ }
+
+ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+ drr->drr_type = DRR_BEGIN;
+ drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
+ DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
+ DMU_SUBSTREAM);
+
+#ifdef _KERNEL
+ if (dmu_objset_type(os) == DMU_OST_ZFS) {
+ uint64_t version;
+ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ dsl_pool_rele(dp, tag);
+ return (SET_ERROR(EINVAL));
+ }
+ if (version >= ZPL_VERSION_SA) {
+ featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+ }
+ }
+#endif
+
+ if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+ if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
+ if (embedok &&
+ spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ featureflags |= DMU_BACKUP_FEATURE_LZ4;
+ }
+ if (compressok) {
+ featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
+ }
+ if ((featureflags &
+ (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) !=
+ 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
+ featureflags |= DMU_BACKUP_FEATURE_LZ4;
+ }
+
+ if (resumeobj != 0 || resumeoff != 0) {
+ featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+ }
+
+ DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+ featureflags);
+
+ drr->drr_u.drr_begin.drr_creation_time =
+ dsl_dataset_phys(to_ds)->ds_creation_time;
+ drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
+ if (is_clone)
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
+ drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+ if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+ if (zfs_send_set_freerecords_bit)
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
+
+ if (ancestor_zb != NULL) {
+ drr->drr_u.drr_begin.drr_fromguid =
+ ancestor_zb->zbm_guid;
+ fromtxg = ancestor_zb->zbm_creation_txg;
+ }
+ dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
+ if (!to_ds->ds_is_snapshot) {
+ (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
+ sizeof (drr->drr_u.drr_begin.drr_toname));
+ }
+
+ dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
+
+ dsp->dsa_drr = drr;
+ dsp->dsa_outfd = outfd;
+ dsp->dsa_proc = curproc;
+ dsp->dsa_td = curthread;
+ dsp->dsa_fp = fp;
+ dsp->dsa_os = os;
+ dsp->dsa_off = off;
+ dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+ dsp->dsa_pending_op = PENDING_NONE;
+ dsp->dsa_featureflags = featureflags;
+ dsp->dsa_resume_object = resumeobj;
+ dsp->dsa_resume_offset = resumeoff;
+
+ mutex_enter(&to_ds->ds_sendstream_lock);
+ list_insert_head(&to_ds->ds_sendstreams, dsp);
+ mutex_exit(&to_ds->ds_sendstream_lock);
+
+ dsl_dataset_long_hold(to_ds, FTAG);
+ dsl_pool_rele(dp, tag);
+
+ void *payload = NULL;
+ size_t payload_len = 0;
+ if (resumeobj != 0 || resumeoff != 0) {
+ dmu_object_info_t to_doi;
+ err = dmu_object_info(os, resumeobj, &to_doi);
+ if (err != 0)
+ goto out;
+ SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
+ resumeoff / to_doi.doi_data_block_size);
+
+ nvlist_t *nvl = fnvlist_alloc();
+ fnvlist_add_uint64(nvl, "resume_object", resumeobj);
+ fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
+ payload = fnvlist_pack(nvl, &payload_len);
+ drr->drr_payloadlen = payload_len;
+ fnvlist_free(nvl);
+ }
+
+ err = dump_record(dsp, payload, payload_len);
+ fnvlist_pack_free(payload, payload_len);
+ if (err != 0) {
+ err = dsp->dsa_err;
+ goto out;
+ }
+
+ err = bqueue_init(&to_arg.q, zfs_send_queue_length,
+ offsetof(struct send_block_record, ln));
+ to_arg.error_code = 0;
+ to_arg.cancel = B_FALSE;
+ to_arg.ds = to_ds;
+ to_arg.fromtxg = fromtxg;
+ to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
+ (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0,
+ TS_RUN, minclsyspri);
+
+ struct send_block_record *to_data;
+ to_data = bqueue_dequeue(&to_arg.q);
+
+ while (!to_data->eos_marker && err == 0) {
+ err = do_dump(dsp, to_data);
+ to_data = get_next_record(&to_arg.q, to_data);
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ err = EINTR;
+ }
+
+ if (err != 0) {
+ to_arg.cancel = B_TRUE;
+ while (!to_data->eos_marker) {
+ to_data = get_next_record(&to_arg.q, to_data);
+ }
+ }
+ kmem_free(to_data, sizeof (*to_data));
+
+ bqueue_destroy(&to_arg.q);
+
+ if (err == 0 && to_arg.error_code != 0)
+ err = to_arg.error_code;
+
+ if (err != 0)
+ goto out;
+
+ if (dsp->dsa_pending_op != PENDING_NONE)
+ if (dump_record(dsp, NULL, 0) != 0)
+ err = SET_ERROR(EINTR);
+
+ if (err != 0) {
+ if (err == EINTR && dsp->dsa_err != 0)
+ err = dsp->dsa_err;
+ goto out;
+ }
+
+ bzero(drr, sizeof (dmu_replay_record_t));
+ drr->drr_type = DRR_END;
+ drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
+ drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
+
+ if (dump_record(dsp, NULL, 0) != 0)
+ err = dsp->dsa_err;
+
+out:
+ mutex_enter(&to_ds->ds_sendstream_lock);
+ list_remove(&to_ds->ds_sendstreams, dsp);
+ mutex_exit(&to_ds->ds_sendstream_lock);
+
+ VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
+
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ kmem_free(dsp, sizeof (dmu_sendarg_t));
+
+ dsl_dataset_long_rele(to_ds, FTAG);
+
+ return (err);
+}
+
+int
+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
+#ifdef illumos
+ int outfd, vnode_t *vp, offset_t *off)
+#else
+ int outfd, struct file *fp, offset_t *off)
+#endif
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ dsl_dataset_t *fromds = NULL;
+ int err;
+
+ err = dsl_pool_hold(pool, FTAG, &dp);
+ if (err != 0)
+ return (err);
+
+ err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
+ if (fromsnap != 0) {
+ zfs_bookmark_phys_t zb;
+ boolean_t is_clone;
+
+ err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
+ if (err != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+ if (!dsl_dataset_is_before(ds, fromds, 0))
+ err = SET_ERROR(EXDEV);
+ zb.zbm_creation_time =
+ dsl_dataset_phys(fromds)->ds_creation_time;
+ zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
+ zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+ is_clone = (fromds->ds_dir != ds->ds_dir);
+ dsl_dataset_rele(fromds, FTAG);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, compressok, outfd, 0, 0, fp, off);
+ } else {
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, compressok, outfd, 0, 0, fp, off);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+}
+
+int
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+ boolean_t large_block_ok, boolean_t compressok, int outfd,
+ uint64_t resumeobj, uint64_t resumeoff,
+#ifdef illumos
+ vnode_t *vp, offset_t *off)
+#else
+ struct file *fp, offset_t *off)
+#endif
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+ boolean_t owned = B_FALSE;
+
+ if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
+ return (SET_ERROR(EINVAL));
+
+ err = dsl_pool_hold(tosnap, FTAG, &dp);
+ if (err != 0)
+ return (err);
+
+ if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
+ /*
+ * We are sending a filesystem or volume. Ensure
+ * that it doesn't change by owning the dataset.
+ */
+ err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
+ owned = B_TRUE;
+ } else {
+ err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+ }
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
+ if (fromsnap != NULL) {
+ zfs_bookmark_phys_t zb;
+ boolean_t is_clone = B_FALSE;
+ int fsnamelen = strchr(tosnap, '@') - tosnap;
+
+ /*
+ * If the fromsnap is in a different filesystem, then
+ * mark the send stream as a clone.
+ */
+ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
+ (fromsnap[fsnamelen] != '@' &&
+ fromsnap[fsnamelen] != '#')) {
+ is_clone = B_TRUE;
+ }
+
+ if (strchr(fromsnap, '@')) {
+ dsl_dataset_t *fromds;
+ err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+ if (err == 0) {
+ if (!dsl_dataset_is_before(ds, fromds, 0))
+ err = SET_ERROR(EXDEV);
+ zb.zbm_creation_time =
+ dsl_dataset_phys(fromds)->ds_creation_time;
+ zb.zbm_creation_txg =
+ dsl_dataset_phys(fromds)->ds_creation_txg;
+ zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+ is_clone = (ds->ds_dir != fromds->ds_dir);
+ dsl_dataset_rele(fromds, FTAG);
+ }
+ } else {
+ err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
+ }
+ if (err != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, compressok,
+ outfd, resumeobj, resumeoff, fp, off);
+ } else {
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, compressok,
+ outfd, resumeobj, resumeoff, fp, off);
+ }
+ if (owned)
+ dsl_dataset_disown(ds, FTAG);
+ else
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+}
+
+static int
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
+ uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
+{
+ int err = 0;
+ uint64_t size;
+ /*
+ * Assume that space (both on-disk and in-stream) is dominated by
+ * data. We will adjust for indirect blocks and the copies property,
+ * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+ */
+ uint64_t recordsize;
+ uint64_t record_count;
+ objset_t *os;
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+
+ /* Assume all (uncompressed) blocks are recordsize. */
+ if (zfs_override_estimate_recordsize != 0) {
+ recordsize = zfs_override_estimate_recordsize;
+ } else if (os->os_phys->os_type == DMU_OST_ZVOL) {
+ err = dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
+ } else {
+ err = dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
+ }
+ if (err != 0)
+ return (err);
+ record_count = uncompressed / recordsize;
+
+ /*
+ * If we're estimating a send size for a compressed stream, use the
+ * compressed data size to estimate the stream size. Otherwise, use the
+ * uncompressed data size.
+ */
+ size = stream_compressed ? compressed : uncompressed;
+
+ /*
+ * Subtract out approximate space used by indirect blocks.
+ * Assume most space is used by data blocks (non-indirect, non-dnode).
+ * Assume no ditto blocks or internal fragmentation.
+ *
+ * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+ * block.
+ */
+ size -= record_count * sizeof (blkptr_t);
+
+ /* Add in the space for the record associated with each block. */
+ size += record_count * sizeof (dmu_replay_record_t);
+
+ *sizep = size;
+
+ return (0);
+}
+
+int
+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
+ boolean_t stream_compressed, uint64_t *sizep)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ int err;
+ uint64_t uncomp, comp;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ /* tosnap must be a snapshot */
+ if (!ds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ /* fromsnap, if provided, must be a snapshot */
+ if (fromds != NULL && !fromds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * fromsnap must be an earlier snapshot from the same fs as tosnap,
+ * or the origin's fs.
+ */
+ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
+ return (SET_ERROR(EXDEV));
+
+ /* Get compressed and uncompressed size estimates of changed data. */
+ if (fromds == NULL) {
+ uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
+ } else {
+ uint64_t used;
+ err = dsl_dataset_space_written(fromds, ds,
+ &used, &comp, &uncomp);
+ if (err != 0)
+ return (err);
+ }
+
+ err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
+ stream_compressed, sizep);
+ /*
+ * Add the size of the BEGIN and END records to the estimate.
+ */
+ *sizep += 2 * sizeof (dmu_replay_record_t);
+ return (err);
+}
+
+struct calculate_send_arg {
+ uint64_t uncompressed;
+ uint64_t compressed;
+};
+
+/*
+ * Simple callback used to traverse the blocks of a snapshot and sum their
+ * uncompressed and compressed sizes.
+ */
+/* ARGSUSED */
+static int
+dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ struct calculate_send_arg *space = arg;
+ if (bp != NULL && !BP_IS_HOLE(bp)) {
+ space->uncompressed += BP_GET_UCSIZE(bp);
+ space->compressed += BP_GET_PSIZE(bp);
+ }
+ return (0);
+}
+
+/*
+ * Given a desination snapshot and a TXG, calculate the approximate size of a
+ * send stream sent from that TXG. from_txg may be zero, indicating that the
+ * whole snapshot will be sent.
+ */
+int
+dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
+ boolean_t stream_compressed, uint64_t *sizep)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ int err;
+ struct calculate_send_arg size = { 0 };
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ /* tosnap must be a snapshot */
+ if (!ds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ /* verify that from_txg is before the provided snapshot was taken */
+ if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * traverse the blocks of the snapshot with birth times after
+ * from_txg, summing their uncompressed size
+ */
+ err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
+ dmu_calculate_send_traversal, &size);
+ if (err)
+ return (err);
+
+ err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
+ size.compressed, stream_compressed, sizep);
+ return (err);
+}
+
+typedef struct dmu_recv_begin_arg {
+ const char *drba_origin;
+ dmu_recv_cookie_t *drba_cookie;
+ cred_t *drba_cred;
+ uint64_t drba_snapobj;
+} dmu_recv_begin_arg_t;
+
+static int
+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
+ uint64_t fromguid)
+{
+ uint64_t val;
+ uint64_t children;
+ int error;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* Temporary clone name must not exist. */
+ error = zap_lookup(dp->dp_meta_objset,
+ dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
+ 8, 1, &val);
+ if (error != ENOENT)
+ return (error == 0 ? SET_ERROR(EBUSY) : error);
+
+ /* Resume state must not be set. */
+ if (dsl_dataset_has_resume_receive_state(ds))
+ return (SET_ERROR(EBUSY));
+
+ /* New snapshot name must not exist. */
+ error = zap_lookup(dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+ drba->drba_cookie->drc_tosnap, 8, 1, &val);
+ if (error != ENOENT)
+ return (error == 0 ? SET_ERROR(EEXIST) : error);
+
+ /* must not have children if receiving a ZVOL */
+ error = zap_count(dp->dp_meta_objset,
+ dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
+ if (error != 0)
+ return (error);
+ if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
+ children > 0)
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+
+ /*
+ * Check snapshot limit before receiving. We'll recheck again at the
+ * end, but might as well abort before receiving if we're already over
+ * the limit.
+ *
+ * Note that we do not check the file system limit with
+ * dsl_dir_fscount_check because the temporary %clones don't count
+ * against that limit.
+ */
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+ NULL, drba->drba_cred);
+ if (error != 0)
+ return (error);
+
+ if (fromguid != 0) {
+ dsl_dataset_t *snap;
+ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+
+ /* Find snapshot in this dir that matches fromguid. */
+ while (obj != 0) {
+ error = dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap);
+ if (error != 0)
+ return (SET_ERROR(ENODEV));
+ if (snap->ds_dir != ds->ds_dir) {
+ dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(ENODEV));
+ }
+ if (dsl_dataset_phys(snap)->ds_guid == fromguid)
+ break;
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_dataset_rele(snap, FTAG);
+ }
+ if (obj == 0)
+ return (SET_ERROR(ENODEV));
+
+ if (drba->drba_cookie->drc_force) {
+ drba->drba_snapobj = obj;
+ } else {
+ /*
+ * If we are not forcing, there must be no
+ * changes since fromsnap.
+ */
+ if (dsl_dataset_modified_since_snap(ds, snap)) {
+ dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(ETXTBSY));
+ }
+ drba->drba_snapobj = ds->ds_prev->ds_object;
+ }
+
+ dsl_dataset_rele(snap, FTAG);
+ } else {
+ /* if full, then must be forced */
+ if (!drba->drba_cookie->drc_force)
+ return (SET_ERROR(EEXIST));
+ /* start from $ORIGIN@$ORIGIN, if supported */
+ drba->drba_snapobj = dp->dp_origin_snap != NULL ?
+ dp->dp_origin_snap->ds_object : 0;
+ }
+
+ return (0);
+
+}
+
+static int
+dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+ uint64_t fromguid = drrb->drr_fromguid;
+ int flags = drrb->drr_flags;
+ int error;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+ dsl_dataset_t *ds;
+ const char *tofs = drba->drba_cookie->drc_tofs;
+
+ /* already checked */
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+ ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
+
+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM ||
+ drrb->drr_type >= DMU_OST_NUMTYPES ||
+ ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
+ return (SET_ERROR(EINVAL));
+
+ /* Verify pool version supports SA if SA_SPILL feature set */
+ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ spa_version(dp->dp_spa) < SPA_VERSION_SA)
+ return (SET_ERROR(ENOTSUP));
+
+ if (drba->drba_cookie->drc_resumable &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+ * record to a plain WRITE record, so the pool must have the
+ * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+ * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS. Same with
+ * large dnodes.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
+
+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+ if (error == 0) {
+ /* target fs already exists; recv into temp clone */
+
+ /* Can't recv a clone into an existing fs */
+ if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = recv_begin_check_existing_impl(drba, ds, fromguid);
+ dsl_dataset_rele(ds, FTAG);
+ } else if (error == ENOENT) {
+ /* target fs does not exist; must be a full backup or clone */
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ objset_t *os;
+
+ /*
+ * If it's a non-clone incremental, we are missing the
+ * target fs, so fail the recv.
+ */
+ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
+ drba->drba_origin))
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * If we're receiving a full send as a clone, and it doesn't
+ * contain all the necessary free records and freeobject
+ * records, reject it.
+ */
+ if (fromguid == 0 && drba->drba_origin &&
+ !(flags & DRR_FLAG_FREERECORDS))
+ return (SET_ERROR(EINVAL));
+
+ /* Open the parent of tofs */
+ ASSERT3U(strlen(tofs), <, sizeof (buf));
+ (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
+ error = dsl_dataset_hold(dp, buf, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Check filesystem and snapshot limits before receiving. We'll
+ * recheck snapshot limits again at the end (we create the
+ * filesystems and increment those counts during begin_sync).
+ */
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+ ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ /* can't recv below anything but filesystems (eg. no ZVOLs) */
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+ }
+
+ if (drba->drba_origin != NULL) {
+ dsl_dataset_t *origin;
+ error = dsl_dataset_hold(dp, drba->drba_origin,
+ FTAG, &origin);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ if (!origin->ds_is_snapshot) {
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+ fromguid != 0) {
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENODEV));
+ }
+ dsl_dataset_rele(origin, FTAG);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ error = 0;
+ }
+ return (error);
+}
+
+static void
+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+ const char *tofs = drba->drba_cookie->drc_tofs;
+ dsl_dataset_t *ds, *newds;
+ uint64_t dsobj;
+ int error;
+ uint64_t crflags = 0;
+
+ if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+ crflags |= DS_FLAG_CI_DATASET;
+
+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+ if (error == 0) {
+ /* create temporary clone */
+ dsl_dataset_t *snap = NULL;
+ if (drba->drba_snapobj != 0) {
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ drba->drba_snapobj, FTAG, &snap));
+ }
+ dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
+ snap, crflags, drba->drba_cred, tx);
+ if (drba->drba_snapobj != 0)
+ dsl_dataset_rele(snap, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ } else {
+ dsl_dir_t *dd;
+ const char *tail;
+ dsl_dataset_t *origin = NULL;
+
+ VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
+
+ if (drba->drba_origin != NULL) {
+ VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
+ FTAG, &origin));
+ }
+
+ /* Create new dataset. */
+ dsobj = dsl_dataset_create_sync(dd,
+ strrchr(tofs, '/') + 1,
+ origin, crflags, drba->drba_cred, tx);
+ if (origin != NULL)
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ drba->drba_cookie->drc_newfs = B_TRUE;
+ }
+ VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+
+ if (drba->drba_cookie->drc_resumable) {
+ dsl_dataset_zapify(newds, tx);
+ if (drrb->drr_fromguid != 0) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+ 8, 1, &drrb->drr_fromguid, tx));
+ }
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+ 8, 1, &drrb->drr_toguid, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+ 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+ uint64_t one = 1;
+ uint64_t zero = 0;
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+ 8, 1, &one, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+ 8, 1, &zero, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+ 8, 1, &zero, tx));
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
+ 8, 1, &one, tx));
+ }
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_EMBED_DATA) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+ 8, 1, &one, tx));
+ }
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_COMPRESSED) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
+ 8, 1, &one, tx));
+ }
+ }
+
+ dmu_buf_will_dirty(newds->ds_dbuf, tx);
+ dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ /*
+ * If we actually created a non-clone, we need to create the
+ * objset in our new dataset.
+ */
+ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
+ (void) dmu_objset_create_impl(dp->dp_spa,
+ newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
+ }
+ rrw_exit(&newds->ds_bp_rwlock, FTAG);
+
+ drba->drba_cookie->drc_ds = newds;
+
+ spa_history_log_internal_ds(newds, "receive", tx, "");
+}
+
+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+ int error;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+ dsl_dataset_t *ds;
+ const char *tofs = drba->drba_cookie->drc_tofs;
+
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+ /* already checked */
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+ ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM ||
+ drrb->drr_type >= DMU_OST_NUMTYPES)
+ return (SET_ERROR(EINVAL));
+
+ /* Verify pool version supports SA if SA_SPILL feature set */
+ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ spa_version(dp->dp_spa) < SPA_VERSION_SA)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+ * record to a plain WRITE record, so the pool must have the
+ * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+ * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS. Same with
+ * large dnodes.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
+
+ (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+ tofs, recv_clone_name);
+
+ if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+ /* %recv does not exist; continue in tofs */
+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+ if (error != 0)
+ return (error);
+ }
+
+ /* check that ds is marked inconsistent */
+ if (!DS_IS_INCONSISTENT(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* check that there is resuming data, and that the toguid matches */
+ if (!dsl_dataset_is_zapified(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ uint64_t val;
+ error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+ if (error != 0 || drrb->drr_toguid != val) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Check if the receive is still running. If so, it will be owned.
+ * Note that nothing else can own the dataset (e.g. after the receive
+ * fails) because it will be marked inconsistent.
+ */
+ if (dsl_dataset_has_owner(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EBUSY));
+ }
+
+ /* There should not be any snapshots of this fs yet. */
+ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Note: resume point will be checked when we process the first WRITE
+ * record.
+ */
+
+ /* check that the origin matches */
+ val = 0;
+ (void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+ if (drrb->drr_fromguid != val) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ const char *tofs = drba->drba_cookie->drc_tofs;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+ (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+ tofs, recv_clone_name);
+
+ if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+ /* %recv does not exist; continue in tofs */
+ VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
+ drba->drba_cookie->drc_newfs = B_TRUE;
+ }
+
+ /* clear the inconsistent flag so that we can own it */
+ ASSERT(DS_IS_INCONSISTENT(ds));
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ drba->drba_cookie->drc_ds = ds;
+
+ spa_history_log_internal_ds(ds, "resume receive", tx, "");
+}
+
+/*
+ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
+ * succeeds; otherwise we will leak the holds on the datasets.
+ */
+int
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+ boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
+{
+ dmu_recv_begin_arg_t drba = { 0 };
+
+ bzero(drc, sizeof (dmu_recv_cookie_t));
+ drc->drc_drr_begin = drr_begin;
+ drc->drc_drrb = &drr_begin->drr_u.drr_begin;
+ drc->drc_tosnap = tosnap;
+ drc->drc_tofs = tofs;
+ drc->drc_force = force;
+ drc->drc_resumable = resumable;
+ drc->drc_cred = CRED();
+ drc->drc_clone = (origin != NULL);
+
+ if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ drc->drc_byteswap = B_TRUE;
+ (void) fletcher_4_incremental_byteswap(drr_begin,
+ sizeof (dmu_replay_record_t), &drc->drc_cksum);
+ byteswap_record(drr_begin);
+ } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+ (void) fletcher_4_incremental_native(drr_begin,
+ sizeof (dmu_replay_record_t), &drc->drc_cksum);
+ } else {
+ return (SET_ERROR(EINVAL));
+ }
+
+ drba.drba_origin = origin;
+ drba.drba_cookie = drc;
+ drba.drba_cred = CRED();
+
+ if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_RESUMING) {
+ return (dsl_sync_task(tofs,
+ dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+ &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+ } else {
+ return (dsl_sync_task(tofs,
+ dmu_recv_begin_check, dmu_recv_begin_sync,
+ &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+ }
+}
+
+struct receive_record_arg {
+ dmu_replay_record_t header;
+ void *payload; /* Pointer to a buffer containing the payload */
+ /*
+ * If the record is a write, pointer to the arc_buf_t containing the
+ * payload.
+ */
+ arc_buf_t *write_buf;
+ int payload_size;
+ uint64_t bytes_read; /* bytes read from stream when record created */
+ boolean_t eos_marker; /* Marks the end of the stream */
+ bqueue_node_t node;
+};
+
+struct receive_writer_arg {
+ objset_t *os;
+ boolean_t byteswap;
+ bqueue_t q;
+
+ /*
+ * These three args are used to signal to the main thread that we're
+ * done.
+ */
+ kmutex_t mutex;
+ kcondvar_t cv;
+ boolean_t done;
+
+ int err;
+ /* A map from guid to dataset to help handle dedup'd streams. */
+ avl_tree_t *guid_to_ds_map;
+ boolean_t resumable;
+ uint64_t last_object;
+ uint64_t last_offset;
+ uint64_t max_object; /* highest object ID referenced in stream */
+ uint64_t bytes_read; /* bytes read when current record created */
+};
+
+struct objlist {
+ list_t list; /* List of struct receive_objnode. */
+ /*
+ * Last object looked up. Used to assert that objects are being looked
+ * up in ascending order.
+ */
+ uint64_t last_lookup;
+};
+
+struct receive_objnode {
+ list_node_t node;
+ uint64_t object;
+};
+
+struct receive_arg {
+ objset_t *os;
+ kthread_t *td;
+ struct file *fp;
+ uint64_t voff; /* The current offset in the stream */
+ uint64_t bytes_read;
+ /*
+ * A record that has had its payload read in, but hasn't yet been handed
+ * off to the worker thread.
+ */
+ struct receive_record_arg *rrd;
+ /* A record that has had its header read in, but not its payload. */
+ struct receive_record_arg *next_rrd;
+ zio_cksum_t cksum;
+ zio_cksum_t prev_cksum;
+ int err;
+ boolean_t byteswap;
+ /* Sorted list of objects not to issue prefetches for. */
+ struct objlist ignore_objlist;
+};
+
+typedef struct guid_map_entry {
+ uint64_t guid;
+ dsl_dataset_t *gme_ds;
+ avl_node_t avlnode;
+} guid_map_entry_t;
+
+static int
+guid_compare(const void *arg1, const void *arg2)
+{
+ const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1;
+ const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2;
+
+ return (AVL_CMP(gmep1->guid, gmep2->guid));
+}
+
+static void
+free_guid_map_onexit(void *arg)
+{
+ avl_tree_t *ca = arg;
+ void *cookie = NULL;
+ guid_map_entry_t *gmep;
+
+ while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
+ dsl_dataset_long_rele(gmep->gme_ds, gmep);
+ dsl_dataset_rele(gmep->gme_ds, gmep);
+ kmem_free(gmep, sizeof (guid_map_entry_t));
+ }
+ avl_destroy(ca);
+ kmem_free(ca, sizeof (avl_tree_t));
+}
+
+static int
+restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid)
+{
+ struct uio auio;
+ struct iovec aiov;
+ int error;
+
+ aiov.iov_base = buf;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_offset = off;
+ auio.uio_td = ra->td;
+#ifdef _KERNEL
+ error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ error = EOPNOTSUPP;
+#endif
+ *resid = auio.uio_resid;
+ return (error);
+}
+
+static int
+receive_read(struct receive_arg *ra, int len, void *buf)
+{
+ int done = 0;
+
+ /*
+ * The code doesn't rely on this (lengths being multiples of 8). See
+ * comment in dump_bytes.
+ */
+ ASSERT0(len % 8);
+
+ while (done < len) {
+ ssize_t resid;
+
+ ra->err = restore_bytes(ra, buf + done,
+ len - done, ra->voff, &resid);
+
+ if (resid == len - done) {
+ /*
+ * Note: ECKSUM indicates that the receive
+ * was interrupted and can potentially be resumed.
+ */
+ ra->err = SET_ERROR(ECKSUM);
+ }
+ ra->voff += len - done - resid;
+ done = len - resid;
+ if (ra->err != 0)
+ return (ra->err);
+ }
+
+ ra->bytes_read += len;
+
+ ASSERT3U(done, ==, len);
+ return (0);
+}
+
+noinline static void
+byteswap_record(dmu_replay_record_t *drr)
+{
+#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+ drr->drr_type = BSWAP_32(drr->drr_type);
+ drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
+
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ DO64(drr_begin.drr_magic);
+ DO64(drr_begin.drr_versioninfo);
+ DO64(drr_begin.drr_creation_time);
+ DO32(drr_begin.drr_type);
+ DO32(drr_begin.drr_flags);
+ DO64(drr_begin.drr_toguid);
+ DO64(drr_begin.drr_fromguid);
+ break;
+ case DRR_OBJECT:
+ DO64(drr_object.drr_object);
+ DO32(drr_object.drr_type);
+ DO32(drr_object.drr_bonustype);
+ DO32(drr_object.drr_blksz);
+ DO32(drr_object.drr_bonuslen);
+ DO64(drr_object.drr_toguid);
+ break;
+ case DRR_FREEOBJECTS:
+ DO64(drr_freeobjects.drr_firstobj);
+ DO64(drr_freeobjects.drr_numobjs);
+ DO64(drr_freeobjects.drr_toguid);
+ break;
+ case DRR_WRITE:
+ DO64(drr_write.drr_object);
+ DO32(drr_write.drr_type);
+ DO64(drr_write.drr_offset);
+ DO64(drr_write.drr_logical_size);
+ DO64(drr_write.drr_toguid);
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
+ DO64(drr_write.drr_key.ddk_prop);
+ DO64(drr_write.drr_compressed_size);
+ break;
+ case DRR_WRITE_BYREF:
+ DO64(drr_write_byref.drr_object);
+ DO64(drr_write_byref.drr_offset);
+ DO64(drr_write_byref.drr_length);
+ DO64(drr_write_byref.drr_toguid);
+ DO64(drr_write_byref.drr_refguid);
+ DO64(drr_write_byref.drr_refobject);
+ DO64(drr_write_byref.drr_refoffset);
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
+ drr_key.ddk_cksum);
+ DO64(drr_write_byref.drr_key.ddk_prop);
+ break;
+ case DRR_WRITE_EMBEDDED:
+ DO64(drr_write_embedded.drr_object);
+ DO64(drr_write_embedded.drr_offset);
+ DO64(drr_write_embedded.drr_length);
+ DO64(drr_write_embedded.drr_toguid);
+ DO32(drr_write_embedded.drr_lsize);
+ DO32(drr_write_embedded.drr_psize);
+ break;
+ case DRR_FREE:
+ DO64(drr_free.drr_object);
+ DO64(drr_free.drr_offset);
+ DO64(drr_free.drr_length);
+ DO64(drr_free.drr_toguid);
+ break;
+ case DRR_SPILL:
+ DO64(drr_spill.drr_object);
+ DO64(drr_spill.drr_length);
+ DO64(drr_spill.drr_toguid);
+ break;
+ case DRR_END:
+ DO64(drr_end.drr_toguid);
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
+ break;
+ }
+
+ if (drr->drr_type != DRR_BEGIN) {
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
+ }
+
+#undef DO64
+#undef DO32
+}
+
+static inline uint8_t
+deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
+{
+ if (bonus_type == DMU_OT_SA) {
+ return (1);
+ } else {
+ return (1 +
+ ((DN_OLD_MAX_BONUSLEN -
+ MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
+ }
+}
+
+static void
+save_resume_state(struct receive_writer_arg *rwa,
+ uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ if (!rwa->resumable)
+ return;
+
+ /*
+ * We use ds_resume_bytes[] != 0 to indicate that we need to
+ * update this on disk, so it must not be 0.
+ */
+ ASSERT(rwa->bytes_read != 0);
+
+ /*
+ * We only resume from write records, which have a valid
+ * (non-meta-dnode) object number.
+ */
+ ASSERT(object != 0);
+
+ /*
+ * For resuming to work correctly, we must receive records in order,
+ * sorted by object,offset. This is checked by the callers, but
+ * assert it here for good measure.
+ */
+ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+ ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+ offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+ ASSERT3U(rwa->bytes_read, >=,
+ rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+ rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+ rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+ rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
+noinline static int
+receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+ void *data)
+{
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ uint64_t object;
+ int err;
+ uint8_t dn_slots = drro->drr_dn_slots != 0 ?
+ drro->drr_dn_slots : DNODE_MIN_SLOTS;
+
+ if (drro->drr_type == DMU_OT_NONE ||
+ !DMU_OT_IS_VALID(drro->drr_type) ||
+ !DMU_OT_IS_VALID(drro->drr_bonustype) ||
+ drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
+ drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+ P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+ drro->drr_blksz < SPA_MINBLOCKSIZE ||
+ drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
+ drro->drr_bonuslen >
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
+ dn_slots >
+ (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ err = dmu_object_info(rwa->os, drro->drr_object, &doi);
+
+ if (err != 0 && err != ENOENT && err != EEXIST)
+ return (SET_ERROR(EINVAL));
+
+ if (drro->drr_object > rwa->max_object)
+ rwa->max_object = drro->drr_object;
+
+ /*
+ * If we are losing blkptrs or changing the block size this must
+ * be a new file instance. We must clear out the previous file
+ * contents before we can change this type of metadata in the dnode.
+ */
+ if (err == 0) {
+ int nblkptr;
+
+ object = drro->drr_object;
+
+ nblkptr = deduce_nblkptr(drro->drr_bonustype,
+ drro->drr_bonuslen);
+
+ if (drro->drr_blksz != doi.doi_data_block_size ||
+ nblkptr < doi.doi_nblkptr ||
+ dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
+ err = dmu_free_long_range(rwa->os, drro->drr_object,
+ 0, DMU_OBJECT_END);
+ if (err != 0)
+ return (SET_ERROR(EINVAL));
+ }
+ } else if (err == EEXIST) {
+ /*
+ * The object requested is currently an interior slot of a
+ * multi-slot dnode. This will be resolved when the next txg
+ * is synced out, since the send stream will have told us
+ * to free this slot when we freed the associated dnode
+ * earlier in the stream.
+ */
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ object = drro->drr_object;
+ } else {
+ /* object is free and we are about to allocate a new one */
+ object = DMU_NEW_OBJECT;
+ }
+
+ /*
+ * If this is a multi-slot dnode there is a chance that this
+ * object will expand into a slot that is already used by
+ * another object from the previous snapshot. We must free
+ * these objects before we attempt to allocate the new dnode.
+ */
+ if (dn_slots > 1) {
+ boolean_t need_sync = B_FALSE;
+
+ for (uint64_t slot = drro->drr_object + 1;
+ slot < drro->drr_object + dn_slots;
+ slot++) {
+ dmu_object_info_t slot_doi;
+
+ err = dmu_object_info(rwa->os, slot, &slot_doi);
+ if (err == ENOENT || err == EEXIST)
+ continue;
+ else if (err != 0)
+ return (err);
+
+ err = dmu_free_long_object(rwa->os, slot);
+
+ if (err != 0)
+ return (err);
+
+ need_sync = B_TRUE;
+ }
+
+ if (need_sync)
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ }
+
+ tx = dmu_tx_create(rwa->os);
+ dmu_tx_hold_bonus(tx, object);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ if (object == DMU_NEW_OBJECT) {
+ /* currently free, want to be allocated */
+ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
+ drro->drr_type, drro->drr_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen,
+ dn_slots << DNODE_SHIFT, tx);
+ } else if (drro->drr_type != doi.doi_type ||
+ drro->drr_blksz != doi.doi_data_block_size ||
+ drro->drr_bonustype != doi.doi_bonus_type ||
+ drro->drr_bonuslen != doi.doi_bonus_size ||
+ drro->drr_dn_slots != (doi.doi_dnodesize >> DNODE_SHIFT)) {
+ /* currently allocated, but with different properties */
+ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
+ drro->drr_type, drro->drr_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen,
+ drro->drr_dn_slots << DNODE_SHIFT, tx);
+ }
+ if (err != 0) {
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINVAL));
+ }
+
+ dmu_object_set_checksum(rwa->os, drro->drr_object,
+ drro->drr_checksumtype, tx);
+ dmu_object_set_compress(rwa->os, drro->drr_object,
+ drro->drr_compress, tx);
+
+ if (data != NULL) {
+ dmu_buf_t *db;
+
+ VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+ bcopy(data, db->db_data, drro->drr_bonuslen);
+ if (rwa->byteswap) {
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drro->drr_bonustype);
+ dmu_ot_byteswap[byteswap].ob_func(db->db_data,
+ drro->drr_bonuslen);
+ }
+ dmu_buf_rele(db, FTAG);
+ }
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/* ARGSUSED */
+noinline static int
+receive_freeobjects(struct receive_writer_arg *rwa,
+ struct drr_freeobjects *drrfo)
+{
+ uint64_t obj;
+ int next_err = 0;
+
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+ return (SET_ERROR(EINVAL));
+
+ for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
+ obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
+ next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(rwa->os, obj, NULL);
+ if (err == ENOENT)
+ continue;
+ else if (err != 0)
+ return (err);
+
+ err = dmu_free_long_object(rwa->os, obj);
+ if (err != 0)
+ return (err);
+
+ if (obj > rwa->max_object)
+ rwa->max_object = obj;
+ }
+ if (next_err != ESRCH)
+ return (next_err);
+ return (0);
+}
+
+noinline static int
+receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
+ arc_buf_t *abuf)
+{
+ dmu_tx_t *tx;
+ int err;
+
+ if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
+ !DMU_OT_IS_VALID(drrw->drr_type))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * For resuming to work, records must be in increasing order
+ * by (object, offset).
+ */
+ if (drrw->drr_object < rwa->last_object ||
+ (drrw->drr_object == rwa->last_object &&
+ drrw->drr_offset < rwa->last_offset)) {
+ return (SET_ERROR(EINVAL));
+ }
+ rwa->last_object = drrw->drr_object;
+ rwa->last_offset = drrw->drr_offset;
+
+ if (rwa->last_object > rwa->max_object)
+ rwa->max_object = rwa->last_object;
+
+ if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ tx = dmu_tx_create(rwa->os);
+ dmu_tx_hold_write(tx, drrw->drr_object,
+ drrw->drr_offset, drrw->drr_logical_size);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ if (rwa->byteswap) {
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrw->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
+ DRR_WRITE_PAYLOAD_SIZE(drrw));
+ }
+
+ /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */
+ dmu_buf_t *bonus;
+ if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
+ return (SET_ERROR(EINVAL));
+ dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+
+ /*
+ * Note: If the receive fails, we want the resume stream to start
+ * with the same record that we last successfully received (as opposed
+ * to the next record), so that we can verify that we are
+ * resuming from the correct location.
+ */
+ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
+ dmu_tx_commit(tx);
+ dmu_buf_rele(bonus, FTAG);
+
+ return (0);
+}
+
+/*
+ * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
+ * streams to refer to a copy of the data that is already on the
+ * system because it came in earlier in the stream. This function
+ * finds the earlier copy of the data, and uses that copy instead of
+ * data from the stream to fulfill this write.
+ */
+static int
+receive_write_byref(struct receive_writer_arg *rwa,
+ struct drr_write_byref *drrwbr)
+{
+ dmu_tx_t *tx;
+ int err;
+ guid_map_entry_t gmesrch;
+ guid_map_entry_t *gmep;
+ avl_index_t where;
+ objset_t *ref_os = NULL;
+ dmu_buf_t *dbp;
+
+ if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * If the GUID of the referenced dataset is different from the
+ * GUID of the target dataset, find the referenced dataset.
+ */
+ if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
+ gmesrch.guid = drrwbr->drr_refguid;
+ if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
+ &where)) == NULL) {
+ return (SET_ERROR(EINVAL));
+ }
+ if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
+ return (SET_ERROR(EINVAL));
+ } else {
+ ref_os = rwa->os;
+ }
+
+ if (drrwbr->drr_object > rwa->max_object)
+ rwa->max_object = drrwbr->drr_object;
+
+ err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+ drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
+ if (err != 0)
+ return (err);
+
+ tx = dmu_tx_create(rwa->os);
+
+ dmu_tx_hold_write(tx, drrwbr->drr_object,
+ drrwbr->drr_offset, drrwbr->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ dmu_write(rwa->os, drrwbr->drr_object,
+ drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+ dmu_buf_rele(dbp, FTAG);
+
+ /* See comment in restore_write. */
+ save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+static int
+receive_write_embedded(struct receive_writer_arg *rwa,
+ struct drr_write_embedded *drrwe, void *data)
+{
+ dmu_tx_t *tx;
+ int err;
+
+ if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
+ return (EINVAL);
+
+ if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
+ return (EINVAL);
+
+ if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+ return (EINVAL);
+ if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+ return (EINVAL);
+
+ if (drrwe->drr_object > rwa->max_object)
+ rwa->max_object = drrwe->drr_object;
+
+ tx = dmu_tx_create(rwa->os);
+
+ dmu_tx_hold_write(tx, drrwe->drr_object,
+ drrwe->drr_offset, drrwe->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dmu_write_embedded(rwa->os, drrwe->drr_object,
+ drrwe->drr_offset, data, drrwe->drr_etype,
+ drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
+ rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+ /* See comment in restore_write. */
+ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+static int
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+ void *data)
+{
+ dmu_tx_t *tx;
+ dmu_buf_t *db, *db_spill;
+ int err;
+
+ if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+ drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
+ return (SET_ERROR(EINVAL));
+
+ if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (drrs->drr_object > rwa->max_object)
+ rwa->max_object = drrs->drr_object;
+
+ VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
+ if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+
+ tx = dmu_tx_create(rwa->os);
+
+ dmu_tx_hold_spill(tx, db->db_object);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ dmu_buf_will_dirty(db_spill, tx);
+
+ if (db_spill->db_size < drrs->drr_length)
+ VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+ drrs->drr_length, tx));
+ bcopy(data, db_spill->db_data, drrs->drr_length);
+
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+/* ARGSUSED */
+noinline static int
+receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
+{
+ int err;
+
+ if (drrf->drr_length != -1ULL &&
+ drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+ return (SET_ERROR(EINVAL));
+
+ if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (drrf->drr_object > rwa->max_object)
+ rwa->max_object = drrf->drr_object;
+
+ err = dmu_free_long_range(rwa->os, drrf->drr_object,
+ drrf->drr_offset, drrf->drr_length);
+
+ return (err);
+}
+
+/* used to destroy the drc_ds on error */
+static void
+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
+{
+ if (drc->drc_resumable) {
+ /* wait for our resume state to be written to disk */
+ txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ } else {
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(drc->drc_ds, name);
+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ (void) dsl_destroy_head(name);
+ }
+}
+
+static void
+receive_cksum(struct receive_arg *ra, int len, void *buf)
+{
+ if (ra->byteswap) {
+ (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+ } else {
+ (void) fletcher_4_incremental_native(buf, len, &ra->cksum);
+ }
+}
+
+/*
+ * Read the payload into a buffer of size len, and update the current record's
+ * payload field.
+ * Allocate ra->next_rrd and read the next record's header into
+ * ra->next_rrd->header.
+ * Verify checksum of payload and next record.
+ */
+static int
+receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
+{
+ int err;
+
+ if (len != 0) {
+ ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
+ err = receive_read(ra, len, buf);
+ if (err != 0)
+ return (err);
+ receive_cksum(ra, len, buf);
+
+ /* note: rrd is NULL when reading the begin record's payload */
+ if (ra->rrd != NULL) {
+ ra->rrd->payload = buf;
+ ra->rrd->payload_size = len;
+ ra->rrd->bytes_read = ra->bytes_read;
+ }
+ }
+
+ ra->prev_cksum = ra->cksum;
+
+ ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
+ err = receive_read(ra, sizeof (ra->next_rrd->header),
+ &ra->next_rrd->header);
+ ra->next_rrd->bytes_read = ra->bytes_read;
+ if (err != 0) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
+ return (err);
+ }
+ if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Note: checksum is of everything up to but not including the
+ * checksum itself.
+ */
+ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+ receive_cksum(ra,
+ offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ &ra->next_rrd->header);
+
+ zio_cksum_t cksum_orig =
+ ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+ zio_cksum_t *cksump =
+ &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+
+ if (ra->byteswap)
+ byteswap_record(&ra->next_rrd->header);
+
+ if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+ !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
+ return (SET_ERROR(ECKSUM));
+ }
+
+ receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+
+ return (0);
+}
+
+static void
+objlist_create(struct objlist *list)
+{
+ list_create(&list->list, sizeof (struct receive_objnode),
+ offsetof(struct receive_objnode, node));
+ list->last_lookup = 0;
+}
+
+static void
+objlist_destroy(struct objlist *list)
+{
+ for (struct receive_objnode *n = list_remove_head(&list->list);
+ n != NULL; n = list_remove_head(&list->list)) {
+ kmem_free(n, sizeof (*n));
+ }
+ list_destroy(&list->list);
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist. In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number. Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+static boolean_t
+objlist_exists(struct objlist *list, uint64_t object)
+{
+ struct receive_objnode *node = list_head(&list->list);
+ ASSERT3U(object, >=, list->last_lookup);
+ list->last_lookup = object;
+ while (node != NULL && node->object < object) {
+ VERIFY3P(node, ==, list_remove_head(&list->list));
+ kmem_free(node, sizeof (*node));
+ node = list_head(&list->list);
+ }
+ return (node != NULL && node->object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order. However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+static void
+objlist_insert(struct objlist *list, uint64_t object)
+{
+ struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+ node->object = object;
+#ifdef ZFS_DEBUG
+ struct receive_objnode *last_object = list_tail(&list->list);
+ uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
+ ASSERT3U(node->object, >, last_objnum);
+#endif
+ list_insert_tail(&list->list, node);
+}
+
+/*
+ * Issue the prefetch reads for any necessary indirect blocks.
+ *
+ * We use the object ignore list to tell us whether or not to issue prefetches
+ * for a given object. We do this for both correctness (in case the blocksize
+ * of an object has changed) and performance (if the object doesn't exist, don't
+ * needlessly try to issue prefetches). We also trim the list as we go through
+ * the stream to prevent it from growing to an unbounded size.
+ *
+ * The object numbers within will always be in sorted order, and any write
+ * records we see will also be in sorted order, but they're not sorted with
+ * respect to each other (i.e. we can get several object records before
+ * receiving each object's write records). As a result, once we've reached a
+ * given object number, we can safely remove any reference to lower object
+ * numbers in the ignore list. In practice, we receive up to 32 object records
+ * before receiving write records, so the list can have up to 32 nodes in it.
+ */
+/* ARGSUSED */
+static void
+receive_read_prefetch(struct receive_arg *ra,
+ uint64_t object, uint64_t offset, uint64_t length)
+{
+ if (!objlist_exists(&ra->ignore_objlist, object)) {
+ dmu_prefetch(ra->os, object, 1, offset, length,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+}
+
+/*
+ * Read records off the stream, issuing any necessary prefetches.
+ */
+static int
+receive_read_record(struct receive_arg *ra)
+{
+ int err;
+
+ switch (ra->rrd->header.drr_type) {
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
+ uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
+ void *buf = kmem_zalloc(size, KM_SLEEP);
+ dmu_object_info_t doi;
+ err = receive_read_payload_and_next_header(ra, size, buf);
+ if (err != 0) {
+ kmem_free(buf, size);
+ return (err);
+ }
+ err = dmu_object_info(ra->os, drro->drr_object, &doi);
+ /*
+ * See receive_read_prefetch for an explanation why we're
+ * storing this object in the ignore_obj_list.
+ */
+ if (err == ENOENT ||
+ (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+ objlist_insert(&ra->ignore_objlist, drro->drr_object);
+ err = 0;
+ }
+ return (err);
+ }
+ case DRR_FREEOBJECTS:
+ {
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ return (err);
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
+ arc_buf_t *abuf;
+ boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
+ if (DRR_WRITE_COMPRESSED(drrw)) {
+ ASSERT3U(drrw->drr_compressed_size, >, 0);
+ ASSERT3U(drrw->drr_logical_size, >=,
+ drrw->drr_compressed_size);
+ ASSERT(!is_meta);
+ abuf = arc_loan_compressed_buf(
+ dmu_objset_spa(ra->os),
+ drrw->drr_compressed_size, drrw->drr_logical_size,
+ drrw->drr_compressiontype);
+ } else {
+ abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+ is_meta, drrw->drr_logical_size);
+ }
+
+ err = receive_read_payload_and_next_header(ra,
+ DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
+ if (err != 0) {
+ dmu_return_arcbuf(abuf);
+ return (err);
+ }
+ ra->rrd->write_buf = abuf;
+ receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
+ drrw->drr_logical_size);
+ return (err);
+ }
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref *drrwb =
+ &ra->rrd->header.drr_u.drr_write_byref;
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
+ drrwb->drr_length);
+ return (err);
+ }
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &ra->rrd->header.drr_u.drr_write_embedded;
+ uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
+ void *buf = kmem_zalloc(size, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(ra, size, buf);
+ if (err != 0) {
+ kmem_free(buf, size);
+ return (err);
+ }
+
+ receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
+ drrwe->drr_length);
+ return (err);
+ }
+ case DRR_FREE:
+ {
+ /*
+ * It might be beneficial to prefetch indirect blocks here, but
+ * we don't really have the data to decide for sure.
+ */
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ return (err);
+ }
+ case DRR_END:
+ {
+ struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
+ if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
+ return (SET_ERROR(ECKSUM));
+ return (0);
+ }
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
+ void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
+ err = receive_read_payload_and_next_header(ra, drrs->drr_length,
+ buf);
+ if (err != 0)
+ kmem_free(buf, drrs->drr_length);
+ return (err);
+ }
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+}
+
+/*
+ * Commit the records to the pool.
+ */
+static int
+receive_process_record(struct receive_writer_arg *rwa,
+ struct receive_record_arg *rrd)
+{
+ int err;
+
+ /* Processing in order, therefore bytes_read should be increasing. */
+ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+ rwa->bytes_read = rrd->bytes_read;
+
+ switch (rrd->header.drr_type) {
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro = &rrd->header.drr_u.drr_object;
+ err = receive_object(rwa, drro, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_FREEOBJECTS:
+ {
+ struct drr_freeobjects *drrfo =
+ &rrd->header.drr_u.drr_freeobjects;
+ return (receive_freeobjects(rwa, drrfo));
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+ err = receive_write(rwa, drrw, rrd->write_buf);
+ /* if receive_write() is successful, it consumes the arc_buf */
+ if (err != 0)
+ dmu_return_arcbuf(rrd->write_buf);
+ rrd->write_buf = NULL;
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref *drrwbr =
+ &rrd->header.drr_u.drr_write_byref;
+ return (receive_write_byref(rwa, drrwbr));
+ }
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &rrd->header.drr_u.drr_write_embedded;
+ err = receive_write_embedded(rwa, drrwe, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_FREE:
+ {
+ struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+ return (receive_free(rwa, drrf));
+ }
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+ err = receive_spill(rwa, drrs, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
+ }
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+}
+
+/*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
+{
+ struct receive_writer_arg *rwa = arg;
+ struct receive_record_arg *rrd;
+ for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+ rrd = bqueue_dequeue(&rwa->q)) {
+ /*
+ * If there's an error, the main thread will stop putting things
+ * on the queue, but we need to clear everything in it before we
+ * can exit.
+ */
+ if (rwa->err == 0) {
+ rwa->err = receive_process_record(rwa, rrd);
+ } else if (rrd->write_buf != NULL) {
+ dmu_return_arcbuf(rrd->write_buf);
+ rrd->write_buf = NULL;
+ rrd->payload = NULL;
+ } else if (rrd->payload != NULL) {
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ }
+ kmem_free(rrd, sizeof (*rrd));
+ }
+ kmem_free(rrd, sizeof (*rrd));
+ mutex_enter(&rwa->mutex);
+ rwa->done = B_TRUE;
+ cv_signal(&rwa->cv);
+ mutex_exit(&rwa->mutex);
+ thread_exit();
+}
+
+static int
+resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
+{
+ uint64_t val;
+ objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
+ uint64_t dsobj = dmu_objset_id(ra->os);
+ uint64_t resume_obj, resume_off;
+
+ if (nvlist_lookup_uint64(begin_nvl,
+ "resume_object", &resume_obj) != 0 ||
+ nvlist_lookup_uint64(begin_nvl,
+ "resume_offset", &resume_off) != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+ VERIFY0(zap_lookup(mos, dsobj,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+ if (resume_obj != val)
+ return (SET_ERROR(EINVAL));
+ VERIFY0(zap_lookup(mos, dsobj,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+ if (resume_off != val)
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+}
+
+/*
+ * Read in the stream's records, one by one, and apply them to the pool. There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks. It will then push the records
+ * onto an internal blocking queue. The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU. This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
+ * NB: callers *must* call dmu_recv_end() if this succeeds.
+ */
+int
+dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
+ int cleanup_fd, uint64_t *action_handlep)
+{
+ int err = 0;
+ struct receive_arg ra = { 0 };
+ struct receive_writer_arg rwa = { 0 };
+ int featureflags;
+ nvlist_t *begin_nvl = NULL;
+
+ ra.byteswap = drc->drc_byteswap;
+ ra.cksum = drc->drc_cksum;
+ ra.td = curthread;
+ ra.fp = fp;
+ ra.voff = *voffp;
+
+ if (dsl_dataset_is_zapified(drc->drc_ds)) {
+ (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+ drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+ sizeof (ra.bytes_read), 1, &ra.bytes_read);
+ }
+
+ objlist_create(&ra.ignore_objlist);
+
+ /* these were verified in dmu_recv_begin */
+ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
+ DMU_SUBSTREAM);
+ ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
+
+ /*
+ * Open the objset we are modifying.
+ */
+ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
+
+ ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
+
+ featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+
+ /* if this stream is dedup'ed, set up the avl tree for guid mapping */
+ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
+ minor_t minor;
+
+ if (cleanup_fd == -1) {
+ ra.err = SET_ERROR(EBADF);
+ goto out;
+ }
+ ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (ra.err != 0) {
+ cleanup_fd = -1;
+ goto out;
+ }
+
+ if (*action_handlep == 0) {
+ rwa.guid_to_ds_map =
+ kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+ avl_create(rwa.guid_to_ds_map, guid_compare,
+ sizeof (guid_map_entry_t),
+ offsetof(guid_map_entry_t, avlnode));
+ err = zfs_onexit_add_cb(minor,
+ free_guid_map_onexit, rwa.guid_to_ds_map,
+ action_handlep);
+ if (ra.err != 0)
+ goto out;
+ } else {
+ err = zfs_onexit_cb_data(minor, *action_handlep,
+ (void **)&rwa.guid_to_ds_map);
+ if (ra.err != 0)
+ goto out;
+ }
+
+ drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
+ }
+
+ uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
+ void *payload = NULL;
+ if (payloadlen != 0)
+ payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(&ra, payloadlen, payload);
+ if (err != 0) {
+ if (payloadlen != 0)
+ kmem_free(payload, payloadlen);
+ goto out;
+ }
+ if (payloadlen != 0) {
+ err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
+ kmem_free(payload, payloadlen);
+ if (err != 0)
+ goto out;
+ }
+
+ if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+ err = resume_check(&ra, begin_nvl);
+ if (err != 0)
+ goto out;
+ }
+
+ (void) bqueue_init(&rwa.q, zfs_recv_queue_length,
+ offsetof(struct receive_record_arg, node));
+ cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
+ rwa.os = ra.os;
+ rwa.byteswap = drc->drc_byteswap;
+ rwa.resumable = drc->drc_resumable;
+
+ (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0,
+ TS_RUN, minclsyspri);
+ /*
+ * We're reading rwa.err without locks, which is safe since we are the
+ * only reader, and the worker thread is the only writer. It's ok if we
+ * miss a write for an iteration or two of the loop, since the writer
+ * thread will keep freeing records we send it until we send it an eos
+ * marker.
+ *
+ * We can leave this loop in 3 ways: First, if rwa.err is
+ * non-zero. In that case, the writer thread will free the rrd we just
+ * pushed. Second, if we're interrupted; in that case, either it's the
+ * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
+ * has been handed off to the writer thread who will free it. Finally,
+ * if receive_read_record fails or we're at the end of the stream, then
+ * we free ra.rrd and exit.
+ */
+ while (rwa.err == 0) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ err = SET_ERROR(EINTR);
+ break;
+ }
+
+ ASSERT3P(ra.rrd, ==, NULL);
+ ra.rrd = ra.next_rrd;
+ ra.next_rrd = NULL;
+ /* Allocates and loads header into ra.next_rrd */
+ err = receive_read_record(&ra);
+
+ if (ra.rrd->header.drr_type == DRR_END || err != 0) {
+ kmem_free(ra.rrd, sizeof (*ra.rrd));
+ ra.rrd = NULL;
+ break;
+ }
+
+ bqueue_enqueue(&rwa.q, ra.rrd,
+ sizeof (struct receive_record_arg) + ra.rrd->payload_size);
+ ra.rrd = NULL;
+ }
+ if (ra.next_rrd == NULL)
+ ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
+ ra.next_rrd->eos_marker = B_TRUE;
+ bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
+
+ mutex_enter(&rwa.mutex);
+ while (!rwa.done) {
+ cv_wait(&rwa.cv, &rwa.mutex);
+ }
+ mutex_exit(&rwa.mutex);
+
+ /*
+ * If we are receiving a full stream as a clone, all object IDs which
+ * are greater than the maximum ID referenced in the stream are
+ * by definition unused and must be freed. Note that it's possible that
+ * we've resumed this send and the first record we received was the END
+ * record. In that case, max_object would be 0, but we shouldn't start
+ * freeing all objects from there; instead we should start from the
+ * resumeobj.
+ */
+ if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
+ uint64_t obj;
+ if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0)
+ obj = 0;
+ if (rwa.max_object > obj)
+ obj = rwa.max_object;
+ obj++;
+ int free_err = 0;
+ int next_err = 0;
+
+ while (next_err == 0) {
+ free_err = dmu_free_long_object(rwa.os, obj);
+ if (free_err != 0 && free_err != ENOENT)
+ break;
+
+ next_err = dmu_object_next(rwa.os, &obj, FALSE, 0);
+ }
+
+ if (err == 0) {
+ if (free_err != 0 && free_err != ENOENT)
+ err = free_err;
+ else if (next_err != ESRCH)
+ err = next_err;
+ }
+ }
+
+ cv_destroy(&rwa.cv);
+ mutex_destroy(&rwa.mutex);
+ bqueue_destroy(&rwa.q);
+ if (err == 0)
+ err = rwa.err;
+
+out:
+ nvlist_free(begin_nvl);
+ if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
+ zfs_onexit_fd_rele(cleanup_fd);
+
+ if (err != 0) {
+ /*
+ * Clean up references. If receive is not resumable,
+ * destroy what we created, so we don't leave it in
+ * the inconsistent state.
+ */
+ dmu_recv_cleanup_ds(drc);
+ }
+
+ *voffp = ra.voff;
+ objlist_destroy(&ra.ignore_objlist);
+ return (err);
+}
+
+static int
+dmu_recv_end_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_cookie_t *drc = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int error;
+
+ ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
+
+ if (!drc->drc_newfs) {
+ dsl_dataset_t *origin_head;
+
+ error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
+ if (error != 0)
+ return (error);
+ if (drc->drc_force) {
+ /*
+ * We will destroy any snapshots in tofs (i.e. before
+ * origin_head) that are after the origin (which is
+ * the snap before drc_ds, because drc_ds can not
+ * have any snaps of its own).
+ */
+ uint64_t obj;
+
+ obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+ while (obj !=
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+ dsl_dataset_t *snap;
+ error = dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap);
+ if (error != 0)
+ break;
+ if (snap->ds_dir != origin_head->ds_dir)
+ error = SET_ERROR(EINVAL);
+ if (error == 0) {
+ error = dsl_destroy_snapshot_check_impl(
+ snap, B_FALSE);
+ }
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_dataset_rele(snap, FTAG);
+ if (error != 0)
+ break;
+ }
+ if (error != 0) {
+ dsl_dataset_rele(origin_head, FTAG);
+ return (error);
+ }
+ }
+ error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
+ origin_head, drc->drc_force, drc->drc_owner, tx);
+ if (error != 0) {
+ dsl_dataset_rele(origin_head, FTAG);
+ return (error);
+ }
+ error = dsl_dataset_snapshot_check_impl(origin_head,
+ drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
+ dsl_dataset_rele(origin_head, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
+ } else {
+ error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
+ drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
+ }
+ return (error);
+}
+
+static void
+dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_cookie_t *drc = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
+ tx, "snap=%s", drc->drc_tosnap);
+
+ if (!drc->drc_newfs) {
+ dsl_dataset_t *origin_head;
+
+ VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
+ &origin_head));
+
+ if (drc->drc_force) {
+ /*
+ * Destroy any snapshots of drc_tofs (origin_head)
+ * after the origin (the snap before drc_ds).
+ */
+ uint64_t obj;
+
+ obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+ while (obj !=
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+ dsl_dataset_t *snap;
+ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap));
+ ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_destroy_snapshot_sync_impl(snap,
+ B_FALSE, tx);
+ dsl_dataset_rele(snap, FTAG);
+ }
+ }
+ VERIFY3P(drc->drc_ds->ds_prev, ==,
+ origin_head->ds_prev);
+
+ dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
+ origin_head, tx);
+ dsl_dataset_snapshot_sync_impl(origin_head,
+ drc->drc_tosnap, tx);
+
+ /* set snapshot's creation time and guid */
+ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
+ dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
+ drc->drc_drrb->drr_creation_time;
+ dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
+ drc->drc_drrb->drr_toguid;
+ dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
+ ~DS_FLAG_INCONSISTENT;
+
+ dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+ dsl_dataset_phys(origin_head)->ds_flags &=
+ ~DS_FLAG_INCONSISTENT;
+
+ drc->drc_newsnapobj =
+ dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+
+ dsl_dataset_rele(origin_head, FTAG);
+ dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+ if (drc->drc_owner != NULL)
+ VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
+ } else {
+ dsl_dataset_t *ds = drc->drc_ds;
+
+ dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
+
+ /* set snapshot's creation time and guid */
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
+ drc->drc_drrb->drr_creation_time;
+ dsl_dataset_phys(ds->ds_prev)->ds_guid =
+ drc->drc_drrb->drr_toguid;
+ dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+ ~DS_FLAG_INCONSISTENT;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ if (dsl_dataset_has_resume_receive_state(ds)) {
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, tx);
+ }
+ drc->drc_newsnapobj =
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
+ }
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ zvol_create_minors(dp->dp_spa, drc->drc_tofs);
+#endif
+
+ /*
+ * Release the hold from dmu_recv_begin. This must be done before
+ * we return to open context, so that when we free the dataset's dnode,
+ * we can evict its bonus buffer.
+ */
+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ drc->drc_ds = NULL;
+}
+
+static int
+add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *snapds;
+ guid_map_entry_t *gmep;
+ int err;
+
+ ASSERT(guid_map != NULL);
+
+ err = dsl_pool_hold(name, FTAG, &dp);
+ if (err != 0)
+ return (err);
+ gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
+ err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
+ if (err == 0) {
+ gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
+ gmep->gme_ds = snapds;
+ avl_add(guid_map, gmep);
+ dsl_dataset_long_hold(snapds, gmep);
+ } else
+ kmem_free(gmep, sizeof (*gmep));
+
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+}
+
+static int dmu_recv_end_modified_blocks = 3;
+
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
+{
+#ifdef _KERNEL
+ /*
+ * We will be destroying the ds; make sure its origin is unmounted if
+ * necessary.
+ */
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(drc->drc_ds, name);
+ zfs_destroy_unmount_origin(name);
+#endif
+
+ return (dsl_sync_task(drc->drc_tofs,
+ dmu_recv_end_check, dmu_recv_end_sync, drc,
+ dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+ return (dsl_sync_task(drc->drc_tofs,
+ dmu_recv_end_check, dmu_recv_end_sync, drc,
+ dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
+{
+ int error;
+
+ drc->drc_owner = owner;
+
+ if (drc->drc_newfs)
+ error = dmu_recv_new_end(drc);
+ else
+ error = dmu_recv_existing_end(drc);
+
+ if (error != 0) {
+ dmu_recv_cleanup_ds(drc);
+ } else if (drc->drc_guid_to_ds_map != NULL) {
+ (void) add_ds_to_guidmap(drc->drc_tofs,
+ drc->drc_guid_to_ds_map,
+ drc->drc_newsnapobj);
+ }
+ return (error);
+}
+
+/*
+ * Return TRUE if this objset is currently being received into.
+ */
+boolean_t
+dmu_objset_is_receiving(objset_t *os)
+{
+ return (os->os_dsl_dataset != NULL &&
+ os->os_dsl_dataset->ds_owner == dmu_recv_tag);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
new file mode 100644
index 000000000000..8ed53914ceae
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -0,0 +1,712 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015 Chunwei Chen. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/callb.h>
+#include <sys/zfeature.h>
+
+int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */
+boolean_t send_holes_without_birth_time = B_TRUE;
+
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN,
+ &send_holes_without_birth_time, 0, "Send holes without birth time");
+#endif
+
+typedef struct prefetch_data {
+ kmutex_t pd_mtx;
+ kcondvar_t pd_cv;
+ int32_t pd_bytes_fetched;
+ int pd_flags;
+ boolean_t pd_cancel;
+ boolean_t pd_exited;
+ zbookmark_phys_t pd_resume;
+} prefetch_data_t;
+
+typedef struct traverse_data {
+ spa_t *td_spa;
+ uint64_t td_objset;
+ blkptr_t *td_rootbp;
+ uint64_t td_min_txg;
+ zbookmark_phys_t *td_resume;
+ int td_flags;
+ prefetch_data_t *td_pfd;
+ boolean_t td_paused;
+ uint64_t td_hole_birth_enabled_txg;
+ blkptr_cb_t *td_func;
+ void *td_arg;
+ boolean_t td_realloc_possible;
+} traverse_data_t;
+
+static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
+ uint64_t objset, uint64_t object);
+static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
+ uint64_t objset, uint64_t object);
+
+static int
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ traverse_data_t *td = arg;
+ zbookmark_phys_t zb;
+
+ if (BP_IS_HOLE(bp))
+ return (0);
+
+ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+ return (-1);
+
+ SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+
+ return (0);
+}
+
+static int
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ traverse_data_t *td = arg;
+
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_phys_t zb;
+
+ if (BP_IS_HOLE(bp))
+ return (0);
+
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return (0);
+
+ SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+ (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+ td->td_arg);
+ }
+ return (0);
+}
+
+static void
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
+{
+ uint64_t claim_txg = zh->zh_claim_txg;
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed; plus blocks that are already stable in read-only mode.
+ */
+ if (claim_txg == 0 && spa_writeable(td->td_spa))
+ return;
+
+ zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
+ (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
+ claim_txg);
+ zil_free(zilog);
+}
+
+typedef enum resume_skip {
+ RESUME_SKIP_ALL,
+ RESUME_SKIP_NONE,
+ RESUME_SKIP_CHILDREN
+} resume_skip_t;
+
+/*
+ * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
+ * the block indicated by zb does not need to be visited at all. Returns
+ * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
+ * resume point. This indicates that this block should be visited but not its
+ * children (since they must have been visited in a previous traversal).
+ * Otherwise returns RESUME_SKIP_NONE.
+ */
+static resume_skip_t
+resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+ const zbookmark_phys_t *zb)
+{
+ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+ /*
+ * If we already visited this bp & everything below,
+ * don't bother doing it again.
+ */
+ if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
+ return (RESUME_SKIP_ALL);
+
+ /*
+ * If we found the block we're trying to resume from, zero
+ * the bookmark out to indicate that we have resumed.
+ */
+ if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
+ bzero(td->td_resume, sizeof (*zb));
+ if (td->td_flags & TRAVERSE_POST)
+ return (RESUME_SKIP_CHILDREN);
+ }
+ }
+ return (RESUME_SKIP_NONE);
+}
+
+static void
+traverse_prefetch_metadata(traverse_data_t *td,
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+ arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+ if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
+ return;
+ /*
+ * If we are in the process of resuming, don't prefetch, because
+ * some children will not be needed (and in fact may have already
+ * been freed).
+ */
+ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
+ return;
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+ return;
+ if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
+ return;
+
+ (void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+}
+
+static boolean_t
+prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
+{
+ ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
+ BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+static int
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+ zbookmark_phys_t czb;
+ int err = 0;
+ arc_buf_t *buf = NULL;
+ prefetch_data_t *pd = td->td_pfd;
+ boolean_t hard = td->td_flags & TRAVERSE_HARD;
+
+ switch (resume_skip_check(td, dnp, zb)) {
+ case RESUME_SKIP_ALL:
+ return (0);
+ case RESUME_SKIP_CHILDREN:
+ goto post;
+ case RESUME_SKIP_NONE:
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ if (bp->blk_birth == 0) {
+ /*
+ * Since this block has a birth time of 0 it must be one of
+ * two things: a hole created before the
+ * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
+ * which has always been a hole in an object.
+ *
+ * If a file is written sparsely, then the unwritten parts of
+ * the file were "always holes" -- that is, they have been
+ * holes since this object was allocated. However, we (and
+ * our callers) can not necessarily tell when an object was
+ * allocated. Therefore, if it's possible that this object
+ * was freed and then its object number reused, we need to
+ * visit all the holes with birth==0.
+ *
+ * If it isn't possible that the object number was reused,
+ * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
+ * all the blocks we will visit as part of this traversal,
+ * then this hole must have always existed, so we can skip
+ * it. We visit blocks born after (exclusive) td_min_txg.
+ *
+ * Note that the meta-dnode cannot be reallocated.
+ */
+ if (!send_holes_without_birth_time &&
+ (!td->td_realloc_possible ||
+ zb->zb_object == DMU_META_DNODE_OBJECT) &&
+ td->td_hole_birth_enabled_txg <= td->td_min_txg)
+ return (0);
+ } else if (bp->blk_birth <= td->td_min_txg) {
+ return (0);
+ }
+
+ if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
+ uint64_t size = BP_GET_LSIZE(bp);
+ mutex_enter(&pd->pd_mtx);
+ ASSERT(pd->pd_bytes_fetched >= 0);
+ while (pd->pd_bytes_fetched < size && !pd->pd_exited)
+ cv_wait(&pd->pd_cv, &pd->pd_mtx);
+ pd->pd_bytes_fetched -= size;
+ cv_broadcast(&pd->pd_cv);
+ mutex_exit(&pd->pd_mtx);
+ }
+
+ if (BP_IS_HOLE(bp)) {
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+ if (err != 0)
+ goto post;
+ return (0);
+ }
+
+ if (td->td_flags & TRAVERSE_PRE) {
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ goto post;
+ }
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+ err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err != 0)
+ goto post;
+ cbp = buf->b_data;
+
+ for (i = 0; i < epb; i++) {
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ traverse_prefetch_metadata(td, &cbp[i], &czb);
+ }
+
+ /* recursively visitbp() blocks below this */
+ for (i = 0; i < epb; i++) {
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ err = traverse_visitbp(td, dnp, &cbp[i], &czb);
+ if (err != 0)
+ break;
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ int i;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err != 0)
+ goto post;
+ dnode_phys_t *child_dnp = buf->b_data;
+
+ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+ prefetch_dnode_metadata(td, &child_dnp[i],
+ zb->zb_objset, zb->zb_blkid * epb + i);
+ }
+
+ /* recursively visitbp() blocks below this */
+ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+ err = traverse_dnode(td, &child_dnp[i],
+ zb->zb_objset, zb->zb_blkid * epb + i);
+ if (err != 0)
+ break;
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+
+ err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err != 0)
+ goto post;
+
+ objset_phys_t *osp = buf->b_data;
+ prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
+ DMU_META_DNODE_OBJECT);
+ /*
+ * See the block comment above for the goal of this variable.
+ * If the maxblkid of the meta-dnode is 0, then we know that
+ * we've never had more than DNODES_PER_BLOCK objects in the
+ * dataset, which means we can't have reused any object ids.
+ */
+ if (osp->os_meta_dnode.dn_maxblkid == 0)
+ td->td_realloc_possible = B_FALSE;
+
+ if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+ prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+ zb->zb_objset, DMU_GROUPUSED_OBJECT);
+ prefetch_dnode_metadata(td, &osp->os_userused_dnode,
+ zb->zb_objset, DMU_USERUSED_OBJECT);
+ }
+
+ err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
+ DMU_META_DNODE_OBJECT);
+ if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+ err = traverse_dnode(td, &osp->os_groupused_dnode,
+ zb->zb_objset, DMU_GROUPUSED_OBJECT);
+ }
+ if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+ err = traverse_dnode(td, &osp->os_userused_dnode,
+ zb->zb_objset, DMU_USERUSED_OBJECT);
+ }
+ }
+
+ if (buf)
+ arc_buf_destroy(buf, &buf);
+
+post:
+ if (err == 0 && (td->td_flags & TRAVERSE_POST))
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+
+ if (hard && (err == EIO || err == ECKSUM)) {
+ /*
+ * Ignore this disk error as requested by the HARD flag,
+ * and continue traversal.
+ */
+ err = 0;
+ }
+
+ /*
+ * If we are stopping here, set td_resume.
+ */
+ if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+ td->td_resume->zb_objset = zb->zb_objset;
+ td->td_resume->zb_object = zb->zb_object;
+ td->td_resume->zb_level = 0;
+ /*
+ * If we have stopped on an indirect block (e.g. due to
+ * i/o error), we have not visited anything below it.
+ * Set the bookmark to the first level-0 block that we need
+ * to visit. This way, the resuming code does not need to
+ * deal with resuming from indirect blocks.
+ *
+ * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+ * to dereference it.
+ */
+ td->td_resume->zb_blkid = zb->zb_blkid;
+ if (zb->zb_level > 0) {
+ td->td_resume->zb_blkid <<= zb->zb_level *
+ (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+ }
+ td->td_paused = B_TRUE;
+ }
+
+ return (err);
+}
+
+static void
+prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
+ uint64_t objset, uint64_t object)
+{
+ int j;
+ zbookmark_phys_t czb;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+ traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+ traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
+ }
+}
+
+static int
+traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
+ uint64_t objset, uint64_t object)
+{
+ int j, err = 0;
+ zbookmark_phys_t czb;
+
+ if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+ object < td->td_resume->zb_object)
+ return (0);
+
+ if (td->td_flags & TRAVERSE_PRE) {
+ SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+ ZB_DNODE_BLKID);
+ err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ return (err);
+ }
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+ err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
+ if (err != 0)
+ break;
+ }
+
+ if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+ SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+ err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
+ }
+
+ if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+ SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+ ZB_DNODE_BLKID);
+ err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ return (err);
+ }
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ prefetch_data_t *pfd = arg;
+ arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH;
+
+ ASSERT(pfd->pd_bytes_fetched >= 0);
+ if (bp == NULL)
+ return (0);
+ if (pfd->pd_cancel)
+ return (SET_ERROR(EINTR));
+
+ if (!prefetch_needed(pfd, bp))
+ return (0);
+
+ mutex_enter(&pfd->pd_mtx);
+ while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
+ cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
+ pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
+ cv_broadcast(&pfd->pd_cv);
+ mutex_exit(&pfd->pd_mtx);
+
+ (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
+
+ return (0);
+}
+
+static void
+traverse_prefetch_thread(void *arg)
+{
+ traverse_data_t *td_main = arg;
+ traverse_data_t td = *td_main;
+ zbookmark_phys_t czb;
+
+ td.td_func = traverse_prefetcher;
+ td.td_arg = td_main->td_pfd;
+ td.td_pfd = NULL;
+ td.td_resume = &td_main->td_pfd->pd_resume;
+
+ SET_BOOKMARK(&czb, td.td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
+
+ mutex_enter(&td_main->td_pfd->pd_mtx);
+ td_main->td_pfd->pd_exited = B_TRUE;
+ cv_broadcast(&td_main->td_pfd->pd_cv);
+ mutex_exit(&td_main->td_pfd->pd_mtx);
+}
+
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+static int
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
+ uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+ blkptr_cb_t func, void *arg)
+{
+ traverse_data_t td;
+ prefetch_data_t pd = { 0 };
+ zbookmark_phys_t czb;
+ int err;
+
+ ASSERT(ds == NULL || objset == ds->ds_object);
+ ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
+
+ td.td_spa = spa;
+ td.td_objset = objset;
+ td.td_rootbp = rootbp;
+ td.td_min_txg = txg_start;
+ td.td_resume = resume;
+ td.td_func = func;
+ td.td_arg = arg;
+ td.td_pfd = &pd;
+ td.td_flags = flags;
+ td.td_paused = B_FALSE;
+ td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+ VERIFY(spa_feature_enabled_txg(spa,
+ SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
+ } else {
+ td.td_hole_birth_enabled_txg = UINT64_MAX;
+ }
+
+ pd.pd_flags = flags;
+ if (resume != NULL)
+ pd.pd_resume = *resume;
+ mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
+
+ /* See comment on ZIL traversal in dsl_scan_visitds. */
+ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ objset_phys_t *osp;
+ arc_buf_t *buf;
+
+ err = arc_read(NULL, td.td_spa, rootbp,
+ arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
+ if (err != 0)
+ return (err);
+
+ osp = buf->b_data;
+ traverse_zil(&td, &osp->os_zil_header);
+ arc_buf_destroy(buf, &buf);
+ }
+
+ if (!(flags & TRAVERSE_PREFETCH_DATA) ||
+ 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
+ &td, TQ_NOQUEUE))
+ pd.pd_exited = B_TRUE;
+
+ SET_BOOKMARK(&czb, td.td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ err = traverse_visitbp(&td, NULL, rootbp, &czb);
+
+ mutex_enter(&pd.pd_mtx);
+ pd.pd_cancel = B_TRUE;
+ cv_broadcast(&pd.pd_cv);
+ while (!pd.pd_exited)
+ cv_wait(&pd.pd_cv, &pd.pd_mtx);
+ mutex_exit(&pd.pd_mtx);
+
+ mutex_destroy(&pd.pd_mtx);
+ cv_destroy(&pd.pd_cv);
+
+ return (err);
+}
+
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+int
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+ zbookmark_phys_t *resume,
+ int flags, blkptr_cb_t func, void *arg)
+{
+ return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
+ &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
+}
+
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+ int flags, blkptr_cb_t func, void *arg)
+{
+ return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
+}
+
+int
+traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+ uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+ blkptr_cb_t func, void *arg)
+{
+ return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
+ blkptr, txg_start, resume, flags, func, arg));
+}
+
+/*
+ * NB: pool must not be changing on-disk (eg, from zdb or sync context).
+ */
+int
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+ blkptr_cb_t func, void *arg)
+{
+ int err;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_t *mos = dp->dp_meta_objset;
+ boolean_t hard = (flags & TRAVERSE_HARD);
+
+ /* visit the MOS */
+ err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
+ txg_start, NULL, flags, func, arg);
+ if (err != 0)
+ return (err);
+
+ /* visit each dataset */
+ for (uint64_t obj = 1; err == 0;
+ err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
+ dmu_object_info_t doi;
+
+ err = dmu_object_info(mos, obj, &doi);
+ if (err != 0) {
+ if (hard)
+ continue;
+ break;
+ }
+
+ if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
+ dsl_dataset_t *ds;
+ uint64_t txg = txg_start;
+
+ dsl_pool_config_enter(dp, FTAG);
+ err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+ dsl_pool_config_exit(dp, FTAG);
+ if (err != 0) {
+ if (hard)
+ continue;
+ break;
+ }
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+ txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ err = traverse_dataset(ds, txg, flags, func, arg);
+ dsl_dataset_rele(ds, FTAG);
+ if (err != 0)
+ break;
+ }
+ }
+ if (err == ESRCH)
+ err = 0;
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
new file mode 100644
index 000000000000..00784ab6c4df
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -0,0 +1,1345 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h>
+#include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/varargs.h>
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+ uint64_t arg1, uint64_t arg2);
+
+
+dmu_tx_t *
+dmu_tx_create_dd(dsl_dir_t *dd)
+{
+ dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+ tx->tx_dir = dd;
+ if (dd != NULL)
+ tx->tx_pool = dd->dd_pool;
+ list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+ offsetof(dmu_tx_hold_t, txh_node));
+ list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
+ tx->tx_start = gethrtime();
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
+ tx->tx_objset = os;
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(NULL);
+
+ txg_verify(dp->dp_spa, txg);
+ tx->tx_pool = dp;
+ tx->tx_txg = txg;
+ tx->tx_anyobj = TRUE;
+
+ return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
+ uint64_t arg1, uint64_t arg2)
+{
+ dmu_tx_hold_t *txh;
+
+ if (dn != NULL) {
+ (void) zfs_refcount_add(&dn->dn_holds, tx);
+ if (tx->tx_txg != 0) {
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+ * problem, but there's no way for it to happen (for
+ * now, at least).
+ */
+ ASSERT(dn->dn_assigned_txg == 0);
+ dn->dn_assigned_txg = tx->tx_txg;
+ (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ }
+
+ txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+ txh->txh_tx = tx;
+ txh->txh_dnode = dn;
+ zfs_refcount_create(&txh->txh_space_towrite);
+ zfs_refcount_create(&txh->txh_memory_tohold);
+ txh->txh_type = type;
+ txh->txh_arg1 = arg1;
+ txh->txh_arg2 = arg2;
+ list_insert_tail(&tx->tx_holds, txh);
+
+ return (txh);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+ enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
+{
+ dnode_t *dn = NULL;
+ dmu_tx_hold_t *txh;
+ int err;
+
+ if (object != DMU_NEW_OBJECT) {
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0) {
+ tx->tx_err = err;
+ return (NULL);
+ }
+ }
+ txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
+ if (dn != NULL)
+ dnode_rele(dn, FTAG);
+ return (txh);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
+{
+ /*
+ * If we're syncing, they can manipulate any object anyhow, and
+ * the hold on the dnode_t can cause problems.
+ */
+ if (!dmu_tx_is_syncing(tx))
+ (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
+}
+
+/*
+ * This function reads specified data from disk. The specified data will
+ * be needed to perform the transaction -- i.e, it will be read after
+ * we do dmu_tx_assign(). There are two reasons that we read the data now
+ * (before dmu_tx_assign()):
+ *
+ * 1. Reading it now has potentially better performance. The transaction
+ * has not yet been assigned, so the TXG is not held open, and also the
+ * caller typically has less locks held when calling dmu_tx_hold_*() than
+ * after the transaction has been assigned. This reduces the lock (and txg)
+ * hold times, thus reducing lock contention.
+ *
+ * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
+ * that are detected before they start making changes to the DMU state
+ * (i.e. now). Once the transaction has been assigned, and some DMU
+ * state has been changed, it can be difficult to recover from an i/o
+ * error (e.g. to undo the changes already made in memory at the DMU
+ * layer). Typically code to do so does not exist in the caller -- it
+ * assumes that the data has already been cached and thus i/o errors are
+ * not possible.
+ *
+ * It has been observed that the i/o initiated here can be a performance
+ * problem, and it appears to be optional, because we don't look at the
+ * data which is read. However, removing this read would only serve to
+ * move the work elsewhere (after the dmu_tx_assign()), where it may
+ * have a greater impact on performance (in addition to the impact on
+ * fault tolerance noted above).
+ */
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+ int err;
+ dmu_buf_impl_t *db;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold_level(dn, level, blkid, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL)
+ return (SET_ERROR(EIO));
+ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+ dbuf_rele(db, FTAG);
+ return (err);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dnode_t *dn = txh->txh_dnode;
+ int err = 0;
+
+ if (len == 0)
+ return;
+
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
+
+ if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
+ err = SET_ERROR(EFBIG);
+
+ if (dn == NULL)
+ return;
+
+ /*
+ * For i/o error checking, read the blocks that will be needed
+ * to perform the write: the first and last level-0 blocks (if
+ * they are not aligned, i.e. if they are partial-block writes),
+ * and all the level-1 blocks.
+ */
+ if (dn->dn_maxblkid == 0) {
+ if (off < dn->dn_datablksz &&
+ (off > 0 || len < dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+ } else {
+ zio_t *zio = zio_root(dn->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ /* first level-0 block */
+ uint64_t start = off >> dn->dn_datablkshift;
+ if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, start);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+
+ /* last level-0 block */
+ uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
+ if (end != start && end <= dn->dn_maxblkid &&
+ P2PHASE(off + len, dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, end);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+
+ /* level-1 blocks */
+ if (dn->dn_nlevels > 1) {
+ int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (uint64_t i = (start >> shft) + 1;
+ i < end >> shft; i++) {
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+ }
+
+ err = zio_wait(zio);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_hold_t *txh)
+{
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE,
+ FTAG);
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT3U(len, <=, DMU_MAX_ACCESS);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_WRITE, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_write(txh, off, len);
+ dmu_tx_count_dnode(txh);
+ }
+}
+
+void
+dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_WRITE, 0, 0);
+ if (txh == NULL)
+ return;
+
+ dnode_t *dn = txh->txh_dnode;
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite,
+ 1ULL << dn->dn_indblkshift, FTAG);
+ dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT3U(len, <=, DMU_MAX_ACCESS);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_write(txh, off, len);
+ dmu_tx_count_dnode(txh);
+ }
+}
+
+/*
+ * This function marks the transaction as being a "net free". The end
+ * result is that refquotas will be disabled for this transaction, and
+ * this transaction will be able to use half of the pool space overhead
+ * (see dsl_pool_adjustedsize()). Therefore this function should only
+ * be called for transactions that we expect will not cause a net increase
+ * in the amount of space used (but it's OK if that is occasionally not true).
+ */
+void
+dmu_tx_mark_netfree(dmu_tx_t *tx)
+{
+ tx->tx_netfree = B_TRUE;
+}
+
+static void
+dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dmu_tx_t *tx;
+ dnode_t *dn;
+ int err;
+
+ tx = txh->txh_tx;
+ ASSERT(tx->tx_txg == 0);
+
+ dn = txh->txh_dnode;
+ dmu_tx_count_dnode(txh);
+
+ if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
+ return;
+ if (len == DMU_OBJECT_END)
+ len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
+
+
+ /*
+ * For i/o error checking, we read the first and last level-0
+ * blocks if they are not aligned, and all the level-1 blocks.
+ *
+ * Note: dbuf_free_range() assumes that we have not instantiated
+ * any level-0 dbufs that will be completely freed. Therefore we must
+ * exercise care to not read or count the first and last blocks
+ * if they are blocksize-aligned.
+ */
+ if (dn->dn_datablkshift == 0) {
+ if (off != 0 || len < dn->dn_datablksz)
+ dmu_tx_count_write(txh, 0, dn->dn_datablksz);
+ } else {
+ /* first block will be modified if it is not aligned */
+ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
+ dmu_tx_count_write(txh, off, 1);
+ /* last block will be modified if it is not aligned */
+ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
+ dmu_tx_count_write(txh, off + len, 1);
+ }
+
+ /*
+ * Check level-1 blocks.
+ */
+ if (dn->dn_nlevels > 1) {
+ int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ uint64_t start = off >> shift;
+ uint64_t end = (off + len) >> shift;
+
+ ASSERT(dn->dn_indblkshift != 0);
+
+ /*
+ * dnode_reallocate() can result in an object with indirect
+ * blocks having an odd data block size. In this case,
+ * just check the single block.
+ */
+ if (dn->dn_datablkshift == 0)
+ start = end = 0;
+
+ zio_t *zio = zio_root(tx->tx_pool->dp_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (uint64_t i = start; i <= end; i++) {
+ uint64_t ibyte = i << shift;
+ err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (err == ESRCH || i > end)
+ break;
+ if (err != 0) {
+ tx->tx_err = err;
+ (void) zio_wait(zio);
+ return;
+ }
+
+ (void) zfs_refcount_add_many(&txh->txh_memory_tohold,
+ 1 << dn->dn_indblkshift, FTAG);
+
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err != 0) {
+ tx->tx_err = err;
+ (void) zio_wait(zio);
+ return;
+ }
+ }
+ err = zio_wait(zio);
+ if (err != 0) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+ dmu_tx_hold_t *txh;
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_FREE, off, len);
+ if (txh != NULL)
+ (void) dmu_tx_hold_free_impl(txh, off, len);
+}
+
+void
+dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+ dmu_tx_hold_t *txh;
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
+ if (txh != NULL)
+ (void) dmu_tx_hold_free_impl(txh, off, len);
+}
+
+static void
+dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
+{
+ dmu_tx_t *tx = txh->txh_tx;
+ dnode_t *dn;
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+
+ dn = txh->txh_dnode;
+
+ dmu_tx_count_dnode(txh);
+
+ /*
+ * Modifying a almost-full microzap is around the worst case (128KB)
+ *
+ * If it is a fat zap, the worst case would be 7*16KB=112KB:
+ * - 3 blocks overwritten: target leaf, ptrtbl block, header block
+ * - 4 new blocks written if adding:
+ * - 2 blocks for possibly split leaves,
+ * - 2 grown ptrtbl blocks
+ */
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite,
+ MZAP_MAX_BLKSZ, FTAG);
+
+ if (dn == NULL)
+ return;
+
+ ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
+
+ if (dn->dn_maxblkid == 0 || name == NULL) {
+ /*
+ * This is a microzap (only one block), or we don't know
+ * the name. Check the first block for i/o errors.
+ */
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err != 0) {
+ tx->tx_err = err;
+ }
+ } else {
+ /*
+ * Access the name so that we'll check for i/o errors to
+ * the leaf blocks, etc. We ignore ENOENT, as this name
+ * may not yet exist.
+ */
+ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
+ if (err == EIO || err == ECKSUM || err == ENXIO) {
+ tx->tx_err = err;
+ }
+ }
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_ZAP, add, (uintptr_t)name);
+ if (txh != NULL)
+ dmu_tx_hold_zap_impl(txh, name);
+}
+
+void
+dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT(dn != NULL);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
+ if (txh != NULL)
+ dmu_tx_hold_zap_impl(txh, name);
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_BONUS, 0, 0);
+ if (txh)
+ dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
+ if (txh)
+ dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+ dmu_tx_hold_t *txh;
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ DMU_NEW_OBJECT, THT_SPACE, space, 0);
+
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite, space, FTAG);
+}
+
+#ifdef ZFS_DEBUG
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+ boolean_t match_object = B_FALSE;
+ boolean_t match_offset = B_FALSE;
+
+ DB_DNODE_ENTER(db);
+ dnode_t *dn = DB_DNODE(db);
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
+ ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+ if (tx->tx_anyobj) {
+ DB_DNODE_EXIT(db);
+ return;
+ }
+
+ /* XXX No checking on the meta dnode for now */
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ DB_DNODE_EXIT(db);
+ return;
+ }
+
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+ txh = list_next(&tx->tx_holds, txh)) {
+ ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
+ if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
+ match_object = TRUE;
+ if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
+ int datablkshift = dn->dn_datablkshift ?
+ dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int shift = datablkshift + epbs * db->db_level;
+ uint64_t beginblk = shift >= 64 ? 0 :
+ (txh->txh_arg1 >> shift);
+ uint64_t endblk = shift >= 64 ? 0 :
+ ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
+ uint64_t blkid = db->db_blkid;
+
+ /* XXX txh_arg2 better not be zero... */
+
+ dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
+ txh->txh_type, beginblk, endblk);
+
+ switch (txh->txh_type) {
+ case THT_WRITE:
+ if (blkid >= beginblk && blkid <= endblk)
+ match_offset = TRUE;
+ /*
+ * We will let this hold work for the bonus
+ * or spill buffer so that we don't need to
+ * hold it when creating a new object.
+ */
+ if (blkid == DMU_BONUS_BLKID ||
+ blkid == DMU_SPILL_BLKID)
+ match_offset = TRUE;
+ /*
+ * They might have to increase nlevels,
+ * thus dirtying the new TLIBs. Or the
+ * might have to change the block size,
+ * thus dirying the new lvl=0 blk=0.
+ */
+ if (blkid == 0)
+ match_offset = TRUE;
+ break;
+ case THT_FREE:
+ /*
+ * We will dirty all the level 1 blocks in
+ * the free range and perhaps the first and
+ * last level 0 block.
+ */
+ if (blkid >= beginblk && (blkid <= endblk ||
+ txh->txh_arg2 == DMU_OBJECT_END))
+ match_offset = TRUE;
+ break;
+ case THT_SPILL:
+ if (blkid == DMU_SPILL_BLKID)
+ match_offset = TRUE;
+ break;
+ case THT_BONUS:
+ if (blkid == DMU_BONUS_BLKID)
+ match_offset = TRUE;
+ break;
+ case THT_ZAP:
+ match_offset = TRUE;
+ break;
+ case THT_NEWOBJECT:
+ match_object = TRUE;
+ break;
+ default:
+ ASSERT(!"bad txh_type");
+ }
+ }
+ if (match_object && match_offset) {
+ DB_DNODE_EXIT(db);
+ return;
+ }
+ }
+ DB_DNODE_EXIT(db);
+ panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+ (u_longlong_t)db->db.db_object, db->db_level,
+ (u_longlong_t)db->db_blkid);
+}
+#endif
+
+/*
+ * If we can't do 10 iops, something is wrong. Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting. This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time. This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ * min_time = scale * (dirty - min) / (max - dirty)
+ * min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ * 10ms +-------------------------------------------------------------*+
+ * | *|
+ * 9ms + *+
+ * | *|
+ * 8ms + *+
+ * | * |
+ * 7ms + * +
+ * | * |
+ * 6ms + * +
+ * | * |
+ * 5ms + * +
+ * | * |
+ * 4ms + * +
+ * | * |
+ * 3ms + * +
+ * | * |
+ * 2ms + (midpoint) * +
+ * | | ** |
+ * 1ms + v *** +
+ * | zfs_delay_scale ----------> ******** |
+ * 0 +-------------------------------------*********----------------+
+ * 0% <- zfs_dirty_data_max -> 100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ * + +
+ * | |
+ * + *+
+ * 10ms + *+
+ * + ** +
+ * | (midpoint) ** |
+ * + | ** +
+ * 1ms + v **** +
+ * + zfs_delay_scale ----------> ***** +
+ * | **** |
+ * + **** +
+ * 100us + ** +
+ * + * +
+ * | * |
+ * + * +
+ * 10us + * +
+ * + +
+ * | |
+ * + +
+ * +--------------------------------------------------------------+
+ * 0% <- zfs_dirty_data_max -> 100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+ dsl_pool_t *dp = tx->tx_pool;
+ uint64_t delay_min_bytes =
+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+ hrtime_t wakeup, min_tx_time, now;
+
+ if (dirty <= delay_min_bytes)
+ return;
+
+ /*
+ * The caller has already waited until we are under the max.
+ * We make them pass us the amount of dirty data so we don't
+ * have to handle the case of it being >= the max, which could
+ * cause a divide-by-zero if it's == the max.
+ */
+ ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+ now = gethrtime();
+ min_tx_time = zfs_delay_scale *
+ (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+ if (now > tx->tx_start + min_tx_time)
+ return;
+
+ min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+
+ DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+ uint64_t, min_tx_time);
+
+ mutex_enter(&dp->dp_lock);
+ wakeup = MAX(tx->tx_start + min_tx_time,
+ dp->dp_last_wakeup + min_tx_time);
+ dp->dp_last_wakeup = wakeup;
+ mutex_exit(&dp->dp_lock);
+
+#ifdef _KERNEL
+#ifdef illumos
+ mutex_enter(&curthread->t_delay_lock);
+ while (cv_timedwait_hires(&curthread->t_delay_cv,
+ &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
+ CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
+ continue;
+ mutex_exit(&curthread->t_delay_lock);
+#else
+ pause_sbt("dmu_tx_delay", nstosbt(wakeup),
+ nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE);
+#endif
+#else
+ hrtime_t delta = wakeup - gethrtime();
+ struct timespec ts;
+ ts.tv_sec = delta / NANOSEC;
+ ts.tv_nsec = delta % NANOSEC;
+ (void) nanosleep(&ts, NULL);
+#endif
+}
+
+/*
+ * This routine attempts to assign the transaction to a transaction group.
+ * To do so, we must determine if there is sufficient free space on disk.
+ *
+ * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
+ * on it), then it is assumed that there is sufficient free space,
+ * unless there's insufficient slop space in the pool (see the comment
+ * above spa_slop_shift in spa_misc.c).
+ *
+ * If it is not a "netfree" transaction, then if the data already on disk
+ * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
+ * ENOSPC. Otherwise, if the current rough estimate of pending changes,
+ * plus the rough estimate of this transaction's changes, may exceed the
+ * allowed usage, then this will fail with ERESTART, which will cause the
+ * caller to wait for the pending changes to be written to disk (by waiting
+ * for the next TXG to open), and then check the space usage again.
+ *
+ * The rough estimate of pending changes is comprised of the sum of:
+ *
+ * - this transaction's holds' txh_space_towrite
+ *
+ * - dd_tempreserved[], which is the sum of in-flight transactions'
+ * holds' txh_space_towrite (i.e. those transactions that have called
+ * dmu_tx_assign() but not yet called dmu_tx_commit()).
+ *
+ * - dd_space_towrite[], which is the amount of dirtied dbufs.
+ *
+ * Note that all of these values are inflated by spa_get_worst_case_asize(),
+ * which means that we may get ERESTART well before we are actually in danger
+ * of running out of space, but this also mitigates any small inaccuracies
+ * in the rough estimate (e.g. txh_space_towrite doesn't take into account
+ * indirect blocks, and dd_space_towrite[] doesn't take into account changes
+ * to the MOS).
+ *
+ * Note that due to this algorithm, it is possible to exceed the allowed
+ * usage by one transaction. Also, as we approach the allowed usage,
+ * we will allow a very limited amount of changes into each TXG, thus
+ * decreasing performance.
+ */
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ spa_t *spa = tx->tx_pool->dp_spa;
+
+ ASSERT0(tx->tx_txg);
+
+ if (tx->tx_err)
+ return (tx->tx_err);
+
+ if (spa_suspended(spa)) {
+ /*
+ * If the user has indicated a blocking failure mode
+ * then return ERESTART which will block in dmu_tx_wait().
+ * Otherwise, return EIO so that an error can get
+ * propagated back to the VOP calls.
+ *
+ * Note that we always honor the txg_how flag regardless
+ * of the failuremode setting.
+ */
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+ !(txg_how & TXG_WAIT))
+ return (SET_ERROR(EIO));
+
+ return (SET_ERROR(ERESTART));
+ }
+
+ if (!tx->tx_dirty_delayed &&
+ dsl_pool_need_dirty_delay(tx->tx_pool)) {
+ tx->tx_wait_dirty = B_TRUE;
+ return (SET_ERROR(ERESTART));
+ }
+
+ tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+ tx->tx_needassign_txh = NULL;
+
+ /*
+ * NB: No error returns are allowed after txg_hold_open, but
+ * before processing the dnode holds, due to the
+ * dmu_tx_unassign() logic.
+ */
+
+ uint64_t towrite = 0;
+ uint64_t tohold = 0;
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+ if (dn != NULL) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_assigned_txg == tx->tx_txg - 1) {
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = txh;
+ return (SET_ERROR(ERESTART));
+ }
+ if (dn->dn_assigned_txg == 0)
+ dn->dn_assigned_txg = tx->tx_txg;
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+ (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ towrite += zfs_refcount_count(&txh->txh_space_towrite);
+ tohold += zfs_refcount_count(&txh->txh_memory_tohold);
+ }
+
+ /* needed allocation: worst-case estimate of write space */
+ uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
+ /* calculate memory footprint estimate */
+ uint64_t memory = towrite + tohold;
+
+ if (tx->tx_dir != NULL && asize != 0) {
+ int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+ asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
+ if (err != 0)
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+dmu_tx_unassign(dmu_tx_t *tx)
+{
+ if (tx->tx_txg == 0)
+ return;
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ /*
+ * Walk the transaction's hold list, removing the hold on the
+ * associated dnode, and notifying waiters if the refcount drops to 0.
+ */
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
+ txh != tx->tx_needassign_txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ txg_rele_to_sync(&tx->tx_txgh);
+
+ tx->tx_lasttried_txg = tx->tx_txg;
+ tx->tx_txg = 0;
+}
+
+/*
+ * Assign tx to a transaction group; txg_how is a bitmask:
+ *
+ * If TXG_WAIT is set and the currently open txg is full, this function
+ * will wait until there's a new txg. This should be used when no locks
+ * are being held. With this bit set, this function will only fail if
+ * we're truly out of space (or over quota).
+ *
+ * If TXG_WAIT is *not* set and we can't assign into the currently open
+ * txg without blocking, this function will return immediately with
+ * ERESTART. This should be used whenever locks are being held. On an
+ * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
+ * and try again.
+ *
+ * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
+ * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
+ * details on the throttle). This is used by the VFS operations, after
+ * they have already called dmu_tx_wait() (though most likely on a
+ * different tx).
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
+ ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+
+ /* If we might wait, we must not hold the config lock. */
+ IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
+
+ if ((txg_how & TXG_NOTHROTTLE))
+ tx->tx_dirty_delayed = B_TRUE;
+
+ while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
+ dmu_tx_unassign(tx);
+
+ if (err != ERESTART || !(txg_how & TXG_WAIT))
+ return (err);
+
+ dmu_tx_wait(tx);
+ }
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ return (0);
+}
+
+void
+dmu_tx_wait(dmu_tx_t *tx)
+{
+ spa_t *spa = tx->tx_pool->dp_spa;
+ dsl_pool_t *dp = tx->tx_pool;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(!dsl_pool_config_held(tx->tx_pool));
+
+ if (tx->tx_wait_dirty) {
+ /*
+ * dmu_tx_try_assign() has determined that we need to wait
+ * because we've consumed much or all of the dirty buffer
+ * space.
+ */
+ mutex_enter(&dp->dp_lock);
+ while (dp->dp_dirty_total >= zfs_dirty_data_max)
+ cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+ uint64_t dirty = dp->dp_dirty_total;
+ mutex_exit(&dp->dp_lock);
+
+ dmu_tx_delay(tx, dirty);
+
+ tx->tx_wait_dirty = B_FALSE;
+
+ /*
+ * Note: setting tx_dirty_delayed only has effect if the
+ * caller used TX_WAIT. Otherwise they are going to
+ * destroy this tx and try again. The common case,
+ * zfs_write(), uses TX_WAIT.
+ */
+ tx->tx_dirty_delayed = B_TRUE;
+ } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+ /*
+ * If the pool is suspended we need to wait until it
+ * is resumed. Note that it's possible that the pool
+ * has become active after this thread has tried to
+ * obtain a tx. If that's the case then tx_lasttried_txg
+ * would not have been set.
+ */
+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
+ } else if (tx->tx_needassign_txh) {
+ /*
+ * A dnode is assigned to the quiescing txg. Wait for its
+ * transaction to complete.
+ */
+ dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
+
+ mutex_enter(&dn->dn_mtx);
+ while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
+ cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = NULL;
+ } else {
+ /*
+ * If we have a lot of dirty data just wait until we sync
+ * out a TXG at which point we'll hopefully have synced
+ * a portion of the changes.
+ */
+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
+ }
+}
+
+static void
+dmu_tx_destroy(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ while ((txh = list_head(&tx->tx_holds)) != NULL) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ zfs_refcount_destroy_many(&txh->txh_space_towrite,
+ zfs_refcount_count(&txh->txh_space_towrite));
+ zfs_refcount_destroy_many(&txh->txh_memory_tohold,
+ zfs_refcount_count(&txh->txh_memory_tohold));
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn != NULL)
+ dnode_rele(dn, tx);
+ }
+
+ list_destroy(&tx->tx_callbacks);
+ list_destroy(&tx->tx_holds);
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+
+ /*
+ * Go through the transaction's hold list and remove holds on
+ * associated dnodes, notifying waiters if no holds remain.
+ */
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ continue;
+
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (tx->tx_tempreserve_cookie)
+ dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+
+ if (!list_is_empty(&tx->tx_callbacks))
+ txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
+ if (tx->tx_anyobj == FALSE)
+ txg_rele_to_sync(&tx->tx_txgh);
+
+ dmu_tx_destroy(tx);
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg == 0);
+
+ /*
+ * Call any registered callbacks with an error code.
+ */
+ if (!list_is_empty(&tx->tx_callbacks))
+ dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
+
+ dmu_tx_destroy(tx);
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+ return (tx->tx_txg);
+}
+
+dsl_pool_t *
+dmu_tx_pool(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_pool != NULL);
+ return (tx->tx_pool);
+}
+
+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+ dmu_tx_callback_t *dcb;
+
+ dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+ dcb->dcb_func = func;
+ dcb->dcb_data = data;
+
+ list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+ dmu_tx_callback_t *dcb;
+
+ while ((dcb = list_head(cb_list)) != NULL) {
+ list_remove(cb_list, dcb);
+ dcb->dcb_func(dcb->dcb_data, error);
+ kmem_free(dcb, sizeof (dmu_tx_callback_t));
+ }
+}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed. If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+ if (!sa->sa_need_attr_registration)
+ return;
+
+ for (int i = 0; i != sa->sa_num_attrs; i++) {
+ if (!sa->sa_attr_table[i].sa_registered) {
+ if (sa->sa_reg_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ else
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ }
+ }
+}
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+ THT_SPILL, 0, 0);
+ if (txh != NULL)
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+ } else {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
+ return;
+
+ (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+ THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function. It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+ uint64_t object;
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ ASSERT(hdl != NULL);
+
+ object = sa_handle_object(hdl);
+
+ dmu_tx_hold_bonus(tx, object);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+ tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+ if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ } else {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_have_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ }
+ DB_DNODE_EXIT(db);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
new file mode 100644
index 000000000000..229032530e86
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -0,0 +1,374 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+#include <sys/kstat.h>
+
+/*
+ * This tunable disables predictive prefetch. Note that it leaves "prescient"
+ * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
+ * prescient prefetch never issues i/os that end up not being needed,
+ * so it can't hurt performance.
+ */
+boolean_t zfs_prefetch_disable = B_FALSE;
+
+/* max # of streams per zfetch */
+uint32_t zfetch_max_streams = 8;
+/* min time before stream reclaim */
+uint32_t zfetch_min_sec_reap = 2;
+/* max bytes to prefetch per stream (default 8MB) */
+uint32_t zfetch_max_distance = 8 * 1024 * 1024;
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+uint32_t zfetch_max_idistance = 64 * 1024 * 1024;
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
+uint64_t zfetch_array_rd_sz = 1024 * 1024;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
+ &zfs_prefetch_disable, 0, "Disable prefetch");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS ZFETCH");
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
+ &zfetch_max_streams, 0, "Max # of streams per zfetch");
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
+ &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
+ &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
+ &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
+SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
+ &zfetch_array_rd_sz, 0,
+ "Number of bytes in a array_read at which we stop prefetching");
+
+typedef struct zfetch_stats {
+ kstat_named_t zfetchstat_hits;
+ kstat_named_t zfetchstat_misses;
+ kstat_named_t zfetchstat_max_streams;
+} zfetch_stats_t;
+
+static zfetch_stats_t zfetch_stats = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "max_streams", KSTAT_DATA_UINT64 },
+};
+
+#define ZFETCHSTAT_BUMP(stat) \
+ atomic_inc_64(&zfetch_stats.stat.value.ui64);
+
+kstat_t *zfetch_ksp;
+
+void
+zfetch_init(void)
+{
+ zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (zfetch_ksp != NULL) {
+ zfetch_ksp->ks_data = &zfetch_stats;
+ kstat_install(zfetch_ksp);
+ }
+}
+
+void
+zfetch_fini(void)
+{
+ if (zfetch_ksp != NULL) {
+ kstat_delete(zfetch_ksp);
+ zfetch_ksp = NULL;
+ }
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode. It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+ if (zf == NULL)
+ return;
+
+ zf->zf_dnode = dno;
+
+ list_create(&zf->zf_stream, sizeof (zstream_t),
+ offsetof(zstream_t, zs_node));
+
+ rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+ list_remove(&zf->zf_stream, zs);
+ mutex_destroy(&zs->zs_lock);
+ kmem_free(zs, sizeof (*zs));
+}
+
+/*
+ * Clean-up state associated with a zfetch structure (e.g. destroy the
+ * streams). This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_fini(zfetch_t *zf)
+{
+ zstream_t *zs;
+
+ ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
+
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ while ((zs = list_head(&zf->zf_stream)) != NULL)
+ dmu_zfetch_stream_remove(zf, zs);
+ rw_exit(&zf->zf_rwlock);
+ list_destroy(&zf->zf_stream);
+ rw_destroy(&zf->zf_rwlock);
+
+ zf->zf_dnode = NULL;
+}
+
+/*
+ * If there aren't too many streams already, create a new stream.
+ * The "blkid" argument is the next block that we expect this stream to access.
+ * While we're here, clean up old streams (which haven't been
+ * accessed for at least zfetch_min_sec_reap seconds).
+ */
+static void
+dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
+{
+ zstream_t *zs_next;
+ int numstreams = 0;
+
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ /*
+ * Clean up old streams.
+ */
+ for (zstream_t *zs = list_head(&zf->zf_stream);
+ zs != NULL; zs = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs);
+ if (((gethrtime() - zs->zs_atime) / NANOSEC) >
+ zfetch_min_sec_reap)
+ dmu_zfetch_stream_remove(zf, zs);
+ else
+ numstreams++;
+ }
+
+ /*
+ * The maximum number of streams is normally zfetch_max_streams,
+ * but for small files we lower it such that it's at least possible
+ * for all the streams to be non-overlapping.
+ *
+ * If we are already at the maximum number of streams for this file,
+ * even after removing old streams, then don't create this stream.
+ */
+ uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
+ zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+ zfetch_max_distance));
+ if (numstreams >= max_streams) {
+ ZFETCHSTAT_BUMP(zfetchstat_max_streams);
+ return;
+ }
+
+ zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
+ zs->zs_blkid = blkid;
+ zs->zs_pf_blkid = blkid;
+ zs->zs_ipf_blkid = blkid;
+ zs->zs_atime = gethrtime();
+ mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ list_insert_head(&zf->zf_stream, zs);
+}
+
+/*
+ * This is the predictive prefetch entry point. It associates dnode access
+ * specified with blkid and nblks arguments with prefetch stream, predicts
+ * further accesses based on that stats and initiates speculative prefetch.
+ * fetch_data argument specifies whether actual data blocks should be fetched:
+ * FALSE -- prefetch only indirect blocks for predicted data blocks;
+ * TRUE -- prefetch predicted data blocks plus following indirect blocks.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
+{
+ zstream_t *zs;
+ int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+ int64_t pf_ahead_blks, max_blks;
+ int epbs, max_dist_blks, pf_nblks, ipf_nblks;
+ uint64_t end_of_access_blkid = blkid + nblks;
+ spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
+
+ if (zfs_prefetch_disable)
+ return;
+
+ /*
+ * If we haven't yet loaded the indirect vdevs' mappings, we
+ * can only read from blocks that we carefully ensure are on
+ * concrete vdevs (or previously-loaded indirect vdevs). So we
+ * can't allow the predictive prefetcher to attempt reads of other
+ * blocks (e.g. of the MOS's dnode obejct).
+ */
+ if (!spa_indirect_vdevs_loaded(spa))
+ return;
+
+ /*
+ * As a fast path for small (single-block) files, ignore access
+ * to the first block.
+ */
+ if (blkid == 0)
+ return;
+
+ rw_enter(&zf->zf_rwlock, RW_READER);
+
+ /*
+ * Find matching prefetch stream. Depending on whether the accesses
+ * are block-aligned, first block of the new access may either follow
+ * the last block of the previous access, or be equal to it.
+ */
+ for (zs = list_head(&zf->zf_stream); zs != NULL;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
+ mutex_enter(&zs->zs_lock);
+ /*
+ * zs_blkid could have changed before we
+ * acquired zs_lock; re-check them here.
+ */
+ if (blkid == zs->zs_blkid) {
+ break;
+ } else if (blkid + 1 == zs->zs_blkid) {
+ blkid++;
+ nblks--;
+ if (nblks == 0) {
+ /* Already prefetched this before. */
+ mutex_exit(&zs->zs_lock);
+ rw_exit(&zf->zf_rwlock);
+ return;
+ }
+ break;
+ }
+ mutex_exit(&zs->zs_lock);
+ }
+ }
+
+ if (zs == NULL) {
+ /*
+ * This access is not part of any existing stream. Create
+ * a new stream for it.
+ */
+ ZFETCHSTAT_BUMP(zfetchstat_misses);
+ if (rw_tryupgrade(&zf->zf_rwlock))
+ dmu_zfetch_stream_create(zf, end_of_access_blkid);
+ rw_exit(&zf->zf_rwlock);
+ return;
+ }
+
+ /*
+ * This access was to a block that we issued a prefetch for on
+ * behalf of this stream. Issue further prefetches for this stream.
+ *
+ * Normally, we start prefetching where we stopped
+ * prefetching last (zs_pf_blkid). But when we get our first
+ * hit on this stream, zs_pf_blkid == zs_blkid, we don't
+ * want to prefetch the block we just accessed. In this case,
+ * start just after the block we just accessed.
+ */
+ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
+
+ /*
+ * Double our amount of prefetched data, but don't let the
+ * prefetch get further ahead than zfetch_max_distance.
+ */
+ if (fetch_data) {
+ max_dist_blks =
+ zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
+ /*
+ * Previously, we were (zs_pf_blkid - blkid) ahead. We
+ * want to now be double that, so read that amount again,
+ * plus the amount we are catching up by (i.e. the amount
+ * read just now).
+ */
+ pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
+ max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
+ pf_nblks = MIN(pf_ahead_blks, max_blks);
+ } else {
+ pf_nblks = 0;
+ }
+
+ zs->zs_pf_blkid = pf_start + pf_nblks;
+
+ /*
+ * Do the same for indirects, starting from where we stopped last,
+ * or where we will stop reading data blocks (and the indirects
+ * that point to them).
+ */
+ ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
+ max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
+ /*
+ * We want to double our distance ahead of the data prefetch
+ * (or reader, if we are not prefetching data). Previously, we
+ * were (zs_ipf_blkid - blkid) ahead. To double that, we read
+ * that amount again, plus the amount we are catching up by
+ * (i.e. the amount read now + the amount of data prefetched now).
+ */
+ pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
+ max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+ ipf_nblks = MIN(pf_ahead_blks, max_blks);
+ zs->zs_ipf_blkid = ipf_start + ipf_nblks;
+
+ epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+ ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
+
+ zs->zs_atime = gethrtime();
+ zs->zs_blkid = end_of_access_blkid;
+ mutex_exit(&zs->zs_lock);
+ rw_exit(&zf->zf_rwlock);
+
+ /*
+ * dbuf_prefetch() is asynchronous (even when it needs to read
+ * indirect blocks), but we still prefer to drop our locks before
+ * calling it to reduce the time we hold them.
+ */
+
+ for (int i = 0; i < pf_nblks; i++) {
+ dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+ for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+ dbuf_prefetch(zf->zf_dnode, 1, iblk,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+ ZFETCHSTAT_BUMP(zfetchstat_hits);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
new file mode 100644
index 000000000000..50a7338fb9e8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -0,0 +1,2418 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 RackTop Systems.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/range_tree.h>
+
+dnode_stats_t dnode_stats = {
+ { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
+ { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
+ { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_allocate", KSTAT_DATA_UINT64 },
+ { "dnode_reallocate", KSTAT_DATA_UINT64 },
+ { "dnode_buf_evict", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_race", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
+ { "dnode_move_invalid", KSTAT_DATA_UINT64 },
+ { "dnode_move_recheck1", KSTAT_DATA_UINT64 },
+ { "dnode_move_recheck2", KSTAT_DATA_UINT64 },
+ { "dnode_move_special", KSTAT_DATA_UINT64 },
+ { "dnode_move_handle", KSTAT_DATA_UINT64 },
+ { "dnode_move_rwlock", KSTAT_DATA_UINT64 },
+ { "dnode_move_active", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dnode_ksp;
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
+ &zfs_default_bs, 0, "Default dnode block shift");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
+ &zfs_default_ibs, 0, "Default dnode indirect block shift");
+
+#ifdef illumos
+#ifdef _KERNEL
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+#endif /* _KERNEL */
+#endif
+
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+ const dmu_buf_impl_t *d1 = x1;
+ const dmu_buf_impl_t *d2 = x2;
+
+ int cmp = AVL_CMP(d1->db_level, d2->db_level);
+ if (likely(cmp))
+ return (cmp);
+
+ cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+ if (likely(cmp))
+ return (cmp);
+
+ if (d1->db_state == DB_SEARCH) {
+ ASSERT3S(d2->db_state, !=, DB_SEARCH);
+ return (-1);
+ } else if (d2->db_state == DB_SEARCH) {
+ ASSERT3S(d1->db_state, !=, DB_SEARCH);
+ return (1);
+ }
+
+ return (AVL_PCMP(d1, d2));
+}
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+ dnode_t *dn = arg;
+ int i;
+
+ rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+
+ /*
+ * Every dbuf has a reference, and dropping a tracked reference is
+ * O(number of references), so don't track dn_holds.
+ */
+ zfs_refcount_create_untracked(&dn->dn_holds);
+ zfs_refcount_create(&dn->dn_tx_holds);
+ list_link_init(&dn->dn_link);
+
+ bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+ bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+ bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+ bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+ bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+ bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+ bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ multilist_link_init(&dn->dn_dirty_link[i]);
+ dn->dn_free_ranges[i] = NULL;
+ list_create(&dn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+ dn->dn_dirty_txg = 0;
+ dn->dn_dirtyctx = 0;
+ dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_bonus = NULL;
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_zio = NULL;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dn->dn_dbufs_count = 0;
+ avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ dn->dn_moved = 0;
+ POINTER_INVALIDATE(&dn->dn_objset);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+ int i;
+ dnode_t *dn = arg;
+
+ rw_destroy(&dn->dn_struct_rwlock);
+ mutex_destroy(&dn->dn_mtx);
+ mutex_destroy(&dn->dn_dbufs_mtx);
+ cv_destroy(&dn->dn_notxholds);
+ zfs_refcount_destroy(&dn->dn_holds);
+ zfs_refcount_destroy(&dn->dn_tx_holds);
+ ASSERT(!list_link_active(&dn->dn_link));
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+ list_destroy(&dn->dn_dirty_records[i]);
+ ASSERT0(dn->dn_next_nblkptr[i]);
+ ASSERT0(dn->dn_next_nlevels[i]);
+ ASSERT0(dn->dn_next_indblkshift[i]);
+ ASSERT0(dn->dn_next_bonustype[i]);
+ ASSERT0(dn->dn_rm_spillblk[i]);
+ ASSERT0(dn->dn_next_bonuslen[i]);
+ ASSERT0(dn->dn_next_blksz[i]);
+ }
+
+ ASSERT0(dn->dn_allocated_txg);
+ ASSERT0(dn->dn_free_txg);
+ ASSERT0(dn->dn_assigned_txg);
+ ASSERT0(dn->dn_dirty_txg);
+ ASSERT0(dn->dn_dirtyctx);
+ ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+ ASSERT3P(dn->dn_bonus, ==, NULL);
+ ASSERT(!dn->dn_have_spill);
+ ASSERT3P(dn->dn_zio, ==, NULL);
+ ASSERT0(dn->dn_oldused);
+ ASSERT0(dn->dn_oldflags);
+ ASSERT0(dn->dn_olduid);
+ ASSERT0(dn->dn_oldgid);
+ ASSERT0(dn->dn_newuid);
+ ASSERT0(dn->dn_newgid);
+ ASSERT0(dn->dn_id_flags);
+
+ ASSERT0(dn->dn_dbufs_count);
+ avl_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+ ASSERT(dnode_cache == NULL);
+ dnode_cache = kmem_cache_create("dnode_t",
+ sizeof (dnode_t),
+ 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+#ifdef _KERNEL
+ kmem_cache_set_move(dnode_cache, dnode_move);
+
+ dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (dnode_ksp != NULL) {
+ dnode_ksp->ks_data = &dnode_stats;
+ kstat_install(dnode_ksp);
+ }
+#endif /* _KERNEL */
+}
+
+void
+dnode_fini(void)
+{
+ if (dnode_ksp != NULL) {
+ kstat_delete(dnode_ksp);
+ dnode_ksp = NULL;
+ }
+
+ kmem_cache_destroy(dnode_cache);
+ dnode_cache = NULL;
+}
+
+
+#ifdef ZFS_DEBUG
+void
+dnode_verify(dnode_t *dn)
+{
+ int drop_struct_lock = FALSE;
+
+ ASSERT(dn->dn_phys);
+ ASSERT(dn->dn_objset);
+ ASSERT(dn->dn_handle->dnh_dnode == dn);
+
+ ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+
+ if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+ return;
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+ int i;
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ ASSERT3U(dn->dn_indblkshift, >=, 0);
+ ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+ if (dn->dn_datablkshift) {
+ ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+ ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+ }
+ ASSERT3U(dn->dn_nlevels, <=, 30);
+ ASSERT(DMU_OT_IS_VALID(dn->dn_type));
+ ASSERT3U(dn->dn_nblkptr, >=, 1);
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+ ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
+ ASSERT3U(dn->dn_datablksz, ==,
+ dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+ ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+ dn->dn_bonuslen, <=, max_bonuslen);
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+ }
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE)
+ ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+ ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
+ if (dn->dn_dbuf != NULL) {
+ ASSERT3P(dn->dn_phys, ==,
+ (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+ (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+#endif
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+ uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+ int i;
+
+ if (dnp->dn_type == DMU_OT_NONE) {
+ bzero(dnp, sizeof (dnode_phys_t));
+ return;
+ }
+
+ dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+ dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
+ dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+ dnp->dn_used = BSWAP_64(dnp->dn_used);
+
+ /*
+ * dn_nblkptr is only one byte, so it's OK to read it in either
+ * byte order. We can't read dn_bouslen.
+ */
+ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+ ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+ for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+ buf64[i] = BSWAP_64(buf64[i]);
+
+ /*
+ * OK to check dn_bonuslen for zero, because it won't matter if
+ * we have the wrong byte order. This is necessary because the
+ * dnode dnode is smaller than a regular dnode.
+ */
+ if (dnp->dn_bonuslen != 0) {
+ /*
+ * Note that the bonus length calculated here may be
+ * longer than the actual bonus buffer. This is because
+ * we always put the bonus buffer after the last block
+ * pointer (instead of packing it against the end of the
+ * dnode buffer).
+ */
+ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+ int slots = dnp->dn_extra_slots + 1;
+ size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
+ ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(dnp->dn_bonustype);
+ dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
+ }
+
+ /* Swap SPILL block if we have one */
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
+
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+ int i = 0;
+
+ ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+ ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+ while (i < size) {
+ dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
+ dnode_byteswap(dnp);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE)
+ i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
+ }
+}
+
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+ dn->dn_bonuslen = newsize;
+ if (newsize == 0)
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+ else
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dn->dn_bonustype = newtype;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ dnode_setdirty(dn, tx);
+ dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+ dn->dn_have_spill = B_FALSE;
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+ ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+ 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+ dn->dn_datablksz = size;
+ dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+ dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+ uint64_t object, dnode_handle_t *dnh)
+{
+ dnode_t *dn;
+
+ dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+#ifdef _KERNEL
+ ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+#endif /* _KERNEL */
+ dn->dn_moved = 0;
+
+ /*
+ * Defer setting dn_objset until the dnode is ready to be a candidate
+ * for the dnode_move() callback.
+ */
+ dn->dn_object = object;
+ dn->dn_dbuf = db;
+ dn->dn_handle = dnh;
+ dn->dn_phys = dnp;
+
+ if (dnp->dn_datablkszsec) {
+ dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ } else {
+ dn->dn_datablksz = 0;
+ dn->dn_datablkszsec = 0;
+ dn->dn_datablkshift = 0;
+ }
+ dn->dn_indblkshift = dnp->dn_indblkshift;
+ dn->dn_nlevels = dnp->dn_nlevels;
+ dn->dn_type = dnp->dn_type;
+ dn->dn_nblkptr = dnp->dn_nblkptr;
+ dn->dn_checksum = dnp->dn_checksum;
+ dn->dn_compress = dnp->dn_compress;
+ dn->dn_bonustype = dnp->dn_bonustype;
+ dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_num_slots = dnp->dn_extra_slots + 1;
+ dn->dn_maxblkid = dnp->dn_maxblkid;
+ dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+ dn->dn_id_flags = 0;
+
+ dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+ ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+ ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
+
+ mutex_enter(&os->os_lock);
+
+ /*
+ * Exclude special dnodes from os_dnodes so an empty os_dnodes
+ * signifies that the special dnodes have no references from
+ * their children (the entries in os_dnodes). This allows
+ * dnode_destroy() to easily determine if the last child has
+ * been removed and then complete eviction of the objset.
+ */
+ if (!DMU_OBJECT_IS_SPECIAL(object))
+ list_insert_head(&os->os_dnodes, dn);
+ membar_producer();
+
+ /*
+ * Everything else must be valid before assigning dn_objset
+ * makes the dnode eligible for dnode_move().
+ */
+ dn->dn_objset = os;
+
+ dnh->dnh_dnode = dn;
+ mutex_exit(&os->os_lock);
+
+ arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+ return (dn);
+}
+
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
+static void
+dnode_destroy(dnode_t *dn)
+{
+ objset_t *os = dn->dn_objset;
+ boolean_t complete_os_eviction = B_FALSE;
+
+ ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
+
+ mutex_enter(&os->os_lock);
+ POINTER_INVALIDATE(&dn->dn_objset);
+ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ list_remove(&os->os_dnodes, dn);
+ complete_os_eviction =
+ list_is_empty(&os->os_dnodes) &&
+ list_link_active(&os->os_evicting_node);
+ }
+ mutex_exit(&os->os_lock);
+
+ /* the dnode can no longer move, so we can release the handle */
+ if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+ dn->dn_dirty_txg = 0;
+
+ dn->dn_dirtyctx = 0;
+ if (dn->dn_dirtyctx_firstset != NULL) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+ if (dn->dn_bonus != NULL) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_destroy(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ dn->dn_zio = NULL;
+
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dmu_zfetch_fini(&dn->dn_zfetch);
+ kmem_cache_free(dnode_cache, dn);
+ arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+ if (complete_os_eviction)
+ dmu_objset_evict_done(os);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
+{
+ int i;
+
+ ASSERT3U(dn_slots, >, 0);
+ ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+ spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+ if (blocksize == 0)
+ blocksize = 1 << zfs_default_bs;
+ else
+ blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
+
+ if (ibs == 0)
+ ibs = zfs_default_ibs;
+
+ ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+ dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
+ " blocksize=%d ibs=%d dn_slots=%d\n",
+ dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
+ DNODE_STAT_BUMP(dnode_allocate);
+
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+ ASSERT(ot != DMU_OT_NONE);
+ ASSERT(DMU_OT_IS_VALID(ot));
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT(DMU_OT_IS_VALID(bonustype));
+ ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT0(dn->dn_maxblkid);
+ ASSERT0(dn->dn_allocated_txg);
+ ASSERT0(dn->dn_dirty_txg);
+ ASSERT0(dn->dn_assigned_txg);
+ ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
+ ASSERT(avl_is_empty(&dn->dn_dbufs));
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(dn->dn_next_nblkptr[i]);
+ ASSERT0(dn->dn_next_nlevels[i]);
+ ASSERT0(dn->dn_next_indblkshift[i]);
+ ASSERT0(dn->dn_next_bonuslen[i]);
+ ASSERT0(dn->dn_next_bonustype[i]);
+ ASSERT0(dn->dn_rm_spillblk[i]);
+ ASSERT0(dn->dn_next_blksz[i]);
+ ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
+ ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+ }
+
+ dn->dn_type = ot;
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_indblkshift = ibs;
+ dn->dn_nlevels = 1;
+ dn->dn_num_slots = dn_slots;
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ dn->dn_nblkptr = 1;
+ else {
+ dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ }
+
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ dn->dn_dirtyctx = 0;
+
+ dn->dn_free_txg = 0;
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ dn->dn_id_flags = 0;
+
+ dnode_setdirty(dn, tx);
+ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
+{
+ int nblkptr;
+
+ ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+ ASSERT0(blocksize % SPA_MINBLOCKSIZE);
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+ ASSERT(tx->tx_txg != 0);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0));
+ ASSERT(DMU_OT_IS_VALID(bonustype));
+ ASSERT3U(bonuslen, <=,
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+ ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
+
+ dnode_free_interior_slots(dn);
+ DNODE_STAT_BUMP(dnode_reallocate);
+
+ /* clean up any unreferenced dbufs */
+ dnode_evict_dbufs(dn);
+
+ dn->dn_id_flags = 0;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_setdirty(dn, tx);
+ if (dn->dn_datablksz != blocksize) {
+ /* change blocksize */
+ ASSERT(dn->dn_maxblkid == 0 &&
+ (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+ dnode_block_freed(dn, 0)));
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+ }
+ if (dn->dn_bonuslen != bonuslen)
+ dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
+
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ nblkptr = 1;
+ else
+ nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ if (dn->dn_bonustype != bonustype)
+ dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
+ if (dn->dn_nblkptr != nblkptr)
+ dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ dbuf_rm_spill(dn, tx);
+ dnode_rm_spill(dn, tx);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* change type */
+ dn->dn_type = ot;
+
+ /* change bonus size and type */
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_num_slots = dn_slots;
+ dn->dn_nblkptr = nblkptr;
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+ /* fix up the bonus db_size */
+ if (dn->dn_bonus) {
+ dn->dn_bonus->db.db_size =
+ DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
+ ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+ }
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+}
+
+#ifdef _KERNEL
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+ int i;
+
+ ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+ ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+ /* Copy fields. */
+ ndn->dn_objset = odn->dn_objset;
+ ndn->dn_object = odn->dn_object;
+ ndn->dn_dbuf = odn->dn_dbuf;
+ ndn->dn_handle = odn->dn_handle;
+ ndn->dn_phys = odn->dn_phys;
+ ndn->dn_type = odn->dn_type;
+ ndn->dn_bonuslen = odn->dn_bonuslen;
+ ndn->dn_bonustype = odn->dn_bonustype;
+ ndn->dn_nblkptr = odn->dn_nblkptr;
+ ndn->dn_checksum = odn->dn_checksum;
+ ndn->dn_compress = odn->dn_compress;
+ ndn->dn_nlevels = odn->dn_nlevels;
+ ndn->dn_indblkshift = odn->dn_indblkshift;
+ ndn->dn_datablkshift = odn->dn_datablkshift;
+ ndn->dn_datablkszsec = odn->dn_datablkszsec;
+ ndn->dn_datablksz = odn->dn_datablksz;
+ ndn->dn_maxblkid = odn->dn_maxblkid;
+ ndn->dn_num_slots = odn->dn_num_slots;
+ bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
+ sizeof (odn->dn_next_type));
+ bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+ sizeof (odn->dn_next_nblkptr));
+ bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+ sizeof (odn->dn_next_nlevels));
+ bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+ sizeof (odn->dn_next_indblkshift));
+ bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+ sizeof (odn->dn_next_bonustype));
+ bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+ sizeof (odn->dn_rm_spillblk));
+ bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+ sizeof (odn->dn_next_bonuslen));
+ bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+ sizeof (odn->dn_next_blksz));
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_move_tail(&ndn->dn_dirty_records[i],
+ &odn->dn_dirty_records[i]);
+ }
+ bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+ sizeof (odn->dn_free_ranges));
+ ndn->dn_allocated_txg = odn->dn_allocated_txg;
+ ndn->dn_free_txg = odn->dn_free_txg;
+ ndn->dn_assigned_txg = odn->dn_assigned_txg;
+ ndn->dn_dirty_txg = odn->dn_dirty_txg;
+ ndn->dn_dirtyctx = odn->dn_dirtyctx;
+ ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+ ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
+ zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+ ASSERT(avl_is_empty(&ndn->dn_dbufs));
+ avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ndn->dn_dbufs_count = odn->dn_dbufs_count;
+ ndn->dn_bonus = odn->dn_bonus;
+ ndn->dn_have_spill = odn->dn_have_spill;
+ ndn->dn_zio = odn->dn_zio;
+ ndn->dn_oldused = odn->dn_oldused;
+ ndn->dn_oldflags = odn->dn_oldflags;
+ ndn->dn_olduid = odn->dn_olduid;
+ ndn->dn_oldgid = odn->dn_oldgid;
+ ndn->dn_newuid = odn->dn_newuid;
+ ndn->dn_newgid = odn->dn_newgid;
+ ndn->dn_id_flags = odn->dn_id_flags;
+ dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+ list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+ ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+
+ /*
+ * Update back pointers. Updating the handle fixes the back pointer of
+ * every descendant dbuf as well as the bonus dbuf.
+ */
+ ASSERT(ndn->dn_handle->dnh_dnode == odn);
+ ndn->dn_handle->dnh_dnode = ndn;
+ if (ndn->dn_zfetch.zf_dnode == odn) {
+ ndn->dn_zfetch.zf_dnode = ndn;
+ }
+
+ /*
+ * Invalidate the original dnode by clearing all of its back pointers.
+ */
+ odn->dn_dbuf = NULL;
+ odn->dn_handle = NULL;
+ avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+ odn->dn_dbufs_count = 0;
+ odn->dn_bonus = NULL;
+ odn->dn_zfetch.zf_dnode = NULL;
+
+ /*
+ * Set the low bit of the objset pointer to ensure that dnode_move()
+ * recognizes the dnode as invalid in any subsequent callback.
+ */
+ POINTER_INVALIDATE(&odn->dn_objset);
+
+ /*
+ * Satisfy the destructor.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&odn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ odn->dn_free_ranges[i] = NULL;
+ odn->dn_next_nlevels[i] = 0;
+ odn->dn_next_indblkshift[i] = 0;
+ odn->dn_next_bonustype[i] = 0;
+ odn->dn_rm_spillblk[i] = 0;
+ odn->dn_next_bonuslen[i] = 0;
+ odn->dn_next_blksz[i] = 0;
+ }
+ odn->dn_allocated_txg = 0;
+ odn->dn_free_txg = 0;
+ odn->dn_assigned_txg = 0;
+ odn->dn_dirty_txg = 0;
+ odn->dn_dirtyctx = 0;
+ odn->dn_dirtyctx_firstset = NULL;
+ odn->dn_have_spill = B_FALSE;
+ odn->dn_zio = NULL;
+ odn->dn_oldused = 0;
+ odn->dn_oldflags = 0;
+ odn->dn_olduid = 0;
+ odn->dn_oldgid = 0;
+ odn->dn_newuid = 0;
+ odn->dn_newgid = 0;
+ odn->dn_id_flags = 0;
+
+ /*
+ * Mark the dnode.
+ */
+ ndn->dn_moved = 1;
+ odn->dn_moved = (uint8_t)-1;
+}
+
+#ifdef illumos
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+ dnode_t *odn = buf, *ndn = newbuf;
+ objset_t *os;
+ int64_t refcount;
+ uint32_t dbufs;
+
+ /*
+ * The dnode is on the objset's list of known dnodes if the objset
+ * pointer is valid. We set the low bit of the objset pointer when
+ * freeing the dnode to invalidate it, and the memory patterns written
+ * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+ * A newly created dnode sets the objset pointer last of all to indicate
+ * that the dnode is known and in a valid state to be moved by this
+ * function.
+ */
+ os = odn->dn_objset;
+ if (!POINTER_IS_VALID(os)) {
+ DNODE_STAT_BUMP(dnode_move_invalid);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * Ensure that the objset does not go away during the move.
+ */
+ rw_enter(&os_lock, RW_WRITER);
+ if (os != odn->dn_objset) {
+ rw_exit(&os_lock);
+ DNODE_STAT_BUMP(dnode_move_recheck1);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * If the dnode is still valid, then so is the objset. We know that no
+ * valid objset can be freed while we hold os_lock, so we can safely
+ * ensure that the objset remains in use.
+ */
+ mutex_enter(&os->os_lock);
+
+ /*
+ * Recheck the objset pointer in case the dnode was removed just before
+ * acquiring the lock.
+ */
+ if (os != odn->dn_objset) {
+ mutex_exit(&os->os_lock);
+ rw_exit(&os_lock);
+ DNODE_STAT_BUMP(dnode_move_recheck2);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * At this point we know that as long as we hold os->os_lock, the dnode
+ * cannot be freed and fields within the dnode can be safely accessed.
+ * The objset listing this dnode cannot go away as long as this dnode is
+ * on its list.
+ */
+ rw_exit(&os_lock);
+ if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_BUMP(dnode_move_special);
+ return (KMEM_CBRC_NO);
+ }
+ ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+ /*
+ * Lock the dnode handle to prevent the dnode from obtaining any new
+ * holds. This also prevents the descendant dbufs and the bonus dbuf
+ * from accessing the dnode, so that we can discount their holds. The
+ * handle is safe to access because we know that while the dnode cannot
+ * go away, neither can its handle. Once we hold dnh_zrlock, we can
+ * safely move any dnode referenced only by dbufs.
+ */
+ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_BUMP(dnode_move_handle);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+ * We need to guarantee that there is a hold for every dbuf in order to
+ * determine whether the dnode is actively referenced. Falsely matching
+ * a dbuf to an active hold would lead to an unsafe move. It's possible
+ * that a thread already having an active dnode hold is about to add a
+ * dbuf, and we can't compare hold and dbuf counts while the add is in
+ * progress.
+ */
+ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_BUMP(dnode_move_rwlock);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * A dbuf may be removed (evicted) without an active dnode hold. In that
+ * case, the dbuf count is decremented under the handle lock before the
+ * dbuf's hold is released. This order ensures that if we count the hold
+ * after the dbuf is removed but before its hold is released, we will
+ * treat the unmatched hold as active and exit safely. If we count the
+ * hold before the dbuf is removed, the hold is discounted, and the
+ * removal is blocked until the move completes.
+ */
+ refcount = zfs_refcount_count(&odn->dn_holds);
+ ASSERT(refcount >= 0);
+ dbufs = DN_DBUFS_COUNT(odn);
+
+ /* We can't have more dbufs than dnode holds. */
+ ASSERT3U(dbufs, <=, refcount);
+ DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+ uint32_t, dbufs);
+
+ if (refcount > dbufs) {
+ rw_exit(&odn->dn_struct_rwlock);
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_BUMP(dnode_move_active);
+ return (KMEM_CBRC_LATER);
+ }
+
+ rw_exit(&odn->dn_struct_rwlock);
+
+ /*
+ * At this point we know that anyone with a hold on the dnode is not
+ * actively referencing it. The dnode is known and in a valid state to
+ * move. We're holding the locks needed to execute the critical section.
+ */
+ dnode_move_impl(odn, ndn);
+
+ list_link_replace(&odn->dn_link, &ndn->dn_link);
+ /* If the dnode was safe to move, the refcount cannot have changed. */
+ ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
+ ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
+ zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+ mutex_exit(&os->os_lock);
+
+ return (KMEM_CBRC_YES);
+}
+#endif /* illumos */
+#endif /* _KERNEL */
+
+static void
+dnode_slots_hold(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ zrl_add(&dnh->dnh_zrlock);
+ }
+}
+
+static void
+dnode_slots_rele(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ if (zrl_is_locked(&dnh->dnh_zrlock))
+ zrl_exit(&dnh->dnh_zrlock);
+ else
+ zrl_remove(&dnh->dnh_zrlock);
+ }
+}
+
+static int
+dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ if (!zrl_tryenter(&dnh->dnh_zrlock)) {
+ for (int j = idx; j < i; j++) {
+ dnh = &children->dnc_children[j];
+ zrl_exit(&dnh->dnh_zrlock);
+ }
+
+ return (0);
+ }
+ }
+
+ return (1);
+}
+
+static void
+dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dnh->dnh_dnode = ptr;
+ }
+}
+
+static boolean_t
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ /*
+ * If all dnode slots are either already free or
+ * evictable return B_TRUE.
+ */
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dnode_t *dn = dnh->dnh_dnode;
+
+ if (dn == DN_SLOT_FREE) {
+ continue;
+ } else if (DN_SLOT_IS_PTR(dn)) {
+ mutex_enter(&dn->dn_mtx);
+ boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
+ zfs_refcount_is_zero(&dn->dn_holds) &&
+ !DNODE_IS_DIRTY(dn));
+ mutex_exit(&dn->dn_mtx);
+
+ if (!can_free)
+ return (B_FALSE);
+ else
+ continue;
+ } else {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+ dnode_destroy(dnh->dnh_dnode);
+ dnh->dnh_dnode = DN_SLOT_FREE;
+ }
+ }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+ dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+ int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+ int idx = (dn->dn_object & (epb - 1)) + 1;
+ int slots = dn->dn_num_slots - 1;
+
+ if (slots == 0)
+ return;
+
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ while (!dnode_slots_tryenter(children, idx, slots))
+ DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+ dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+ dnode_slots_rele(children, idx, slots);
+}
+
+void
+dnode_special_close(dnode_handle_t *dnh)
+{
+ dnode_t *dn = dnh->dnh_dnode;
+
+ /*
+ * Wait for final references to the dnode to clear. This can
+ * only happen if the arc is asynchronously evicting state that
+ * has a hold on this dnode while we are trying to evict this
+ * dnode.
+ */
+ while (zfs_refcount_count(&dn->dn_holds) > 0)
+ delay(1);
+ ASSERT(dn->dn_dbuf == NULL ||
+ dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
+}
+
+void
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+ dnode_handle_t *dnh)
+{
+ dnode_t *dn;
+
+ zrl_init(&dnh->dnh_zrlock);
+ zrl_tryenter(&dnh->dnh_zrlock);
+
+ dn = dnode_create(os, dnp, NULL, object, dnh);
+ DNODE_VERIFY(dn);
+
+ zrl_exit(&dnh->dnh_zrlock);
+}
+
+static void
+dnode_buf_evict_async(void *dbu)
+{
+ dnode_children_t *dnc = dbu;
+
+ DNODE_STAT_BUMP(dnode_buf_evict);
+
+ for (int i = 0; i < dnc->dnc_count; i++) {
+ dnode_handle_t *dnh = &dnc->dnc_children[i];
+ dnode_t *dn;
+
+ /*
+ * The dnode handle lock guards against the dnode moving to
+ * another valid address, so there is no need here to guard
+ * against changes to or from NULL.
+ */
+ if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = DN_SLOT_UNINIT;
+ continue;
+ }
+
+ zrl_add(&dnh->dnh_zrlock);
+ dn = dnh->dnh_dnode;
+ /*
+ * If there are holds on this dnode, then there should
+ * be holds on the dnode's containing dbuf as well; thus
+ * it wouldn't be eligible for eviction and this function
+ * would not have been called.
+ */
+ ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
+ ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
+
+ dnode_destroy(dn); /* implicit zrl_remove() for first slot */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = DN_SLOT_UNINIT;
+ }
+ kmem_free(dnc, sizeof (dnode_children_t) +
+ dnc->dnc_count * sizeof (dnode_handle_t));
+}
+
+/*
+ * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
+ * to ensure the hole at the specified object offset is large enough to
+ * hold the dnode being created. The slots parameter is also used to ensure
+ * a dnode does not span multiple dnode blocks. In both of these cases, if
+ * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
+ * are only possible when using DNODE_MUST_BE_FREE.
+ *
+ * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
+ * dnode_hold_impl() will check if the requested dnode is already consumed
+ * as an extra dnode slot by an large dnode, in which case it returns
+ * ENOENT.
+ *
+ * errors:
+ * EINVAL - invalid object number or flags.
+ * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
+ * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
+ * - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
+ * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
+ * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
+ * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
+ * EIO - i/o error error when reading the meta dnode dbuf.
+ * succeeds even for free dnodes.
+ */
+int
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
+ void *tag, dnode_t **dnp)
+{
+ int epb, idx, err, i;
+ int drop_struct_lock = FALSE;
+ int type;
+ uint64_t blk;
+ dnode_t *mdn, *dn;
+ dmu_buf_impl_t *db;
+ dnode_children_t *dnc;
+ dnode_phys_t *dn_block;
+ dnode_phys_t *dn_block_begin;
+ dnode_handle_t *dnh;
+
+ ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+ ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
+ /*
+ * If you are holding the spa config lock as writer, you shouldn't
+ * be asking the DMU to do *anything* unless it's the root pool
+ * which may require us to read from the root filesystem while
+ * holding some (not all) of the locks as writer.
+ */
+ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+ (spa_is_root(os->os_spa) &&
+ spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
+
+ ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
+
+ if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
+ dn = (object == DMU_USERUSED_OBJECT) ?
+ DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
+ if (dn == NULL)
+ return (SET_ERROR(ENOENT));
+ type = dn->dn_type;
+ if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
+ return (SET_ERROR(ENOENT));
+ if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
+ return (SET_ERROR(EEXIST));
+ DNODE_VERIFY(dn);
+ (void) zfs_refcount_add(&dn->dn_holds, tag);
+ *dnp = dn;
+ return (0);
+ }
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return (SET_ERROR(EINVAL));
+
+ mdn = DMU_META_DNODE(os);
+ ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
+
+ DNODE_VERIFY(mdn);
+
+ if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+ rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
+
+ db = dbuf_hold(mdn, blk, FTAG);
+ if (drop_struct_lock)
+ rw_exit(&mdn->dn_struct_rwlock);
+ if (db == NULL) {
+ DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
+ return (SET_ERROR(EIO));
+ }
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ DNODE_STAT_BUMP(dnode_hold_dbuf_read);
+ dbuf_rele(db, FTAG);
+ return (err);
+ }
+
+ ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+ epb = db->db.db_size >> DNODE_SHIFT;
+
+ idx = object & (epb - 1);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
+ dnc = dmu_buf_get_user(&db->db);
+ dnh = NULL;
+ if (dnc == NULL) {
+ dnode_children_t *winner;
+ int skip = 0;
+
+ dnc = kmem_zalloc(sizeof (dnode_children_t) +
+ epb * sizeof (dnode_handle_t), KM_SLEEP);
+ dnc->dnc_count = epb;
+ dnh = &dnc->dnc_children[0];
+
+ /* Initialize dnode slot status from dnode_phys_t */
+ for (int i = 0; i < epb; i++) {
+ zrl_init(&dnh[i].dnh_zrlock);
+
+ if (skip) {
+ skip--;
+ continue;
+ }
+
+ if (dn_block[i].dn_type != DMU_OT_NONE) {
+ int interior = dn_block[i].dn_extra_slots;
+
+ dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
+ dnode_set_slots(dnc, i + 1, interior,
+ DN_SLOT_INTERIOR);
+ skip = interior;
+ } else {
+ dnh[i].dnh_dnode = DN_SLOT_FREE;
+ skip = 0;
+ }
+ }
+
+ dmu_buf_init_user(&dnc->dnc_dbu, NULL,
+ dnode_buf_evict_async, NULL);
+ winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
+ if (winner != NULL) {
+
+ for (int i = 0; i < epb; i++)
+ zrl_destroy(&dnh[i].dnh_zrlock);
+
+ kmem_free(dnc, sizeof (dnode_children_t) +
+ epb * sizeof (dnode_handle_t));
+ dnc = winner;
+ }
+ }
+
+ ASSERT(dnc->dnc_count == epb);
+ dn = DN_SLOT_UNINIT;
+
+ if (flag & DNODE_MUST_BE_ALLOCATED) {
+ slots = 1;
+
+ while (dn == DN_SLOT_UNINIT) {
+ dnode_slots_hold(dnc, idx, slots);
+ dnh = &dnc->dnc_children[idx];
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ break;
+ } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_interior);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EEXIST));
+ } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ dnode_slots_rele(dnc, idx, slots);
+ if (!dnode_slots_tryenter(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
+ continue;
+ }
+
+ /*
+ * Someone else won the race and called dnode_create()
+ * after we checked DN_SLOT_IS_PTR() above but before
+ * we acquired the lock.
+ */
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ DNODE_STAT_BUMP(dnode_hold_alloc_hits);
+ } else if (flag & DNODE_MUST_BE_FREE) {
+
+ if (idx + slots - 1 >= DNODES_PER_BLOCK) {
+ DNODE_STAT_BUMP(dnode_hold_free_overflow);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ while (dn == DN_SLOT_UNINIT) {
+ dnode_slots_hold(dnc, idx, slots);
+
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dnode_slots_rele(dnc, idx, slots);
+ if (!dnode_slots_tryenter(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
+ continue;
+ }
+
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ /*
+ * Allocated but otherwise free dnodes which would
+ * be in the interior of a multi-slot dnodes need
+ * to be freed. Single slot dnodes can be safely
+ * re-purposed as a performance optimization.
+ */
+ if (slots > 1)
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
+ dnh = &dnc->dnc_children[idx];
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
+ DNODE_STAT_BUMP(dnode_hold_free_refcount);
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
+ DNODE_STAT_BUMP(dnode_hold_free_hits);
+ } else {
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (dn->dn_free_txg) {
+ DNODE_STAT_BUMP(dnode_hold_free_txg);
+ type = dn->dn_type;
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
+ ENOENT : EEXIST));
+ }
+
+ if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
+ dbuf_add_ref(db, dnh);
+
+ mutex_exit(&dn->dn_mtx);
+
+ /* Now we can rely on the hold to prevent the dnode from moving. */
+ dnode_slots_rele(dnc, idx, slots);
+
+ DNODE_VERIFY(dn);
+ ASSERT3P(dn->dn_dbuf, ==, db);
+ ASSERT3U(dn->dn_object, ==, object);
+ dbuf_rele(db, FTAG);
+
+ *dnp = dn;
+ return (0);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+int
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
+{
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+ dnp));
+}
+
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode. Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
+dnode_add_ref(dnode_t *dn, void *tag)
+{
+ mutex_enter(&dn->dn_mtx);
+ if (zfs_refcount_is_zero(&dn->dn_holds)) {
+ mutex_exit(&dn->dn_mtx);
+ return (FALSE);
+ }
+ VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+ mutex_exit(&dn->dn_mtx);
+ return (TRUE);
+}
+
+void
+dnode_rele(dnode_t *dn, void *tag)
+{
+ mutex_enter(&dn->dn_mtx);
+ dnode_rele_and_unlock(dn, tag, B_FALSE);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
+{
+ uint64_t refs;
+ /* Get while the hold prevents the dnode from moving. */
+ dmu_buf_impl_t *db = dn->dn_dbuf;
+ dnode_handle_t *dnh = dn->dn_handle;
+
+ refs = zfs_refcount_remove(&dn->dn_holds, tag);
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * It's unsafe to release the last hold on a dnode by dnode_rele() or
+ * indirectly by dbuf_rele() while relying on the dnode handle to
+ * prevent the dnode from moving, since releasing the last hold could
+ * result in the dnode's parent dbuf evicting its dnode handles. For
+ * that reason anyone calling dnode_rele() or dbuf_rele() without some
+ * other direct or indirect hold on the dnode must first drop the dnode
+ * handle.
+ */
+ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
+ /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+ if (refs == 0 && db != NULL) {
+ /*
+ * Another thread could add a hold to the dnode handle in
+ * dnode_hold_impl() while holding the parent dbuf. Since the
+ * hold on the parent dbuf prevents the handle from being
+ * destroyed, the hold on the handle is OK. We can't yet assert
+ * that the handle has zero references, but that will be
+ * asserted anyway when the handle gets destroyed.
+ */
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, dnh, evicting);
+ }
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+ objset_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+
+ if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
+ return;
+ }
+
+ DNODE_VERIFY(dn);
+
+#ifdef ZFS_DEBUG
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+ ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
+ mutex_exit(&dn->dn_mtx);
+#endif
+
+ /*
+ * Determine old uid/gid when necessary
+ */
+ dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
+ multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
+ multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
+
+ /*
+ * If we are already marked dirty, we're done.
+ */
+ if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+ multilist_sublist_unlock(mls);
+ return;
+ }
+
+ ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
+ !avl_is_empty(&dn->dn_dbufs));
+ ASSERT(dn->dn_datablksz != 0);
+ ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
+ ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
+ ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
+
+ dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+ dn->dn_object, txg);
+
+ multilist_sublist_insert_head(mls, dn);
+
+ multilist_sublist_unlock(mls);
+
+ /*
+ * The dnode maintains a hold on its containing dbuf as
+ * long as there are holds on it. Each instantiated child
+ * dbuf maintains a hold on the dnode. When the last child
+ * drops its hold, the dnode will drop its hold on the
+ * containing dbuf. We add a "dirty hold" here so that the
+ * dnode will hang around after we finish processing its
+ * children.
+ */
+ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
+
+ (void) dbuf_dirty(dn->dn_dbuf, tx);
+
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+ dn->dn_free_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ dnode_setdirty(dn, tx);
+}
+
+/*
+ * Try to change the block size for the indicated dnode. This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int err;
+
+ ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+ if (size == 0)
+ size = SPA_MINBLOCKSIZE;
+ else
+ size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+ if (ibs == dn->dn_indblkshift)
+ ibs = 0;
+
+ if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+ return (0);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* Check for any allocated blocks beyond the first */
+ if (dn->dn_maxblkid != 0)
+ goto fail;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = avl_first(&dn->dn_dbufs); db != NULL;
+ db = AVL_NEXT(&dn->dn_dbufs, db)) {
+ if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
+ mutex_exit(&dn->dn_dbufs_mtx);
+ goto fail;
+ }
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ if (ibs && dn->dn_nlevels != 1)
+ goto fail;
+
+ /* resize the old block */
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+ if (err == 0)
+ dbuf_new_size(db, size, tx);
+ else if (err != ENOENT)
+ goto fail;
+
+ dnode_setdblksz(dn, size);
+ dnode_setdirty(dn, tx);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+ if (ibs) {
+ dn->dn_indblkshift = ibs;
+ dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ }
+ /* rele after we have fixed the blocksize in the dnode */
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+ return (0);
+
+fail:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (SET_ERROR(ENOTSUP));
+}
+
+/* read-holding callers must not rely on the lock being continuously held */
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
+{
+ uint64_t txgoff = tx->tx_txg & TXG_MASK;
+ int epbs, new_nlevels;
+ uint64_t sz;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+
+ ASSERT(have_read ?
+ RW_READ_HELD(&dn->dn_struct_rwlock) :
+ RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ /*
+ * if we have a read-lock, check to see if we need to do any work
+ * before upgrading to a write-lock.
+ */
+ if (have_read) {
+ if (blkid <= dn->dn_maxblkid)
+ return;
+
+ if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ }
+ }
+
+ if (blkid <= dn->dn_maxblkid)
+ goto out;
+
+ dn->dn_maxblkid = blkid;
+
+ /*
+ * Compute the number of levels necessary to support the new maxblkid.
+ */
+ new_nlevels = 1;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (sz = dn->dn_nblkptr;
+ sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
+ new_nlevels++;
+
+ if (new_nlevels > dn->dn_nlevels) {
+ int old_nlevels = dn->dn_nlevels;
+ dmu_buf_impl_t *db;
+ list_t *list;
+ dbuf_dirty_record_t *new, *dr, *dr_next;
+
+ dn->dn_nlevels = new_nlevels;
+
+ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+ dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+ /* dirty the left indirects */
+ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+ new = dbuf_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+
+ /* transfer the dirty records to the new indirect */
+ mutex_enter(&dn->dn_mtx);
+ mutex_enter(&new->dt.di.dr_mtx);
+ list = &dn->dn_dirty_records[txgoff];
+ for (dr = list_head(list); dr; dr = dr_next) {
+ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+ if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+ dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ list_remove(&dn->dn_dirty_records[txgoff], dr);
+ list_insert_tail(&new->dt.di.dr_children, dr);
+ dr->dr_parent = new;
+ }
+ }
+ mutex_exit(&new->dt.di.dr_mtx);
+ mutex_exit(&dn->dn_mtx);
+ }
+
+out:
+ if (have_read)
+ rw_downgrade(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+ if (db != NULL) {
+ dmu_buf_will_dirty(&db->db, tx);
+ dbuf_rele(db, FTAG);
+ }
+}
+
+/*
+ * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
+ * and end_blkid.
+ */
+static void
+dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t db_search;
+ dmu_buf_impl_t *db;
+ avl_index_t where;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+
+ db_search.db_level = 1;
+ db_search.db_blkid = start_blkid + 1;
+ db_search.db_state = DB_SEARCH;
+ for (;;) {
+
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ if (db == NULL || db->db_level != 1 ||
+ db->db_blkid >= end_blkid) {
+ break;
+ }
+
+ /*
+ * Setup the next blkid we want to search for.
+ */
+ db_search.db_blkid = db->db_blkid + 1;
+ ASSERT3U(db->db_blkid, >=, start_blkid);
+
+ /*
+ * If the dbuf transitions to DB_EVICTING while we're trying
+ * to dirty it, then we will be unable to discover it in
+ * the dbuf hash table. This will result in a call to
+ * dbuf_create() which needs to acquire the dn_dbufs_mtx
+ * lock. To avoid a deadlock, we drop the lock before
+ * dirtying the level-1 dbuf.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ dnode_dirty_l1(dn, db->db_blkid, tx);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ }
+
+#ifdef ZFS_DEBUG
+ /*
+ * Walk all the in-core level-1 dbufs and verify they have been dirtied.
+ */
+ db_search.db_level = 1;
+ db_search.db_blkid = start_blkid + 1;
+ db_search.db_state = DB_SEARCH;
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+ for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+ if (db->db_level != 1 || db->db_blkid >= end_blkid)
+ break;
+ ASSERT(db->db_dirtycnt > 0);
+ }
+#endif
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ uint64_t blkoff, blkid, nblks;
+ int blksz, blkshift, head, tail;
+ int trunc = FALSE;
+ int epbs;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ blksz = dn->dn_datablksz;
+ blkshift = dn->dn_datablkshift;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ if (len == DMU_OBJECT_END) {
+ len = UINT64_MAX - off;
+ trunc = TRUE;
+ }
+
+ /*
+ * First, block align the region to free:
+ */
+ if (ISP2(blksz)) {
+ head = P2NPHASE(off, blksz);
+ blkoff = P2PHASE(off, blksz);
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ goto out;
+ } else {
+ ASSERT(dn->dn_maxblkid == 0);
+ if (off == 0 && len >= blksz) {
+ /*
+ * Freeing the whole block; fast-track this request.
+ */
+ blkid = 0;
+ nblks = 1;
+ if (dn->dn_nlevels > 1)
+ dnode_dirty_l1(dn, 0, tx);
+ goto done;
+ } else if (off >= blksz) {
+ /* Freeing past end-of-data */
+ goto out;
+ } else {
+ /* Freeing part of the block. */
+ head = blksz - off;
+ ASSERT3U(head, >, 0);
+ }
+ blkoff = off;
+ }
+ /* zero out any partial block data at the start of the range */
+ if (head) {
+ ASSERT3U(blkoff + head, ==, blksz);
+ if (len < head)
+ head = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+ TRUE, FALSE, FTAG, &db) == 0) {
+ caddr_t data;
+
+ /* don't dirty if it isn't on disk and isn't dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_will_dirty(&db->db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ data = db->db.db_data;
+ bzero(data + blkoff, head);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ off += head;
+ len -= head;
+ }
+
+ /* If the range was less than one block, we're done */
+ if (len == 0)
+ goto out;
+
+ /* If the remaining range is past end of file, we're done */
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ goto out;
+
+ ASSERT(ISP2(blksz));
+ if (trunc)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT0(P2PHASE(off, blksz));
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ if (len < tail)
+ tail = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+ TRUE, FALSE, FTAG, &db) == 0) {
+ /* don't dirty if not on disk and not dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_will_dirty(&db->db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ bzero(db->db.db_data, tail);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ len -= tail;
+ }
+
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
+ goto out;
+
+ ASSERT(IS_P2ALIGNED(off, blksz));
+ ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+ if (trunc)
+ nblks += 1;
+
+ /*
+ * Dirty all the indirect blocks in this range. Note that only
+ * the first and last indirect blocks can actually be written
+ * (if they were partially freed) -- they must be dirtied, even if
+ * they do not exist on disk yet. The interior blocks will
+ * be freed by free_children(), so they will not actually be written.
+ * Even though these interior blocks will not be written, we
+ * dirty them for two reasons:
+ *
+ * - It ensures that the indirect blocks remain in memory until
+ * syncing context. (They have already been prefetched by
+ * dmu_tx_hold_free(), so we don't have to worry about reading
+ * them serially here.)
+ *
+ * - The dirty space accounting will put pressure on the txg sync
+ * mechanism to begin syncing, and to delay transactions if there
+ * is a large amount of freeing. Even though these indirect
+ * blocks will not be written, we could need to write the same
+ * amount of space if we copy the freed BPs into deadlists.
+ */
+ if (dn->dn_nlevels > 1) {
+ uint64_t first, last;
+
+ first = blkid >> epbs;
+ dnode_dirty_l1(dn, first, tx);
+ if (trunc)
+ last = dn->dn_maxblkid >> epbs;
+ else
+ last = (blkid + nblks - 1) >> epbs;
+ if (last != first)
+ dnode_dirty_l1(dn, last, tx);
+
+ dnode_dirty_l1range(dn, first, last, tx);
+
+ int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ for (uint64_t i = first + 1; i < last; i++) {
+ /*
+ * Set i to the blockid of the next non-hole
+ * level-1 indirect block at or after i. Note
+ * that dnode_next_offset() operates in terms of
+ * level-0-equivalent bytes.
+ */
+ uint64_t ibyte = i << shift;
+ int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+ &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (i >= last)
+ break;
+
+ /*
+ * Normally we should not see an error, either
+ * from dnode_next_offset() or dbuf_hold_level()
+ * (except for ESRCH from dnode_next_offset).
+ * If there is an i/o error, then when we read
+ * this block in syncing context, it will use
+ * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+ * to the "failmode" property. dnode_next_offset()
+ * doesn't have a flag to indicate MUSTSUCCEED.
+ */
+ if (err != 0)
+ break;
+
+ dnode_dirty_l1(dn, i, tx);
+ }
+ }
+
+done:
+ /*
+ * Add this range to the dnode range list.
+ * We will finish up this free operation in the syncing phase.
+ */
+ mutex_enter(&dn->dn_mtx);
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] == NULL) {
+ dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
+ }
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+ range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ mutex_exit(&dn->dn_mtx);
+
+ dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
+ dnode_setdirty(dn, tx);
+out:
+
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+ int i;
+
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+ void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+ int i;
+
+ if (blkid == DMU_BONUS_BLKID)
+ return (FALSE);
+
+ /*
+ * If we're in the process of opening the pool, dp will not be
+ * set yet, but there shouldn't be anything dirty.
+ */
+ if (dp == NULL)
+ return (FALSE);
+
+ if (dn->dn_free_txg)
+ return (TRUE);
+
+ if (blkid == DMU_SPILL_BLKID)
+ return (dnode_spill_freed(dn));
+
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (dn->dn_free_ranges[i] != NULL &&
+ range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t delta)
+{
+ uint64_t space;
+ dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
+ dn, dn->dn_phys,
+ (u_longlong_t)dn->dn_phys->dn_used,
+ (longlong_t)delta);
+
+ mutex_enter(&dn->dn_mtx);
+ space = DN_USED_BYTES(dn->dn_phys);
+ if (delta > 0) {
+ ASSERT3U(space + delta, >=, space); /* no overflow */
+ } else {
+ ASSERT3U(space, >=, -delta); /* no underflow */
+ }
+ space += delta;
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
+ ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
+ ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
+ dn->dn_phys->dn_used = space >> DEV_BSHIFT;
+ } else {
+ dn->dn_phys->dn_used = space;
+ dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
+ }
+ mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Scans a block at the indicated "level" looking for a hole or data,
+ * depending on 'flags'.
+ *
+ * If level > 0, then we are scanning an indirect block looking at its
+ * pointers. If level == 0, then we are looking at a block of dnodes.
+ *
+ * If we don't find what we are looking for in the block, we return ESRCH.
+ * Otherwise, return with *offset pointing to the beginning (if searching
+ * forwards) or end (if searching backwards) of the range covered by the
+ * block pointer we matched on (or dnode).
+ *
+ * The basic search algorithm used below by dnode_next_offset() is to
+ * use this function to search up the block tree (widen the search) until
+ * we find something (i.e., we don't return ESRCH) and then search back
+ * down the tree (narrow the search) until we reach our original search
+ * level.
+ */
+static int
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
+ int lvl, uint64_t blkfill, uint64_t txg)
+{
+ dmu_buf_impl_t *db = NULL;
+ void *data = NULL;
+ uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t epb = 1ULL << epbs;
+ uint64_t minfill, maxfill;
+ boolean_t hole;
+ int i, inc, error, span;
+
+ dprintf("probing object %llu offset %llx level %d of %u\n",
+ dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+
+ hole = ((flags & DNODE_FIND_HOLE) != 0);
+ inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+ ASSERT(txg == 0 || !hole);
+
+ if (lvl == dn->dn_phys->dn_nlevels) {
+ error = 0;
+ epb = dn->dn_phys->dn_nblkptr;
+ data = dn->dn_phys->dn_blkptr;
+ } else {
+ uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
+ if (error) {
+ if (error != ENOENT)
+ return (error);
+ if (hole)
+ return (0);
+ /*
+ * This can only happen when we are searching up
+ * the block tree for data. We don't really need to
+ * adjust the offset, as we will just end up looking
+ * at the pointer to this block in its parent, and its
+ * going to be unallocated, so we will skip over it.
+ */
+ return (SET_ERROR(ESRCH));
+ }
+ error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
+ if (error) {
+ dbuf_rele(db, FTAG);
+ return (error);
+ }
+ data = db->db.db_data;
+ }
+
+
+ if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+ db->db_blkptr->blk_birth <= txg ||
+ BP_IS_HOLE(db->db_blkptr))) {
+ /*
+ * This can only happen when we are searching up the tree
+ * and these conditions mean that we need to keep climbing.
+ */
+ error = SET_ERROR(ESRCH);
+ } else if (lvl == 0) {
+ dnode_phys_t *dnp = data;
+
+ ASSERT(dn->dn_type == DMU_OT_DNODE);
+ ASSERT(!(flags & DNODE_FIND_BACKWARDS));
+
+ for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+ i < blkfill; i += dnp[i].dn_extra_slots + 1) {
+ if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
+ break;
+ }
+
+ if (i == blkfill)
+ error = SET_ERROR(ESRCH);
+
+ *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+ (i << DNODE_SHIFT);
+ } else {
+ blkptr_t *bp = data;
+ uint64_t start = *offset;
+ span = (lvl - 1) * epbs + dn->dn_datablkshift;
+ minfill = 0;
+ maxfill = blkfill << ((lvl - 1) * epbs);
+
+ if (hole)
+ maxfill--;
+ else
+ minfill++;
+
+ *offset = *offset >> span;
+ for (i = BF64_GET(*offset, 0, epbs);
+ i >= 0 && i < epb; i += inc) {
+ if (BP_GET_FILL(&bp[i]) >= minfill &&
+ BP_GET_FILL(&bp[i]) <= maxfill &&
+ (hole || bp[i].blk_birth > txg))
+ break;
+ if (inc > 0 || *offset > 0)
+ *offset += inc;
+ }
+ *offset = *offset << span;
+ if (inc < 0) {
+ /* traversing backwards; position offset at the end */
+ ASSERT3U(*offset, <=, start);
+ *offset = MIN(*offset + (1ULL << span) - 1, start);
+ } else if (*offset < start) {
+ *offset = start;
+ }
+ if (i < 0 || i >= epb)
+ error = SET_ERROR(ESRCH);
+ }
+
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ *
+ * Examples:
+ *
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ * Finds the next/previous hole/data in a file.
+ * Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
+ * Finds the next free/allocated dnode an objset's meta-dnode.
+ * Only finds objects that have new contents since txg (ie.
+ * bonus buffer changes and content removal are ignored).
+ * Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ * Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
+ int minlvl, uint64_t blkfill, uint64_t txg)
+{
+ uint64_t initial_offset = *offset;
+ int lvl, maxlvl;
+ int error = 0;
+
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (dn->dn_phys->dn_nlevels == 0) {
+ error = SET_ERROR(ESRCH);
+ goto out;
+ }
+
+ if (dn->dn_datablkshift == 0) {
+ if (*offset < dn->dn_datablksz) {
+ if (flags & DNODE_FIND_HOLE)
+ *offset = dn->dn_datablksz;
+ } else {
+ error = SET_ERROR(ESRCH);
+ }
+ goto out;
+ }
+
+ maxlvl = dn->dn_phys->dn_nlevels;
+
+ for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ error = dnode_next_offset_level(dn,
+ flags, offset, lvl, blkfill, txg);
+ if (error != ESRCH)
+ break;
+ }
+
+ while (error == 0 && --lvl >= minlvl) {
+ error = dnode_next_offset_level(dn,
+ flags, offset, lvl, blkfill, txg);
+ }
+
+ /*
+ * There's always a "virtual hole" at the end of the object, even
+ * if all BP's which physically exist are non-holes.
+ */
+ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+ minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+ error = 0;
+ }
+
+ if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+ initial_offset < *offset : initial_offset > *offset))
+ error = SET_ERROR(ESRCH);
+out:
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_exit(&dn->dn_struct_rwlock);
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
new file mode 100644
index 000000000000..9283356608aa
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -0,0 +1,779 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/range_tree.h>
+#include <sys/zfeature.h>
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ int nblkptr = dn->dn_phys->dn_nblkptr;
+ int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+ int new_level = dn->dn_next_nlevels[txgoff];
+ int i;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* this dnode can't be paged out because it's dirty */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
+
+ db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+
+ dn->dn_phys->dn_nlevels = new_level;
+ dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
+ dn->dn_object, dn->dn_phys->dn_nlevels);
+
+ /* transfer dnode's block pointers to new indirect block */
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ ASSERT(db->db.db_data);
+ ASSERT(arc_released(db->db_buf));
+ ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ sizeof (blkptr_t) * nblkptr);
+ arc_buf_freeze(db->db_buf);
+
+ /* set dbuf's parent pointers to new indirect buf */
+ for (i = 0; i < nblkptr; i++) {
+ dmu_buf_impl_t *child =
+ dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+
+ if (child == NULL)
+ continue;
+#ifdef DEBUG
+ DB_DNODE_ENTER(child);
+ ASSERT3P(DB_DNODE(child), ==, dn);
+ DB_DNODE_EXIT(child);
+#endif /* DEBUG */
+ if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+ ASSERT(child->db_parent->db_level == db->db_level);
+ ASSERT(child->db_blkptr !=
+ &dn->dn_phys->dn_blkptr[child->db_blkid]);
+ mutex_exit(&child->db_mtx);
+ continue;
+ }
+ ASSERT(child->db_parent == NULL ||
+ child->db_parent == dn->dn_dbuf);
+
+ child->db_parent = db;
+ dbuf_add_ref(db, child);
+ if (db->db.db_data)
+ child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+ else
+ child->db_blkptr = NULL;
+ dprintf_dbuf_bp(child, child->db_blkptr,
+ "changed db_blkptr to new indirect %s", "");
+
+ mutex_exit(&child->db_mtx);
+ }
+
+ bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ uint64_t bytesfreed = 0;
+
+ dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
+
+ for (int i = 0; i < num; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+
+ bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
+ ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+
+ /*
+ * Save some useful information on the holes being
+ * punched, including logical size, type, and indirection
+ * level. Retaining birth time enables detection of when
+ * holes are punched for reducing the number of free
+ * records transmitted during a zfs send.
+ */
+
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ uint64_t lvl = BP_GET_LEVEL(bp);
+
+ bzero(bp, sizeof (blkptr_t));
+
+ if (spa_feature_is_active(dn->dn_objset->os_spa,
+ SPA_FEATURE_HOLE_BIRTH)) {
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, type);
+ BP_SET_LEVEL(bp, lvl);
+ BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
+ }
+ }
+ dnode_diduse_space(dn, -bytesfreed);
+}
+
+#ifdef ZFS_DEBUG
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+ int off, num;
+ int i, err, epbs;
+ uint64_t txg = tx->tx_txg;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ off = start - (db->db_blkid * 1<<epbs);
+ num = end - start + 1;
+
+ ASSERT3U(off, >=, 0);
+ ASSERT3U(num, >=, 0);
+ ASSERT3U(db->db_level, >, 0);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
+ ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+ ASSERT(db->db_blkptr != NULL);
+
+ for (i = off; i < off+num; i++) {
+ uint64_t *buf;
+ dmu_buf_impl_t *child;
+ dbuf_dirty_record_t *dr;
+ int j;
+
+ ASSERT(db->db_level == 1);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1,
+ (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (err == ENOENT)
+ continue;
+ ASSERT(err == 0);
+ ASSERT(child->db_level == 0);
+ dr = child->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ ASSERT(dr == NULL || dr->dr_txg == txg);
+
+ /* data_old better be zeroed */
+ if (dr) {
+ buf = dr->dt.dl.dr_data->b_data;
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ (void *)child, i, off, num);
+ }
+ }
+ }
+
+ /*
+ * db_data better be zeroed unless it's dirty in a
+ * future txg.
+ */
+ mutex_enter(&child->db_mtx);
+ buf = child->db.db_data;
+ if (buf != NULL && child->db_state != DB_FILL &&
+ child->db_last_dirty == NULL) {
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ (void *)child, i, off, num);
+ }
+ }
+ }
+ mutex_exit(&child->db_mtx);
+
+ dbuf_rele(child, FTAG);
+ }
+ DB_DNODE_EXIT(db);
+}
+#endif
+
+/*
+ * We don't usually free the indirect blocks here. If in one txg we have a
+ * free_range and a write to the same indirect block, it's important that we
+ * preserve the hole's birth times. Therefore, we don't free any any indirect
+ * blocks in free_children(). If an indirect block happens to turn into all
+ * holes, it will be freed by dbuf_write_children_ready, which happens at a
+ * point in the syncing process where we know for certain the contents of the
+ * indirect block.
+ *
+ * However, if we're freeing a dnode, its space accounting must go to zero
+ * before we actually try to free the dnode, or we will trip an assertion. In
+ * addition, we know the case described above cannot occur, because the dnode is
+ * being freed. Therefore, we free the indirect blocks immediately in that
+ * case.
+ */
+static void
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
+ boolean_t free_indirects, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ blkptr_t *bp;
+ dmu_buf_impl_t *subdb;
+ uint64_t start, end, dbstart, dbend;
+ unsigned int epbs, shift, i;
+
+ /*
+ * There is a small possibility that this block will not be cached:
+ * 1 - if level > 1 and there are no children with level <= 1
+ * 2 - if this block was evicted since we read it from
+ * dmu_tx_hold_free().
+ */
+ if (db->db_state != DB_CACHED)
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+
+ /*
+ * If we modify this indirect block, and we are not freeing the
+ * dnode (!free_indirects), then this indirect block needs to get
+ * written to disk by dbuf_write(). If it is dirty, we know it will
+ * be written (otherwise, we would have incorrect on-disk state
+ * because the space would be freed but still referenced by the BP
+ * in this indirect block). Therefore we VERIFY that it is
+ * dirty.
+ *
+ * Our VERIFY covers some cases that do not actually have to be
+ * dirty, but the open-context code happens to dirty. E.g. if the
+ * blocks we are freeing are all holes, because in that case, we
+ * are only freeing part of this indirect block, so it is an
+ * ancestor of the first or last block to be freed. The first and
+ * last L1 indirect blocks are always dirtied by dnode_free_range().
+ */
+ VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
+
+ dbuf_release_bp(db);
+ bp = db->db.db_data;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(epbs, <, 31);
+ shift = (db->db_level - 1) * epbs;
+ dbstart = db->db_blkid << epbs;
+ start = blkid >> shift;
+ if (dbstart < start) {
+ bp += start - dbstart;
+ } else {
+ start = dbstart;
+ }
+ dbend = ((db->db_blkid + 1) << epbs) - 1;
+ end = (blkid + nblks - 1) >> shift;
+ if (dbend <= end)
+ end = dbend;
+
+ ASSERT3U(start, <=, end);
+
+ if (db->db_level == 1) {
+ FREE_VERIFY(db, start, end, tx);
+ free_blocks(dn, bp, end-start+1, tx);
+ } else {
+ for (uint64_t id = start; id <= end; id++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
+ id, TRUE, FALSE, FTAG, &subdb));
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3P(bp, ==, subdb->db_blkptr);
+
+ free_children(subdb, blkid, nblks, free_indirects, tx);
+ dbuf_rele(subdb, FTAG);
+ }
+ }
+
+ if (free_indirects) {
+ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
+ ASSERT(BP_IS_HOLE(bp));
+ bzero(db->db.db_data, db->db.db_size);
+ free_blocks(dn, db->db_blkptr, 1, tx);
+ }
+
+ DB_DNODE_EXIT(db);
+ arc_buf_freeze(db->db_buf);
+}
+
+/*
+ * Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
+ boolean_t free_indirects, dmu_tx_t *tx)
+{
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ int dnlevel = dn->dn_phys->dn_nlevels;
+ boolean_t trunc = B_FALSE;
+
+ if (blkid > dn->dn_phys->dn_maxblkid)
+ return;
+
+ ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+ if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
+ nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+ trunc = B_TRUE;
+ }
+
+ /* There are no indirect blocks in the object */
+ if (dnlevel == 1) {
+ if (blkid >= dn->dn_phys->dn_nblkptr) {
+ /* this range was never made persistent */
+ return;
+ }
+ ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+ free_blocks(dn, bp + blkid, nblks, tx);
+ } else {
+ int shift = (dnlevel - 1) *
+ (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+ int start = blkid >> shift;
+ int end = (blkid + nblks - 1) >> shift;
+ dmu_buf_impl_t *db;
+
+ ASSERT(start < dn->dn_phys->dn_nblkptr);
+ bp += start;
+ for (int i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
+ TRUE, FALSE, FTAG, &db));
+ rw_exit(&dn->dn_struct_rwlock);
+
+ free_children(db, blkid, nblks, free_indirects, tx);
+ dbuf_rele(db, FTAG);
+ }
+ }
+
+ if (trunc) {
+ dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
+
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
+ }
+}
+
+typedef struct dnode_sync_free_range_arg {
+ dnode_t *dsfra_dnode;
+ dmu_tx_t *dsfra_tx;
+ boolean_t dsfra_free_indirects;
+} dnode_sync_free_range_arg_t;
+
+static void
+dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
+{
+ dnode_sync_free_range_arg_t *dsfra = arg;
+ dnode_t *dn = dsfra->dsfra_dnode;
+
+ mutex_exit(&dn->dn_mtx);
+ dnode_sync_free_range_impl(dn, blkid, nblks,
+ dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
+ mutex_enter(&dn->dn_mtx);
+}
+
+/*
+ * Try to kick all the dnode's dbufs out of the cache...
+ */
+void
+dnode_evict_dbufs(dnode_t *dn)
+{
+ dmu_buf_impl_t db_marker;
+ dmu_buf_impl_t *db, *db_next;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+
+#ifdef DEBUG
+ DB_DNODE_ENTER(db);
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ DB_DNODE_EXIT(db);
+#endif /* DEBUG */
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING &&
+ zfs_refcount_is_zero(&db->db_holds)) {
+ db_marker.db_level = db->db_level;
+ db_marker.db_blkid = db->db_blkid;
+ db_marker.db_state = DB_SEARCH;
+ avl_insert_here(&dn->dn_dbufs, &db_marker, db,
+ AVL_BEFORE);
+
+ /*
+ * We need to use the "marker" dbuf rather than
+ * simply getting the next dbuf, because
+ * dbuf_destroy() may actually remove multiple dbufs.
+ * It can call itself recursively on the parent dbuf,
+ * which may also be removed from dn_dbufs. The code
+ * flow would look like:
+ *
+ * dbuf_destroy():
+ * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+ * if (!cacheable || pending_evict)
+ * dbuf_destroy()
+ */
+ dbuf_destroy(db);
+
+ db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
+ avl_remove(&dn->dn_dbufs, &db_marker);
+ } else {
+ db->db_pending_evict = TRUE;
+ mutex_exit(&db->db_mtx);
+ db_next = AVL_NEXT(&dn->dn_dbufs, db);
+ }
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ dnode_evict_bonus(dn);
+}
+
+void
+dnode_evict_bonus(dnode_t *dn)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus != NULL) {
+ if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_destroy(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ } else {
+ dn->dn_bonus->db_pending_evict = TRUE;
+ }
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ if (db->db_level != 0)
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
+ mutex_enter(&db->db_mtx);
+ /* XXX - use dbuf_undirty()? */
+ list_remove(list, dr);
+ ASSERT(db->db_last_dirty == dr);
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt -= 1;
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ dr->dt.dl.dr_data == db->db_buf);
+ dbuf_unoverride(dr);
+ } else {
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
+ }
+}
+
+static void
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /*
+ * Our contents should have been freed in dnode_sync() by the
+ * free range record inserted by the caller of dnode_free().
+ */
+ ASSERT0(DN_USED_BYTES(dn->dn_phys));
+ ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
+ dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
+ dnode_evict_dbufs(dn);
+
+ /*
+ * XXX - It would be nice to assert this, but we may still
+ * have residual holds from async evictions from the arc...
+ *
+ * zfs_obj_to_path() also depends on this being
+ * commented out.
+ *
+ * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1);
+ */
+
+ /* Undirty next bits */
+ dn->dn_next_nlevels[txgoff] = 0;
+ dn->dn_next_indblkshift[txgoff] = 0;
+ dn->dn_next_blksz[txgoff] = 0;
+
+ /* ASSERT(blkptrs are zero); */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ ASSERT(dn->dn_free_txg > 0);
+ if (dn->dn_allocated_txg != dn->dn_free_txg)
+ dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
+ bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+ dnode_free_interior_slots(dn);
+
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_type = DMU_OT_NONE;
+ dn->dn_maxblkid = 0;
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_num_slots = 1;
+ mutex_exit(&dn->dn_mtx);
+
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ /*
+ * Now that we've released our hold, the dnode may
+ * be evicted, so we musn't access it.
+ */
+}
+
+/*
+ * Write out the dnode's dirty buffers.
+ */
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
+{
+ dnode_phys_t *dnp = dn->dn_phys;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ list_t *list = &dn->dn_dirty_records[txgoff];
+ static const dnode_phys_t zerodn = { 0 };
+ boolean_t kill_spill = B_FALSE;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+ ASSERT(dnp->dn_type != DMU_OT_NONE ||
+ bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
+ DNODE_VERIFY(dn);
+
+ ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
+
+ if (dmu_objset_userused_enabled(dn->dn_objset) &&
+ !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+ dn->dn_oldflags = dn->dn_phys->dn_flags;
+ dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+ mutex_exit(&dn->dn_mtx);
+ dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
+ } else {
+ /* Once we account for it, we should always account for it. */
+ ASSERT(!(dn->dn_phys->dn_flags &
+ DNODE_FLAG_USERUSED_ACCOUNTED));
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_allocated_txg == tx->tx_txg) {
+ /* The dnode is newly allocated or reallocated */
+ if (dnp->dn_type == DMU_OT_NONE) {
+ /* this is a first alloc, not a realloc */
+ dnp->dn_nlevels = 1;
+ dnp->dn_nblkptr = dn->dn_nblkptr;
+ }
+
+ dnp->dn_type = dn->dn_type;
+ dnp->dn_bonustype = dn->dn_bonustype;
+ dnp->dn_bonuslen = dn->dn_bonuslen;
+ }
+
+ dnp->dn_extra_slots = dn->dn_num_slots - 1;
+
+ ASSERT(dnp->dn_nlevels > 1 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT(dnp->dn_nlevels < 2 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
+
+ if (dn->dn_next_type[txgoff] != 0) {
+ dnp->dn_type = dn->dn_type;
+ dn->dn_next_type[txgoff] = 0;
+ }
+
+ if (dn->dn_next_blksz[txgoff] != 0) {
+ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
+ SPA_MINBLOCKSIZE) == 0);
+ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ dn->dn_maxblkid == 0 || list_head(list) != NULL ||
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
+ dnp->dn_datablkszsec ||
+ !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
+ dnp->dn_datablkszsec =
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+ dn->dn_next_blksz[txgoff] = 0;
+ }
+
+ if (dn->dn_next_bonuslen[txgoff] != 0) {
+ if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+ dnp->dn_bonuslen = 0;
+ else
+ dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+ ASSERT(dnp->dn_bonuslen <=
+ DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
+ dn->dn_next_bonuslen[txgoff] = 0;
+ }
+
+ if (dn->dn_next_bonustype[txgoff] != 0) {
+ ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
+ dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+ dn->dn_next_bonustype[txgoff] = 0;
+ }
+
+ boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
+ dn->dn_free_txg <= tx->tx_txg;
+
+ /*
+ * Remove the spill block if we have been explicitly asked to
+ * remove it, or if the object is being removed.
+ */
+ if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ kill_spill = B_TRUE;
+ dn->dn_rm_spillblk[txgoff] = 0;
+ }
+
+ if (dn->dn_next_indblkshift[txgoff] != 0) {
+ ASSERT(dnp->dn_nlevels == 1);
+ dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+ dn->dn_next_indblkshift[txgoff] = 0;
+ }
+
+ /*
+ * Just take the live (open-context) values for checksum and compress.
+ * Strictly speaking it's a future leak, but nothing bad happens if we
+ * start using the new checksum or compress algorithm a little early.
+ */
+ dnp->dn_checksum = dn->dn_checksum;
+ dnp->dn_compress = dn->dn_compress;
+
+ mutex_exit(&dn->dn_mtx);
+
+ if (kill_spill) {
+ free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
+ mutex_enter(&dn->dn_mtx);
+ dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ /* process all the "freed" ranges in the file */
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ dnode_sync_free_range_arg_t dsfra;
+ dsfra.dsfra_dnode = dn;
+ dsfra.dsfra_tx = tx;
+ dsfra.dsfra_free_indirects = freeing_dnode;
+ if (freeing_dnode) {
+ ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
+ 0, dn->dn_maxblkid + 1));
+ }
+ mutex_enter(&dn->dn_mtx);
+ range_tree_vacate(dn->dn_free_ranges[txgoff],
+ dnode_sync_free_range, &dsfra);
+ range_tree_destroy(dn->dn_free_ranges[txgoff]);
+ dn->dn_free_ranges[txgoff] = NULL;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (freeing_dnode) {
+ dn->dn_objset->os_freed_dnodes++;
+ dnode_sync_free(dn, tx);
+ return;
+ }
+
+ if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ mutex_enter(&ds->ds_lock);
+ ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] =
+ B_TRUE;
+ mutex_exit(&ds->ds_lock);
+ }
+
+ if (dn->dn_next_nlevels[txgoff]) {
+ dnode_increase_indirection(dn, tx);
+ dn->dn_next_nlevels[txgoff] = 0;
+ }
+
+ if (dn->dn_next_nblkptr[txgoff]) {
+ /* this should only happen on a realloc */
+ ASSERT(dn->dn_allocated_txg == tx->tx_txg);
+ if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
+ /* zero the new blkptrs we are gaining */
+ bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+ sizeof (blkptr_t) *
+ (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
+#ifdef ZFS_DEBUG
+ } else {
+ int i;
+ ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
+ /* the blkptrs we are losing better be unallocated */
+ for (i = dn->dn_next_nblkptr[txgoff];
+ i < dnp->dn_nblkptr; i++)
+ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
+#endif
+ }
+ mutex_enter(&dn->dn_mtx);
+ dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
+ dn->dn_next_nblkptr[txgoff] = 0;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
+
+ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ ASSERT3P(list_head(list), ==, NULL);
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ }
+
+ /*
+ * Although we have dropped our reference to the dnode, it
+ * can't be evicted until its written, and we haven't yet
+ * initiated the IO for the dnode's dbuf.
+ */
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c
new file mode 100644
index 000000000000..cae6d00ca2ce
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c
@@ -0,0 +1,566 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/spa.h>
+#include <sys/dsl_bookmark.h>
+#include <zfs_namecheck.h>
+
+static int
+dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
+ dsl_dataset_t **dsp, void *tag, char **shortnamep)
+{
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ char *hashp;
+
+ if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ hashp = strchr(fullname, '#');
+ if (hashp == NULL)
+ return (SET_ERROR(EINVAL));
+
+ *shortnamep = hashp + 1;
+ if (zfs_component_namecheck(*shortnamep, NULL, NULL))
+ return (SET_ERROR(EINVAL));
+ (void) strlcpy(buf, fullname, hashp - fullname + 1);
+ return (dsl_dataset_hold(dp, buf, tag, dsp));
+}
+
+/*
+ * Returns ESRCH if bookmark is not found.
+ */
+static int
+dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname,
+ zfs_bookmark_phys_t *bmark_phys)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t bmark_zapobj = ds->ds_bookmarks;
+ matchtype_t mt = 0;
+ int err;
+
+ if (bmark_zapobj == 0)
+ return (SET_ERROR(ESRCH));
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
+ sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt,
+ NULL, 0, NULL);
+
+ return (err == ENOENT ? ESRCH : err);
+}
+
+/*
+ * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark
+ * does not represents an earlier point in later_ds's timeline.
+ *
+ * Returns ENOENT if the dataset containing the bookmark does not exist.
+ * Returns ESRCH if the dataset exists but the bookmark was not found in it.
+ */
+int
+dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname,
+ dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp)
+{
+ char *shortname;
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_bmark_lookup(ds, shortname, bmp);
+ if (error == 0 && later_ds != NULL) {
+ if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
+ error = SET_ERROR(EXDEV);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+}
+
+typedef struct dsl_bookmark_create_arg {
+ nvlist_t *dbca_bmarks;
+ nvlist_t *dbca_errors;
+} dsl_bookmark_create_arg_t;
+
+static int
+dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *bmark_fs;
+ char *shortname;
+ int error;
+ zfs_bookmark_phys_t bmark_phys;
+
+ if (!snapds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ error = dsl_bookmark_hold_ds(dp, bookmark_name,
+ &bmark_fs, FTAG, &shortname);
+ if (error != 0)
+ return (error);
+
+ if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) {
+ dsl_dataset_rele(bmark_fs, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_dataset_bmark_lookup(bmark_fs, shortname,
+ &bmark_phys);
+ dsl_dataset_rele(bmark_fs, FTAG);
+ if (error == 0)
+ return (SET_ERROR(EEXIST));
+ if (error == ESRCH)
+ return (0);
+ return (error);
+}
+
+static int
+dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_create_arg_t *dbca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int rv = 0;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+ return (SET_ERROR(ENOTSUP));
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
+ dsl_dataset_t *snapds;
+ int error;
+
+ /* note: validity of nvlist checked by ioctl layer */
+ error = dsl_dataset_hold(dp, fnvpair_value_string(pair),
+ FTAG, &snapds);
+ if (error == 0) {
+ error = dsl_bookmark_create_check_impl(snapds,
+ nvpair_name(pair), tx);
+ dsl_dataset_rele(snapds, FTAG);
+ }
+ if (error != 0) {
+ fnvlist_add_int32(dbca->dbca_errors,
+ nvpair_name(pair), error);
+ rv = error;
+ }
+ }
+
+ return (rv);
+}
+
+static void
+dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_create_arg_t *dbca = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS));
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
+ dsl_dataset_t *snapds, *bmark_fs;
+ zfs_bookmark_phys_t bmark_phys;
+ char *shortname;
+
+ VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair),
+ FTAG, &snapds));
+ VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
+ &bmark_fs, FTAG, &shortname));
+ if (bmark_fs->ds_bookmarks == 0) {
+ bmark_fs->ds_bookmarks =
+ zap_create_norm(mos, U8_TEXTPREP_TOUPPER,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+
+ dsl_dataset_zapify(bmark_fs, tx);
+ VERIFY0(zap_add(mos, bmark_fs->ds_object,
+ DS_FIELD_BOOKMARK_NAMES,
+ sizeof (bmark_fs->ds_bookmarks), 1,
+ &bmark_fs->ds_bookmarks, tx));
+ }
+
+ bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid;
+ bmark_phys.zbm_creation_txg =
+ dsl_dataset_phys(snapds)->ds_creation_txg;
+ bmark_phys.zbm_creation_time =
+ dsl_dataset_phys(snapds)->ds_creation_time;
+
+ VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks,
+ shortname, sizeof (uint64_t),
+ sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+ &bmark_phys, tx));
+
+ spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
+ "name=%s creation_txg=%llu target_snap=%llu",
+ shortname,
+ (longlong_t)bmark_phys.zbm_creation_txg,
+ (longlong_t)snapds->ds_object);
+
+ dsl_dataset_rele(bmark_fs, FTAG);
+ dsl_dataset_rele(snapds, FTAG);
+ }
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
+{
+ nvpair_t *pair;
+ dsl_bookmark_create_arg_t dbca;
+
+ pair = nvlist_next_nvpair(bmarks, NULL);
+ if (pair == NULL)
+ return (0);
+
+ dbca.dbca_bmarks = bmarks;
+ dbca.dbca_errors = errors;
+
+ return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
+ dsl_bookmark_create_sync, &dbca,
+ fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl)
+{
+ int err = 0;
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ uint64_t bmark_zapobj = ds->ds_bookmarks;
+ if (bmark_zapobj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ char *bmark_name = attr.za_name;
+ zfs_bookmark_phys_t bmark_phys;
+
+ err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys);
+ ASSERT3U(err, !=, ENOENT);
+ if (err != 0)
+ break;
+
+ nvlist_t *out_props = fnvlist_alloc();
+ if (nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_GUID))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_GUID, bmark_phys.zbm_guid);
+ }
+ if (nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg);
+ }
+ if (nvlist_exists(props,
+ zfs_prop_to_name(ZFS_PROP_CREATION))) {
+ dsl_prop_nvlist_add_uint64(out_props,
+ ZFS_PROP_CREATION, bmark_phys.zbm_creation_time);
+ }
+
+ fnvlist_add_nvlist(outnvl, bmark_name, out_props);
+ fnvlist_free(out_props);
+ }
+ zap_cursor_fini(&zc);
+ return (err);
+}
+
+/*
+ * Retrieve the bookmarks that exist in the specified dataset, and the
+ * requested properties of each bookmark.
+ *
+ * The "props" nvlist specifies which properties are requested.
+ * See lzc_get_bookmarks() for the list of valid properties.
+ */
+int
+dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_pool_hold(dsname, FTAG, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
+ err = dsl_get_bookmarks_impl(ds, props, outnvl);
+
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+}
+
+typedef struct dsl_bookmark_destroy_arg {
+ nvlist_t *dbda_bmarks;
+ nvlist_t *dbda_success;
+ nvlist_t *dbda_errors;
+} dsl_bookmark_destroy_arg_t;
+
+static int
+dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t bmark_zapobj = ds->ds_bookmarks;
+ matchtype_t mt = 0;
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
+}
+
+static int
+dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_destroy_arg_t *dbda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int rv = 0;
+
+ ASSERT(nvlist_empty(dbda->dbda_success));
+ ASSERT(nvlist_empty(dbda->dbda_errors));
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+ return (0);
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
+ const char *fullname = nvpair_name(pair);
+ dsl_dataset_t *ds;
+ zfs_bookmark_phys_t bm;
+ int error;
+ char *shortname;
+
+ error = dsl_bookmark_hold_ds(dp, fullname, &ds,
+ FTAG, &shortname);
+ if (error == ENOENT) {
+ /* ignore it; the bookmark is "already destroyed" */
+ continue;
+ }
+ if (error == 0) {
+ error = dsl_dataset_bmark_lookup(ds, shortname, &bm);
+ dsl_dataset_rele(ds, FTAG);
+ if (error == ESRCH) {
+ /*
+ * ignore it; the bookmark is
+ * "already destroyed"
+ */
+ continue;
+ }
+ }
+ if (error == 0) {
+ if (dmu_tx_is_syncing(tx)) {
+ fnvlist_add_boolean(dbda->dbda_success,
+ fullname);
+ }
+ } else {
+ fnvlist_add_int32(dbda->dbda_errors, fullname, error);
+ rv = error;
+ }
+ }
+ return (rv);
+}
+
+static void
+dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_destroy_arg_t *dbda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
+ dsl_dataset_t *ds;
+ char *shortname;
+ uint64_t zap_cnt;
+
+ VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
+ &ds, FTAG, &shortname));
+ VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx));
+
+ /*
+ * If all of this dataset's bookmarks have been destroyed,
+ * free the zap object and decrement the feature's use count.
+ */
+ VERIFY0(zap_count(mos, ds->ds_bookmarks,
+ &zap_cnt));
+ if (zap_cnt == 0) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
+ ds->ds_bookmarks = 0;
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+ VERIFY0(zap_remove(mos, ds->ds_object,
+ DS_FIELD_BOOKMARK_NAMES, tx));
+ }
+
+ spa_history_log_internal_ds(ds, "remove bookmark", tx,
+ "name=%s", shortname);
+
+ dsl_dataset_rele(ds, FTAG);
+ }
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
+{
+ int rv;
+ dsl_bookmark_destroy_arg_t dbda;
+ nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
+ if (pair == NULL)
+ return (0);
+
+ dbda.dbda_bmarks = bmarks;
+ dbda.dbda_errors = errors;
+ dbda.dbda_success = fnvlist_alloc();
+
+ rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
+ dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
+ ZFS_SPACE_CHECK_RESERVED);
+ fnvlist_free(dbda.dbda_success);
+ return (rv);
+}
+
+typedef struct dsl_bookmark_rename_arg {
+ const char *dbra_fsname;
+ const char *dbra_oldname;
+ const char *dbra_newname;
+} dsl_bookmark_rename_arg_t;
+
+static int
+dsl_bookmark_rename_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_bookmark_rename_arg_t *dbra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ zfs_bookmark_phys_t bmark_phys;
+ int error;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+ return (SET_ERROR(ENOTSUP));
+
+ /* Check validity and the full length of the new bookmark name. */
+ if (zfs_component_namecheck(dbra->dbra_newname, NULL, NULL))
+ return (SET_ERROR(EINVAL));
+ if (strlen(dbra->dbra_fsname) + strlen(dbra->dbra_newname) + 1 >=
+ ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ error = dsl_dataset_hold(dp, dbra->dbra_fsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+ if (ds->ds_is_snapshot) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ error = dsl_dataset_bmark_lookup(ds, dbra->dbra_oldname, &bmark_phys);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ error = dsl_dataset_bmark_lookup(ds, dbra->dbra_newname, &bmark_phys);
+ dsl_dataset_rele(ds, FTAG);
+ if (error == 0)
+ return (SET_ERROR(EEXIST));
+ if (error != ESRCH)
+ return (error);
+ return (0);
+}
+
+static void
+dsl_bookmark_rename_sync(void *arg, dmu_tx_t *tx)
+{
+ zfs_bookmark_phys_t bmark_phys;
+ dsl_bookmark_rename_arg_t *dbra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos;
+ dsl_dataset_t *ds;
+ uint64_t bmark_zapobj;
+ uint64_t int_size, num_ints;
+ matchtype_t mt = 0;
+ int error;
+
+ ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS));
+ VERIFY0(dsl_dataset_hold(dp, dbra->dbra_fsname, FTAG, &ds));
+
+ mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ bmark_zapobj = ds->ds_bookmarks;
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ VERIFY0(zap_length(mos, bmark_zapobj, dbra->dbra_oldname,
+ &int_size, &num_ints));
+ ASSERT3U(int_size, ==, sizeof (uint64_t));
+ VERIFY0(zap_lookup_norm(mos, bmark_zapobj, dbra->dbra_oldname, int_size,
+ num_ints, &bmark_phys, mt, NULL, 0, NULL));
+ VERIFY0(zap_remove_norm(mos, bmark_zapobj, dbra->dbra_oldname, mt, tx));
+
+ VERIFY0(zap_add(mos, bmark_zapobj, dbra->dbra_newname, int_size,
+ num_ints, &bmark_phys, tx));
+
+ spa_history_log_internal_ds(ds, "rename bookmark", tx,
+ "#%s -> #%s creation_txg=%llu",
+ dbra->dbra_oldname, dbra->dbra_newname,
+ (longlong_t)bmark_phys.zbm_creation_txg);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_rename(const char *fsname, const char *oldbmark,
+ const char *newbmark)
+{
+ dsl_bookmark_rename_arg_t dbra;
+
+ dbra.dbra_fsname = fsname;
+ dbra.dbra_oldname = oldbmark;
+ dbra.dbra_newname = newbmark;
+
+ return (dsl_sync_task(fsname, dsl_bookmark_rename_check,
+ dsl_bookmark_rename_sync, &dbra, 1, ZFS_SPACE_CHECK_NORMAL));
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
new file mode 100644
index 000000000000..f226c0244004
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -0,0 +1,4252 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dmu_send.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>
+
+SYSCTL_DECL(_vfs_zfs);
+
+/*
+ * The SPA supports block sizes up to 16MB. However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator. Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB). Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
+ &zfs_max_recordsize, 0,
+ "Maximum block size. Expect dragons when tuning this.");
+
+#define SWITCH64(x, y) \
+ { \
+ uint64_t __tmp = (x); \
+ (x) = (y); \
+ (y) = __tmp; \
+ }
+
+#define DS_REF_MAX (1ULL << 62)
+
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
+
+static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
+ uint64_t obj, dmu_tx_t *tx);
+static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
+ dmu_tx_t *tx);
+
+extern int spa_asize_inflation;
+
+static zil_header_t zero_zil;
+
+/*
+ * Figure out how much of this delta should be propogated to the dsl_dir
+ * layer. If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
+ */
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+ dsl_dataset_phys_t *ds_phys;
+ uint64_t old_bytes, new_bytes;
+
+ if (ds->ds_reserved == 0)
+ return (delta);
+
+ ds_phys = dsl_dataset_phys(ds);
+ old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+ new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+ ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+ return (new_bytes - old_bytes);
+}
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+ int64_t delta;
+
+ dprintf_bp(bp, "ds=%p", ds);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* It could have been compressed away to nothing */
+ if (BP_IS_HOLE(bp))
+ return;
+ ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+ ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
+ if (ds == NULL) {
+ dsl_pool_mos_diduse_space(tx->tx_pool,
+ used, compressed, uncompressed);
+ return;
+ }
+
+ ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_lock);
+ delta = parent_delta(ds, used);
+ dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+ dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+ dsl_dataset_phys(ds)->ds_unique_bytes += used;
+
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+ ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
+ B_TRUE;
+ }
+
+ spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+ if (f != SPA_FEATURE_NONE)
+ ds->ds_feature_activation_needed[f] = B_TRUE;
+
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
+ compressed, uncompressed, tx);
+ dsl_dir_transfer_space(ds->ds_dir, used - delta,
+ DD_USED_REFRSRV, DD_USED_HEAD, NULL);
+}
+
+/*
+ * Called when the specified segment has been remapped, and is thus no
+ * longer referenced in the head dataset. The vdev must be indirect.
+ *
+ * If the segment is referenced by a snapshot, put it on the remap deadlist.
+ * Otherwise, add this segment to the obsolete spacemap.
+ */
+void
+dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
+ uint64_t size, uint64_t birth, dmu_tx_t *tx)
+{
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(birth <= tx->tx_txg);
+ ASSERT(!ds->ds_is_snapshot);
+
+ if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+ } else {
+ blkptr_t fakebp;
+ dva_t *dva = &fakebp.blk_dva[0];
+
+ ASSERT(ds != NULL);
+
+ mutex_enter(&ds->ds_remap_deadlist_lock);
+ if (!dsl_dataset_remap_deadlist_exists(ds)) {
+ dsl_dataset_create_remap_deadlist(ds, tx);
+ }
+ mutex_exit(&ds->ds_remap_deadlist_lock);
+
+ BP_ZERO(&fakebp);
+ fakebp.blk_birth = birth;
+ DVA_SET_VDEV(dva, vdev);
+ DVA_SET_OFFSET(dva, offset);
+ DVA_SET_ASIZE(dva, size);
+
+ dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
+ }
+}
+
+int
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+ boolean_t async)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ int used = bp_get_dsize_sync(spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ if (BP_IS_HOLE(bp))
+ return (0);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(bp->blk_birth <= tx->tx_txg);
+
+ if (ds == NULL) {
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
+ dsl_pool_mos_diduse_space(tx->tx_pool,
+ -used, -compressed, -uncompressed);
+ return (used);
+ }
+ ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+ ASSERT(!ds->ds_is_snapshot);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ int64_t delta;
+
+ dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
+
+ mutex_enter(&ds->ds_lock);
+ ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
+ !DS_UNIQUE_IS_ACCURATE(ds));
+ delta = parent_delta(ds, -used);
+ dsl_dataset_phys(ds)->ds_unique_bytes -= used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+ delta, -compressed, -uncompressed, tx);
+ dsl_dir_transfer_space(ds->ds_dir, -used - delta,
+ DD_USED_REFRSRV, DD_USED_HEAD, NULL);
+ } else {
+ dprintf_bp(bp, "putting on dead list: %s", "");
+ if (async) {
+ /*
+ * We are here as part of zio's write done callback,
+ * which means we're a zio interrupt thread. We can't
+ * call dsl_deadlist_insert() now because it may block
+ * waiting for I/O. Instead, put bp on the deferred
+ * queue and let dsl_pool_sync() finish the job.
+ */
+ bplist_append(&ds->ds_pending_deadlist, bp);
+ } else {
+ dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
+ }
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj);
+ ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
+ /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+ ds->ds_object && bp->blk_birth >
+ dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ mutex_enter(&ds->ds_prev->ds_lock);
+ dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_prev->ds_lock);
+ }
+ if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+ dsl_dir_transfer_space(ds->ds_dir, used,
+ DD_USED_HEAD, DD_USED_SNAP, tx);
+ }
+ }
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+ dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+ dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
+ mutex_exit(&ds->ds_lock);
+
+ return (used);
+}
+
+/*
+ * We have to release the fsid syncronously or we risk that a subsequent
+ * mount of the same dataset will fail to unique_insert the fsid. This
+ * failure would manifest itself as the fsid of this dataset changing
+ * between mounts which makes NFS clients quite unhappy.
+ */
+static void
+dsl_dataset_evict_sync(void *dbu)
+{
+ dsl_dataset_t *ds = dbu;
+
+ ASSERT(ds->ds_owner == NULL);
+
+ unique_remove(ds->ds_fsid_guid);
+}
+
+static void
+dsl_dataset_evict_async(void *dbu)
+{
+ dsl_dataset_t *ds = dbu;
+
+ ASSERT(ds->ds_owner == NULL);
+
+ ds->ds_dbuf = NULL;
+
+ if (ds->ds_objset != NULL)
+ dmu_objset_evict(ds->ds_objset);
+
+ if (ds->ds_prev) {
+ dsl_dataset_rele(ds->ds_prev, ds);
+ ds->ds_prev = NULL;
+ }
+
+ bplist_destroy(&ds->ds_pending_deadlist);
+ if (dsl_deadlist_is_open(&ds->ds_deadlist))
+ dsl_deadlist_close(&ds->ds_deadlist);
+ if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+ if (ds->ds_dir)
+ dsl_dir_async_rele(ds->ds_dir, ds);
+
+ ASSERT(!list_link_active(&ds->ds_synced_link));
+
+ list_destroy(&ds->ds_prop_cbs);
+ if (mutex_owned(&ds->ds_lock))
+ mutex_exit(&ds->ds_lock);
+ mutex_destroy(&ds->ds_lock);
+ if (mutex_owned(&ds->ds_opening_lock))
+ mutex_exit(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_sendstream_lock);
+ mutex_destroy(&ds->ds_remap_deadlist_lock);
+ zfs_refcount_destroy(&ds->ds_longholds);
+ rrw_destroy(&ds->ds_bp_rwlock);
+
+ kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+int
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+ dsl_dataset_phys_t *headphys;
+ int err;
+ dmu_buf_t *headdbuf;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (ds->ds_snapname[0])
+ return (0);
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
+ return (0);
+
+ err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err != 0)
+ return (err);
+ headphys = headdbuf->db_data;
+ err = zap_value_search(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
+ dmu_buf_rele(headdbuf, FTAG);
+ return (err);
+}
+
+int
+dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+ matchtype_t mt = 0;
+ int err;
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ err = zap_lookup_norm(mos, snapobj, name, 8, 1,
+ value, mt, NULL, 0, NULL);
+ if (err == ENOTSUP && (mt & MT_NORMALIZE))
+ err = zap_lookup(mos, snapobj, name, 8, 1, value);
+ return (err);
+}
+
+int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+ boolean_t adj_cnt)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+ matchtype_t mt = 0;
+ int err;
+
+ dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ mt = MT_NORMALIZE;
+
+ err = zap_remove_norm(mos, snapobj, name, mt, tx);
+ if (err == ENOTSUP && (mt & MT_NORMALIZE))
+ err = zap_remove(mos, snapobj, name, tx);
+
+ if (err == 0 && adj_cnt)
+ dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
+ return (err);
+}
+
+boolean_t
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+{
+ dmu_buf_t *dbuf = ds->ds_dbuf;
+ boolean_t result = B_FALSE;
+
+ if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
+ ds->ds_object, DMU_BONUS_BLKID, tag)) {
+
+ if (ds == dmu_buf_get_user(dbuf))
+ result = B_TRUE;
+ else
+ dmu_buf_rele(dbuf, tag);
+ }
+
+ return (result);
+}
+
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+ dsl_dataset_t **dsp)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_t *ds;
+ int err;
+ dmu_object_info_t doi;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+ if (err != 0)
+ return (err);
+
+ /* Make sure dsobj has the correct object type. */
+ dmu_object_info_from_db(dbuf, &doi);
+ if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
+ dmu_buf_rele(dbuf, tag);
+ return (SET_ERROR(EINVAL));
+ }
+
+ ds = dmu_buf_get_user(dbuf);
+ if (ds == NULL) {
+ dsl_dataset_t *winner = NULL;
+
+ ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+ ds->ds_dbuf = dbuf;
+ ds->ds_object = dsobj;
+ ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
+
+ err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
+ NULL, ds, &ds->ds_dir);
+ if (err != 0) {
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+
+ mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_remap_deadlist_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ rrw_init(&ds->ds_bp_rwlock, B_FALSE);
+ zfs_refcount_create(&ds->ds_longholds);
+
+ bplist_create(&ds->ds_pending_deadlist);
+
+ list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
+ offsetof(dmu_sendarg_t, dsa_link));
+
+ list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_ds_node));
+
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET))
+ continue;
+ err = zap_contains(mos, dsobj,
+ spa_feature_table[f].fi_guid);
+ if (err == 0) {
+ ds->ds_feature_inuse[f] = B_TRUE;
+ } else {
+ ASSERT3U(err, ==, ENOENT);
+ err = 0;
+ }
+ }
+ }
+
+ if (!ds->ds_is_snapshot) {
+ ds->ds_snapname[0] = '\0';
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ ds, &ds->ds_prev);
+ }
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ int zaperr = zap_lookup(mos, ds->ds_object,
+ DS_FIELD_BOOKMARK_NAMES,
+ sizeof (ds->ds_bookmarks), 1,
+ &ds->ds_bookmarks);
+ if (zaperr != ENOENT)
+ VERIFY0(zaperr);
+ }
+ } else {
+ if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+ err = dsl_dataset_get_snapname(ds);
+ if (err == 0 &&
+ dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+ err = zap_count(
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_userrefs_obj,
+ &ds->ds_userrefs);
+ }
+ }
+
+ if (err == 0 && !ds->ds_is_snapshot) {
+ err = dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ &ds->ds_reserved);
+ if (err == 0) {
+ err = dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+ &ds->ds_quota);
+ }
+ } else {
+ ds->ds_reserved = ds->ds_quota = 0;
+ }
+
+ dsl_deadlist_open(&ds->ds_deadlist,
+ mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
+ uint64_t remap_deadlist_obj =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ if (remap_deadlist_obj != 0) {
+ dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
+ remap_deadlist_obj);
+ }
+
+ dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
+ dsl_dataset_evict_async, &ds->ds_dbuf);
+ if (err == 0)
+ winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
+
+ if (err != 0 || winner != NULL) {
+ bplist_destroy(&ds->ds_pending_deadlist);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+ if (ds->ds_prev)
+ dsl_dataset_rele(ds->ds_prev, ds);
+ dsl_dir_rele(ds->ds_dir, ds);
+ list_destroy(&ds->ds_prop_cbs);
+ list_destroy(&ds->ds_sendstreams);
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_sendstream_lock);
+ mutex_destroy(&ds->ds_remap_deadlist_lock);
+ zfs_refcount_destroy(&ds->ds_longholds);
+ rrw_destroy(&ds->ds_bp_rwlock);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ if (err != 0) {
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ ds = winner;
+ } else {
+ ds->ds_fsid_guid =
+ unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
+ if (ds->ds_fsid_guid !=
+ dsl_dataset_phys(ds)->ds_fsid_guid) {
+ zfs_dbgmsg("ds_fsid_guid changed from "
+ "%llx to %llx for pool %s dataset id %llu",
+ (long long)
+ dsl_dataset_phys(ds)->ds_fsid_guid,
+ (long long)ds->ds_fsid_guid,
+ spa_name(dp->dp_spa),
+ dsobj);
+ }
+ }
+ }
+ ASSERT3P(ds->ds_dbuf, ==, dbuf);
+ ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+ ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
+ spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
+ dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
+ *dsp = ds;
+ return (0);
+}
+
+int
+dsl_dataset_hold(dsl_pool_t *dp, const char *name,
+ void *tag, dsl_dataset_t **dsp)
+{
+ dsl_dir_t *dd;
+ const char *snapname;
+ uint64_t obj;
+ int err = 0;
+ dsl_dataset_t *ds;
+
+ err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
+ if (err != 0)
+ return (err);
+
+ ASSERT(dsl_pool_config_held(dp));
+ obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+ if (obj != 0)
+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+ else
+ err = SET_ERROR(ENOENT);
+
+ /* we may be looking for a snapshot */
+ if (err == 0 && snapname != NULL) {
+ dsl_dataset_t *snap_ds;
+
+ if (*snapname++ != '@') {
+ dsl_dataset_rele(ds, tag);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ dprintf("looking for snapshot '%s'\n", snapname);
+ err = dsl_dataset_snap_lookup(ds, snapname, &obj);
+ if (err == 0)
+ err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
+ dsl_dataset_rele(ds, tag);
+
+ if (err == 0) {
+ mutex_enter(&snap_ds->ds_lock);
+ if (snap_ds->ds_snapname[0] == 0)
+ (void) strlcpy(snap_ds->ds_snapname, snapname,
+ sizeof (snap_ds->ds_snapname));
+ mutex_exit(&snap_ds->ds_lock);
+ ds = snap_ds;
+ }
+ }
+ if (err == 0)
+ *dsp = ds;
+ dsl_dir_rele(dd, FTAG);
+ return (err);
+}
+
+int
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
+ void *tag, dsl_dataset_t **dsp)
+{
+ int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
+ if (err != 0)
+ return (err);
+ if (!dsl_dataset_tryown(*dsp, tag)) {
+ dsl_dataset_rele(*dsp, tag);
+ *dsp = NULL;
+ return (SET_ERROR(EBUSY));
+ }
+ return (0);
+}
+
+int
+dsl_dataset_own(dsl_pool_t *dp, const char *name,
+ void *tag, dsl_dataset_t **dsp)
+{
+ int err = dsl_dataset_hold(dp, name, tag, dsp);
+ if (err != 0)
+ return (err);
+ if (!dsl_dataset_tryown(*dsp, tag)) {
+ dsl_dataset_rele(*dsp, tag);
+ return (SET_ERROR(EBUSY));
+ }
+ return (0);
+}
+
+/*
+ * See the comment above dsl_pool_hold() for details. In summary, a long
+ * hold is used to prevent destruction of a dataset while the pool hold
+ * is dropped, allowing other concurrent operations (e.g. spa_sync()).
+ *
+ * The dataset and pool must be held when this function is called. After it
+ * is called, the pool hold may be released while the dataset is still held
+ * and accessed.
+ */
+void
+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+{
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+ (void) zfs_refcount_add(&ds->ds_longholds, tag);
+}
+
+void
+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+{
+ (void) zfs_refcount_remove(&ds->ds_longholds, tag);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_dataset_long_held(dsl_dataset_t *ds)
+{
+ return (!zfs_refcount_is_zero(&ds->ds_longholds));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+ if (ds == NULL) {
+ (void) strcpy(name, "mos");
+ } else {
+ dsl_dir_name(ds->ds_dir, name);
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ if (ds->ds_snapname[0]) {
+ VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ /*
+ * We use a "recursive" mutex so that we
+ * can call dprintf_ds() with ds_lock held.
+ */
+ if (!MUTEX_HELD(&ds->ds_lock)) {
+ mutex_enter(&ds->ds_lock);
+ VERIFY3U(strlcat(name, ds->ds_snapname,
+ ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
+ mutex_exit(&ds->ds_lock);
+ } else {
+ VERIFY3U(strlcat(name, ds->ds_snapname,
+ ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
+ }
+ }
+ }
+}
+
+int
+dsl_dataset_namelen(dsl_dataset_t *ds)
+{
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ mutex_enter(&ds->ds_lock);
+ int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
+ mutex_exit(&ds->ds_lock);
+ return (len);
+}
+
+void
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+{
+ dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
+{
+ ASSERT3P(ds->ds_owner, ==, tag);
+ ASSERT(ds->ds_dbuf != NULL);
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_owner = NULL;
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_long_rele(ds, tag);
+ dsl_dataset_rele(ds, tag);
+}
+
+boolean_t
+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
+{
+ boolean_t gotit = FALSE;
+
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
+ ds->ds_owner = tag;
+ dsl_dataset_long_hold(ds, tag);
+ gotit = TRUE;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (gotit);
+}
+
+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+ boolean_t rv;
+ mutex_enter(&ds->ds_lock);
+ rv = (ds->ds_owner != NULL);
+ mutex_exit(&ds->ds_lock);
+ return (rv);
+}
+
+static void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t zero = 0;
+
+ VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+ spa_feature_incr(spa, f, tx);
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+ VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+ sizeof (zero), 1, &zero, tx));
+}
+
+void
+dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+
+ VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+ VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+ spa_feature_decr(spa, f, tx);
+}
+
+uint64_t
+dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+ uint64_t flags, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (origin == NULL)
+ origin = dp->dp_origin_snap;
+
+ ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+ ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ bzero(dsphys, sizeof (dsl_dataset_phys_t));
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_flags = flags;
+ dsphys->ds_fsid_guid = unique_create();
+ do {
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ } while (dsphys->ds_guid == 0);
+ dsphys->ds_snapnames_zapobj =
+ zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+ DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
+
+ if (origin == NULL) {
+ dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+ } else {
+ dsl_dataset_t *ohds; /* head of the origin snapshot */
+
+ dsphys->ds_prev_snap_obj = origin->ds_object;
+ dsphys->ds_prev_snap_txg =
+ dsl_dataset_phys(origin)->ds_creation_txg;
+ dsphys->ds_referenced_bytes =
+ dsl_dataset_phys(origin)->ds_referenced_bytes;
+ dsphys->ds_compressed_bytes =
+ dsl_dataset_phys(origin)->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+ rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
+ dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
+ rrw_exit(&origin->ds_bp_rwlock, FTAG);
+
+ /*
+ * Inherit flags that describe the dataset's contents
+ * (INCONSISTENT) or properties (Case Insensitive).
+ */
+ dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
+ (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (origin->ds_feature_inuse[f])
+ dsl_dataset_activate_feature(dsobj, f, tx);
+ }
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ dsl_dataset_phys(origin)->ds_num_children++;
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+ FTAG, &ohds));
+ dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+ dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+ dsl_dataset_rele(ohds, FTAG);
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
+ if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+ dsl_dataset_phys(origin)->ds_next_clones_obj =
+ zap_create(mos,
+ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY0(zap_add_int(mos,
+ dsl_dataset_phys(origin)->ds_next_clones_obj,
+ dsobj, tx));
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ dsl_dir_phys(origin->ds_dir)->dd_clones =
+ zap_create(mos,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY0(zap_add_int(mos,
+ dsl_dir_phys(origin->ds_dir)->dd_clones,
+ dsobj, tx));
+ }
+ }
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+ dmu_buf_rele(dbuf, FTAG);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
+
+ return (dsobj);
+}
+
+static void
+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ objset_t *os;
+
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ zio_t *zio;
+
+ bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dsl_dataset_sync(ds, zio, tx);
+ VERIFY0(zio_wait(zio));
+
+ /* dsl_dataset_sync_done will drop this reference. */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ dsl_dataset_sync_done(ds, tx);
+ }
+}
+
+uint64_t
+dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
+ dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = pdd->dd_pool;
+ uint64_t dsobj, ddobj;
+ dsl_dir_t *dd;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(lastname[0] != '@');
+
+ ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
+
+ dsobj = dsl_dataset_create_sync_dd(dd, origin,
+ flags & ~DS_CREATE_FLAG_NODIRTY, tx);
+
+ dsl_deleg_set_create_perms(dd, tx, cr);
+
+ /*
+ * Since we're creating a new node we know it's a leaf, so we can
+ * initialize the counts if the limit feature is active.
+ */
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ uint64_t cnt = 0;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+
+ dsl_dir_zapify(dd, tx);
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (cnt), 1, &cnt, tx));
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (cnt), 1, &cnt, tx));
+ }
+
+ dsl_dir_rele(dd, FTAG);
+
+ /*
+ * If we are creating a clone, make sure we zero out any stale
+ * data from the origin snapshots zil header.
+ */
+ if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ dsl_dataset_zero_zil(ds, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ return (dsobj);
+}
+
+#ifdef __FreeBSD__
+/* FreeBSD ioctl compat begin */
+struct destroyarg {
+ nvlist_t *nvl;
+ const char *snapname;
+};
+
+static int
+dsl_check_snap_cb(const char *name, void *arg)
+{
+ struct destroyarg *da = arg;
+ dsl_dataset_t *ds;
+ char *dsname;
+
+ dsname = kmem_asprintf("%s@%s", name, da->snapname);
+ fnvlist_add_boolean(da->nvl, dsname);
+ kmem_free(dsname, strlen(dsname) + 1);
+
+ return (0);
+}
+
+int
+dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
+ nvlist_t *snaps)
+{
+ struct destroyarg *da;
+ int err;
+
+ da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
+ da->nvl = snaps;
+ da->snapname = snapname;
+ err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
+ DS_FIND_CHILDREN);
+ kmem_free(da, sizeof (struct destroyarg));
+
+ return (err);
+}
+/* FreeBSD ioctl compat end */
+#endif /* __FreeBSD__ */
+
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use. To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+ uint64_t mrs_used;
+ uint64_t dlused, dlcomp, dluncomp;
+
+ ASSERT(!ds->ds_is_snapshot);
+
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+ mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
+ else
+ mrs_used = 0;
+
+ dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
+
+ ASSERT3U(dlused, <=, mrs_used);
+ dsl_dataset_phys(ds)->ds_unique_bytes =
+ dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ SPA_VERSION_UNIQUE_ACCURATE)
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+void
+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+ dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t count;
+ int err;
+
+ ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+ err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+ obj, tx);
+ /*
+ * The err should not be ENOENT, but a bug in a previous version
+ * of the code could cause upgrade_clones_cb() to not set
+ * ds_next_snap_obj when it should, leading to a missing entry.
+ * If we knew that the pool was created after
+ * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+ * ENOENT. However, at least we can check that we don't have
+ * too many entries in the next_clones_obj even after failing to
+ * remove this one.
+ */
+ if (err != ENOENT)
+ VERIFY0(err);
+ ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+ &count));
+ ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
+}
+
+
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
+{
+ return (&dsl_dataset_phys(ds)->ds_bp);
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+ return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp;
+
+ if (ds == NULL) /* this is the meta-objset */
+ return;
+
+ ASSERT(ds->ds_objset != NULL);
+
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
+ panic("dirtying snapshot!");
+
+ /* Must not dirty a dataset in the same txg where it got snapshotted. */
+ ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
+ dp = ds->ds_dir->dd_pool;
+ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ }
+}
+
+boolean_t
+dsl_dataset_is_dirty(dsl_dataset_t *ds)
+{
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+ ds, t))
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static int
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t asize;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ /*
+ * If there's an fs-only reservation, any blocks that might become
+ * owned by the snapshot dataset must be accommodated by space
+ * outside of the reservation.
+ */
+ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+ asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
+ if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+ return (SET_ERROR(ENOSPC));
+
+ /*
+ * Propagate any reserved space for this snapshot to other
+ * snapshot checks in this sync group.
+ */
+ if (asize > 0)
+ dsl_dir_willuse_space(ds->ds_dir, asize, tx);
+
+ return (0);
+}
+
+int
+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+ dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
+{
+ int error;
+ uint64_t value;
+
+ ds->ds_trysnap_txg = tx->tx_txg;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ /*
+ * We don't allow multiple snapshots of the same txg. If there
+ * is already one, try again.
+ */
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
+ return (SET_ERROR(EAGAIN));
+
+ /*
+ * Check for conflicting snapshot name.
+ */
+ error = dsl_dataset_snap_lookup(ds, snapname, &value);
+ if (error == 0)
+ return (SET_ERROR(EEXIST));
+ if (error != ENOENT)
+ return (error);
+
+ /*
+ * We don't allow taking snapshots of inconsistent datasets, such as
+ * those into which we are currently receiving. However, if we are
+ * creating this snapshot as part of a receive, this check will be
+ * executed atomically with respect to the completion of the receive
+ * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
+ * case we ignore this, knowing it will be fixed up for us shortly in
+ * dmu_recv_end_sync().
+ */
+ if (!recv && DS_IS_INCONSISTENT(ds))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Skip the check for temporary snapshots or if we have already checked
+ * the counts in dsl_dataset_snapshot_check. This means we really only
+ * check the count here when we're receiving a stream.
+ */
+ if (cnt != 0 && cr != NULL) {
+ error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
+ if (error != 0)
+ return (error);
+ }
+
+ error = dsl_dataset_snapshot_reserve_space(ds, tx);
+ if (error != 0)
+ return (error);
+
+ return (0);
+}
+
+int
+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_snapshot_arg_t *ddsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ nvpair_t *pair;
+ int rv = 0;
+
+ /*
+ * Pre-compute how many total new snapshots will be created for each
+ * level in the tree and below. This is needed for validating the
+ * snapshot limit when either taking a recursive snapshot or when
+ * taking multiple snapshots.
+ *
+ * The problem is that the counts are not actually adjusted when
+ * we are checking, only when we finally sync. For a single snapshot,
+ * this is easy, the count will increase by 1 at each node up the tree,
+ * but its more complicated for the recursive/multiple snapshot case.
+ *
+ * The dsl_fs_ss_limit_check function does recursively check the count
+ * at each level up the tree but since it is validating each snapshot
+ * independently we need to be sure that we are validating the complete
+ * count for the entire set of snapshots. We do this by rolling up the
+ * counts for each component of the name into an nvlist and then
+ * checking each of those cases with the aggregated count.
+ *
+ * This approach properly handles not only the recursive snapshot
+ * case (where we get all of those on the ddsa_snaps list) but also
+ * the sibling case (e.g. snapshot a/b and a/c so that we will also
+ * validate the limit on 'a' using a count of 2).
+ *
+ * We validate the snapshot names in the third loop and only report
+ * name errors once.
+ */
+ if (dmu_tx_is_syncing(tx)) {
+ nvlist_t *cnt_track = NULL;
+ cnt_track = fnvlist_alloc();
+
+ /* Rollup aggregated counts into the cnt_track list */
+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+ char *pdelim;
+ uint64_t val;
+ char nm[MAXPATHLEN];
+
+ (void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
+ pdelim = strchr(nm, '@');
+ if (pdelim == NULL)
+ continue;
+ *pdelim = '\0';
+
+ do {
+ if (nvlist_lookup_uint64(cnt_track, nm,
+ &val) == 0) {
+ /* update existing entry */
+ fnvlist_add_uint64(cnt_track, nm,
+ val + 1);
+ } else {
+ /* add to list */
+ fnvlist_add_uint64(cnt_track, nm, 1);
+ }
+
+ pdelim = strrchr(nm, '/');
+ if (pdelim != NULL)
+ *pdelim = '\0';
+ } while (pdelim != NULL);
+ }
+
+ /* Check aggregated counts at each level */
+ for (pair = nvlist_next_nvpair(cnt_track, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+ int error = 0;
+ char *name;
+ uint64_t cnt = 0;
+ dsl_dataset_t *ds;
+
+ name = nvpair_name(pair);
+ cnt = fnvpair_value_uint64(pair);
+ ASSERT(cnt > 0);
+
+ error = dsl_dataset_hold(dp, name, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+ ddsa->ddsa_cr);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ if (error != 0) {
+ if (ddsa->ddsa_errors != NULL)
+ fnvlist_add_int32(ddsa->ddsa_errors,
+ name, error);
+ rv = error;
+ /* only report one error for this check */
+ break;
+ }
+ }
+ nvlist_free(cnt_track);
+ }
+
+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+ int error = 0;
+ dsl_dataset_t *ds;
+ char *name, *atp;
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
+
+ name = nvpair_name(pair);
+ if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
+ error = SET_ERROR(ENAMETOOLONG);
+ if (error == 0) {
+ atp = strchr(name, '@');
+ if (atp == NULL)
+ error = SET_ERROR(EINVAL);
+ if (error == 0)
+ (void) strlcpy(dsname, name, atp - name + 1);
+ }
+ if (error == 0)
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error == 0) {
+ /* passing 0/NULL skips dsl_fs_ss_limit_check */
+ error = dsl_dataset_snapshot_check_impl(ds,
+ atp + 1, tx, B_FALSE, 0, NULL);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ if (error != 0) {
+ if (ddsa->ddsa_errors != NULL) {
+ fnvlist_add_int32(ddsa->ddsa_errors,
+ name, error);
+ }
+ rv = error;
+ }
+ }
+
+ return (rv);
+}
+
+void
+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj, crtxg;
+ objset_t *mos = dp->dp_meta_objset;
+ objset_t *os;
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ /*
+ * If we are on an old pool, the zil must not be active, in which
+ * case it will be zeroed. Usually zil_suspend() accomplishes this.
+ */
+ ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
+ dmu_objset_from_ds(ds, &os) != 0 ||
+ bcmp(&os->os_phys->os_zil_header, &zero_zil,
+ sizeof (zero_zil)) == 0);
+
+ /* Should not snapshot a dirty dataset. */
+ ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+ ds, tx->tx_txg));
+
+ dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
+
+ /*
+ * The origin's ds_creation_txg has to be < TXG_INITIAL
+ */
+ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
+ crtxg = 1;
+ else
+ crtxg = tx->tx_txg;
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ bzero(dsphys, sizeof (dsl_dataset_phys_t));
+ dsphys->ds_dir_obj = ds->ds_dir->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ do {
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ } while (dsphys->ds_guid == 0);
+ dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ dsphys->ds_next_snap_obj = ds->ds_object;
+ dsphys->ds_num_children = 1;
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = crtxg;
+ dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+ dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+ dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ dmu_buf_rele(dbuf, FTAG);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f])
+ dsl_dataset_activate_feature(dsobj, f, tx);
+ }
+
+ ASSERT3U(ds->ds_prev != 0, ==,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+ if (ds->ds_prev) {
+ uint64_t next_clones_obj =
+ dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+ ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+ ds->ds_object ||
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+ ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+ dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
+ } else if (next_clones_obj != 0) {
+ dsl_dataset_remove_from_next_clones(ds->ds_prev,
+ dsphys->ds_next_snap_obj, tx);
+ VERIFY0(zap_add_int(mos,
+ next_clones_obj, dsobj, tx));
+ }
+ }
+
+ /*
+ * If we have a reference-reservation on this dataset, we will
+ * need to increase the amount of refreservation being charged
+ * since our unique space is going to zero.
+ */
+ if (ds->ds_reserved) {
+ int64_t delta;
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+ ds->ds_reserved);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
+ delta, 0, 0, tx);
+ }
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_deadlist_obj =
+ dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_open(&ds->ds_deadlist, mos,
+ dsl_dataset_phys(ds)->ds_deadlist_obj);
+ dsl_deadlist_add_key(&ds->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ uint64_t remap_deadlist_obj =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ /*
+ * Move the remap_deadlist to the snapshot. The head
+ * will create a new remap deadlist on demand, from
+ * dsl_dataset_block_remapped().
+ */
+ dsl_dataset_unset_remap_deadlist_object(ds, tx);
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
+ sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
+ }
+
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+ dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+ dsl_dataset_phys(ds)->ds_unique_bytes = 0;
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+ VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+ snapname, 8, 1, &dsobj, tx));
+
+ if (ds->ds_prev)
+ dsl_dataset_rele(ds->ds_prev, ds);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
+
+ dsl_scan_ds_snapshotted(ds, tx);
+
+ dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+ spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
+}
+
+void
+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_snapshot_arg_t *ddsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ nvpair_t *pair;
+
+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+ dsl_dataset_t *ds;
+ char *name, *atp;
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
+
+ name = nvpair_name(pair);
+ atp = strchr(name, '@');
+ (void) strlcpy(dsname, name, atp - name + 1);
+ VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
+
+ dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
+ if (ddsa->ddsa_props != NULL) {
+ dsl_props_set_sync_impl(ds->ds_prev,
+ ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+ }
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ zvol_create_minors(dp->dp_spa, name);
+#endif
+ dsl_dataset_rele(ds, FTAG);
+ }
+}
+
+/*
+ * The snapshots must all be in the same pool.
+ * All-or-nothing: if there are any failures, nothing will be modified.
+ */
+int
+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+{
+ dsl_dataset_snapshot_arg_t ddsa;
+ nvpair_t *pair;
+ boolean_t needsuspend;
+ int error;
+ spa_t *spa;
+ char *firstname;
+ nvlist_t *suspended = NULL;
+
+ pair = nvlist_next_nvpair(snaps, NULL);
+ if (pair == NULL)
+ return (0);
+ firstname = nvpair_name(pair);
+
+ error = spa_open(firstname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+ spa_close(spa, FTAG);
+
+ if (needsuspend) {
+ suspended = fnvlist_alloc();
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
+ char *snapname = nvpair_name(pair);
+ char *atp;
+ void *cookie;
+
+ atp = strchr(snapname, '@');
+ if (atp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ (void) strlcpy(fsname, snapname, atp - snapname + 1);
+
+ error = zil_suspend(fsname, &cookie);
+ if (error != 0)
+ break;
+ fnvlist_add_uint64(suspended, fsname,
+ (uintptr_t)cookie);
+ }
+ }
+
+ ddsa.ddsa_snaps = snaps;
+ ddsa.ddsa_props = props;
+ ddsa.ddsa_errors = errors;
+ ddsa.ddsa_cr = CRED();
+
+ if (error == 0) {
+ error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
+ dsl_dataset_snapshot_sync, &ddsa,
+ fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
+ }
+
+ if (suspended != NULL) {
+ for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(suspended, pair)) {
+ zil_resume((void *)(uintptr_t)
+ fnvpair_value_uint64(pair));
+ }
+ fnvlist_free(suspended);
+ }
+
+ return (error);
+}
+
+typedef struct dsl_dataset_snapshot_tmp_arg {
+ const char *ddsta_fsname;
+ const char *ddsta_snapname;
+ minor_t ddsta_cleanup_minor;
+ const char *ddsta_htag;
+} dsl_dataset_snapshot_tmp_arg_t;
+
+static int
+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ /* NULL cred means no limit check for tmp snapshot */
+ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
+ tx, B_FALSE, 0, NULL);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
+ B_TRUE, tx);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
+
+ dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
+ dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
+ ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
+ dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+ minor_t cleanup_minor, const char *htag)
+{
+ dsl_dataset_snapshot_tmp_arg_t ddsta;
+ int error;
+ spa_t *spa;
+ boolean_t needsuspend;
+ void *cookie;
+
+ ddsta.ddsta_fsname = fsname;
+ ddsta.ddsta_snapname = snapname;
+ ddsta.ddsta_cleanup_minor = cleanup_minor;
+ ddsta.ddsta_htag = htag;
+
+ error = spa_open(fsname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+ spa_close(spa, FTAG);
+
+ if (needsuspend) {
+ error = zil_suspend(fsname, &cookie);
+ if (error != 0)
+ return (error);
+ }
+
+ error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
+ dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
+
+ if (needsuspend)
+ zil_resume(cookie);
+ return (error);
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(ds->ds_objset != NULL);
+ ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
+
+ /*
+ * in case we had to change ds_fsid_guid when we opened it,
+ * sync it out now.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
+
+ if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+ &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+ &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+ &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+ ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+ ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+ ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+ }
+
+ dmu_objset_sync(ds->ds_objset, zio, tx);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_activation_needed[f]) {
+ if (ds->ds_feature_inuse[f])
+ continue;
+ dsl_dataset_activate_feature(ds->ds_object, f, tx);
+ ds->ds_feature_inuse[f] = B_TRUE;
+ }
+ }
+}
+
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
+void
+dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ objset_t *os = ds->ds_objset;
+
+ bplist_iterate(&ds->ds_pending_deadlist,
+ deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+
+ if (os->os_synced_dnodes != NULL) {
+ multilist_destroy(os->os_synced_dnodes);
+ os->os_synced_dnodes = NULL;
+ }
+
+ ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
+
+ dmu_buf_rele(ds->ds_dbuf, ds);
+}
+
+int
+get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val)
+{
+ uint64_t count = 0;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+ /*
+ * There may be missing entries in ds_next_clones_obj
+ * due to a bug in a previous version of the code.
+ * Only trust it if it has the right number of entries.
+ */
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+ VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+ &count));
+ }
+ if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
+ return (ENOENT);
+ }
+ for (zap_cursor_init(&zc, mos,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *clone;
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+ za.za_first_integer, FTAG, &clone));
+ dsl_dir_name(clone->ds_dir, buf);
+ fnvlist_add_boolean(val, buf);
+ dsl_dataset_rele(clone, FTAG);
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+void
+get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ nvlist_t *propval = fnvlist_alloc();
+ nvlist_t *val;
+
+ /*
+ * We use nvlist_alloc() instead of fnvlist_alloc() because the
+ * latter would allocate the list with NV_UNIQUE_NAME flag.
+ * As a result, every time a clone name is appended to the list
+ * it would be (linearly) searched for for a duplicate name.
+ * We already know that all clone names must be unique and we
+ * want avoid the quadratic complexity of double-checking that
+ * because we can have a large number of clones.
+ */
+ VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));
+
+ if (get_clones_stat_impl(ds, val) == 0) {
+ fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
+ fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
+ propval);
+ }
+
+ nvlist_free(val);
+ nvlist_free(propval);
+}
+
+/*
+ * Returns a string that represents the receive resume stats token. It should
+ * be freed with strfree().
+ */
+char *
+get_receive_resume_stats_impl(dsl_dataset_t *ds)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (dsl_dataset_has_resume_receive_state(ds)) {
+ char *str;
+ void *packed;
+ uint8_t *compressed;
+ uint64_t val;
+ nvlist_t *token_nv = fnvlist_alloc();
+ size_t packed_size, compressed_size;
+
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "fromguid", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "object", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "offset", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "bytes", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "toguid", val);
+ }
+ char buf[256];
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+ fnvlist_add_string(token_nv, "toname", buf);
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_LARGEBLOCK) == 0) {
+ fnvlist_add_boolean(token_nv, "largeblockok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_EMBEDOK) == 0) {
+ fnvlist_add_boolean(token_nv, "embedok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_COMPRESSOK) == 0) {
+ fnvlist_add_boolean(token_nv, "compressok");
+ }
+ packed = fnvlist_pack(token_nv, &packed_size);
+ fnvlist_free(token_nv);
+ compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+ compressed_size = gzip_compress(packed, compressed,
+ packed_size, packed_size, 6);
+
+ zio_cksum_t cksum;
+ fletcher_4_native(compressed, compressed_size, NULL, &cksum);
+
+ str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
+ for (int i = 0; i < compressed_size; i++) {
+ (void) sprintf(str + i * 2, "%02x", compressed[i]);
+ }
+ str[compressed_size * 2] = '\0';
+ char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+ ZFS_SEND_RESUME_TOKEN_VERSION,
+ (longlong_t)cksum.zc_word[0],
+ (longlong_t)packed_size, str);
+ kmem_free(packed, packed_size);
+ kmem_free(str, compressed_size * 2 + 1);
+ kmem_free(compressed, packed_size);
+ return (propval);
+ }
+ return (spa_strdup(""));
+}
+
+/*
+ * Returns a string that represents the receive resume stats token of the
+ * dataset's child. It should be freed with strfree().
+ */
+char *
+get_child_receive_stats(dsl_dataset_t *ds)
+{
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+ dsl_dataset_t *recv_ds;
+ dsl_dataset_name(ds, recvname);
+ if (strlcat(recvname, "/", sizeof (recvname)) <
+ sizeof (recvname) &&
+ strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+ sizeof (recvname) &&
+ dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
+ &recv_ds) == 0) {
+ char *propval = get_receive_resume_stats_impl(recv_ds);
+ dsl_dataset_rele(recv_ds, FTAG);
+ return (propval);
+ }
+ return (spa_strdup(""));
+}
+
+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ char *propval = get_receive_resume_stats_impl(ds);
+ if (strcmp(propval, "") != 0) {
+ dsl_prop_nvlist_add_string(nv,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+ } else {
+ char *childval = get_child_receive_stats(ds);
+ if (strcmp(childval, "") != 0) {
+ dsl_prop_nvlist_add_string(nv,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
+ }
+ strfree(childval);
+ }
+ strfree(propval);
+}
+
+uint64_t
+dsl_get_refratio(dsl_dataset_t *ds)
+{
+ uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+ (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+ dsl_dataset_phys(ds)->ds_compressed_bytes);
+ return (ratio);
+}
+
+uint64_t
+dsl_get_logicalreferenced(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
+}
+
+uint64_t
+dsl_get_compressratio(dsl_dataset_t *ds)
+{
+ if (ds->ds_is_snapshot) {
+ return (dsl_get_refratio(ds));
+ } else {
+ dsl_dir_t *dd = ds->ds_dir;
+ mutex_enter(&dd->dd_lock);
+ uint64_t val = dsl_dir_get_compressratio(dd);
+ mutex_exit(&dd->dd_lock);
+ return (val);
+ }
+}
+
+uint64_t
+dsl_get_used(dsl_dataset_t *ds)
+{
+ if (ds->ds_is_snapshot) {
+ return (dsl_dataset_phys(ds)->ds_unique_bytes);
+ } else {
+ dsl_dir_t *dd = ds->ds_dir;
+ mutex_enter(&dd->dd_lock);
+ uint64_t val = dsl_dir_get_used(dd);
+ mutex_exit(&dd->dd_lock);
+ return (val);
+ }
+}
+
+uint64_t
+dsl_get_creation(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_creation_time);
+}
+
+uint64_t
+dsl_get_creationtxg(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_creation_txg);
+}
+
+uint64_t
+dsl_get_refquota(dsl_dataset_t *ds)
+{
+ return (ds->ds_quota);
+}
+
+uint64_t
+dsl_get_refreservation(dsl_dataset_t *ds)
+{
+ return (ds->ds_reserved);
+}
+
+uint64_t
+dsl_get_guid(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_guid);
+}
+
+uint64_t
+dsl_get_unique(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_unique_bytes);
+}
+
+uint64_t
+dsl_get_objsetid(dsl_dataset_t *ds)
+{
+ return (ds->ds_object);
+}
+
+uint64_t
+dsl_get_userrefs(dsl_dataset_t *ds)
+{
+ return (ds->ds_userrefs);
+}
+
+uint64_t
+dsl_get_defer_destroy(dsl_dataset_t *ds)
+{
+ return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
+}
+
+uint64_t
+dsl_get_referenced(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_phys(ds)->ds_referenced_bytes);
+}
+
+uint64_t
+dsl_get_numclones(dsl_dataset_t *ds)
+{
+ ASSERT(ds->ds_is_snapshot);
+ return (dsl_dataset_phys(ds)->ds_num_children - 1);
+}
+
+uint64_t
+dsl_get_inconsistent(dsl_dataset_t *ds)
+{
+ return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
+ 1 : 0);
+}
+
+uint64_t
+dsl_get_available(dsl_dataset_t *ds)
+{
+ uint64_t refdbytes = dsl_get_referenced(ds);
+ uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
+ NULL, 0, TRUE);
+ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
+ availbytes +=
+ ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
+ }
+ if (ds->ds_quota != 0) {
+ /*
+ * Adjust available bytes according to refquota
+ */
+ if (refdbytes < ds->ds_quota) {
+ availbytes = MIN(availbytes,
+ ds->ds_quota - refdbytes);
+ } else {
+ availbytes = 0;
+ }
+ }
+ return (availbytes);
+}
+
+int
+dsl_get_written(dsl_dataset_t *ds, uint64_t *written)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_dataset_t *prev;
+ int err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+ if (err == 0) {
+ uint64_t comp, uncomp;
+ err = dsl_dataset_space_written(prev, ds, written,
+ &comp, &uncomp);
+ dsl_dataset_rele(prev, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
+ */
+int
+dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
+ dsl_dataset_name(ds->ds_prev, snap);
+ return (0);
+ } else {
+ return (ENOENT);
+ }
+}
+
+/*
+ * Returns the mountpoint property and source for the given dataset in the value
+ * and source buffers. The value buffer must be at least as large as MAXPATHLEN
+ * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
+ * Returns 0 on success and an error on failure.
+ */
+int
+dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
+ char *source)
+{
+ int error;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* Retrieve the mountpoint value stored in the zap opbject */
+ error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
+ ZAP_MAXVALUELEN, value, source);
+ if (error != 0) {
+ return (error);
+ }
+
+ /*
+ * Process the dsname and source to find the full mountpoint string.
+ * Can be skipped for 'legacy' or 'none'.
+ */
+ if (value[0] == '/') {
+ char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ char *root = buf;
+ const char *relpath;
+
+ /*
+ * If we inherit the mountpoint, even from a dataset
+ * with a received value, the source will be the path of
+ * the dataset we inherit from. If source is
+ * ZPROP_SOURCE_VAL_RECVD, the received value is not
+ * inherited.
+ */
+ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
+ relpath = "";
+ } else {
+ ASSERT0(strncmp(dsname, source, strlen(source)));
+ relpath = dsname + strlen(source);
+ if (relpath[0] == '/')
+ relpath++;
+ }
+
+ spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);
+
+ /*
+ * Special case an alternate root of '/'. This will
+ * avoid having multiple leading slashes in the
+ * mountpoint path.
+ */
+ if (strcmp(root, "/") == 0)
+ root++;
+
+ /*
+ * If the mountpoint is '/' then skip over this
+ * if we are obtaining either an alternate root or
+ * an inherited mountpoint.
+ */
+ char *mnt = value;
+ if (value[1] == '\0' && (root[0] != '\0' ||
+ relpath[0] != '\0'))
+ mnt = value + 1;
+
+ if (relpath[0] == '\0') {
+ (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
+ root, mnt);
+ } else {
+ (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
+ root, mnt, relpath[0] == '@' ? "" : "/",
+ relpath);
+ }
+ kmem_free(buf, ZAP_MAXVALUELEN);
+ }
+
+ return (0);
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
+ dsl_get_refratio(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
+ dsl_get_logicalreferenced(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+ dsl_get_compressratio(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+ dsl_get_used(ds));
+
+ if (ds->ds_is_snapshot) {
+ get_clones_stat(ds, nv);
+ } else {
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ if (dsl_get_prev_snap(ds, buf) == 0)
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
+ buf);
+ dsl_dir_stats(ds->ds_dir, nv);
+ }
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
+ dsl_get_available(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
+ dsl_get_referenced(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
+ dsl_get_creation(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
+ dsl_get_creationtxg(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+ dsl_get_refquota(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+ dsl_get_refreservation(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
+ dsl_get_guid(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+ dsl_get_unique(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+ dsl_get_objsetid(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+ dsl_get_userrefs(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+ dsl_get_defer_destroy(ds));
+
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ uint64_t written;
+ if (dsl_get_written(ds, &written) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
+ written);
+ }
+ }
+
+ if (!dsl_dataset_is_snapshot(ds)) {
+ /*
+ * A failed "newfs" (e.g. full) resumable receive leaves
+ * the stats set on this dataset. Check here for the prop.
+ */
+ get_receive_resume_stats(ds, nv);
+
+ /*
+ * A failed incremental resumable receive leaves the
+ * stats set on our child named "%recv". Check the child
+ * for the prop.
+ */
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+ dsl_dataset_t *recv_ds;
+ dsl_dataset_name(ds, recvname);
+ if (strlcat(recvname, "/", sizeof (recvname)) <
+ sizeof (recvname) &&
+ strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+ sizeof (recvname) &&
+ dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+ get_receive_resume_stats(recv_ds, nv);
+ dsl_dataset_rele(recv_ds, FTAG);
+ }
+ }
+}
+
+void
+dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ ASSERT(dsl_pool_config_held(dp));
+
+ stat->dds_creation_txg = dsl_get_creationtxg(ds);
+ stat->dds_inconsistent = dsl_get_inconsistent(ds);
+ stat->dds_guid = dsl_get_guid(ds);
+ stat->dds_origin[0] = '\0';
+ if (ds->ds_is_snapshot) {
+ stat->dds_is_snapshot = B_TRUE;
+ stat->dds_num_clones = dsl_get_numclones(ds);
+ } else {
+ stat->dds_is_snapshot = B_FALSE;
+ stat->dds_num_clones = 0;
+
+ if (dsl_dir_is_clone(ds->ds_dir)) {
+ dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
+ }
+ }
+}
+
+uint64_t
+dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+{
+ return (ds->ds_fsid_guid);
+}
+
+void
+dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
+ *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+ *availbytesp +=
+ ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
+ if (ds->ds_quota != 0) {
+ /*
+ * Adjust available bytes according to refquota
+ */
+ if (*refdbytesp < ds->ds_quota)
+ *availbytesp = MIN(*availbytesp,
+ ds->ds_quota - *refdbytesp);
+ else
+ *availbytesp = 0;
+ }
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ *availobjsp = DN_MAX_OBJECT - *usedobjsp;
+}
+
+boolean_t
+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ uint64_t birth;
+
+ ASSERT(dsl_pool_config_held(dp));
+ if (snap == NULL)
+ return (B_FALSE);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
+ objset_t *os, *os_snap;
+ /*
+ * It may be that only the ZIL differs, because it was
+ * reset in the head. Don't count that as being
+ * modified.
+ */
+ if (dmu_objset_from_ds(ds, &os) != 0)
+ return (B_TRUE);
+ if (dmu_objset_from_ds(snap, &os_snap) != 0)
+ return (B_TRUE);
+ return (bcmp(&os->os_phys->os_meta_dnode,
+ &os_snap->os_phys->os_meta_dnode,
+ sizeof (os->os_phys->os_meta_dnode)) != 0);
+ }
+ return (B_FALSE);
+}
+
+typedef struct dsl_dataset_rename_snapshot_arg {
+ const char *ddrsa_fsname;
+ const char *ddrsa_oldsnapname;
+ const char *ddrsa_newsnapname;
+ boolean_t ddrsa_recursive;
+ dmu_tx_t *ddrsa_tx;
+} dsl_dataset_rename_snapshot_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
+ dsl_dataset_t *hds, void *arg)
+{
+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+ int error;
+ uint64_t val;
+
+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+ if (error != 0) {
+ /* ignore nonexistent snapshots */
+ return (error == ENOENT ? 0 : error);
+ }
+
+ /* new name should not exist */
+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
+ if (error == 0)
+ error = SET_ERROR(EEXIST);
+ else if (error == ENOENT)
+ error = 0;
+
+ /* dataset name + 1 for the "@" + the new snapshot name must fit */
+ if (dsl_dir_namelen(hds->ds_dir) + 1 +
+ strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
+ error = SET_ERROR(ENAMETOOLONG);
+
+ return (error);
+}
+
+static int
+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *hds;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
+ if (error != 0)
+ return (error);
+
+ if (ddrsa->ddrsa_recursive) {
+ error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+ dsl_dataset_rename_snapshot_check_impl, ddrsa,
+ DS_FIND_CHILDREN);
+ } else {
+ error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
+ }
+ dsl_dataset_rele(hds, FTAG);
+ return (error);
+}
+
+static int
+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+ dsl_dataset_t *hds, void *arg)
+{
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ char *oldname, *newname;
+#endif
+#endif
+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+ dsl_dataset_t *ds;
+ uint64_t val;
+ dmu_tx_t *tx = ddrsa->ddrsa_tx;
+ int error;
+
+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+ ASSERT(error == 0 || error == ENOENT);
+ if (error == ENOENT) {
+ /* ignore nonexistent snapshots */
+ return (0);
+ }
+
+ VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
+
+ /* log before we change the name */
+ spa_history_log_internal_ds(ds, "rename", tx,
+ "-> @%s", ddrsa->ddrsa_newsnapname);
+
+ VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+ B_FALSE));
+ mutex_enter(&ds->ds_lock);
+ (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
+ mutex_exit(&ds->ds_lock);
+ VERIFY0(zap_add(dp->dp_meta_objset,
+ dsl_dataset_phys(hds)->ds_snapnames_zapobj,
+ ds->ds_snapname, 8, 1, &ds->ds_object, tx));
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ snprintf(oldname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
+ ddrsa->ddrsa_fsname, ddrsa->ddrsa_oldsnapname);
+ snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
+ ddrsa->ddrsa_fsname, ddrsa->ddrsa_newsnapname);
+ zfsvfs_update_fromname(oldname, newname);
+ zvol_rename_minors(dp->dp_spa, oldname, newname);
+ kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN);
+#endif
+#endif
+ dsl_dataset_rele(ds, FTAG);
+
+ return (0);
+}
+
+static void
+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *hds;
+
+ VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
+ ddrsa->ddrsa_tx = tx;
+ if (ddrsa->ddrsa_recursive) {
+ VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+ dsl_dataset_rename_snapshot_sync_impl, ddrsa,
+ DS_FIND_CHILDREN));
+ } else {
+ VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
+ }
+ dsl_dataset_rele(hds, FTAG);
+}
+
+int
+dsl_dataset_rename_snapshot(const char *fsname,
+ const char *oldsnapname, const char *newsnapname, boolean_t recursive)
+{
+ dsl_dataset_rename_snapshot_arg_t ddrsa;
+
+ ddrsa.ddrsa_fsname = fsname;
+ ddrsa.ddrsa_oldsnapname = oldsnapname;
+ ddrsa.ddrsa_newsnapname = newsnapname;
+ ddrsa.ddrsa_recursive = recursive;
+
+ return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+ dsl_dataset_rename_snapshot_sync, &ddrsa,
+ 1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset. We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here. We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
+static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+ boolean_t held;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ if (owner != NULL) {
+ VERIFY3P(ds->ds_owner, ==, owner);
+ dsl_dataset_long_rele(ds, owner);
+ }
+
+ held = dsl_dataset_long_held(ds);
+
+ if (owner != NULL)
+ dsl_dataset_long_hold(ds, owner);
+
+ if (held)
+ return (SET_ERROR(EBUSY));
+
+ return (0);
+}
+
+int
+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_rollback_arg_t *ddra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int64_t unused_refres_delta;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ /* must not be a snapshot */
+ if (ds->ds_is_snapshot) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* must have a most recent snapshot */
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ESRCH));
+ }
+
+ /*
+ * No rollback to a snapshot created in the current txg, because
+ * the rollback may dirty the dataset and create blocks that are
+ * not reachable from the rootbp while having a birth txg that
+ * falls into the snapshot's range.
+ */
+ if (dmu_tx_is_syncing(tx) &&
+ dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EAGAIN));
+ }
+
+ /*
+ * If the expected target snapshot is specified, then check that
+ * the latest snapshot is it.
+ */
+ if (ddra->ddra_tosnap != NULL) {
+ dsl_dataset_t *snapds;
+
+ /* Check if the target snapshot exists at all. */
+ error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
+ if (error != 0) {
+ /*
+ * ESRCH is used to signal that the target snapshot does
+ * not exist, while ENOENT is used to report that
+ * the rolled back dataset does not exist.
+ * ESRCH is also used to cover other cases where the
+ * target snapshot is not related to the dataset being
+ * rolled back such as being in a different pool.
+ */
+ if (error == ENOENT || error == EXDEV)
+ error = SET_ERROR(ESRCH);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ ASSERT(snapds->ds_is_snapshot);
+
+ /* Check if the snapshot is the latest snapshot indeed. */
+ if (snapds != ds->ds_prev) {
+ /*
+ * Distinguish between the case where the only problem
+ * is intervening snapshots (EEXIST) vs the snapshot
+ * not being a valid target for rollback (ESRCH).
+ */
+ if (snapds->ds_dir == ds->ds_dir ||
+ (dsl_dir_is_clone(ds->ds_dir) &&
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
+ snapds->ds_object)) {
+ error = SET_ERROR(EEXIST);
+ } else {
+ error = SET_ERROR(ESRCH);
+ }
+ dsl_dataset_rele(snapds, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ dsl_dataset_rele(snapds, FTAG);
+ }
+
+ /* must not have any bookmarks after the most recent snapshot */
+ nvlist_t *proprequest = fnvlist_alloc();
+ fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
+ nvlist_t *bookmarks = fnvlist_alloc();
+ error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
+ fnvlist_free(proprequest);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
+ nvlist_t *valuenv =
+ fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
+ zfs_prop_to_name(ZFS_PROP_CREATETXG));
+ uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
+ if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ fnvlist_free(bookmarks);
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+ }
+ fnvlist_free(bookmarks);
+
+ error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ /*
+ * Check if the snap we are rolling back to uses more than
+ * the refquota.
+ */
+ if (ds->ds_quota != 0 &&
+ dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ /*
+ * When we do the clone swap, we will temporarily use more space
+ * due to the refreservation (the head will no longer have any
+ * unique space, so the entire amount of the refreservation will need
+ * to be free). We will immediately destroy the clone, freeing
+ * this space, but the freeing happens over many txg's.
+ */
+ unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
+ dsl_dataset_phys(ds)->ds_unique_bytes);
+
+ if (unused_refres_delta > 0 &&
+ unused_refres_delta >
+ dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_rollback_arg_t *ddra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds, *clone;
+ uint64_t cloneobj;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+ VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
+
+ dsl_dataset_name(ds->ds_prev, namebuf);
+ fnvlist_add_string(ddra->ddra_result, "target", namebuf);
+
+ cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
+ ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
+
+ VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
+
+ dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
+ dsl_dataset_zero_zil(ds, tx);
+
+ dsl_destroy_head_sync_impl(clone, tx);
+
+ dsl_dataset_rele(clone, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Rolls back the given filesystem or volume to the most recent snapshot.
+ * The name of the most recent snapshot will be returned under key "target"
+ * in the result nvlist.
+ *
+ * If owner != NULL:
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ * succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted. See
+ * notes above zfs_suspend_fs() for further details.
+ */
+int
+dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
+ nvlist_t *result)
+{
+ dsl_dataset_rollback_arg_t ddra;
+
+ ddra.ddra_fsname = fsname;
+ ddra.ddra_tosnap = tosnap;
+ ddra.ddra_owner = owner;
+ ddra.ddra_result = result;
+
+ return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
+ dsl_dataset_rollback_sync, &ddra,
+ 1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+struct promotenode {
+ list_node_t link;
+ dsl_dataset_t *ds;
+};
+
+static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
+ void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
+
+int
+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_promote_arg_t *ddpa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *hds;
+ struct promotenode *snap;
+ dsl_dataset_t *origin_ds;
+ int err;
+ uint64_t unused;
+ uint64_t ss_mv_cnt;
+ size_t max_snap_len;
+ boolean_t conflicting_snaps;
+
+ err = promote_hold(ddpa, dp, FTAG);
+ if (err != 0)
+ return (err);
+
+ hds = ddpa->ddpa_clone;
+ snap = list_head(&ddpa->shared_snaps);
+ origin_ds = snap->ds;
+ max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
+
+ snap = list_head(&ddpa->origin_snaps);
+
+ if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
+ promote_rele(ddpa, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * Compute and check the amount of space to transfer. Since this is
+ * so expensive, don't do the preliminary check.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ promote_rele(ddpa, FTAG);
+ return (0);
+ }
+
+ /* compute origin's new unique space */
+ snap = list_tail(&ddpa->clone_snaps);
+ ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+ origin_ds->ds_object);
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
+ &ddpa->unique, &unused, &unused);
+
+ /*
+ * Walk the snapshots that we are moving
+ *
+ * Compute space to transfer. Consider the incremental changes
+ * to used by each snapshot:
+ * (my used) = (prev's used) + (blocks born) - (blocks killed)
+ * So each snapshot gave birth to:
+ * (blocks born) = (my used) - (prev's used) + (blocks killed)
+ * So a sequence would look like:
+ * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
+ * Which simplifies to:
+ * uN + kN + kN-1 + ... + k1 + k0
+ * Note however, if we stop before we reach the ORIGIN we get:
+ * uN + kN + kN-1 + ... + kM - uM-1
+ */
+ conflicting_snaps = B_FALSE;
+ ss_mv_cnt = 0;
+ ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+ ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+ ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
+ for (snap = list_head(&ddpa->shared_snaps); snap;
+ snap = list_next(&ddpa->shared_snaps, snap)) {
+ uint64_t val, dlused, dlcomp, dluncomp;
+ dsl_dataset_t *ds = snap->ds;
+
+ ss_mv_cnt++;
+
+ /*
+ * If there are long holds, we won't be able to evict
+ * the objset.
+ */
+ if (dsl_dataset_long_held(ds)) {
+ err = SET_ERROR(EBUSY);
+ goto out;
+ }
+
+ /* Check that the snapshot name does not conflict */
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ if (strlen(ds->ds_snapname) >= max_snap_len) {
+ err = SET_ERROR(ENAMETOOLONG);
+ goto out;
+ }
+ err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
+ if (err == 0) {
+ fnvlist_add_boolean(ddpa->err_ds,
+ snap->ds->ds_snapname);
+ conflicting_snaps = B_TRUE;
+ } else if (err != ENOENT) {
+ goto out;
+ }
+
+ /* The very first snapshot does not have a deadlist */
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
+ continue;
+
+ dsl_deadlist_space(&ds->ds_deadlist,
+ &dlused, &dlcomp, &dluncomp);
+ ddpa->used += dlused;
+ ddpa->comp += dlcomp;
+ ddpa->uncomp += dluncomp;
+ }
+
+ /*
+ * In order to return the full list of conflicting snapshots, we check
+ * whether there was a conflict after traversing all of them.
+ */
+ if (conflicting_snaps) {
+ err = SET_ERROR(EEXIST);
+ goto out;
+ }
+
+ /*
+ * If we are a clone of a clone then we never reached ORIGIN,
+ * so we need to subtract out the clone origin's used space.
+ */
+ if (ddpa->origin_origin) {
+ ddpa->used -=
+ dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+ ddpa->comp -=
+ dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
+ ddpa->uncomp -=
+ dsl_dataset_phys(ddpa->origin_origin)->
+ ds_uncompressed_bytes;
+ }
+
+ /* Check that there is enough space and limit headroom here */
+ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
+ 0, ss_mv_cnt, ddpa->used, ddpa->cr);
+ if (err != 0)
+ goto out;
+
+ /*
+ * Compute the amounts of space that will be used by snapshots
+ * after the promotion (for both origin and clone). For each,
+ * it is the amount of space that will be on all of their
+ * deadlists (that was not born before their new origin).
+ */
+ if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ uint64_t space;
+
+ /*
+ * Note, typically this will not be a clone of a clone,
+ * so dd_origin_txg will be < TXG_INITIAL, so
+ * these snaplist_space() -> dsl_deadlist_space_range()
+ * calls will be fast because they do not have to
+ * iterate over all bps.
+ */
+ snap = list_head(&ddpa->origin_snaps);
+ err = snaplist_space(&ddpa->shared_snaps,
+ snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
+ if (err != 0)
+ goto out;
+
+ err = snaplist_space(&ddpa->clone_snaps,
+ snap->ds->ds_dir->dd_origin_txg, &space);
+ if (err != 0)
+ goto out;
+ ddpa->cloneusedsnap += space;
+ }
+ if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+ DD_FLAG_USED_BREAKDOWN) {
+ err = snaplist_space(&ddpa->origin_snaps,
+ dsl_dataset_phys(origin_ds)->ds_creation_txg,
+ &ddpa->originusedsnap);
+ if (err != 0)
+ goto out;
+ }
+
+out:
+ promote_rele(ddpa, FTAG);
+ return (err);
+}
+
+void
+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_promote_arg_t *ddpa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *hds;
+ struct promotenode *snap;
+ dsl_dataset_t *origin_ds;
+ dsl_dataset_t *origin_head;
+ dsl_dir_t *dd;
+ dsl_dir_t *odd = NULL;
+ uint64_t oldnext_obj;
+ int64_t delta;
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ char *oldname, *newname;
+#endif
+
+ VERIFY0(promote_hold(ddpa, dp, FTAG));
+ hds = ddpa->ddpa_clone;
+
+ ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
+
+ snap = list_head(&ddpa->shared_snaps);
+ origin_ds = snap->ds;
+ dd = hds->ds_dir;
+
+ snap = list_head(&ddpa->origin_snaps);
+ origin_head = snap->ds;
+
+ /*
+ * We need to explicitly open odd, since origin_ds's dd will be
+ * changing.
+ */
+ VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
+ NULL, FTAG, &odd));
+
+ /* change origin's next snap */
+ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+ oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
+ snap = list_tail(&ddpa->clone_snaps);
+ ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+ origin_ds->ds_object);
+ dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
+
+ /* change the origin's next clone */
+ if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
+ dsl_dataset_remove_from_next_clones(origin_ds,
+ snap->ds->ds_object, tx);
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
+ oldnext_obj, tx));
+ }
+
+ /* change origin */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+ dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
+ dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
+ dmu_buf_will_dirty(odd->dd_dbuf, tx);
+ dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
+ origin_head->ds_dir->dd_origin_txg =
+ dsl_dataset_phys(origin_ds)->ds_creation_txg;
+
+ /* change dd_clone entries */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY0(zap_remove_int(dp->dp_meta_objset,
+ dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+ hds->ds_object, tx));
+
+ VERIFY0(zap_remove_int(dp->dp_meta_objset,
+ dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+ origin_head->ds_object, tx));
+ if (dsl_dir_phys(dd)->dd_clones == 0) {
+ dsl_dir_phys(dd)->dd_clones =
+ zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+ DMU_OT_NONE, 0, tx);
+ }
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
+ }
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+#endif
+
+ /* move snapshots to this dir */
+ for (snap = list_head(&ddpa->shared_snaps); snap;
+ snap = list_next(&ddpa->shared_snaps, snap)) {
+ dsl_dataset_t *ds = snap->ds;
+
+ /*
+ * Property callbacks are registered to a particular
+ * dsl_dir. Since ours is changing, evict the objset
+ * so that they will be unregistered from the old dsl_dir.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ dsl_dataset_name(ds, oldname);
+#endif
+
+ /* move snap name entry */
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ VERIFY0(dsl_dataset_snap_remove(origin_head,
+ ds->ds_snapname, tx, B_TRUE));
+ VERIFY0(zap_add(dp->dp_meta_objset,
+ dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
+ 8, 1, &ds->ds_object, tx));
+ dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
+ /* change containing dsl_dir */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+ dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
+ ASSERT3P(ds->ds_dir, ==, odd);
+ dsl_dir_rele(ds->ds_dir, ds);
+ VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
+ NULL, ds, &ds->ds_dir));
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ dsl_dataset_name(ds, newname);
+ zfsvfs_update_fromname(oldname, newname);
+ zvol_rename_minors(dp->dp_spa, oldname, newname);
+#endif
+
+ /* move any clone references */
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
+ spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *cnds;
+ uint64_t o;
+
+ if (za.za_first_integer == oldnext_obj) {
+ /*
+ * We've already moved the
+ * origin's reference.
+ */
+ continue;
+ }
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &cnds));
+ o = dsl_dir_phys(cnds->ds_dir)->
+ dd_head_dataset_obj;
+
+ VERIFY0(zap_remove_int(dp->dp_meta_objset,
+ dsl_dir_phys(odd)->dd_clones, o, tx));
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_clones, o, tx));
+ dsl_dataset_rele(cnds, FTAG);
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ ASSERT(!dsl_prop_hascb(ds));
+ }
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN);
+#endif
+ /*
+ * Change space accounting.
+ * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
+ * both be valid, or both be 0 (resulting in delta == 0). This
+ * is true for each of {clone,origin} independently.
+ */
+
+ delta = ddpa->cloneusedsnap -
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
+ ASSERT3S(delta, >=, 0);
+ ASSERT3U(ddpa->used, >=, delta);
+ dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
+ dsl_dir_diduse_space(dd, DD_USED_HEAD,
+ ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
+
+ delta = ddpa->originusedsnap -
+ dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
+ ASSERT3S(delta, <=, 0);
+ ASSERT3U(ddpa->used, >=, -delta);
+ dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
+ dsl_dir_diduse_space(odd, DD_USED_HEAD,
+ -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
+
+ dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
+
+ /* log history record */
+ spa_history_log_internal_ds(hds, "promote", tx, "");
+
+ dsl_dir_rele(odd, FTAG);
+ promote_rele(ddpa, FTAG);
+}
+
+/*
+ * Make a list of dsl_dataset_t's for the snapshots between first_obj
+ * (exclusive) and last_obj (inclusive). The list will be in reverse
+ * order (last_obj will be the list_head()). If first_obj == 0, do all
+ * snapshots back to this dataset's origin.
+ */
+static int
+snaplist_make(dsl_pool_t *dp,
+ uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
+{
+ uint64_t obj = last_obj;
+
+ list_create(l, sizeof (struct promotenode),
+ offsetof(struct promotenode, link));
+
+ while (obj != first_obj) {
+ dsl_dataset_t *ds;
+ struct promotenode *snap;
+ int err;
+
+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+ ASSERT(err != ENOENT);
+ if (err != 0)
+ return (err);
+
+ if (first_obj == 0)
+ first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
+
+ snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
+ snap->ds = ds;
+ list_insert_tail(l, snap);
+ obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+
+ return (0);
+}
+
+static int
+snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
+{
+ struct promotenode *snap;
+
+ *spacep = 0;
+ for (snap = list_head(l); snap; snap = list_next(l, snap)) {
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ mintxg, UINT64_MAX, &used, &comp, &uncomp);
+ *spacep += used;
+ }
+ return (0);
+}
+
+static void
+snaplist_destroy(list_t *l, void *tag)
+{
+ struct promotenode *snap;
+
+ if (l == NULL || !list_link_active(&l->list_head))
+ return;
+
+ while ((snap = list_tail(l)) != NULL) {
+ list_remove(l, snap);
+ dsl_dataset_rele(snap->ds, tag);
+ kmem_free(snap, sizeof (*snap));
+ }
+ list_destroy(l);
+}
+
+static int
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
+{
+ int error;
+ dsl_dir_t *dd;
+ struct promotenode *snap;
+
+ error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
+ &ddpa->ddpa_clone);
+ if (error != 0)
+ return (error);
+ dd = ddpa->ddpa_clone->ds_dir;
+
+ if (ddpa->ddpa_clone->ds_is_snapshot ||
+ !dsl_dir_is_clone(dd)) {
+ dsl_dataset_rele(ddpa->ddpa_clone, tag);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
+ &ddpa->shared_snaps, tag);
+ if (error != 0)
+ goto out;
+
+ error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
+ &ddpa->clone_snaps, tag);
+ if (error != 0)
+ goto out;
+
+ snap = list_head(&ddpa->shared_snaps);
+ ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+ error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+ dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
+ &ddpa->origin_snaps, tag);
+ if (error != 0)
+ goto out;
+
+ if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
+ error = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
+ tag, &ddpa->origin_origin);
+ if (error != 0)
+ goto out;
+ }
+out:
+ if (error != 0)
+ promote_rele(ddpa, tag);
+ return (error);
+}
+
+static void
+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
+{
+ snaplist_destroy(&ddpa->shared_snaps, tag);
+ snaplist_destroy(&ddpa->clone_snaps, tag);
+ snaplist_destroy(&ddpa->origin_snaps, tag);
+ if (ddpa->origin_origin != NULL)
+ dsl_dataset_rele(ddpa->origin_origin, tag);
+ dsl_dataset_rele(ddpa->ddpa_clone, tag);
+}
+
+/*
+ * Promote a clone.
+ *
+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
+ * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
+ */
+int
+dsl_dataset_promote(const char *name, char *conflsnap)
+{
+ dsl_dataset_promote_arg_t ddpa = { 0 };
+ uint64_t numsnaps;
+ int error;
+ nvpair_t *snap_pair;
+ objset_t *os;
+
+ /*
+ * We will modify space proportional to the number of
+ * snapshots. Compute numsnaps.
+ */
+ error = dmu_objset_hold(name, FTAG, &os);
+ if (error != 0)
+ return (error);
+ error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
+ dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+ &numsnaps);
+ dmu_objset_rele(os, FTAG);
+ if (error != 0)
+ return (error);
+
+ ddpa.ddpa_clonename = name;
+ ddpa.err_ds = fnvlist_alloc();
+ ddpa.cr = CRED();
+
+ error = dsl_sync_task(name, dsl_dataset_promote_check,
+ dsl_dataset_promote_sync, &ddpa,
+ 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);
+
+ /*
+ * Return the first conflicting snapshot found.
+ */
+ snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
+ if (snap_pair != NULL && conflsnap != NULL)
+ (void) strcpy(conflsnap, nvpair_name(snap_pair));
+
+ fnvlist_free(ddpa.err_ds);
+ return (error);
+}
+
+int
+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
+{
+ /*
+ * "slack" factor for received datasets with refquota set on them.
+ * See the bottom of this function for details on its use.
+ */
+ uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
+ int64_t unused_refres_delta;
+
+ /* they should both be heads */
+ if (clone->ds_is_snapshot ||
+ origin_head->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ /* if we are not forcing, the branch point should be just before them */
+ if (!force && clone->ds_prev != origin_head->ds_prev)
+ return (SET_ERROR(EINVAL));
+
+ /* clone should be the clone (unless they are unrelated) */
+ if (clone->ds_prev != NULL &&
+ clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
+ origin_head->ds_dir != clone->ds_prev->ds_dir)
+ return (SET_ERROR(EINVAL));
+
+ /* the clone should be a child of the origin */
+ if (clone->ds_dir->dd_parent != origin_head->ds_dir)
+ return (SET_ERROR(EINVAL));
+
+ /* origin_head shouldn't be modified unless 'force' */
+ if (!force &&
+ dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
+ return (SET_ERROR(ETXTBSY));
+
+ /* origin_head should have no long holds (e.g. is not mounted) */
+ if (dsl_dataset_handoff_check(origin_head, owner, tx))
+ return (SET_ERROR(EBUSY));
+
+ /* check amount of any unconsumed refreservation */
+ unused_refres_delta =
+ (int64_t)MIN(origin_head->ds_reserved,
+ dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+ (int64_t)MIN(origin_head->ds_reserved,
+ dsl_dataset_phys(clone)->ds_unique_bytes);
+
+ if (unused_refres_delta > 0 &&
+ unused_refres_delta >
+ dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
+ return (SET_ERROR(ENOSPC));
+
+ /*
+ * The clone can't be too much over the head's refquota.
+ *
+ * To ensure that the entire refquota can be used, we allow one
+ * transaction to exceed the the refquota. Therefore, this check
+ * needs to also allow for the space referenced to be more than the
+ * refquota. The maximum amount of space that one transaction can use
+ * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this
+ * overage ensures that we are able to receive a filesystem that
+ * exceeds the refquota on the source system.
+ *
+ * So that overage is the refquota_slack we use below.
+ */
+ if (origin_head->ds_quota != 0 &&
+ dsl_dataset_phys(clone)->ds_referenced_bytes >
+ origin_head->ds_quota + refquota_slack)
+ return (SET_ERROR(EDQUOT));
+
+ return (0);
+}
+
+static void
+dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
+ dsl_dataset_t *origin, dmu_tx_t *tx)
+{
+ uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ ASSERT(dsl_pool_sync_context(dp));
+
+ clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
+ origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
+
+ if (clone_remap_dl_obj != 0) {
+ dsl_deadlist_close(&clone->ds_remap_deadlist);
+ dsl_dataset_unset_remap_deadlist_object(clone, tx);
+ }
+ if (origin_remap_dl_obj != 0) {
+ dsl_deadlist_close(&origin->ds_remap_deadlist);
+ dsl_dataset_unset_remap_deadlist_object(origin, tx);
+ }
+
+ if (clone_remap_dl_obj != 0) {
+ dsl_dataset_set_remap_deadlist_object(origin,
+ clone_remap_dl_obj, tx);
+ dsl_deadlist_open(&origin->ds_remap_deadlist,
+ dp->dp_meta_objset, clone_remap_dl_obj);
+ }
+ if (origin_remap_dl_obj != 0) {
+ dsl_dataset_set_remap_deadlist_object(clone,
+ origin_remap_dl_obj, tx);
+ dsl_deadlist_open(&clone->ds_remap_deadlist,
+ dp->dp_meta_objset, origin_remap_dl_obj);
+ }
+}
+
+void
+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+ dsl_dataset_t *origin_head, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int64_t unused_refres_delta;
+
+ ASSERT(clone->ds_reserved == 0);
+ /*
+ * NOTE: On DEBUG kernels there could be a race between this and
+ * the check function if spa_asize_inflation is adjusted...
+ */
+ ASSERT(origin_head->ds_quota == 0 ||
+ dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
+ DMU_MAX_ACCESS * spa_asize_inflation);
+ ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
+
+ /*
+ * Swap per-dataset feature flags.
+ */
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET)) {
+ ASSERT(!clone->ds_feature_inuse[f]);
+ ASSERT(!origin_head->ds_feature_inuse[f]);
+ continue;
+ }
+
+ boolean_t clone_inuse = clone->ds_feature_inuse[f];
+ boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
+
+ if (clone_inuse) {
+ dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
+ clone->ds_feature_inuse[f] = B_FALSE;
+ }
+ if (origin_head_inuse) {
+ dsl_dataset_deactivate_feature(origin_head->ds_object,
+ f, tx);
+ origin_head->ds_feature_inuse[f] = B_FALSE;
+ }
+ if (clone_inuse) {
+ dsl_dataset_activate_feature(origin_head->ds_object,
+ f, tx);
+ origin_head->ds_feature_inuse[f] = B_TRUE;
+ }
+ if (origin_head_inuse) {
+ dsl_dataset_activate_feature(clone->ds_object, f, tx);
+ clone->ds_feature_inuse[f] = B_TRUE;
+ }
+ }
+
+ dmu_buf_will_dirty(clone->ds_dbuf, tx);
+ dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+
+ if (clone->ds_objset != NULL) {
+ dmu_objset_evict(clone->ds_objset);
+ clone->ds_objset = NULL;
+ }
+
+ if (origin_head->ds_objset != NULL) {
+ dmu_objset_evict(origin_head->ds_objset);
+ origin_head->ds_objset = NULL;
+ }
+
+ unused_refres_delta =
+ (int64_t)MIN(origin_head->ds_reserved,
+ dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+ (int64_t)MIN(origin_head->ds_reserved,
+ dsl_dataset_phys(clone)->ds_unique_bytes);
+
+ /*
+ * Reset origin's unique bytes, if it exists.
+ */
+ if (clone->ds_prev) {
+ dsl_dataset_t *origin = clone->ds_prev;
+ uint64_t comp, uncomp;
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ dsl_deadlist_space_range(&clone->ds_deadlist,
+ dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+ &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
+ }
+
+ /* swap blkptrs */
+ {
+ rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
+ rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
+ blkptr_t tmp;
+ tmp = dsl_dataset_phys(origin_head)->ds_bp;
+ dsl_dataset_phys(origin_head)->ds_bp =
+ dsl_dataset_phys(clone)->ds_bp;
+ dsl_dataset_phys(clone)->ds_bp = tmp;
+ rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
+ rrw_exit(&clone->ds_bp_rwlock, FTAG);
+ }
+
+ /* set dd_*_bytes */
+ {
+ int64_t dused, dcomp, duncomp;
+ uint64_t cdl_used, cdl_comp, cdl_uncomp;
+ uint64_t odl_used, odl_comp, odl_uncomp;
+
+ ASSERT3U(dsl_dir_phys(clone->ds_dir)->
+ dd_used_breakdown[DD_USED_SNAP], ==, 0);
+
+ dsl_deadlist_space(&clone->ds_deadlist,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space(&origin_head->ds_deadlist,
+ &odl_used, &odl_comp, &odl_uncomp);
+
+ dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+ cdl_used -
+ (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+ odl_used);
+ dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+ cdl_comp -
+ (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+ odl_comp);
+ duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
+ cdl_uncomp -
+ (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+ odl_uncomp);
+
+ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
+ dused, dcomp, duncomp, tx);
+ dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
+ -dused, -dcomp, -duncomp, tx);
+
+ /*
+ * The difference in the space used by snapshots is the
+ * difference in snapshot space due to the head's
+ * deadlist (since that's the only thing that's
+ * changing that affects the snapused).
+ */
+ dsl_deadlist_space_range(&clone->ds_deadlist,
+ origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space_range(&origin_head->ds_deadlist,
+ origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+ &odl_used, &odl_comp, &odl_uncomp);
+ dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
+ DD_USED_HEAD, DD_USED_SNAP, NULL);
+ }
+
+ /* swap ds_*_bytes */
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+ dsl_dataset_phys(clone)->ds_referenced_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+ dsl_dataset_phys(clone)->ds_compressed_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+ dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+ dsl_dataset_phys(clone)->ds_unique_bytes);
+
+ /* apply any parent delta for change in unconsumed refreservation */
+ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
+ unused_refres_delta, 0, 0, tx);
+
+ /*
+ * Swap deadlists.
+ */
+ dsl_deadlist_close(&clone->ds_deadlist);
+ dsl_deadlist_close(&origin_head->ds_deadlist);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+ dsl_dataset_phys(clone)->ds_deadlist_obj);
+ dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
+ dsl_dataset_phys(clone)->ds_deadlist_obj);
+ dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
+ dsl_dataset_phys(origin_head)->ds_deadlist_obj);
+ dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
+
+ dsl_scan_ds_clone_swapped(origin_head, clone, tx);
+
+ spa_history_log_internal_ds(clone, "clone swap", tx,
+ "parent=%s", origin_head->ds_dir->dd_myname);
+}
+
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
+int
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_pool_hold(pname, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+ if (error == 0) {
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_pool_rele(dp, FTAG);
+
+ return (error);
+}
+
+int
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+ uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
+{
+ int error = 0;
+
+ ASSERT3S(asize, >, 0);
+
+ /*
+ * *ref_rsrv is the portion of asize that will come from any
+ * unconsumed refreservation space.
+ */
+ *ref_rsrv = 0;
+
+ mutex_enter(&ds->ds_lock);
+ /*
+ * Make a space adjustment for reserved bytes.
+ */
+ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
+ ASSERT3U(*used, >=,
+ ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+ *used -=
+ (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+ *ref_rsrv =
+ asize - MIN(asize, parent_delta(ds, asize + inflight));
+ }
+
+ if (!check_quota || ds->ds_quota == 0) {
+ mutex_exit(&ds->ds_lock);
+ return (0);
+ }
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota, they get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+ ds->ds_quota) {
+ if (inflight > 0 ||
+ dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
+ error = SET_ERROR(ERESTART);
+ else
+ error = SET_ERROR(EDQUOT);
+ }
+ mutex_exit(&ds->ds_lock);
+
+ return (error);
+}
+
+typedef struct dsl_dataset_set_qr_arg {
+ const char *ddsqra_name;
+ zprop_source_t ddsqra_source;
+ uint64_t ddsqra_value;
+} dsl_dataset_set_qr_arg_t;
+
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+ uint64_t newval;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
+ return (SET_ERROR(ENOTSUP));
+
+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (ds->ds_is_snapshot) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_prop_predict(ds->ds_dir,
+ zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (newval == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
+ newval < ds->ds_reserved) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ uint64_t newval;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+ &ddsqra->ddsqra_value, tx);
+
+ VERIFY0(dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
+
+ if (ds->ds_quota != newval) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_quota = newval;
+ }
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
+ uint64_t refquota)
+{
+ dsl_dataset_set_qr_arg_t ddsqra;
+
+ ddsqra.ddsqra_name = dsname;
+ ddsqra.ddsqra_source = source;
+ ddsqra.ddsqra_value = refquota;
+
+ return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
+ dsl_dataset_set_refquota_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+static int
+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+ uint64_t newval, unique;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
+ return (SET_ERROR(ENOTSUP));
+
+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (ds->ds_is_snapshot) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_prop_predict(ds->ds_dir,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ mutex_enter(&ds->ds_lock);
+ if (!DS_UNIQUE_IS_ACCURATE(ds))
+ dsl_dataset_recalc_head_uniq(ds);
+ unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+ mutex_exit(&ds->ds_lock);
+
+ if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
+ uint64_t delta = MAX(unique, newval) -
+ MAX(unique, ds->ds_reserved);
+
+ if (delta >
+ dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
+ (ds->ds_quota > 0 && newval > ds->ds_quota)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+ zprop_source_t source, uint64_t value, dmu_tx_t *tx)
+{
+ uint64_t newval;
+ uint64_t unique;
+ int64_t delta;
+
+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ source, sizeof (value), 1, &value, tx);
+
+ VERIFY0(dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_dir->dd_lock);
+ mutex_enter(&ds->ds_lock);
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+ delta = MAX(0, (int64_t)(newval - unique)) -
+ MAX(0, (int64_t)(ds->ds_reserved - unique));
+ ds->ds_reserved = newval;
+ mutex_exit(&ds->ds_lock);
+
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
+ mutex_exit(&ds->ds_dir->dd_lock);
+}
+
+static void
+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+ dsl_dataset_set_refreservation_sync_impl(ds,
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
+ uint64_t refreservation)
+{
+ dsl_dataset_set_qr_arg_t ddsqra;
+
+ ddsqra.ddsqra_name = dsname;
+ ddsqra.ddsqra_source = source;
+ ddsqra.ddsqra_value = refreservation;
+
+ return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
+ dsl_dataset_set_refreservation_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/*
+ * Return (in *usedp) the amount of space written in new that is not
+ * present in oldsnap. New may be a snapshot or the head. Old must be
+ * a snapshot before new, in new's filesystem (or its origin). If not then
+ * fail and return EINVAL.
+ *
+ * The written space is calculated by considering two components: First, we
+ * ignore any freed space, and calculate the written as new's used space
+ * minus old's used space. Next, we add in the amount of space that was freed
+ * between the two snapshots, thus reducing new's used space relative to old's.
+ * Specifically, this is the space that was born before old->ds_creation_txg,
+ * and freed before new (ie. on new's deadlist or a previous deadlist).
+ *
+ * space freed [---------------------]
+ * snapshots ---O-------O--------O-------O------
+ * oldsnap new
+ */
+int
+dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ int err = 0;
+ uint64_t snapobj;
+ dsl_pool_t *dp = new->ds_dir->dd_pool;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ *usedp = 0;
+ *usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+ *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
+
+ *compp = 0;
+ *compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+ *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
+
+ *uncompp = 0;
+ *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+ *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
+
+ snapobj = new->ds_object;
+ while (snapobj != oldsnap->ds_object) {
+ dsl_dataset_t *snap;
+ uint64_t used, comp, uncomp;
+
+ if (snapobj == new->ds_object) {
+ snap = new;
+ } else {
+ err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+ if (err != 0)
+ break;
+ }
+
+ if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
+ dsl_dataset_phys(oldsnap)->ds_creation_txg) {
+ /*
+ * The blocks in the deadlist can not be born after
+ * ds_prev_snap_txg, so get the whole deadlist space,
+ * which is more efficient (especially for old-format
+ * deadlists). Unfortunately the deadlist code
+ * doesn't have enough information to make this
+ * optimization itself.
+ */
+ dsl_deadlist_space(&snap->ds_deadlist,
+ &used, &comp, &uncomp);
+ } else {
+ dsl_deadlist_space_range(&snap->ds_deadlist,
+ 0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
+ &used, &comp, &uncomp);
+ }
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+
+ /*
+ * If we get to the beginning of the chain of snapshots
+ * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
+ * was not a snapshot of/before new.
+ */
+ snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ if (snap != new)
+ dsl_dataset_rele(snap, FTAG);
+ if (snapobj == 0) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
+
+ }
+ return (err);
+}
+
+/*
+ * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
+ * lastsnap, and all snapshots in between are deleted.
+ *
+ * blocks that would be freed [---------------------------]
+ * snapshots ---O-------O--------O-------O--------O
+ * firstsnap lastsnap
+ *
+ * This is the set of blocks that were born after the snap before firstsnap,
+ * (birth > firstsnap->prev_snap_txg) and died before the snap after the
+ * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
+ * We calculate this by iterating over the relevant deadlists (from the snap
+ * after lastsnap, backward to the snap after firstsnap), summing up the
+ * space on the deadlist that was born after the snap before firstsnap.
+ */
+int
+dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
+ dsl_dataset_t *lastsnap,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ int err = 0;
+ uint64_t snapobj;
+ dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
+
+ ASSERT(firstsnap->ds_is_snapshot);
+ ASSERT(lastsnap->ds_is_snapshot);
+
+ /*
+ * Check that the snapshots are in the same dsl_dir, and firstsnap
+ * is before lastsnap.
+ */
+ if (firstsnap->ds_dir != lastsnap->ds_dir ||
+ dsl_dataset_phys(firstsnap)->ds_creation_txg >
+ dsl_dataset_phys(lastsnap)->ds_creation_txg)
+ return (SET_ERROR(EINVAL));
+
+ *usedp = *compp = *uncompp = 0;
+
+ snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
+ while (snapobj != firstsnap->ds_object) {
+ dsl_dataset_t *ds;
+ uint64_t used, comp, uncomp;
+
+ err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
+ if (err != 0)
+ break;
+
+ dsl_deadlist_space_range(&ds->ds_deadlist,
+ dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
+ &used, &comp, &uncomp);
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+
+ snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ ASSERT3U(snapobj, !=, 0);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
+ * For example, they could both be snapshots of the same filesystem, and
+ * 'earlier' is before 'later'. Or 'earlier' could be the origin of
+ * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's
+ * filesystem. Or 'earlier' could be the origin's origin.
+ *
+ * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
+ */
+boolean_t
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+ uint64_t earlier_txg)
+{
+ dsl_pool_t *dp = later->ds_dir->dd_pool;
+ int error;
+ boolean_t ret;
+
+ ASSERT(dsl_pool_config_held(dp));
+ ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
+
+ if (earlier_txg == 0)
+ earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
+
+ if (later->ds_is_snapshot &&
+ earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
+ return (B_FALSE);
+
+ if (later->ds_dir == earlier->ds_dir)
+ return (B_TRUE);
+ if (!dsl_dir_is_clone(later->ds_dir))
+ return (B_FALSE);
+
+ if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
+ return (B_TRUE);
+ dsl_dataset_t *origin;
+ error = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
+ if (error != 0)
+ return (B_FALSE);
+ ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
+ dsl_dataset_rele(origin, FTAG);
+ return (ret);
+}
+
+void
+dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
+}
+
+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+ dmu_object_info_t doi;
+
+ dmu_object_info_from_db(ds->ds_dbuf, &doi);
+ return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_is_zapified(ds) &&
+ zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
+
+uint64_t
+dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
+{
+ uint64_t remap_deadlist_obj;
+ int err;
+
+ if (!dsl_dataset_is_zapified(ds))
+ return (0);
+
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+ DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
+ &remap_deadlist_obj);
+
+ if (err != 0) {
+ VERIFY3S(err, ==, ENOENT);
+ return (0);
+ }
+
+ ASSERT(remap_deadlist_obj != 0);
+ return (remap_deadlist_obj);
+}
+
+boolean_t
+dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
+{
+ EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
+ dsl_dataset_get_remap_deadlist_object(ds) != 0);
+ return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
+}
+
+static void
+dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
+ dmu_tx_t *tx)
+{
+ ASSERT(obj != 0);
+ dsl_dataset_zapify(ds, tx);
+ VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+ DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));
+}
+
+static void
+dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));
+}
+
+void
+dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t remap_deadlist_object;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_dataset_remap_deadlist_exists(ds));
+
+ remap_deadlist_object = ds->ds_remap_deadlist.dl_object;
+ dsl_deadlist_close(&ds->ds_remap_deadlist);
+ dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);
+ dsl_dataset_unset_remap_deadlist_object(ds, tx);
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t remap_deadlist_obj;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));
+ /*
+ * Currently we only create remap deadlists when there are indirect
+ * vdevs with referenced mappings.
+ */
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ remap_deadlist_obj = dsl_deadlist_clone(
+ &ds->ds_deadlist, UINT64_MAX,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+ dsl_dataset_set_remap_deadlist_object(ds,
+ remap_deadlist_obj, tx);
+ dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),
+ remap_deadlist_obj);
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
new file mode 100644
index 000000000000..2f3647bc8e86
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
@@ -0,0 +1,561 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/dsl_dataset.h>
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+
+/*
+ * Deadlist concurrency:
+ *
+ * Deadlists can only be modified from the syncing thread.
+ *
+ * Except for dsl_deadlist_insert(), it can only be modified with the
+ * dp_config_rwlock held with RW_WRITER.
+ *
+ * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
+ * be called concurrently, from open context, with the dl_config_rwlock held
+ * with RW_READER.
+ *
+ * Therefore, we only need to provide locking between dsl_deadlist_insert() and
+ * the accessors, protecting:
+ * dl_phys->dl_used,comp,uncomp
+ * and protecting the dl_tree from being loaded.
+ * The locking is provided by dl_lock. Note that locking on the bpobj_t
+ * provides its own locking, and dl_oldfmt is immutable.
+ */
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+ const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1;
+ const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2;
+
+ return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+ ASSERT(!dl->dl_oldfmt);
+ if (dl->dl_havetree)
+ return;
+
+ avl_create(&dl->dl_tree, dsl_deadlist_compare,
+ sizeof (dsl_deadlist_entry_t),
+ offsetof(dsl_deadlist_entry_t, dle_node));
+ for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
+ za.za_first_integer));
+ avl_add(&dl->dl_tree, dle);
+ }
+ zap_cursor_fini(&zc);
+ dl->dl_havetree = B_TRUE;
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+
+ ASSERT(!dsl_deadlist_is_open(dl));
+
+ mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+ dl->dl_os = os;
+ dl->dl_object = object;
+ VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+ dmu_object_info_from_db(dl->dl_dbuf, &doi);
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ dl->dl_dbuf = NULL;
+ dl->dl_oldfmt = B_TRUE;
+ VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
+ return;
+ }
+
+ dl->dl_oldfmt = B_FALSE;
+ dl->dl_phys = dl->dl_dbuf->db_data;
+ dl->dl_havetree = B_FALSE;
+}
+
+boolean_t
+dsl_deadlist_is_open(dsl_deadlist_t *dl)
+{
+ return (dl->dl_os != NULL);
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+ void *cookie = NULL;
+ dsl_deadlist_entry_t *dle;
+
+ ASSERT(dsl_deadlist_is_open(dl));
+
+ if (dl->dl_oldfmt) {
+ dl->dl_oldfmt = B_FALSE;
+ bpobj_close(&dl->dl_bpobj);
+ dl->dl_os = NULL;
+ dl->dl_object = 0;
+ return;
+ }
+
+ if (dl->dl_havetree) {
+ while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+ != NULL) {
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ }
+ avl_destroy(&dl->dl_tree);
+ }
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ mutex_destroy(&dl->dl_lock);
+ dl->dl_dbuf = NULL;
+ dl->dl_phys = NULL;
+ dl->dl_os = NULL;
+ dl->dl_object = 0;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
+ return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+ sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+ dmu_object_info_t doi;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_free(os, dlobj, tx);
+ return;
+ }
+
+ for (zap_cursor_init(&zc, os, dlobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t obj = za.za_first_integer;
+ if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
+ bpobj_decr_empty(os, tx);
+ else
+ bpobj_free(os, obj, tx);
+ }
+ zap_cursor_fini(&zc);
+ VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
+}
+
+static void
+dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+ const blkptr_t *bp, dmu_tx_t *tx)
+{
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+ if (dle->dle_bpobj.bpo_object ==
+ dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+ uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+ bpobj_close(&dle->dle_bpobj);
+ bpobj_decr_empty(dl->dl_os, tx);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, obj, tx));
+ }
+ bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+static void
+dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+ uint64_t obj, dmu_tx_t *tx)
+{
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+ if (dle->dle_bpobj.bpo_object !=
+ dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+ bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+ } else {
+ bpobj_close(&dle->dle_bpobj);
+ bpobj_decr_empty(dl->dl_os, tx);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, obj, tx));
+ }
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dl->dl_phys->dl_used +=
+ bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+ dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
+ dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+
+ dle_tofind.dle_mintxg = bp->blk_birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ else
+ dle = AVL_PREV(&dl->dl_tree, dle);
+ dle_enqueue(dl, dle, bp, tx);
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ dsl_deadlist_entry_t *dle;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = mintxg;
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ avl_add(&dl->dl_tree, dle);
+
+ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
+ mintxg, obj, tx));
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle, *dle_prev;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+ dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+ dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
+
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_t dl = { 0 };
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ dsl_deadlist_open(&dl, os, dlobj);
+ if (dl.dl_oldfmt) {
+ dsl_deadlist_close(&dl);
+ return;
+ }
+
+ while (mrs_obj != 0) {
+ dsl_dataset_t *ds;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+ dsl_deadlist_add_key(&dl,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+ mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t *dle;
+ uint64_t newobj;
+
+ newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+ if (dl->dl_oldfmt) {
+ dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+ return (newobj);
+ }
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+
+ for (dle = avl_first(&dl->dl_tree); dle;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t obj;
+
+ if (dle->dle_mintxg >= maxtxg)
+ break;
+
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
+ dle->dle_mintxg, obj, tx));
+ }
+ mutex_exit(&dl->dl_lock);
+ return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ ASSERT(dsl_deadlist_is_open(dl));
+ if (dl->dl_oldfmt) {
+ VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
+ usedp, compp, uncompp));
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ *usedp = dl->dl_phys->dl_used;
+ *compp = dl->dl_phys->dl_comp;
+ *uncompp = dl->dl_phys->dl_uncomp;
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * larger than any bp in the deadlist (eg. UINT64_MAX)).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ dsl_deadlist_entry_t *dle;
+ dsl_deadlist_entry_t dle_tofind;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
+ mintxg, maxtxg, usedp, compp, uncompp));
+ return;
+ }
+
+ *usedp = *compp = *uncompp = 0;
+
+ mutex_enter(&dl->dl_lock);
+ dsl_deadlist_load_tree(dl);
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ /*
+ * If we don't find this mintxg, there shouldn't be anything
+ * after it either.
+ */
+ ASSERT(dle != NULL ||
+ avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
+
+ for (; dle && dle->dle_mintxg < maxtxg;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t used, comp, uncomp;
+
+ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+ }
+ mutex_exit(&dl->dl_lock);
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+ uint64_t used, comp, uncomp;
+ bpobj_t bpo;
+
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
+ bpobj_close(&bpo);
+
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dl->dl_phys->dl_used += used;
+ dl->dl_phys->dl_comp += comp;
+ dl->dl_phys->dl_uncomp += uncomp;
+
+ dle_tofind.dle_mintxg = birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ dle_enqueue_subobj(dl, dle, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl. obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ dmu_buf_t *bonus;
+ dsl_deadlist_phys_t *dlp;
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_t bpo;
+ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY3U(0, ==, bpobj_iterate(&bpo,
+ dsl_deadlist_insert_cb, dl, tx));
+ bpobj_close(&bpo);
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ for (zap_cursor_init(&zc, dl->dl_os, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
+ dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
+ }
+ zap_cursor_fini(&zc);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+ dlp = bonus->db_data;
+ dmu_buf_will_dirty(bonus, tx);
+ bzero(dlp, sizeof (*dlp));
+ dmu_buf_rele(bonus, FTAG);
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Remove entries on dl that are >= mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ ASSERT(!dl->dl_oldfmt);
+
+ mutex_enter(&dl->dl_lock);
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+ while (dle) {
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_entry_t *dle_next;
+
+ bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+ ASSERT3U(dl->dl_phys->dl_used, >=, used);
+ ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+ ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+ dl->dl_phys->dl_used -= used;
+ dl->dl_phys->dl_comp -= comp;
+ dl->dl_phys->dl_uncomp -= uncomp;
+
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, tx));
+
+ dle_next = AVL_NEXT(&dl->dl_tree, dle);
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ dle = dle_next;
+ }
+ mutex_exit(&dl->dl_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
new file mode 100644
index 000000000000..0ad658f910ec
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ */
+
+/*
+ * DSL permissions are stored in a two level zap attribute
+ * mechanism. The first level identifies the "class" of
+ * entry. The class is identified by the first 2 letters of
+ * the attribute. The second letter "l" or "d" identifies whether
+ * it is a local or descendent permission. The first letter
+ * identifies the type of entry.
+ *
+ * ul$<id> identifies permissions granted locally for this userid.
+ * ud$<id> identifies permissions granted on descendent datasets for
+ * this userid.
+ * Ul$<id> identifies permission sets granted locally for this userid.
+ * Ud$<id> identifies permission sets granted on descendent datasets for
+ * this userid.
+ * gl$<id> identifies permissions granted locally for this groupid.
+ * gd$<id> identifies permissions granted on descendent datasets for
+ * this groupid.
+ * Gl$<id> identifies permission sets granted locally for this groupid.
+ * Gd$<id> identifies permission sets granted on descendent datasets for
+ * this groupid.
+ * el$ identifies permissions granted locally for everyone.
+ * ed$ identifies permissions granted on descendent datasets
+ * for everyone.
+ * El$ identifies permission sets granted locally for everyone.
+ * Ed$ identifies permission sets granted to descendent datasets for
+ * everyone.
+ * c-$ identifies permission to create at dataset creation time.
+ * C-$ identifies permission sets to grant locally at dataset creation
+ * time.
+ * s-$@<name> permissions defined in specified set @<name>
+ * S-$@<name> Sets defined in named set @<name>
+ *
+ * Each of the above entities points to another zap attribute that contains one
+ * attribute for each allowed permission, such as create, destroy,...
+ * All of the "upper" case class types will specify permission set names
+ * rather than permissions.
+ *
+ * Basically it looks something like this:
+ * ul$12 -> ZAP OBJ -> permissions...
+ *
+ * The ZAP OBJ is referred to as the jump object.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/cred.h>
+#include <sys/sunddi.h>
+
+#include "zfs_deleg.h"
+
+/*
+ * Validate that user is allowed to delegate specified permissions.
+ *
+ * In order to delegate "create" you must have "create"
+ * and "allow".
+ */
+int
+dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+ nvpair_t *whopair = NULL;
+ int error;
+
+ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+ return (error);
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+
+ VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+
+ if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
+ return (SET_ERROR(EPERM));
+
+ if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Validate that user is allowed to unallow specified permissions. They
+ * must have the 'allow' permission, and even then can only unallow
+ * perms for their uid.
+ */
+int
+dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+ nvpair_t *whopair = NULL;
+ int error;
+ char idstr[32];
+
+ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+ return (error);
+
+ (void) snprintf(idstr, sizeof (idstr), "%lld",
+ (longlong_t)crgetuid(cr));
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
+
+ if (type != ZFS_DELEG_USER &&
+ type != ZFS_DELEG_USER_SETS)
+ return (SET_ERROR(EPERM));
+
+ if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
+ return (SET_ERROR(EPERM));
+ }
+ return (0);
+}
+
+typedef struct dsl_deleg_arg {
+ const char *dda_name;
+ nvlist_t *dda_nvlist;
+} dsl_deleg_arg_t;
+
+static void
+dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_deleg_arg_t *dda = arg;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ nvpair_t *whopair = NULL;
+ uint64_t zapobj;
+
+ VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+ if (zapobj == 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
+ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ }
+
+ while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
+ const char *whokey = nvpair_name(whopair);
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+ uint64_t jumpobj;
+
+ perms = fnvpair_value_nvlist(whopair);
+
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
+ jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
+ zapobj, whokey, tx);
+ }
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+ uint64_t n = 0;
+
+ VERIFY(zap_update(mos, jumpobj,
+ perm, 8, 1, &n, tx) == 0);
+ spa_history_log_internal_dd(dd, "permission update", tx,
+ "%s %s", whokey, perm);
+ }
+ }
+ dsl_dir_rele(dd, FTAG);
+}
+
+static void
+dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_deleg_arg_t *dda = arg;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ nvpair_t *whopair = NULL;
+ uint64_t zapobj;
+
+ VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+ if (zapobj == 0) {
+ dsl_dir_rele(dd, FTAG);
+ return;
+ }
+
+ while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
+ const char *whokey = nvpair_name(whopair);
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+ uint64_t jumpobj;
+
+ if (nvpair_value_nvlist(whopair, &perms) != 0) {
+ if (zap_lookup(mos, zapobj, whokey, 8,
+ 1, &jumpobj) == 0) {
+ (void) zap_remove(mos, zapobj, whokey, tx);
+ VERIFY(0 == zap_destroy(mos, jumpobj, tx));
+ }
+ spa_history_log_internal_dd(dd, "permission who remove",
+ tx, "%s", whokey);
+ continue;
+ }
+
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
+ continue;
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+ uint64_t n = 0;
+
+ (void) zap_remove(mos, jumpobj, perm, tx);
+ if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
+ (void) zap_remove(mos, zapobj,
+ whokey, tx);
+ VERIFY(0 == zap_destroy(mos,
+ jumpobj, tx));
+ }
+ spa_history_log_internal_dd(dd, "permission remove", tx,
+ "%s %s", whokey, perm);
+ }
+ }
+ dsl_dir_rele(dd, FTAG);
+}
+
+static int
+dsl_deleg_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_deleg_arg_t *dda = arg;
+ dsl_dir_t *dd;
+ int error;
+
+ if (spa_version(dmu_tx_pool(tx)->dp_spa) <
+ SPA_VERSION_DELEGATED_PERMS) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
+ if (error == 0)
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+}
+
+int
+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+{
+ dsl_deleg_arg_t dda;
+
+ /* nvp must already have been verified to be valid */
+
+ dda.dda_name = ddname;
+ dda.dda_nvlist = nvp;
+
+ return (dsl_sync_task(ddname, dsl_deleg_check,
+ unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
+ &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
+}
+
+/*
+ * Find all 'allow' permissions from a given point and then continue
+ * traversing up to the root.
+ *
+ * This function constructs an nvlist of nvlists.
+ * each setpoint is an nvlist composed of an nvlist of an nvlist
+ * of the individual * users/groups/everyone/create
+ * permissions.
+ *
+ * The nvlist will look like this.
+ *
+ * { source fsname -> { whokeys { permissions,...}, ...}}
+ *
+ * The fsname nvpairs will be arranged in a bottom up order. For example,
+ * if we have the following structure a/b/c then the nvpairs for the fsnames
+ * will be ordered a/b/c, a/b, a.
+ */
+int
+dsl_deleg_get(const char *ddname, nvlist_t **nvp)
+{
+ dsl_dir_t *dd, *startdd;
+ dsl_pool_t *dp;
+ int error;
+ objset_t *mos;
+
+ error = dsl_pool_hold(ddname, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ dp = startdd->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
+ zap_cursor_t basezc;
+ zap_attribute_t baseza;
+ nvlist_t *sp_nvp;
+ uint64_t n;
+ char source[ZFS_MAX_DATASET_NAME_LEN];
+
+ if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
+ zap_count(mos,
+ dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
+ continue;
+
+ sp_nvp = fnvlist_alloc();
+ for (zap_cursor_init(&basezc, mos,
+ dsl_dir_phys(dd)->dd_deleg_zapobj);
+ zap_cursor_retrieve(&basezc, &baseza) == 0;
+ zap_cursor_advance(&basezc)) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ nvlist_t *perms_nvp;
+
+ ASSERT(baseza.za_integer_length == 8);
+ ASSERT(baseza.za_num_integers == 1);
+
+ perms_nvp = fnvlist_alloc();
+ for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ fnvlist_add_boolean(perms_nvp, za.za_name);
+ }
+ zap_cursor_fini(&zc);
+ fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp);
+ fnvlist_free(perms_nvp);
+ }
+
+ zap_cursor_fini(&basezc);
+
+ dsl_dir_name(dd, source);
+ fnvlist_add_nvlist(*nvp, source, sp_nvp);
+ nvlist_free(sp_nvp);
+ }
+
+ dsl_dir_rele(startdd, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (0);
+}
+
+/*
+ * Routines for dsl_deleg_access() -- access checking.
+ */
+typedef struct perm_set {
+ avl_node_t p_node;
+ boolean_t p_matched;
+ char p_setname[ZFS_MAX_DELEG_NAME];
+} perm_set_t;
+
+static int
+perm_set_compare(const void *arg1, const void *arg2)
+{
+ const perm_set_t *node1 = (const perm_set_t *)arg1;
+ const perm_set_t *node2 = (const perm_set_t *)arg2;
+ int val;
+
+ val = strcmp(node1->p_setname, node2->p_setname);
+
+ return (AVL_ISIGN(val));
+}
+
+/*
+ * Determine whether a specified permission exists.
+ *
+ * First the base attribute has to be retrieved. i.e. ul$12
+ * Once the base object has been retrieved the actual permission
+ * is lookup up in the zap object the base object points to.
+ *
+ * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
+ * there is no perm in that jumpobj.
+ */
+static int
+dsl_check_access(objset_t *mos, uint64_t zapobj,
+ char type, char checkflag, void *valp, const char *perm)
+{
+ int error;
+ uint64_t jumpobj, zero;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey, type, checkflag, valp);
+ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+ if (error == 0) {
+ error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
+ if (error == ENOENT)
+ error = SET_ERROR(EPERM);
+ }
+ return (error);
+}
+
+/*
+ * check a specified user/group for a requested permission
+ */
+static int
+dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
+ int checkflag, cred_t *cr)
+{
+ const gid_t *gids;
+ int ngids;
+ int i;
+ uint64_t id;
+
+ /* check for user */
+ id = crgetuid(cr);
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_USER, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check for users primary group */
+ id = crgetgid(cr);
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check for everyone entry */
+ id = -1;
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check each supplemental group user is a member of */
+ ngids = crgetngroups(cr);
+ gids = crgetgroups(cr);
+ for (i = 0; i != ngids; i++) {
+ id = gids[i];
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+ return (0);
+ }
+
+ return (SET_ERROR(EPERM));
+}
+
+/*
+ * Iterate over the sets specified in the specified zapobj
+ * and load them into the permsets avl tree.
+ */
+static int
+dsl_load_sets(objset_t *mos, uint64_t zapobj,
+ char type, char checkflag, void *valp, avl_tree_t *avl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ perm_set_t *permnode;
+ avl_index_t idx;
+ uint64_t jumpobj;
+ int error;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey, type, checkflag, valp);
+
+ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+ if (error != 0)
+ return (error);
+
+ for (zap_cursor_init(&zc, mos, jumpobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
+ (void) strlcpy(permnode->p_setname, za.za_name,
+ sizeof (permnode->p_setname));
+ permnode->p_matched = B_FALSE;
+
+ if (avl_find(avl, permnode, &idx) == NULL) {
+ avl_insert(avl, permnode, idx);
+ } else {
+ kmem_free(permnode, sizeof (perm_set_t));
+ }
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+/*
+ * Load all permissions user based on cred belongs to.
+ */
+static void
+dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
+ char checkflag, cred_t *cr)
+{
+ const gid_t *gids;
+ int ngids, i;
+ uint64_t id;
+
+ id = crgetuid(cr);
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_USER_SETS, checkflag, &id, avl);
+
+ id = crgetgid(cr);
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
+
+ ngids = crgetngroups(cr);
+ gids = crgetgroups(cr);
+ for (i = 0; i != ngids; i++) {
+ id = gids[i];
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+ }
+}
+
+/*
+ * Check if user has requested permission.
+ */
+int
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ void *cookie;
+ int error;
+ char checkflag;
+ objset_t *mos;
+ avl_tree_t permsets;
+ perm_set_t *setnode;
+
+ dp = ds->ds_dir->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ if (dsl_delegation_on(mos) == B_FALSE)
+ return (SET_ERROR(ECANCELED));
+
+ if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS)
+ return (SET_ERROR(EPERM));
+
+ if (ds->ds_is_snapshot) {
+ /*
+ * Snapshots are treated as descendents only,
+ * local permissions do not apply.
+ */
+ checkflag = ZFS_DELEG_DESCENDENT;
+ } else {
+ checkflag = ZFS_DELEG_LOCAL;
+ }
+
+ avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
+ offsetof(perm_set_t, p_node));
+
+ ASSERT(dsl_pool_config_held(dp));
+ for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
+ checkflag = ZFS_DELEG_DESCENDENT) {
+ uint64_t zapobj;
+ boolean_t expanded;
+
+ /*
+ * If not in global zone then make sure
+ * the zoned property is set
+ */
+ if (!INGLOBALZONE(curthread)) {
+ uint64_t zoned;
+
+ if (dsl_prop_get_dd(dd,
+ zfs_prop_to_name(ZFS_PROP_ZONED),
+ 8, 1, &zoned, NULL, B_FALSE) != 0)
+ break;
+ if (!zoned)
+ break;
+ }
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+
+ if (zapobj == 0)
+ continue;
+
+ dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
+again:
+ expanded = B_FALSE;
+ for (setnode = avl_first(&permsets); setnode;
+ setnode = AVL_NEXT(&permsets, setnode)) {
+ if (setnode->p_matched == B_TRUE)
+ continue;
+
+ /* See if this set directly grants this permission */
+ error = dsl_check_access(mos, zapobj,
+ ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
+ if (error == 0)
+ goto success;
+ if (error == EPERM)
+ setnode->p_matched = B_TRUE;
+
+ /* See if this set includes other sets */
+ error = dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_NAMED_SET_SETS, 0,
+ setnode->p_setname, &permsets);
+ if (error == 0)
+ setnode->p_matched = expanded = B_TRUE;
+ }
+ /*
+ * If we expanded any sets, that will define more sets,
+ * which we need to check.
+ */
+ if (expanded)
+ goto again;
+
+ error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
+ if (error == 0)
+ goto success;
+ }
+ error = SET_ERROR(EPERM);
+success:
+
+ cookie = NULL;
+ while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
+ kmem_free(setnode, sizeof (perm_set_t));
+
+ return (error);
+}
+
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_pool_hold(dsname, FTAG, &dp);
+ if (error != 0)
+ return (error);
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_pool_rele(dp, FTAG);
+
+ return (error);
+}
+
+/*
+ * Other routines.
+ */
+
+static void
+copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
+ boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t jumpobj, pjumpobj;
+ uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey,
+ dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
+ ZFS_DELEG_LOCAL, NULL);
+ if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
+ return;
+
+ if (zapobj == 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
+ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ }
+
+ zfs_deleg_whokey(whokey,
+ dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
+ ZFS_DELEG_LOCAL, &uid);
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
+ jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
+ }
+
+ for (zap_cursor_init(&zc, mos, pjumpobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t zero = 0;
+ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+
+ VERIFY(zap_update(mos, jumpobj, za.za_name,
+ 8, 1, &zero, tx) == 0);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * set all create time permission on new dataset.
+ */
+void
+dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
+{
+ dsl_dir_t *dd;
+ uint64_t uid = crgetuid(cr);
+
+ if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS)
+ return;
+
+ for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
+ uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+
+ if (pzapobj == 0)
+ continue;
+
+ copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
+ copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
+ }
+}
+
+int
+dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (zapobj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, mos, zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+ VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
+ }
+ zap_cursor_fini(&zc);
+ VERIFY(0 == zap_destroy(mos, zapobj, tx));
+ return (0);
+}
+
+boolean_t
+dsl_delegation_on(objset_t *os)
+{
+ return (!!spa_delegation(os->os_spa));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
new file mode 100644
index 000000000000..41b016a1d8ae
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
@@ -0,0 +1,1097 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
+#include <sys/zcp.h>
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/zvol.h>
+#endif
+
+
+int
+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
+{
+ if (!ds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ if (dsl_dataset_long_held(ds))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Only allow deferred destroy on pools that support it.
+ * NOTE: deferred destroy is only supported on snapshots.
+ */
+ if (defer) {
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+ SPA_VERSION_USERREFS)
+ return (SET_ERROR(ENOTSUP));
+ return (0);
+ }
+
+ /*
+ * If this snapshot has an elevated user reference count,
+ * we can't destroy it yet.
+ */
+ if (ds->ds_userrefs > 0)
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Can't delete a branch point.
+ */
+ if (dsl_dataset_phys(ds)->ds_num_children > 1)
+ return (SET_ERROR(EEXIST));
+
+ return (0);
+}
+
+int
+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_snapshot_arg_t *ddsa = arg;
+ const char *dsname = ddsa->ddsa_name;
+ boolean_t defer = ddsa->ddsa_defer;
+
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int error = 0;
+ dsl_dataset_t *ds;
+
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+
+ /*
+ * If the snapshot does not exist, silently ignore it, and
+ * dsl_destroy_snapshot_sync() will be a no-op
+ * (it's "already destroyed").
+ */
+ if (error == ENOENT)
+ return (0);
+
+ if (error == 0) {
+ error = dsl_destroy_snapshot_check_impl(ds, defer);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ return (error);
+}
+
+struct process_old_arg {
+ dsl_dataset_t *ds;
+ dsl_dataset_t *ds_prev;
+ boolean_t after_branch_point;
+ zio_t *pio;
+ uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct process_old_arg *poa = arg;
+ dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+ dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+ if (poa->ds_prev && !poa->after_branch_point &&
+ bp->blk_birth >
+ dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+ dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
+ bp_get_dsize_sync(dp->dp_spa, bp);
+ }
+ } else {
+ poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+ poa->comp += BP_GET_PSIZE(bp);
+ poa->uncomp += BP_GET_UCSIZE(bp);
+ dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+ }
+ return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+ dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+ struct process_old_arg poa = { 0 };
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t deadlist_obj;
+
+ ASSERT(ds->ds_deadlist.dl_oldfmt);
+ ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+ poa.ds = ds;
+ poa.ds_prev = ds_prev;
+ poa.after_branch_point = after_branch_point;
+ poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+ process_old_cb, &poa, tx));
+ VERIFY0(zio_wait(poa.pio));
+ ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
+
+ /* change snapused */
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -poa.used, -poa.comp, -poa.uncomp, tx);
+
+ /* swap next's deadlist to our deadlist */
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_close(&ds_next->ds_deadlist);
+ deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+ dsl_dataset_phys(ds)->ds_deadlist_obj =
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+ dsl_deadlist_open(&ds->ds_deadlist, mos,
+ dsl_dataset_phys(ds)->ds_deadlist_obj);
+ dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj);
+}
+
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /*
+ * If it is the old version, dd_clones doesn't exist so we can't
+ * find the clones, but dsl_deadlist_remove_key() is a no-op so it
+ * doesn't matter.
+ */
+ if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
+ return;
+
+ for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *clone;
+
+ VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+ za.za_first_integer, FTAG, &clone));
+ if (clone->ds_dir->dd_origin_txg > mintxg) {
+ dsl_deadlist_remove_key(&clone->ds_deadlist,
+ mintxg, tx);
+ if (dsl_dataset_remap_deadlist_exists(clone)) {
+ dsl_deadlist_remove_key(
+ &clone->ds_remap_deadlist, mintxg, tx);
+ }
+ dsl_dataset_remove_clones_key(clone, mintxg, tx);
+ }
+ dsl_dataset_rele(clone, FTAG);
+ }
+ zap_cursor_fini(&zc);
+}
+
+static void
+dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* Move blocks to be obsoleted to pool's obsolete list. */
+ if (dsl_dataset_remap_deadlist_exists(ds_next)) {
+ if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
+ dsl_pool_create_obsolete_bpobj(dp, tx);
+
+ dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
+ &dp->dp_obsolete_bpobj,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+ }
+
+ /* Merge our deadlist into next's and free it. */
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ uint64_t remap_deadlist_object =
+ dsl_dataset_get_remap_deadlist_object(ds);
+ ASSERT(remap_deadlist_object != 0);
+
+ mutex_enter(&ds_next->ds_remap_deadlist_lock);
+ if (!dsl_dataset_remap_deadlist_exists(ds_next))
+ dsl_dataset_create_remap_deadlist(ds_next, tx);
+ mutex_exit(&ds_next->ds_remap_deadlist_lock);
+
+ dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
+ remap_deadlist_object, tx);
+ dsl_dataset_destroy_remap_deadlist(ds, tx);
+ }
+}
+
+void
+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+{
+ int err;
+ int after_branch_point = FALSE;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *ds_prev = NULL;
+ uint64_t obj;
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
+
+ if (defer &&
+ (ds->ds_userrefs > 0 ||
+ dsl_dataset_phys(ds)->ds_num_children > 1)) {
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
+ spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
+ return;
+ }
+
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+
+ /* We need to log before removing it from the namespace. */
+ spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+ dsl_scan_ds_destroyed(ds, tx);
+
+ obj = ds->ds_object;
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f]) {
+ dsl_dataset_deactivate_feature(obj, f, tx);
+ ds->ds_feature_inuse[f] = B_FALSE;
+ }
+ }
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ ASSERT3P(ds->ds_prev, ==, NULL);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
+ after_branch_point =
+ (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
+
+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+ if (after_branch_point &&
+ dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
+ dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+ VERIFY0(zap_add_int(mos,
+ dsl_dataset_phys(ds_prev)->
+ ds_next_clones_obj,
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ tx));
+ }
+ }
+ if (!after_branch_point) {
+ dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+ dsl_dataset_phys(ds)->ds_next_snap_obj;
+ }
+ }
+
+ dsl_dataset_t *ds_next;
+ uint64_t old_unique;
+ uint64_t used = 0, comp = 0, uncomp = 0;
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+ ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
+
+ old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+ dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+ dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+ dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+ dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+ ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
+
+ if (ds_next->ds_deadlist.dl_oldfmt) {
+ process_old_deadlist(ds, ds_prev, ds_next,
+ after_branch_point, tx);
+ } else {
+ /* Adjust prev's unique space. */
+ if (ds_prev && !after_branch_point) {
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ &used, &comp, &uncomp);
+ dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
+ }
+
+ /* Adjust snapused. */
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
+ &used, &comp, &uncomp);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -used, -comp, -uncomp, tx);
+
+ /* Move blocks to be freed to pool's free list. */
+ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+ &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ tx);
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+ DD_USED_HEAD, used, comp, uncomp, tx);
+
+ /* Merge our deadlist into next's and free it. */
+ dsl_deadlist_merge(&ds_next->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+ }
+
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+ dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
+
+ /* Collapse range in clone heads */
+ dsl_dataset_remove_clones_key(ds,
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
+
+ if (ds_next->ds_is_snapshot) {
+ dsl_dataset_t *ds_nextnext;
+
+ /*
+ * Update next's unique to include blocks which
+ * were previously shared by only this snapshot
+ * and it. Those blocks will be born after the
+ * prev snap and before this snap, and will have
+ * died after the next snap and before the one
+ * after that (ie. be on the snap after next's
+ * deadlist).
+ */
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+ FTAG, &ds_nextnext));
+ dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ dsl_dataset_phys(ds)->ds_creation_txg,
+ &used, &comp, &uncomp);
+ dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
+ dsl_dataset_rele(ds_nextnext, FTAG);
+ ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+ /* Collapse range in this head. */
+ dsl_dataset_t *hds;
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
+ dsl_deadlist_remove_key(&hds->ds_deadlist,
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
+ if (dsl_dataset_remap_deadlist_exists(hds)) {
+ dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
+ }
+ dsl_dataset_rele(hds, FTAG);
+
+ } else {
+ ASSERT3P(ds_next->ds_prev, ==, ds);
+ dsl_dataset_rele(ds_next->ds_prev, ds_next);
+ ds_next->ds_prev = NULL;
+ if (ds_prev) {
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ ds_next, &ds_next->ds_prev));
+ }
+
+ dsl_dataset_recalc_head_uniq(ds_next);
+
+ /*
+ * Reduce the amount of our unconsumed refreservation
+ * being charged to our parent by the amount of
+ * new unique data we have gained.
+ */
+ if (old_unique < ds_next->ds_reserved) {
+ int64_t mrsdelta;
+ uint64_t new_unique =
+ dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+ ASSERT(old_unique <= new_unique);
+ mrsdelta = MIN(new_unique - old_unique,
+ ds_next->ds_reserved - old_unique);
+ dsl_dir_diduse_space(ds->ds_dir,
+ DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+ }
+ }
+ dsl_dataset_rele(ds_next, FTAG);
+
+ /*
+ * This must be done after the dsl_traverse(), because it will
+ * re-open the objset.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
+ /* remove from snapshot namespace */
+ dsl_dataset_t *ds_head;
+ ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
+ VERIFY0(dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+ {
+ uint64_t val;
+
+ err = dsl_dataset_snap_lookup(ds_head,
+ ds->ds_snapname, &val);
+ ASSERT0(err);
+ ASSERT3U(val, ==, obj);
+ }
+#endif
+ VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
+ dsl_dataset_rele(ds_head, FTAG);
+
+ if (ds_prev != NULL)
+ dsl_dataset_rele(ds_prev, FTAG);
+
+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+ uint64_t count;
+ ASSERT0(zap_count(mos,
+ dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+ count == 0);
+ VERIFY0(dmu_object_free(mos,
+ dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
+ }
+ if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+ VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+ tx));
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+ VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+ tx));
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
+
+ dsl_dataset_name(ds, dsname);
+ zvol_remove_minors(dp->dp_spa, dsname);
+#endif
+
+ dsl_dir_rele(ds->ds_dir, ds);
+ ds->ds_dir = NULL;
+ dmu_object_free_zapified(mos, obj, tx);
+}
+
+void
+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_snapshot_arg_t *ddsa = arg;
+ const char *dsname = ddsa->ddsa_name;
+ boolean_t defer = ddsa->ddsa_defer;
+
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error == ENOENT)
+ return;
+ ASSERT0(error);
+ dsl_destroy_snapshot_sync_impl(ds, defer, tx);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * The semantics of this function are described in the comment above
+ * lzc_destroy_snaps(). To summarize:
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Snapshots that don't exist will be silently ignored (considered to be
+ * "already deleted").
+ *
+ * On success, all snaps will be destroyed and this will return 0.
+ * On failure, no snaps will be destroyed, the errlist will be filled in,
+ * and this will return an errno.
+ */
+int
+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
+ nvlist_t *errlist)
+{
+ if (nvlist_next_nvpair(snaps, NULL) == NULL)
+ return (0);
+
+ /*
+ * lzc_destroy_snaps() is documented to take an nvlist whose
+ * values "don't matter". We need to convert that nvlist to
+ * one that we know can be converted to LUA. We also don't
+ * care about any duplicate entries because the nvlist will
+ * be converted to a LUA table which should take care of this.
+ */
+ nvlist_t *snaps_normalized;
+ VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP));
+ for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
+ fnvlist_add_boolean_value(snaps_normalized,
+ nvpair_name(pair), B_TRUE);
+ }
+
+ nvlist_t *arg;
+ VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP));
+ fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
+ fnvlist_free(snaps_normalized);
+ fnvlist_add_boolean_value(arg, "defer", defer);
+
+ nvlist_t *wrapper;
+ VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP));
+ fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
+ fnvlist_free(arg);
+
+ const char *program =
+ "arg = ...\n"
+ "snaps = arg['snaps']\n"
+ "defer = arg['defer']\n"
+ "errors = { }\n"
+ "has_errors = false\n"
+ "for snap, v in pairs(snaps) do\n"
+ " errno = zfs.check.destroy{snap, defer=defer}\n"
+ " zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
+ " if errno == ENOENT then\n"
+ " snaps[snap] = nil\n"
+ " elseif errno ~= 0 then\n"
+ " errors[snap] = errno\n"
+ " has_errors = true\n"
+ " end\n"
+ "end\n"
+ "if has_errors then\n"
+ " return errors\n"
+ "end\n"
+ "for snap, v in pairs(snaps) do\n"
+ " errno = zfs.sync.destroy{snap, defer=defer}\n"
+ " assert(errno == 0)\n"
+ "end\n"
+ "return { }\n";
+
+ nvlist_t *result = fnvlist_alloc();
+ int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
+ program,
+ B_TRUE,
+ 0,
+ zfs_lua_max_memlimit,
+ nvlist_next_nvpair(wrapper, NULL), result);
+ if (error != 0) {
+ char *errorstr = NULL;
+ (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
+ if (errorstr != NULL) {
+ zfs_dbgmsg(errorstr);
+ }
+ return (error);
+ }
+ fnvlist_free(wrapper);
+
+ /*
+ * lzc_destroy_snaps() is documented to fill the errlist with
+ * int32 values, so we need to covert the int64 values that are
+ * returned from LUA.
+ */
+ int rv = 0;
+ nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
+ for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
+ int32_t val = (int32_t)fnvpair_value_int64(pair);
+ if (rv == 0)
+ rv = val;
+ fnvlist_add_int32(errlist, nvpair_name(pair), val);
+ }
+ fnvlist_free(result);
+ return (rv);
+}
+
+int
+dsl_destroy_snapshot(const char *name, boolean_t defer)
+{
+ int error;
+ nvlist_t *nvl = fnvlist_alloc();
+ nvlist_t *errlist = fnvlist_alloc();
+
+ fnvlist_add_boolean(nvl, name);
+ error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
+ fnvlist_free(errlist);
+ fnvlist_free(nvl);
+ return (error);
+}
+
+struct killarg {
+ dsl_dataset_t *ds;
+ dmu_tx_t *tx;
+};
+
+/* ARGSUSED */
+static int
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ struct killarg *ka = arg;
+ dmu_tx_t *tx = ka->tx;
+
+ if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ return (0);
+
+ if (zb->zb_level == ZB_ZIL_LEVEL) {
+ ASSERT(zilog != NULL);
+ /*
+ * It's a block in the intent log. It has no
+ * accounting, so just free it.
+ */
+ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
+ } else {
+ ASSERT(zilog == NULL);
+ ASSERT3U(bp->blk_birth, >,
+ dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
+ (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
+ }
+
+ return (0);
+}
+
+static void
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ struct killarg ka;
+
+ /*
+ * Free everything that we point to (that's born after
+ * the previous snapshot, if we are a clone)
+ *
+ * NB: this should be very quick, because we already
+ * freed all the objects in open context.
+ */
+ ka.ds = ds;
+ ka.tx = tx;
+ VERIFY0(traverse_dataset(ds,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
+ kill_blkptr, &ka));
+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+ dsl_dataset_phys(ds)->ds_unique_bytes == 0);
+}
+
+int
+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
+{
+ int error;
+ uint64_t count;
+ objset_t *mos;
+
+ ASSERT(!ds->ds_is_snapshot);
+ if (ds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_refcount_count(&ds->ds_longholds) != expected_holds)
+ return (SET_ERROR(EBUSY));
+
+ mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Can't delete if there are children of this fs.
+ */
+ error = zap_count(mos,
+ dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
+ if (error != 0)
+ return (error);
+ if (count != 0)
+ return (SET_ERROR(EEXIST));
+
+ if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+ ds->ds_prev->ds_userrefs == 0) {
+ /* We need to remove the origin snapshot as well. */
+ if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds))
+ return (SET_ERROR(EBUSY));
+ }
+ return (0);
+}
+
+int
+dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_head_arg_t *ddha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ error = dsl_destroy_head_check_impl(ds, 0);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+}
+
+static void
+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ dd_used_t t;
+
+ ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
+
+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+ ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+ /*
+ * Decrement the filesystem count for all parent filesystems.
+ *
+ * When we receive an incremental stream into a filesystem that already
+ * exists, a temporary clone is created. We never count this temporary
+ * clone, whose name begins with a '%'.
+ */
+ if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
+ dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+
+ /*
+ * Remove our reservation. The impl() routine avoids setting the
+ * actual property, which would require the (already destroyed) ds.
+ */
+ dsl_dir_set_reservation_sync_impl(dd, 0, tx);
+
+ ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+ ASSERT0(dsl_dir_phys(dd)->dd_reserved);
+ for (t = 0; t < DD_USED_NUM; t++)
+ ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
+
+ VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+ VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+ if (dsl_dir_phys(dd)->dd_clones != 0)
+ VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx));
+ VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
+ VERIFY0(zap_remove(mos,
+ dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+ dd->dd_myname, tx));
+
+ dsl_dir_rele(dd, FTAG);
+ dmu_object_free_zapified(mos, ddobj, tx);
+}
+
+void
+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t obj, ddobj, prevobj = 0;
+ boolean_t rmorigin;
+
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+ ASSERT(ds->ds_prev == NULL ||
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ /* We need to log before removing it from the namespace. */
+ spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+ rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
+ DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+ ds->ds_prev->ds_userrefs == 0);
+
+ /* Remove our reservation. */
+ if (ds->ds_reserved != 0) {
+ dsl_dataset_set_refreservation_sync_impl(ds,
+ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+ 0, tx);
+ ASSERT0(ds->ds_reserved);
+ }
+
+ obj = ds->ds_object;
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f]) {
+ dsl_dataset_deactivate_feature(obj, f, tx);
+ ds->ds_feature_inuse[f] = B_FALSE;
+ }
+ }
+
+ dsl_scan_ds_destroyed(ds, tx);
+
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ /* This is a clone */
+ ASSERT(ds->ds_prev != NULL);
+ ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+ obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
+
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
+ dsl_dataset_remove_from_next_clones(ds->ds_prev,
+ obj, tx);
+ }
+
+ ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
+ }
+
+ /*
+ * Destroy the deadlist. Unless it's a clone, the
+ * deadlist should be empty since the dataset has no snapshots.
+ * (If it's a clone, it's safe to ignore the deadlist contents
+ * since they are still referenced by the origin snapshot.)
+ */
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+ if (dsl_dataset_remap_deadlist_exists(ds))
+ dsl_dataset_destroy_remap_deadlist(ds, tx);
+
+ objset_t *os;
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ old_synchronous_dataset_destroy(ds, tx);
+ } else {
+ /*
+ * Move the bptree into the pool's list of trees to
+ * clean up and update space accounting information.
+ */
+ uint64_t used, comp, uncomp;
+
+ zil_destroy_sync(dmu_objset_zil(os), tx);
+
+ if (!spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_ASYNC_DESTROY)) {
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
+ tx);
+ dp->dp_bptree_obj = bptree_alloc(mos, tx);
+ VERIFY0(zap_add(mos,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+ &dp->dp_bptree_obj, tx));
+ ASSERT(!scn->scn_async_destroying);
+ scn->scn_async_destroying = B_TRUE;
+ }
+
+ used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+ comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+ uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
+
+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+ dsl_dataset_phys(ds)->ds_unique_bytes == used);
+
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ bptree_add(mos, dp->dp_bptree_obj,
+ &dsl_dataset_phys(ds)->ds_bp,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ used, comp, uncomp, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+ -used, -comp, -uncomp, tx);
+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+ used, comp, uncomp, tx);
+ }
+
+ if (ds->ds_prev != NULL) {
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY0(zap_remove_int(mos,
+ dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
+ ds->ds_object, tx));
+ }
+ prevobj = ds->ds_prev->ds_object;
+ dsl_dataset_rele(ds->ds_prev, ds);
+ ds->ds_prev = NULL;
+ }
+
+ /*
+ * This must be done after the dsl_traverse(), because it will
+ * re-open the objset.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
+ /* Erase the link in the dir */
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
+ ddobj = ds->ds_dir->dd_object;
+ ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+ VERIFY0(zap_destroy(mos,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
+
+ if (ds->ds_bookmarks != 0) {
+ VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+ }
+
+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+ ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
+ dsl_dir_rele(ds->ds_dir, ds);
+ ds->ds_dir = NULL;
+ dmu_object_free_zapified(mos, obj, tx);
+
+ dsl_dir_destroy_sync(ddobj, tx);
+
+ if (rmorigin) {
+ dsl_dataset_t *prev;
+ VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
+ dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
+ dsl_dataset_rele(prev, FTAG);
+ }
+}
+
+void
+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_head_arg_t *ddha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+ dsl_destroy_head_sync_impl(ds, tx);
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ zvol_remove_minors(dp->dp_spa, ddha->ddha_name);
+#endif
+ dsl_dataset_rele(ds, FTAG);
+}
+
+static void
+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_destroy_head_arg_t *ddha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+
+ /* Mark it as inconsistent on-disk, in case we crash */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ spa_history_log_internal_ds(ds, "destroy begin", tx, "");
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_destroy_head(const char *name)
+{
+ dsl_destroy_head_arg_t ddha;
+ int error;
+ spa_t *spa;
+ boolean_t isenabled;
+
+#ifdef _KERNEL
+ zfs_destroy_unmount_origin(name);
+#endif
+
+ error = spa_open(name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
+ spa_close(spa, FTAG);
+
+ ddha.ddha_name = name;
+
+ if (!isenabled) {
+ objset_t *os;
+
+ error = dsl_sync_task(name, dsl_destroy_head_check,
+ dsl_destroy_head_begin_sync, &ddha,
+ 0, ZFS_SPACE_CHECK_DESTROY);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Head deletion is processed in one txg on old pools;
+ * remove the objects from open context so that the txg sync
+ * is not too long.
+ */
+ error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
+ if (error == 0) {
+ uint64_t prev_snap_txg =
+ dsl_dataset_phys(dmu_objset_ds(os))->
+ ds_prev_snap_txg;
+ for (uint64_t obj = 0; error == 0;
+ error = dmu_object_next(os, &obj, FALSE,
+ prev_snap_txg))
+ (void) dmu_free_long_object(os, obj);
+ /* sync out all frees */
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ dmu_objset_disown(os, FTAG);
+ }
+ }
+
+ return (dsl_sync_task(name, dsl_destroy_head_check,
+ dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
+}
+
+/*
+ * Note, this function is used as the callback for dmu_objset_find(). We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+ objset_t *os;
+
+ if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
+ boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+ /*
+ * If the dataset is inconsistent because a resumable receive
+ * has failed, then do not destroy it.
+ */
+ if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+ need_destroy = B_FALSE;
+
+ dmu_objset_rele(os, FTAG);
+ if (need_destroy)
+ (void) dsl_destroy_head(dsname);
+ }
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
new file mode 100644
index 000000000000..2f43aabf7c82
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -0,0 +1,2184 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
+#include <sys/spa.h>
+#include <sys/metaslab.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/sunddi.h>
+#include <sys/zvol.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The filesystem and snapshot counts are stored as extensible properties. This
+ * capability is controlled by a feature flag and must be enabled to be used.
+ * Once enabled, the feature is not active until the first limit is set. At
+ * that point, future operations to create/destroy filesystems or snapshots
+ * will validate and update the counts.
+ *
+ * Because the count properties will not exist before the feature is active,
+ * the counts are updated when a limit is first set on an uninitialized
+ * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
+ * all of the nested filesystems/snapshots. Thus, a new leaf node has a
+ * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
+ * snapshot count properties on a node indicate uninitialized counts on that
+ * node.) When first setting a limit on an uninitialized node, the code starts
+ * at the filesystem with the new limit and descends into all sub-filesystems
+ * to add the count properties.
+ *
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized, unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initizized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
+ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
+ * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
+ * dsl_dir_init_fs_ss_count().
+ *
+ * There is a special case when we receive a filesystem that already exists. In
+ * this case a temporary clone name of %X is created (see dmu_recv_begin). We
+ * never update the filesystem counts for temporary clones.
+ *
+ * Likewise, we do not update the snapshot counts for temporary snapshots,
+ * such as those created by zfs diff.
+ */
+
+extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
+
+static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
+
+typedef struct ddulrt_arg {
+ dsl_dir_t *ddulrta_dd;
+ uint64_t ddlrta_txg;
+} ddulrt_arg_t;
+
+static void
+dsl_dir_evict_async(void *dbu)
+{
+ dsl_dir_t *dd = dbu;
+ dsl_pool_t *dp = dd->dd_pool;
+ int t;
+
+ dd->dd_dbuf = NULL;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+ ASSERT(dd->dd_tempreserved[t] == 0);
+ ASSERT(dd->dd_space_towrite[t] == 0);
+ }
+
+ if (dd->dd_parent)
+ dsl_dir_async_rele(dd->dd_parent, dd);
+
+ spa_async_close(dd->dd_pool->dp_spa, dd);
+
+ dsl_prop_fini(dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+int
+dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **ddp)
+{
+ dmu_buf_t *dbuf;
+ dsl_dir_t *dd;
+ int err;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+ if (err != 0)
+ return (err);
+ dd = dmu_buf_get_user(dbuf);
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbuf, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
+ }
+#endif
+ if (dd == NULL) {
+ dsl_dir_t *winner;
+
+ dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+ dd->dd_object = ddobj;
+ dd->dd_dbuf = dbuf;
+ dd->dd_pool = dp;
+ mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
+ dsl_prop_init(dd);
+
+ dsl_dir_snap_cmtime_update(dd);
+
+ if (dsl_dir_phys(dd)->dd_parent_obj) {
+ err = dsl_dir_hold_obj(dp,
+ dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
+ &dd->dd_parent);
+ if (err != 0)
+ goto errout;
+ if (tail) {
+#ifdef ZFS_DEBUG
+ uint64_t foundobj;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dsl_dir_phys(dd->dd_parent)->
+ dd_child_dir_zapobj, tail,
+ sizeof (foundobj), 1, &foundobj);
+ ASSERT(err || foundobj == ddobj);
+#endif
+ (void) strcpy(dd->dd_myname, tail);
+ } else {
+ err = zap_value_search(dp->dp_meta_objset,
+ dsl_dir_phys(dd->dd_parent)->
+ dd_child_dir_zapobj,
+ ddobj, 0, dd->dd_myname);
+ }
+ if (err != 0)
+ goto errout;
+ } else {
+ (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
+ }
+
+ if (dsl_dir_is_clone(dd)) {
+ dmu_buf_t *origin_bonus;
+ dsl_dataset_phys_t *origin_phys;
+
+ /*
+ * We can't open the origin dataset, because
+ * that would require opening this dsl_dir.
+ * Just look at its phys directly instead.
+ */
+ err = dmu_bonus_hold(dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG,
+ &origin_bonus);
+ if (err != 0)
+ goto errout;
+ origin_phys = origin_bonus->db_data;
+ dd->dd_origin_txg =
+ origin_phys->ds_creation_txg;
+ dmu_buf_rele(origin_bonus, FTAG);
+ }
+
+ dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
+ &dd->dd_dbuf);
+ winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
+ if (winner != NULL) {
+ if (dd->dd_parent)
+ dsl_dir_rele(dd->dd_parent, dd);
+ dsl_prop_fini(dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dd = winner;
+ } else {
+ spa_open_ref(dp->dp_spa, dd);
+ }
+ }
+
+ /*
+ * The dsl_dir_t has both open-to-close and instantiate-to-evict
+ * holds on the spa. We need the open-to-close holds because
+ * otherwise the spa_refcnt wouldn't change when we open a
+ * dir which the spa also has open, so we could incorrectly
+ * think it was OK to unload/export/destroy the pool. We need
+ * the instantiate-to-evict hold because the dsl_dir_t has a
+ * pointer to the dd_pool, which has a pointer to the spa_t.
+ */
+ spa_open_ref(dp->dp_spa, tag);
+ ASSERT3P(dd->dd_pool, ==, dp);
+ ASSERT3U(dd->dd_object, ==, ddobj);
+ ASSERT3P(dd->dd_dbuf, ==, dbuf);
+ *ddp = dd;
+ return (0);
+
+errout:
+ if (dd->dd_parent)
+ dsl_dir_rele(dd->dd_parent, dd);
+ dsl_prop_fini(dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+}
+
+void
+dsl_dir_rele(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/*
+ * Remove a reference to the given dsl dir that is being asynchronously
+ * released. Async releases occur from a taskq performing eviction of
+ * dsl datasets and dirs. This process is identical to a normal release
+ * with the exception of using the async API for releasing the reference on
+ * the spa.
+ */
+void
+dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_async_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+ if (dd->dd_parent) {
+ dsl_dir_name(dd->dd_parent, buf);
+ VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
+ } else {
+ buf[0] = '\0';
+ }
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /*
+ * recursive mutex so that we can use
+ * dprintf_dd() with dd_lock held
+ */
+ mutex_enter(&dd->dd_lock);
+ VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ }
+}
+
+/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
+int
+dsl_dir_namelen(dsl_dir_t *dd)
+{
+ int result = 0;
+
+ if (dd->dd_parent) {
+ /* parent's name + 1 for the "/" */
+ result = dsl_dir_namelen(dd->dd_parent) + 1;
+ }
+
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /* see dsl_dir_name */
+ mutex_enter(&dd->dd_lock);
+ result += strlen(dd->dd_myname);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ result += strlen(dd->dd_myname);
+ }
+
+ return (result);
+}
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+ char *p;
+
+ if ((path == NULL) || (path[0] == '\0'))
+ return (SET_ERROR(ENOENT));
+ /* This would be a good place to reserve some namespace... */
+ p = strpbrk(path, "/@");
+ if (p && (p[1] == '/' || p[1] == '@')) {
+ /* two separators in a row */
+ return (SET_ERROR(EINVAL));
+ }
+ if (p == NULL || p == path) {
+ /*
+ * if the first thing is an @ or /, it had better be an
+ * @ and it had better not have any more ats or slashes,
+ * and it had better have something after the @.
+ */
+ if (p != NULL &&
+ (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+ return (SET_ERROR(EINVAL));
+ if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strcpy(component, path);
+ p = NULL;
+ } else if (p[0] == '/') {
+ if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strncpy(component, path, p - path);
+ component[p - path] = '\0';
+ p++;
+ } else if (p[0] == '@') {
+ /*
+ * if the next separator is an @, there better not be
+ * any more slashes.
+ */
+ if (strchr(path, '/'))
+ return (SET_ERROR(EINVAL));
+ if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strncpy(component, path, p - path);
+ component[p - path] = '\0';
+ } else {
+ panic("invalid p=%p", (void *)p);
+ }
+ *nextp = p;
+ return (0);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail. The name must be in the specified dsl_pool_t. This
+ * thread must hold the dp_config_rwlock for the pool. Returns NULL if the
+ * path is bogus, or if tail==NULL and we couldn't parse the whole name.
+ * (*tail)[0] == '@' means that the last component is a snapshot.
+ */
+int
+dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
+ dsl_dir_t **ddp, const char **tailp)
+{
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ const char *spaname, *next, *nextnext = NULL;
+ int err;
+ dsl_dir_t *dd;
+ uint64_t ddobj;
+
+ err = getcomponent(name, buf, &next);
+ if (err != 0)
+ return (err);
+
+ /* Make sure the name is in the specified pool. */
+ spaname = spa_name(dp->dp_spa);
+ if (strcmp(buf, spaname) != 0)
+ return (SET_ERROR(EXDEV));
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+ if (err != 0) {
+ return (err);
+ }
+
+ while (next != NULL) {
+ dsl_dir_t *child_dd;
+ err = getcomponent(next, buf, &nextnext);
+ if (err != 0)
+ break;
+ ASSERT(next[0] != '\0');
+ if (next[0] == '@')
+ break;
+ dprintf("looking up %s in obj%lld\n",
+ buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj,
+ buf, sizeof (ddobj), 1, &ddobj);
+ if (err != 0) {
+ if (err == ENOENT)
+ err = 0;
+ break;
+ }
+
+ err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
+ if (err != 0)
+ break;
+ dsl_dir_rele(dd, tag);
+ dd = child_dd;
+ next = nextnext;
+ }
+
+ if (err != 0) {
+ dsl_dir_rele(dd, tag);
+ return (err);
+ }
+
+ /*
+ * It's an error if there's more than one component left, or
+ * tailp==NULL and there's any component left.
+ */
+ if (next != NULL &&
+ (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+ /* bad path name */
+ dsl_dir_rele(dd, tag);
+ dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+ err = SET_ERROR(ENOENT);
+ }
+ if (tailp != NULL)
+ *tailp = next;
+ *ddp = dd;
+ return (err);
+}
+
+/*
+ * If the counts are already initialized for this filesystem and its
+ * descendants then do nothing, otherwise initialize the counts.
+ *
+ * The counts on this filesystem, and those below, may be uninitialized due to
+ * either the use of a pre-existing pool which did not support the
+ * filesystem/snapshot limit feature, or one in which the feature had not yet
+ * been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a count set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * are already correct, so we don't have to update this filesystem.
+ */
+static void
+dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ uint64_t my_fs_cnt = 0;
+ uint64_t my_ss_cnt = 0;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *os = dp->dp_meta_objset;
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+ dsl_dataset_t *ds;
+
+ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
+ ASSERT(dsl_pool_config_held(dp));
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsl_dir_zapify(dd, tx);
+
+ /*
+ * If the filesystem count has already been initialized then we
+ * don't need to recurse down any further.
+ */
+ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
+ return;
+
+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /* Iterate my child dirs */
+ for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+ dsl_dir_t *chld_dd;
+ uint64_t count;
+
+ VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
+ &chld_dd));
+
+ /*
+ * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
+ * temporary datasets.
+ */
+ if (chld_dd->dd_myname[0] == '$' ||
+ chld_dd->dd_myname[0] == '%') {
+ dsl_dir_rele(chld_dd, FTAG);
+ continue;
+ }
+
+ my_fs_cnt++; /* count this child */
+
+ dsl_dir_init_fs_ss_count(chld_dd, tx);
+
+ VERIFY0(zap_lookup(os, chld_dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
+ my_fs_cnt += count;
+ VERIFY0(zap_lookup(os, chld_dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
+ my_ss_cnt += count;
+
+ dsl_dir_rele(chld_dd, FTAG);
+ }
+ zap_cursor_fini(zc);
+ /* Count my snapshots (we counted children's snapshots above) */
+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+ dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
+
+ for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ /* Don't count temporary snapshots */
+ if (za->za_name[0] != '%')
+ my_ss_cnt++;
+ }
+ zap_cursor_fini(zc);
+
+ dsl_dataset_rele(ds, FTAG);
+
+ kmem_free(zc, sizeof (zap_cursor_t));
+ kmem_free(za, sizeof (zap_attribute_t));
+
+ /* we're in a sync task, update counts */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
+}
+
+static int
+dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
+{
+ char *ddname = (char *)arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ dd = ds->ds_dir;
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
+ dsl_dir_is_zapified(dd) &&
+ zap_contains(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT) == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EALREADY));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
+{
+ char *ddname = (char *)arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ spa_t *spa;
+
+ VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
+
+ spa = dsl_dataset_get_spa(ds);
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ /*
+ * Since the feature was not active and we're now setting a
+ * limit, increment the feature-active counter so that the
+ * feature becomes active for the first time.
+ *
+ * We are already in a sync task so we can update the MOS.
+ */
+ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
+ }
+
+ /*
+ * Since we are now setting a non-UINT64_MAX limit on the filesystem,
+ * we need to ensure the counts are correct. Descend down the tree from
+ * this point and update all of the counts to be accurate.
+ */
+ dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * Since we're setting a limit, ensure the on-disk counts are valid.
+ * This is only called by the ioctl path when setting a limit value.
+ *
+ * We do not need to validate the new limit, since users who can change the
+ * limit are also allowed to exceed the limit.
+ */
+int
+dsl_dir_activate_fs_ss_limit(const char *ddname)
+{
+ int error;
+
+ error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
+ dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
+ ZFS_SPACE_CHECK_RESERVED);
+
+ if (error == EALREADY)
+ error = 0;
+
+ return (error);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context. We also have
+ * to handle the case where we are allowed to change the limit on the current
+ * dataset, but there may be another limit in the tree above.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified
+ * in this code.
+ */
+
+typedef enum {
+ ENFORCE_ALWAYS,
+ ENFORCE_NEVER,
+ ENFORCE_ABOVE
+} enforce_res_t;
+
+static enforce_res_t
+dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
+{
+ enforce_res_t enforce = ENFORCE_ALWAYS;
+ uint64_t obj;
+ dsl_dataset_t *ds;
+ uint64_t zoned;
+
+ ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+ prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+#ifdef _KERNEL
+#ifdef __FreeBSD__
+ if (jailed(cr))
+#else
+ if (crgetzoneid(cr) != GLOBAL_ZONEID)
+#endif
+ return (ENFORCE_ALWAYS);
+
+ if (secpolicy_zfs(cr) == 0)
+ return (ENFORCE_NEVER);
+#endif
+
+ if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
+ return (ENFORCE_ALWAYS);
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+ if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
+ return (ENFORCE_ALWAYS);
+
+ if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
+ /* Only root can access zoned fs's from the GZ */
+ enforce = ENFORCE_ALWAYS;
+ } else {
+ if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
+ enforce = ENFORCE_ABOVE;
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (enforce);
+}
+
+static void
+dsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx)
+{
+ ddulrt_arg_t *arg = varg;
+ uint64_t last_remap_txg;
+ dsl_dir_t *dd = arg->ddulrta_dd;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+
+ dsl_dir_zapify(dd, tx);
+ if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+ sizeof (last_remap_txg), 1, &last_remap_txg) != 0 ||
+ last_remap_txg < arg->ddlrta_txg) {
+ VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+ sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx));
+ }
+}
+
+int
+dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg)
+{
+ ddulrt_arg_t arg;
+ arg.ddulrta_dd = dd;
+ arg.ddlrta_txg = txg;
+
+ return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa),
+ NULL, dsl_dir_update_last_remap_txg_sync, &arg,
+ 1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits or adding additional snapshot(s) would exceed any snapshot limits.
+ * The prop argument indicates which limit to check.
+ *
+ * Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
+ */
+int
+dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
+ dsl_dir_t *ancestor, cred_t *cr)
+{
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t limit, count;
+ char *count_prop;
+ enforce_res_t enforce;
+ int err = 0;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+ ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+ prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+ /*
+ * If we're allowed to change the limit, don't enforce the limit
+ * e.g. this can happen if a snapshot is taken by an administrative
+ * user in the global zone (i.e. a recursive snapshot by root).
+ * However, we must handle the case of delegated permissions where we
+ * are allowed to change the limit on the current dataset, but there
+ * is another limit in the tree above.
+ */
+ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
+ if (enforce == ENFORCE_NEVER)
+ return (0);
+
+ /*
+ * e.g. if renaming a dataset with no snapshots, count adjustment
+ * is 0.
+ */
+ if (delta == 0)
+ return (0);
+
+ if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+ /*
+ * We don't enforce the limit for temporary snapshots. This is
+ * indicated by a NULL cred_t argument.
+ */
+ if (cr == NULL)
+ return (0);
+
+ count_prop = DD_FIELD_SNAPSHOT_COUNT;
+ } else {
+ count_prop = DD_FIELD_FILESYSTEM_COUNT;
+ }
+
+ /*
+ * If an ancestor has been provided, stop checking the limit once we
+ * hit that dir. We need this during rename so that we don't overcount
+ * the check once we recurse up to the common ancestor.
+ */
+ if (ancestor == dd)
+ return (0);
+
+ /*
+ * If we hit an uninitialized node while recursing up the tree, we can
+ * stop since we know there is no limit here (or above). The counts are
+ * not valid on this node and we know we won't touch this node's counts.
+ */
+ if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
+ count_prop, sizeof (count), 1, &count) == ENOENT)
+ return (0);
+
+ err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
+ B_FALSE);
+ if (err != 0)
+ return (err);
+
+ /* Is there a limit which we've hit? */
+ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
+ return (SET_ERROR(EDQUOT));
+
+ if (dd->dd_parent != NULL)
+ err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
+ ancestor, cr);
+
+ return (err);
+}
+
+/*
+ * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
+ * parents. When a new filesystem/snapshot is created, increment the count on
+ * all parents, and when a filesystem/snapshot is destroyed, decrement the
+ * count.
+ */
+void
+dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
+ dmu_tx_t *tx)
+{
+ int err;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t count;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
+ strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
+
+ /*
+ * When we receive an incremental stream into a filesystem that already
+ * exists, a temporary clone is created. We don't count this temporary
+ * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
+ * $MOS & $ORIGIN) objsets.
+ */
+ if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
+ strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
+ return;
+
+ /*
+ * e.g. if renaming a dataset with no snapshots, count adjustment is 0
+ */
+ if (delta == 0)
+ return;
+
+ /*
+ * If we hit an uninitialized node while recursing up the tree, we can
+ * stop since we know the counts are not valid on this node and we
+ * know we shouldn't touch this node's counts. An uninitialized count
+ * on the node indicates that either the feature has not yet been
+ * activated or there are no limits on this part of the tree.
+ */
+ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
+ prop, sizeof (count), 1, &count)) == ENOENT)
+ return;
+ VERIFY0(err);
+
+ count += delta;
+ /* Use a signed verify to make sure we're not neg. */
+ VERIFY3S(count, >=, 0);
+
+ VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
+ tx));
+
+ /* Roll up this additional count into our ancestors */
+ if (dd->dd_parent != NULL)
+ dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
+}
+
+uint64_t
+dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
+ dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t ddobj;
+ dsl_dir_phys_t *ddphys;
+ dmu_buf_t *dbuf;
+
+ ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+ if (pds) {
+ VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
+ name, sizeof (uint64_t), 1, &ddobj, tx));
+ } else {
+ /* it's the root dir */
+ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
+ }
+ VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ ddphys = dbuf->db_data;
+
+ ddphys->dd_creation_time = gethrestime_sec();
+ if (pds) {
+ ddphys->dd_parent_obj = pds->dd_object;
+
+ /* update the filesystem counts */
+ dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
+ }
+ ddphys->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ ddphys->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
+ ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+ dmu_buf_rele(dbuf, FTAG);
+
+ return (ddobj);
+}
+
+boolean_t
+dsl_dir_is_clone(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_origin_obj &&
+ (dd->dd_pool->dp_origin_snap == NULL ||
+ dsl_dir_phys(dd)->dd_origin_obj !=
+ dd->dd_pool->dp_origin_snap->ds_object));
+}
+
+
+uint64_t
+dsl_dir_get_used(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_bytes);
+}
+
+uint64_t
+dsl_dir_get_compressed(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_compressed_bytes);
+}
+
+uint64_t
+dsl_dir_get_quota(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_quota);
+}
+
+uint64_t
+dsl_dir_get_reservation(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_reserved);
+}
+
+uint64_t
+dsl_dir_get_compressratio(dsl_dir_t *dd)
+{
+ /* a fixed point number, 100x the ratio */
+ return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
+ (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
+ dsl_dir_phys(dd)->dd_compressed_bytes));
+}
+
+uint64_t
+dsl_dir_get_logicalused(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
+}
+
+uint64_t
+dsl_dir_get_usedsnap(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
+}
+
+uint64_t
+dsl_dir_get_usedds(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
+}
+
+uint64_t
+dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
+}
+
+uint64_t
+dsl_dir_get_usedchild(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+}
+
+void
+dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
+{
+ dsl_dataset_t *ds;
+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
+
+ dsl_dataset_name(ds, buf);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
+{
+ if (dsl_dir_is_zapified(dd)) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (*count), 1, count));
+ } else {
+ return (ENOENT);
+ }
+}
+
+int
+dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
+{
+ if (dsl_dir_is_zapified(dd)) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (*count), 1, count));
+ } else {
+ return (ENOENT);
+ }
+}
+
+int
+dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count)
+{
+ if (dsl_dir_is_zapified(dd)) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
+ sizeof (*count), 1, count));
+ } else {
+ return (ENOENT);
+ }
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
+{
+ mutex_enter(&dd->dd_lock);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+ dsl_dir_get_quota(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
+ dsl_dir_get_reservation(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
+ dsl_dir_get_logicalused(dd));
+ if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
+ dsl_dir_get_usedsnap(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
+ dsl_dir_get_usedds(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
+ dsl_dir_get_usedrefreserv(dd));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
+ dsl_dir_get_usedchild(dd));
+ }
+ mutex_exit(&dd->dd_lock);
+
+ uint64_t count;
+ if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
+ count);
+ }
+ if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
+ count);
+ }
+ if (dsl_dir_get_remaptxg(dd, &count) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG,
+ count);
+ }
+
+ if (dsl_dir_is_clone(dd)) {
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dir_get_origin(dd, buf);
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
+ }
+
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+
+ ASSERT(dsl_dir_phys(dd));
+
+ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(dd->dd_dbuf, dd);
+ }
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+ uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
+ uint64_t new_accounted =
+ MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
+ return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ mutex_enter(&dd->dd_lock);
+ ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
+ dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
+ mutex_exit(&dd->dd_lock);
+
+ /* release the hold from dsl_dir_dirty */
+ dmu_buf_rele(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_space_towrite(dsl_dir_t *dd)
+{
+ uint64_t space = 0;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ space += dd->dd_space_towrite[i & TXG_MASK];
+ ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
+ }
+ return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it? If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+ uint64_t parentspace, myspace, quota, used;
+
+ /*
+ * If there are no restrictions otherwise, assume we have
+ * unlimited space available.
+ */
+ quota = UINT64_MAX;
+ parentspace = UINT64_MAX;
+
+ if (dd->dd_parent != NULL) {
+ parentspace = dsl_dir_space_available(dd->dd_parent,
+ ancestor, delta, ondiskonly);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ if (dsl_dir_phys(dd)->dd_quota != 0)
+ quota = dsl_dir_phys(dd)->dd_quota;
+ used = dsl_dir_phys(dd)->dd_used_bytes;
+ if (!ondiskonly)
+ used += dsl_dir_space_towrite(dd);
+
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
+ ZFS_SPACE_CHECK_NORMAL);
+ quota = MIN(quota, poolsize);
+ }
+
+ if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
+ /*
+ * We have some space reserved, in addition to what our
+ * parent gave us.
+ */
+ parentspace += dsl_dir_phys(dd)->dd_reserved - used;
+ }
+
+ if (dd == ancestor) {
+ ASSERT(delta <= 0);
+ ASSERT(used >= -delta);
+ used += delta;
+ if (parentspace != UINT64_MAX)
+ parentspace -= delta;
+ }
+
+ if (used > quota) {
+ /* over quota */
+ myspace = 0;
+ } else {
+ /*
+ * the lesser of the space provided by our parent and
+ * the space left in our quota
+ */
+ myspace = MIN(parentspace, quota - used);
+ }
+
+ mutex_exit(&dd->dd_lock);
+
+ return (myspace);
+}
+
+struct tempreserve {
+ list_node_t tr_node;
+ dsl_dir_t *tr_ds;
+ uint64_t tr_size;
+};
+
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
+ boolean_t ignorequota, list_t *tr_list,
+ dmu_tx_t *tx, boolean_t first)
+{
+ uint64_t txg = tx->tx_txg;
+ uint64_t quota;
+ struct tempreserve *tr;
+ int retval = EDQUOT;
+ uint64_t ref_rsrv = 0;
+
+ ASSERT3U(txg, !=, 0);
+ ASSERT3S(asize, >, 0);
+
+ mutex_enter(&dd->dd_lock);
+
+ /*
+ * Check against the dsl_dir's quota. We don't add in the delta
+ * when checking for over-quota because they get one free hit.
+ */
+ uint64_t est_inflight = dsl_dir_space_towrite(dd);
+ for (int i = 0; i < TXG_SIZE; i++)
+ est_inflight += dd->dd_tempreserved[i];
+ uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
+
+ /*
+ * On the first iteration, fetch the dataset's used-on-disk and
+ * refreservation values. Also, if checkrefquota is set, test if
+ * allocating this space would exceed the dataset's refquota.
+ */
+ if (first && tx->tx_objset) {
+ int error;
+ dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
+
+ error = dsl_dataset_check_quota(ds, !netfree,
+ asize, est_inflight, &used_on_disk, &ref_rsrv);
+ if (error != 0) {
+ mutex_exit(&dd->dd_lock);
+ return (error);
+ }
+ }
+
+ /*
+ * If this transaction will result in a net free of space,
+ * we want to let it through.
+ */
+ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
+ quota = UINT64_MAX;
+ else
+ quota = dsl_dir_phys(dd)->dd_quota;
+
+ /*
+ * Adjust the quota against the actual pool size at the root
+ * minus any outstanding deferred frees.
+ * To ensure that it's possible to remove files from a full
+ * pool without inducing transient overcommits, we throttle
+ * netfree transactions against a quota that is slightly larger,
+ * but still within the pool's allocation slop. In cases where
+ * we're very close to full, this will allow a steady trickle of
+ * removes to get through.
+ */
+ uint64_t deferred = 0;
+ if (dd->dd_parent == NULL) {
+ uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
+ (netfree) ?
+ ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
+
+ if (avail < quota) {
+ quota = avail;
+ retval = ENOSPC;
+ }
+ }
+
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota, they get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (used_on_disk + est_inflight >= quota) {
+ if (est_inflight > 0 || used_on_disk < quota ||
+ (retval == ENOSPC && used_on_disk < quota + deferred))
+ retval = ERESTART;
+ dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
+ "quota=%lluK tr=%lluK err=%d\n",
+ used_on_disk>>10, est_inflight>>10,
+ quota>>10, asize>>10, retval);
+ mutex_exit(&dd->dd_lock);
+ return (SET_ERROR(retval));
+ }
+
+ /* We need to up our estimated delta before dropping dd_lock */
+ dd->dd_tempreserved[txg & TXG_MASK] += asize;
+
+ uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
+ asize - ref_rsrv);
+ mutex_exit(&dd->dd_lock);
+
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = dd;
+ tr->tr_size = asize;
+ list_insert_tail(tr_list, tr);
+
+ /* see if it's OK with our parent */
+ if (dd->dd_parent != NULL && parent_rsrv != 0) {
+ boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
+
+ return (dsl_dir_tempreserve_impl(dd->dd_parent,
+ parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE));
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and dsl_dir_willuse_space()
+ * has been called), the reservation should be canceled, using
+ * dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
+ boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
+{
+ int err;
+ list_t *tr_list;
+
+ if (asize == 0) {
+ *tr_cookiep = NULL;
+ return (0);
+ }
+
+ tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(tr_list, sizeof (struct tempreserve),
+ offsetof(struct tempreserve, tr_node));
+ ASSERT3S(asize, >, 0);
+
+ err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
+ if (err == 0) {
+ struct tempreserve *tr;
+
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_size = lsize;
+ list_insert_tail(tr_list, tr);
+ } else {
+ if (err == EAGAIN) {
+ /*
+ * If arc_memory_throttle() detected that pageout
+ * is running and we are low on memory, we delay new
+ * non-pageout transactions to give pageout an
+ * advantage.
+ *
+ * It is unfortunate to be delaying while the caller's
+ * locks are held.
+ */
+ txg_delay(dd->dd_pool, tx->tx_txg,
+ MSEC2NSEC(10), MSEC2NSEC(10));
+ err = SET_ERROR(ERESTART);
+ }
+ }
+
+ if (err == 0) {
+ err = dsl_dir_tempreserve_impl(dd, asize, netfree,
+ B_FALSE, tr_list, tx, B_TRUE);
+ }
+
+ if (err != 0)
+ dsl_dir_tempreserve_clear(tr_list, tx);
+ else
+ *tr_cookiep = tr_list;
+
+ return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+ int txgidx = tx->tx_txg & TXG_MASK;
+ list_t *tr_list = tr_cookie;
+ struct tempreserve *tr;
+
+ ASSERT3U(tx->tx_txg, !=, 0);
+
+ if (tr_cookie == NULL)
+ return;
+
+ while ((tr = list_head(tr_list)) != NULL) {
+ if (tr->tr_ds) {
+ mutex_enter(&tr->tr_ds->dd_lock);
+ ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+ tr->tr_size);
+ tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+ mutex_exit(&tr->tr_ds->dd_lock);
+ } else {
+ arc_tempreserve_clear(tr->tr_size);
+ }
+ list_remove(tr_list, tr);
+ kmem_free(tr, sizeof (struct tempreserve));
+ }
+
+ kmem_free(tr_list, sizeof (list_t));
+}
+
+/*
+ * This should be called from open context when we think we're going to write
+ * or free space, for example when dirtying data. Be conservative; it's okay
+ * to write less space or free more, but we don't want to write more or free
+ * less than the amount specified.
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+ int64_t parent_space;
+ uint64_t est_used;
+
+ mutex_enter(&dd->dd_lock);
+ if (space > 0)
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+ est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
+ parent_space = parent_delta(dd, est_used, space);
+ mutex_exit(&dd->dd_lock);
+
+ /* Make sure that we clean up dd_space_to* */
+ dsl_dir_dirty(dd, tx);
+
+ /* XXX this is potentially expensive and unnecessary... */
+ if (parent_space && dd->dd_parent)
+ dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+ int64_t accounted_delta;
+
+ /*
+ * dsl_dataset_set_refreservation_sync_impl() calls this with
+ * dd_lock held, so that it can atomically update
+ * ds->ds_reserved and the dsl_dir accounting, so that
+ * dsl_dataset_check_quota() can see dataset and dir accounting
+ * consistently.
+ */
+ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(type < DD_USED_NUM);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ if (needlock)
+ mutex_enter(&dd->dd_lock);
+ accounted_delta =
+ parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
+ ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
+ ASSERT(compressed >= 0 ||
+ dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
+ ASSERT(uncompressed >= 0 ||
+ dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
+ dsl_dir_phys(dd)->dd_used_bytes += used;
+ dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
+ dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
+
+ if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ ASSERT(used > 0 ||
+ dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
+ dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
+#ifdef DEBUG
+ dd_used_t t;
+ uint64_t u = 0;
+ for (t = 0; t < DD_USED_NUM; t++)
+ u += dsl_dir_phys(dd)->dd_used_breakdown[t];
+ ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
+#endif
+ }
+ if (needlock)
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent != NULL) {
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
+ accounted_delta, compressed, uncompressed, tx);
+ dsl_dir_transfer_space(dd->dd_parent,
+ used - accounted_delta,
+ DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
+ }
+}
+
+void
+dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
+ dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
+{
+ ASSERT(tx == NULL || dmu_tx_is_syncing(tx));
+ ASSERT(oldtype < DD_USED_NUM);
+ ASSERT(newtype < DD_USED_NUM);
+
+ if (delta == 0 ||
+ !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
+ return;
+
+ if (tx != NULL)
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ mutex_enter(&dd->dd_lock);
+ ASSERT(delta > 0 ?
+ dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
+ dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
+ ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
+ dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
+ dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
+ mutex_exit(&dd->dd_lock);
+}
+
+typedef struct dsl_dir_set_qr_arg {
+ const char *ddsqra_name;
+ zprop_source_t ddsqra_source;
+ uint64_t ddsqra_value;
+} dsl_dir_set_qr_arg_t;
+
+static int
+dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ int error;
+ uint64_t towrite, newval;
+
+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ error = dsl_prop_predict(ds->ds_dir, "quota",
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ if (newval == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ mutex_enter(&ds->ds_dir->dd_lock);
+ /*
+ * If we are doing the preliminary check in open context, and
+ * there are pending changes, then don't fail it, since the
+ * pending changes could under-estimate the amount of space to be
+ * freed up.
+ */
+ towrite = dsl_dir_space_towrite(ds->ds_dir);
+ if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
+ (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
+ newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
+ error = SET_ERROR(ENOSPC);
+ }
+ mutex_exit(&ds->ds_dir->dd_lock);
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+}
+
+static void
+dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ uint64_t newval;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+ &ddsqra->ddsqra_value, tx);
+
+ VERIFY0(dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
+ } else {
+ newval = ddsqra->ddsqra_value;
+ spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
+ zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
+ }
+
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ mutex_enter(&ds->ds_dir->dd_lock);
+ dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
+ mutex_exit(&ds->ds_dir->dd_lock);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
+{
+ dsl_dir_set_qr_arg_t ddsqra;
+
+ ddsqra.ddsqra_name = ddname;
+ ddsqra.ddsqra_source = source;
+ ddsqra.ddsqra_value = quota;
+
+ return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
+ dsl_dir_set_quota_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+int
+dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ uint64_t newval, used, avail;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+ if (error != 0)
+ return (error);
+ dd = ds->ds_dir;
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ error = dsl_prop_predict(ds->ds_dir,
+ zfs_prop_to_name(ZFS_PROP_RESERVATION),
+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ used = dsl_dir_phys(dd)->dd_used_bytes;
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent) {
+ avail = dsl_dir_space_available(dd->dd_parent,
+ NULL, 0, FALSE);
+ } else {
+ avail = dsl_pool_adjustedsize(dd->dd_pool,
+ ZFS_SPACE_CHECK_NORMAL) - used;
+ }
+
+ if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
+ uint64_t delta = MAX(used, newval) -
+ MAX(used, dsl_dir_phys(dd)->dd_reserved);
+
+ if (delta > avail ||
+ (dsl_dir_phys(dd)->dd_quota > 0 &&
+ newval > dsl_dir_phys(dd)->dd_quota))
+ error = SET_ERROR(ENOSPC);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+}
+
+void
+dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
+{
+ uint64_t used;
+ int64_t delta;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ used = dsl_dir_phys(dd)->dd_used_bytes;
+ delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
+ dsl_dir_phys(dd)->dd_reserved = value;
+
+ if (dd->dd_parent != NULL) {
+ /* Roll up this additional usage into our ancestors */
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+ delta, 0, 0, tx);
+ }
+ mutex_exit(&dd->dd_lock);
+}
+
+static void
+dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_set_qr_arg_t *ddsqra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ uint64_t newval;
+
+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_RESERVATION),
+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+ &ddsqra->ddsqra_value, tx);
+
+ VERIFY0(dsl_prop_get_int_ds(ds,
+ zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
+ } else {
+ newval = ddsqra->ddsqra_value;
+ spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
+ zfs_prop_to_name(ZFS_PROP_RESERVATION),
+ (longlong_t)newval);
+ }
+
+ dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+ uint64_t reservation)
+{
+ dsl_dir_set_qr_arg_t ddsqra;
+
+ ddsqra.ddsqra_name = ddname;
+ ddsqra.ddsqra_source = source;
+ ddsqra.ddsqra_value = reservation;
+
+ return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
+ dsl_dir_set_reservation_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+ for (; ds1; ds1 = ds1->dd_parent) {
+ dsl_dir_t *dd;
+ for (dd = ds2; dd; dd = dd->dd_parent) {
+ if (ds1 == dd)
+ return (dd);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor? Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+ if (dd == ancestor)
+ return (delta);
+
+ mutex_enter(&dd->dd_lock);
+ delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
+ mutex_exit(&dd->dd_lock);
+ return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+typedef struct dsl_dir_rename_arg {
+ const char *ddra_oldname;
+ const char *ddra_newname;
+ cred_t *ddra_cred;
+} dsl_dir_rename_arg_t;
+
+typedef struct dsl_valid_rename_arg {
+ int char_delta;
+ int nest_delta;
+} dsl_valid_rename_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ dsl_valid_rename_arg_t *dvra = arg;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+ dsl_dataset_name(ds, namebuf);
+
+ ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ int namelen = strlen(namebuf) + dvra->char_delta;
+ int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
+
+ if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
+ return (SET_ERROR(ENAMETOOLONG));
+ return (0);
+}
+
+static int
+dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_rename_arg_t *ddra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd, *newparent;
+ dsl_valid_rename_arg_t dvra;
+ dsl_dataset_t *parentds;
+ objset_t *parentos;
+ const char *mynewname;
+ int error;
+
+ /* target dir should exist */
+ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
+ if (error != 0)
+ return (error);
+
+ /* new parent should exist */
+ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
+ &newparent, &mynewname);
+ if (error != 0) {
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+
+ /* can't rename to different pool */
+ if (dd->dd_pool != newparent->dd_pool) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /* new name should not already exist */
+ if (mynewname == NULL) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ /* can't rename below anything but filesystems (eg. no ZVOLs) */
+ error = dsl_dataset_hold_obj(newparent->dd_pool,
+ dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
+ if (error != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ error = dmu_objset_from_ds(parentds, &parentos);
+ if (error != 0) {
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
+ dsl_dataset_rele(parentds, FTAG);
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ dsl_dataset_rele(parentds, FTAG);
+
+ ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ dvra.char_delta = strlen(ddra->ddra_newname)
+ - strlen(ddra->ddra_oldname);
+ dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
+ - get_dataset_depth(ddra->ddra_oldname);
+
+ /* if the name length is growing, validate child name lengths */
+ if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
+ error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
+ &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+ if (error != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ }
+
+ if (dmu_tx_is_syncing(tx)) {
+ if (spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_FS_SS_LIMIT)) {
+ /*
+ * Although this is the check function and we don't
+ * normally make on-disk changes in check functions,
+ * we need to do that here.
+ *
+ * Ensure this portion of the tree's counts have been
+ * initialized in case the new parent has limits set.
+ */
+ dsl_dir_init_fs_ss_count(dd, tx);
+ }
+ }
+
+ if (newparent != dd->dd_parent) {
+ /* is there enough space? */
+ uint64_t myspace =
+ MAX(dsl_dir_phys(dd)->dd_used_bytes,
+ dsl_dir_phys(dd)->dd_reserved);
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t fs_cnt = 0;
+ uint64_t ss_cnt = 0;
+
+ if (dsl_dir_is_zapified(dd)) {
+ int err;
+
+ err = zap_lookup(os, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+ &fs_cnt);
+ if (err != ENOENT && err != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (err);
+ }
+
+ /*
+ * have to add 1 for the filesystem itself that we're
+ * moving
+ */
+ fs_cnt++;
+
+ err = zap_lookup(os, dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+ &ss_cnt);
+ if (err != ENOENT && err != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (err);
+ }
+ }
+
+ /* no rename into our descendant */
+ if (closest_common_ancestor(dd, newparent) == dd) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_dir_transfer_possible(dd->dd_parent,
+ newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
+ if (error != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (error);
+ }
+ }
+
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (0);
+}
+
+static void
+dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dir_rename_arg_t *ddra = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd, *newparent;
+ const char *mynewname;
+ int error;
+ objset_t *mos = dp->dp_meta_objset;
+
+ VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
+ VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
+ &mynewname));
+
+ /* Log this before we change the name. */
+ spa_history_log_internal_dd(dd, "rename", tx,
+ "-> %s", ddra->ddra_newname);
+
+ if (newparent != dd->dd_parent) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t fs_cnt = 0;
+ uint64_t ss_cnt = 0;
+
+ /*
+ * We already made sure the dd counts were initialized in the
+ * check function.
+ */
+ if (spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_FS_SS_LIMIT)) {
+ VERIFY0(zap_lookup(os, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+ &fs_cnt));
+ /* add 1 for the filesystem itself that we're moving */
+ fs_cnt++;
+
+ VERIFY0(zap_lookup(os, dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+ &ss_cnt));
+ }
+
+ dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+ dsl_fs_ss_count_adjust(newparent, fs_cnt,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+
+ dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+ dsl_fs_ss_count_adjust(newparent, ss_cnt,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
+ -dsl_dir_phys(dd)->dd_used_bytes,
+ -dsl_dir_phys(dd)->dd_compressed_bytes,
+ -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(newparent, DD_USED_CHILD,
+ dsl_dir_phys(dd)->dd_used_bytes,
+ dsl_dir_phys(dd)->dd_compressed_bytes,
+ dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
+
+ if (dsl_dir_phys(dd)->dd_reserved >
+ dsl_dir_phys(dd)->dd_used_bytes) {
+ uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
+ dsl_dir_phys(dd)->dd_used_bytes;
+
+ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+ -unused_rsrv, 0, 0, tx);
+ dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
+ unused_rsrv, 0, 0, tx);
+ }
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ /* remove from old parent zapobj */
+ error = zap_remove(mos,
+ dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+ dd->dd_myname, tx);
+ ASSERT0(error);
+
+ (void) strcpy(dd->dd_myname, mynewname);
+ dsl_dir_rele(dd->dd_parent, dd);
+ dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
+ VERIFY0(dsl_dir_hold_obj(dp,
+ newparent->dd_object, NULL, dd, &dd->dd_parent));
+
+ /* add to new parent zapobj */
+ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
+ dd->dd_myname, 8, 1, &dd->dd_object, tx));
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
+ zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname);
+#endif
+#endif
+
+ dsl_prop_notify_all(dd);
+
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+}
+
+int
+dsl_dir_rename(const char *oldname, const char *newname)
+{
+ dsl_dir_rename_arg_t ddra;
+
+ ddra.ddra_oldname = oldname;
+ ddra.ddra_newname = newname;
+ ddra.ddra_cred = CRED();
+
+ return (dsl_sync_task(oldname,
+ dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
+ 3, ZFS_SPACE_CHECK_RESERVED));
+}
+
+int
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+ uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
+{
+ dsl_dir_t *ancestor;
+ int64_t adelta;
+ uint64_t avail;
+ int err;
+
+ ancestor = closest_common_ancestor(sdd, tdd);
+ adelta = would_change(sdd, -space, ancestor);
+ avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
+ if (avail < space)
+ return (SET_ERROR(ENOSPC));
+
+ err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
+ ancestor, cr);
+ if (err != 0)
+ return (err);
+ err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
+ ancestor, cr);
+ if (err != 0)
+ return (err);
+
+ return (0);
+}
+
+timestruc_t
+dsl_dir_snap_cmtime(dsl_dir_t *dd)
+{
+ timestruc_t t;
+
+ mutex_enter(&dd->dd_lock);
+ t = dd->dd_snap_cmtime;
+ mutex_exit(&dd->dd_lock);
+
+ return (t);
+}
+
+void
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+{
+ timestruc_t t;
+
+ gethrestime(&t);
+ mutex_enter(&dd->dd_lock);
+ dd->dd_snap_cmtime = t;
+ mutex_exit(&dd->dd_lock);
+}
+
+void
+dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
+}
+
+boolean_t
+dsl_dir_is_zapified(dsl_dir_t *dd)
+{
+ dmu_object_info_t doi;
+
+ dmu_object_info_from_db(dd->dd_dbuf, &doi);
+ return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
new file mode 100644
index 000000000000..ee0ba4793aad
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -0,0 +1,1372 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/bptree.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_userhold.h>
+#include <sys/mmp.h>
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+/*
+ * ZFS Write Throttle
+ * ------------------
+ *
+ * ZFS must limit the rate of incoming writes to the rate at which it is able
+ * to sync data modifications to the backend storage. Throttling by too much
+ * creates an artificial limit; throttling by too little can only be sustained
+ * for short periods and would lead to highly lumpy performance. On a per-pool
+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change
+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount
+ * of dirty data decreases. When the amount of dirty data exceeds a
+ * predetermined threshold further modifications are blocked until the amount
+ * of dirty data decreases (as data is synced out).
+ *
+ * The limit on dirty data is tunable, and should be adjusted according to
+ * both the IO capacity and available memory of the system. The larger the
+ * window, the more ZFS is able to aggregate and amortize metadata (and data)
+ * changes. However, memory is a limited resource, and allowing for more dirty
+ * data comes at the cost of keeping other useful data in memory (for example
+ * ZFS data cached by the ARC).
+ *
+ * Implementation
+ *
+ * As buffers are modified dsl_pool_willuse_space() increments both the per-
+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
+ * dirty space used; dsl_pool_dirty_space() decrements those values as data
+ * is synced out from dsl_pool_sync(). While only the poolwide value is
+ * relevant, the per-txg value is useful for debugging. The tunable
+ * zfs_dirty_data_max determines the dirty space limit. Once that value is
+ * exceeded, new writes are halted until space frees up.
+ *
+ * The zfs_dirty_data_sync tunable dictates the threshold at which we
+ * ensure that there is a txg syncing (see the comment in txg.c for a full
+ * description of transaction group stages).
+ *
+ * The IO scheduler uses both the dirty space limit and current amount of
+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
+ * issues. See the comment in vdev_queue.c for details of the IO scheduler.
+ *
+ * The delay is also calculated based on the amount of dirty data. See the
+ * comment above dmu_tx_delay() for details.
+ */
+
+/*
+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
+ * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system.
+ */
+uint64_t zfs_dirty_data_max;
+uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
+int zfs_dirty_data_max_percent = 10;
+
+/*
+ * If there's at least this much dirty data (as a percentage of
+ * zfs_dirty_data_max), push out a txg. This should be less than
+ * zfs_vdev_async_write_active_min_dirty_percent.
+ */
+uint64_t zfs_dirty_data_sync_pct = 20;
+
+/*
+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
+ * and delay each transaction.
+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+ */
+int zfs_delay_min_dirty_percent = 60;
+
+/*
+ * This controls how quickly the delay approaches infinity.
+ * Larger values cause it to delay more for a given amount of dirty data.
+ * Therefore larger values will cause there to be less dirty data for a
+ * given throughput.
+ *
+ * For the smoothest delay, this value should be about 1 billion divided
+ * by the maximum number of operations per second. This will smoothly
+ * handle between 10x and 1/10th this number.
+ *
+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
+ * multiply in dmu_tx_delay().
+ */
+uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
+
+/*
+ * This determines the number of threads used by the dp_sync_taskq.
+ */
+int zfs_sync_taskq_batch_pct = 75;
+
+/*
+ * These tunables determine the behavior of how zil_itxg_clean() is
+ * called via zil_clean() in the context of spa_sync(). When an itxg
+ * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
+ * If the dispatch fails, the call to zil_itxg_clean() will occur
+ * synchronously in the context of spa_sync(), which can negatively
+ * impact the performance of spa_sync() (e.g. in the case of the itxg
+ * list having a large number of itxs that needs to be cleaned).
+ *
+ * Thus, these tunables can be used to manipulate the behavior of the
+ * taskq used by zil_clean(); they determine the number of taskq entries
+ * that are pre-populated when the taskq is first created (via the
+ * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
+ * taskq entries that are cached after an on-demand allocation (via the
+ * "zfs_zil_clean_taskq_maxalloc").
+ *
+ * The idea being, we want to try reasonably hard to ensure there will
+ * already be a taskq entry pre-allocated by the time that it is needed
+ * by zil_clean(). This way, we can avoid the possibility of an
+ * on-demand allocation of a new taskq entry from failing, which would
+ * result in zil_itxg_clean() being called synchronously from zil_clean()
+ * (which can adversely affect performance of spa_sync()).
+ *
+ * Additionally, the number of threads used by the taskq can be
+ * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
+ */
+int zfs_zil_clean_taskq_nthr_pct = 100;
+int zfs_zil_clean_taskq_minalloc = 1024;
+int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+
+extern int zfs_vdev_async_write_active_max_dirty_percent;
+
+SYSCTL_DECL(_vfs_zfs);
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
+ &zfs_dirty_data_max, 0,
+ "The maximum amount of dirty data in bytes after which new writes are "
+ "halted until space becomes available");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
+ &zfs_dirty_data_max_max, 0,
+ "The absolute cap on dirty_data_max when auto calculating");
+
+static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+ sysctl_zfs_dirty_data_max_percent, "I",
+ "The percent of physical memory used to auto calculate dirty_data_max");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync_pct, CTLFLAG_RWTUN,
+ &zfs_dirty_data_sync_pct, 0,
+ "Force a txg if the percent of dirty buffer bytes exceed this value");
+
+static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
+/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
+ sysctl_zfs_delay_min_dirty_percent, "I",
+ "The limit of outstanding dirty data before transactions are delayed");
+
+static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
+/* No zfs_delay_scale tunable due to limit requirements */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+ sysctl_zfs_delay_scale, "QU",
+ "Controls how quickly the delay approaches infinity");
+
+static int
+sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = zfs_dirty_data_max_percent;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 0 || val > 100)
+ return (EINVAL);
+
+ zfs_dirty_data_max_percent = val;
+
+ return (0);
+}
+
+static int
+sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = zfs_delay_min_dirty_percent;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < zfs_vdev_async_write_active_max_dirty_percent)
+ return (EINVAL);
+
+ zfs_delay_min_dirty_percent = val;
+
+ return (0);
+}
+
+static int
+sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_delay_scale;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val > UINT64_MAX / zfs_dirty_data_max)
+ return (EINVAL);
+
+ zfs_delay_scale = val;
+
+ return (0);
+}
+#endif
+
+int
+dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
+{
+ uint64_t obj;
+ int err;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
+ name, sizeof (obj), 1, &obj);
+ if (err)
+ return (err);
+
+ return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp;
+ blkptr_t *bp = spa_get_rootblkptr(spa);
+
+ dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+ dp->dp_spa = spa;
+ dp->dp_meta_rootbp = *bp;
+ rrw_init(&dp->dp_config_rwlock, B_TRUE);
+ txg_init(dp, txg);
+ mmp_init(spa);
+
+ txg_list_create(&dp->dp_dirty_datasets, spa,
+ offsetof(dsl_dataset_t, ds_dirty_link));
+ txg_list_create(&dp->dp_dirty_zilogs, spa,
+ offsetof(zilog_t, zl_dirty_link));
+ txg_list_create(&dp->dp_dirty_dirs, spa,
+ offsetof(dsl_dir_t, dd_dirty_link));
+ txg_list_create(&dp->dp_sync_tasks, spa,
+ offsetof(dsl_sync_task_t, dst_node));
+ txg_list_create(&dp->dp_early_sync_tasks, spa,
+ offsetof(dsl_sync_task_t, dst_node));
+
+ dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
+ zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
+ TASKQ_THREADS_CPU_PCT);
+
+ dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
+ zfs_zil_clean_taskq_nthr_pct, minclsyspri,
+ zfs_zil_clean_taskq_minalloc,
+ zfs_zil_clean_taskq_maxalloc,
+ TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+
+ mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
+
+ dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
+ 1, 4, 0);
+
+ return (dp);
+}
+
+int
+dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+
+ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+ &dp->dp_meta_objset);
+ if (err != 0)
+ dsl_pool_close(dp);
+ else
+ *dpp = dp;
+
+ return (err);
+}
+
+int
+dsl_pool_open(dsl_pool_t *dp)
+{
+ int err;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ uint64_t obj;
+
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+ &dp->dp_root_dir_obj);
+ if (err)
+ goto out;
+
+ err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir);
+ if (err)
+ goto out;
+
+ err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
+ if (err)
+ goto out;
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
+ err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
+ if (err)
+ goto out;
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
+ if (err == 0) {
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
+ &dp->dp_origin_snap);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_dir_rele(dd, dp);
+ if (err)
+ goto out;
+ }
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+ &dp->dp_free_dir);
+ if (err)
+ goto out;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
+ if (err)
+ goto out;
+ VERIFY0(bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+ }
+
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
+ if (err == 0) {
+ VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
+ dp->dp_meta_objset, obj));
+ } else if (err == ENOENT) {
+ /*
+ * We might not have created the remap bpobj yet.
+ */
+ err = 0;
+ } else {
+ goto out;
+ }
+ }
+
+ /*
+ * Note: errors ignored, because the these special dirs, used for
+ * space accounting, are only created on demand.
+ */
+ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+ &dp->dp_leak_dir);
+
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+ &dp->dp_bptree_obj);
+ if (err != 0)
+ goto out;
+ }
+
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+ &dp->dp_empty_bpobj);
+ if (err != 0)
+ goto out;
+ }
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+ &dp->dp_tmp_userrefs_obj);
+ if (err == ENOENT)
+ err = 0;
+ if (err)
+ goto out;
+
+ err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
+
+out:
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+ return (err);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+ /*
+ * Drop our references from dsl_pool_open().
+ *
+ * Since we held the origin_snap from "syncing" context (which
+ * includes pool-opening context), it actually only got a "ref"
+ * and not a hold, so just drop that here.
+ */
+ if (dp->dp_origin_snap != NULL)
+ dsl_dataset_rele(dp->dp_origin_snap, dp);
+ if (dp->dp_mos_dir != NULL)
+ dsl_dir_rele(dp->dp_mos_dir, dp);
+ if (dp->dp_free_dir != NULL)
+ dsl_dir_rele(dp->dp_free_dir, dp);
+ if (dp->dp_leak_dir != NULL)
+ dsl_dir_rele(dp->dp_leak_dir, dp);
+ if (dp->dp_root_dir != NULL)
+ dsl_dir_rele(dp->dp_root_dir, dp);
+
+ bpobj_close(&dp->dp_free_bpobj);
+ bpobj_close(&dp->dp_obsolete_bpobj);
+
+ /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+ if (dp->dp_meta_objset != NULL)
+ dmu_objset_evict(dp->dp_meta_objset);
+
+ txg_list_destroy(&dp->dp_dirty_datasets);
+ txg_list_destroy(&dp->dp_dirty_zilogs);
+ txg_list_destroy(&dp->dp_sync_tasks);
+ txg_list_destroy(&dp->dp_early_sync_tasks);
+ txg_list_destroy(&dp->dp_dirty_dirs);
+
+ taskq_destroy(dp->dp_zil_clean_taskq);
+ taskq_destroy(dp->dp_sync_taskq);
+
+ /*
+ * We can't set retry to TRUE since we're explicitly specifying
+ * a spa to flush. This is good enough; any missed buffers for
+ * this spa won't cause trouble, and they'll eventually fall
+ * out of the ARC just like any other unused buffer.
+ */
+ arc_flush(dp->dp_spa, FALSE);
+
+ mmp_fini(dp->dp_spa);
+ txg_fini(dp);
+ dsl_scan_fini(dp);
+ dmu_buf_user_evict_wait();
+
+ rrw_destroy(&dp->dp_config_rwlock);
+ mutex_destroy(&dp->dp_lock);
+ taskq_destroy(dp->dp_vnrele_taskq);
+ if (dp->dp_blkstats != NULL) {
+ mutex_destroy(&dp->dp_blkstats->zab_lock);
+ kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+ }
+ kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+void
+dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ /*
+ * Currently, we only create the obsolete_bpobj where there are
+ * indirect vdevs with referenced mappings.
+ */
+ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
+ /* create and open the obsolete_bpobj */
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
+ VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ, tx));
+ bpobj_free(dp->dp_meta_objset,
+ dp->dp_obsolete_bpobj.bpo_object, tx);
+ bpobj_close(&dp->dp_obsolete_bpobj);
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+ dsl_dataset_t *ds;
+ uint64_t obj;
+
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
+ /* create and open the MOS (meta-objset) */
+ dp->dp_meta_objset = dmu_objset_create_impl(spa,
+ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
+
+ /* create the pool directory */
+ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+ ASSERT0(err);
+
+ /* Initialize scan structures */
+ VERIFY0(dsl_scan_init(dp, txg));
+
+ /* create and open the root dir */
+ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
+ VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir));
+
+ /* create and open the meta-objset dir */
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ MOS_DIR_NAME, &dp->dp_mos_dir));
+
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ /* create and open the free dir */
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ FREE_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /* create and open the free_bplist */
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+ VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+ VERIFY0(bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+ }
+
+ if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
+ dsl_pool_create_origin(dp, tx);
+
+ /* create the root dataset */
+ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+
+ /* create the root objset */
+ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+#ifdef _KERNEL
+ {
+ objset_t *os;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ os = dmu_objset_create_impl(dp->dp_spa, ds,
+ dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ zfs_create_fs(os, kcred, zplprops, tx);
+ }
+#endif
+ dsl_dataset_rele(ds, FTAG);
+
+ dmu_tx_commit(tx);
+
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+
+ return (dp);
+}
+
+/*
+ * Account for the meta-objset space in its placeholder dsl_dir.
+ */
+void
+dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+ int64_t used, int64_t comp, int64_t uncomp)
+{
+ ASSERT3U(comp, ==, uncomp); /* it's all metadata */
+ mutex_enter(&dp->dp_lock);
+ dp->dp_mos_used_delta += used;
+ dp->dp_mos_compressed_delta += comp;
+ dp->dp_mos_uncompressed_delta += uncomp;
+ mutex_exit(&dp->dp_lock);
+}
+
+static void
+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dmu_objset_sync(dp->dp_meta_objset, zio, tx);
+ VERIFY0(zio_wait(zio));
+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+}
+
+static void
+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
+{
+ ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+ if (delta < 0)
+ ASSERT3U(-delta, <=, dp->dp_dirty_total);
+
+ dp->dp_dirty_total += delta;
+
+ /*
+ * Note: we signal even when increasing dp_dirty_total.
+ * This ensures forward progress -- each thread wakes the next waiter.
+ */
+ if (dp->dp_dirty_total < zfs_dirty_data_max)
+ cv_signal(&dp->dp_spaceavail_cv);
+}
+
+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+ spa_t *spa = dp->dp_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ txg_list_t *tl = &vd->vdev_ms_list;
+ metaslab_t *ms;
+
+ for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+ ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+ VERIFY(range_tree_is_empty(ms->ms_freeing));
+ VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+ }
+ }
+
+ return (B_TRUE);
+}
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+ zio_t *zio;
+ dmu_tx_t *tx;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ objset_t *mos = dp->dp_meta_objset;
+ list_t synced_datasets;
+
+ list_create(&synced_datasets, sizeof (dsl_dataset_t),
+ offsetof(dsl_dataset_t, ds_synced_link));
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * Run all early sync tasks before writing out any dirty blocks.
+ * For more info on early sync tasks see block comment in
+ * dsl_early_sync_task().
+ */
+ if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+ dsl_sync_task_t *dst;
+
+ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+ while ((dst =
+ txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+ ASSERT(dsl_early_sync_task_verify(dp, txg));
+ dsl_sync_task_sync(dst, tx);
+ }
+ ASSERT(dsl_early_sync_task_verify(dp, txg));
+ }
+
+ /*
+ * Write out all dirty blocks of dirty datasets.
+ */
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
+ /*
+ * We must not sync any non-MOS datasets twice, because
+ * we may have taken a snapshot of them. However, we
+ * may sync newly-created datasets on pass 2.
+ */
+ ASSERT(!list_link_active(&ds->ds_synced_link));
+ list_insert_tail(&synced_datasets, ds);
+ dsl_dataset_sync(ds, zio, tx);
+ }
+ VERIFY0(zio_wait(zio));
+
+ /*
+ * We have written all of the accounted dirty data, so our
+ * dp_space_towrite should now be zero. However, some seldom-used
+ * code paths do not adhere to this (e.g. dbuf_undirty(), also
+ * rounding error in dbuf_write_physdone).
+ * Shore up the accounting of any dirtied space now.
+ */
+ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
+
+ /*
+ * Update the long range free counter after
+ * we're done syncing user data
+ */
+ mutex_enter(&dp->dp_lock);
+ ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
+ mutex_exit(&dp->dp_lock);
+
+ /*
+ * After the data blocks have been written (ensured by the zio_wait()
+ * above), update the user/group space accounting. This happens
+ * in tasks dispatched to dp_sync_taskq, so wait for them before
+ * continuing.
+ */
+ for (ds = list_head(&synced_datasets); ds != NULL;
+ ds = list_next(&synced_datasets, ds)) {
+ dmu_objset_do_userquota_updates(ds->ds_objset, tx);
+ }
+ taskq_wait(dp->dp_sync_taskq);
+
+ /*
+ * Sync the datasets again to push out the changes due to
+ * userspace updates. This must be done before we process the
+ * sync tasks, so that any snapshots will have the correct
+ * user accounting information (and we won't get confused
+ * about which blocks are part of the snapshot).
+ */
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
+ ASSERT(list_link_active(&ds->ds_synced_link));
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ dsl_dataset_sync(ds, zio, tx);
+ }
+ VERIFY0(zio_wait(zio));
+
+ /*
+ * Now that the datasets have been completely synced, we can
+ * clean up our in-memory structures accumulated while syncing:
+ *
+ * - move dead blocks from the pending deadlist to the on-disk deadlist
+ * - release hold from dsl_dataset_dirty()
+ */
+ while ((ds = list_remove_head(&synced_datasets)) != NULL) {
+ dsl_dataset_sync_done(ds, tx);
+ }
+ while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
+ dsl_dir_sync(dd, tx);
+ }
+
+ /*
+ * The MOS's space is accounted for in the pool/$MOS
+ * (dp_mos_dir). We can't modify the mos while we're syncing
+ * it, so we remember the deltas and apply them here.
+ */
+ if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
+ dp->dp_mos_uncompressed_delta != 0) {
+ dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
+ dp->dp_mos_used_delta,
+ dp->dp_mos_compressed_delta,
+ dp->dp_mos_uncompressed_delta, tx);
+ dp->dp_mos_used_delta = 0;
+ dp->dp_mos_compressed_delta = 0;
+ dp->dp_mos_uncompressed_delta = 0;
+ }
+
+ if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) {
+ dsl_pool_sync_mos(dp, tx);
+ }
+
+ /*
+ * If we modify a dataset in the same txg that we want to destroy it,
+ * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
+ * dsl_dir_destroy_check() will fail if there are unexpected holds.
+ * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
+ * and clearing the hold on it) before we process the sync_tasks.
+ * The MOS data dirtied by the sync_tasks will be synced on the next
+ * pass.
+ */
+ if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
+ dsl_sync_task_t *dst;
+ /*
+ * No more sync tasks should have been added while we
+ * were syncing.
+ */
+ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+ while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
+ dsl_sync_task_sync(dst, tx);
+ }
+
+ dmu_tx_commit(tx);
+
+ DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
+}
+
+void
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
+{
+ zilog_t *zilog;
+
+ while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) {
+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+ /*
+ * We don't remove the zilog from the dp_dirty_zilogs
+ * list until after we've cleaned it. This ensures that
+ * callers of zilog_is_dirty() receive an accurate
+ * answer when they are racing with the spa sync thread.
+ */
+ zil_clean(zilog, txg);
+ (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
+ ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
+ dmu_buf_rele(ds->ds_dbuf, zilog);
+ }
+ ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
+}
+
+/*
+ * TRUE if the current thread is the tx_sync_thread or if we
+ * are being called from SPA context during pool initialization.
+ */
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+ return (curthread == dp->dp_tx.tx_sync_thread ||
+ spa_is_initializing(dp->dp_spa) ||
+ taskq_member(dp->dp_sync_taskq, curthread));
+}
+
+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+ spa_t *spa = dp->dp_spa;
+ uint64_t space, resv, adjustedsize;
+ uint64_t spa_deferred_frees =
+ spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
+
+ space = spa_get_dspace(spa)
+ - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+ resv = spa_get_slop_space(spa);
+
+ switch (slop_policy) {
+ case ZFS_SPACE_CHECK_NORMAL:
+ break;
+ case ZFS_SPACE_CHECK_RESERVED:
+ resv >>= 1;
+ break;
+ case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+ resv >>= 2;
+ break;
+ case ZFS_SPACE_CHECK_NONE:
+ resv = 0;
+ break;
+ default:
+ panic("invalid slop policy value: %d", slop_policy);
+ break;
+ }
+ adjustedsize = (space >= resv) ? (space - resv) : 0;
+
+ return (adjustedsize);
+}
+
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+ uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+ uint64_t deferred =
+ metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+ uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+ return (quota);
+}
+
+boolean_t
+dsl_pool_need_dirty_delay(dsl_pool_t *dp)
+{
+ uint64_t delay_min_bytes =
+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+ uint64_t dirty_min_bytes =
+ zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
+ boolean_t rv;
+
+ mutex_enter(&dp->dp_lock);
+ if (dp->dp_dirty_total > dirty_min_bytes)
+ txg_kick(dp);
+ rv = (dp->dp_dirty_total > delay_min_bytes);
+ mutex_exit(&dp->dp_lock);
+ return (rv);
+}
+
+void
+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+ if (space > 0) {
+ mutex_enter(&dp->dp_lock);
+ dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
+ dsl_pool_dirty_delta(dp, space);
+ mutex_exit(&dp->dp_lock);
+ }
+}
+
+void
+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
+{
+ ASSERT3S(space, >=, 0);
+ if (space == 0)
+ return;
+ mutex_enter(&dp->dp_lock);
+ if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
+ /* XXX writing something we didn't dirty? */
+ space = dp->dp_dirty_pertxg[txg & TXG_MASK];
+ }
+ ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
+ dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
+ ASSERT3U(dp->dp_dirty_total, >=, space);
+ dsl_pool_dirty_delta(dp, -space);
+ mutex_exit(&dp->dp_lock);
+}
+
+/* ARGSUSED */
+static int
+upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds, *prev = NULL;
+ int err;
+
+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
+ break;
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ prev = NULL;
+ }
+
+ if (prev == NULL) {
+ prev = dp->dp_origin_snap;
+
+ /*
+ * The $ORIGIN can't have any data, or the accounting
+ * will be wrong.
+ */
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ /* The origin doesn't get attached to itself */
+ if (ds->ds_object == prev->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+ dsl_dataset_phys(ds)->ds_prev_snap_txg =
+ dsl_dataset_phys(prev)->ds_creation_txg;
+
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
+
+ dmu_buf_will_dirty(prev->ds_dbuf, tx);
+ dsl_dataset_phys(prev)->ds_num_children++;
+
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
+ ASSERT(ds->ds_prev == NULL);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ ds, &ds->ds_prev));
+ }
+ }
+
+ ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
+
+ if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
+ dmu_buf_will_dirty(prev->ds_dbuf, tx);
+ dsl_dataset_phys(prev)->ds_next_clones_obj =
+ zap_create(dp->dp_meta_objset,
+ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
+
+ dsl_dataset_rele(ds, FTAG);
+ if (prev != dp->dp_origin_snap)
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+}
+
+void
+dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dp->dp_origin_snap != NULL);
+
+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
+ tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
+}
+
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
+ dsl_dataset_t *origin;
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
+
+ if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ dsl_dir_phys(origin->ds_dir)->dd_clones =
+ zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+ 0, tx);
+ }
+
+ VERIFY0(zap_add_int(dp->dp_meta_objset,
+ dsl_dir_phys(origin->ds_dir)->dd_clones,
+ ds->ds_object, tx));
+
+ dsl_dataset_rele(origin, FTAG);
+ }
+ return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ uint64_t obj;
+
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /*
+ * We can't use bpobj_alloc(), because spa_version() still
+ * returns the old version, and we need a new-version bpobj with
+ * subobj support. So call dmu_object_alloc() directly.
+ */
+ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+ VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
+
+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
+}
+
+void
+dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ uint64_t dsobj;
+ dsl_dataset_t *ds;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dp->dp_origin_snap == NULL);
+ ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
+
+ /* create the origin dir, ds, & snap-ds */
+ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
+ NULL, 0, kcred, tx);
+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
+ VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ dp, &dp->dp_origin_snap));
+ dsl_dataset_rele(ds, FTAG);
+}
+
+taskq_t *
+dsl_pool_vnrele_taskq(dsl_pool_t *dp)
+{
+ return (dp->dp_vnrele_taskq);
+}
+
+/*
+ * Walk through the pool-wide zap object of temporary snapshot user holds
+ * and release them.
+ */
+void
+dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
+{
+ zap_attribute_t za;
+ zap_cursor_t zc;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+ nvlist_t *holds;
+
+ if (zapobj == 0)
+ return;
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+
+ holds = fnvlist_alloc();
+
+ for (zap_cursor_init(&zc, mos, zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ char *htag;
+ nvlist_t *tags;
+
+ htag = strchr(za.za_name, '-');
+ *htag = '\0';
+ ++htag;
+ if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
+ tags = fnvlist_alloc();
+ fnvlist_add_boolean(tags, htag);
+ fnvlist_add_nvlist(holds, za.za_name, tags);
+ fnvlist_free(tags);
+ } else {
+ fnvlist_add_boolean(tags, htag);
+ }
+ }
+ dsl_dataset_user_release_tmp(dp, holds);
+ fnvlist_free(holds);
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Create the pool-wide zap object for storing temporary snapshot holds.
+ */
+void
+dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(dp->dp_tmp_userrefs_obj == 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
+}
+
+static int
+dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+ char *name;
+ int error;
+
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /*
+ * If the pool was created prior to SPA_VERSION_USERREFS, the
+ * zap object for temporary holds might not exist yet.
+ */
+ if (zapobj == 0) {
+ if (holding) {
+ dsl_pool_user_hold_create_obj(dp, tx);
+ zapobj = dp->dp_tmp_userrefs_obj;
+ } else {
+ return (SET_ERROR(ENOENT));
+ }
+ }
+
+ name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
+ if (holding)
+ error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
+ else
+ error = zap_remove(mos, zapobj, name, tx);
+ strfree(name);
+
+ return (error);
+}
+
+/*
+ * Add a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+ uint64_t now, dmu_tx_t *tx)
+{
+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
+}
+
+/*
+ * Release a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+ dmu_tx_t *tx)
+{
+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE));
+}
+
+/*
+ * DSL Pool Configuration Lock
+ *
+ * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
+ * creation / destruction / rename / property setting). It must be held for
+ * read to hold a dataset or dsl_dir. I.e. you must call
+ * dsl_pool_config_enter() or dsl_pool_hold() before calling
+ * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock
+ * must be held continuously until all datasets and dsl_dirs are released.
+ *
+ * The only exception to this rule is that if a "long hold" is placed on
+ * a dataset, then the dp_config_rwlock may be dropped while the dataset
+ * is still held. The long hold will prevent the dataset from being
+ * destroyed -- the destroy will fail with EBUSY. A long hold can be
+ * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
+ * (by calling dsl_{dataset,objset}_{try}own{_obj}).
+ *
+ * Legitimate long-holders (including owners) should be long-running, cancelable
+ * tasks that should cause "zfs destroy" to fail. This includes DMU
+ * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
+ * "zfs send", and "zfs diff". There are several other long-holders whose
+ * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
+ *
+ * The usual formula for long-holding would be:
+ * dsl_pool_hold()
+ * dsl_dataset_hold()
+ * ... perform checks ...
+ * dsl_dataset_long_hold()
+ * dsl_pool_rele()
+ * ... perform long-running task ...
+ * dsl_dataset_long_rele()
+ * dsl_dataset_rele()
+ *
+ * Note that when the long hold is released, the dataset is still held but
+ * the pool is not held. The dataset may change arbitrarily during this time
+ * (e.g. it could be destroyed). Therefore you shouldn't do anything to the
+ * dataset except release it.
+ *
+ * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
+ * or modifying operations.
+ *
+ * Modifying operations should generally use dsl_sync_task(). The synctask
+ * infrastructure enforces proper locking strategy with respect to the
+ * dp_config_rwlock. See the comment above dsl_sync_task() for details.
+ *
+ * Read-only operations will manually hold the pool, then the dataset, obtain
+ * information from the dataset, then release the pool and dataset.
+ * dmu_objset_{hold,rele}() are convenience routines that also do the pool
+ * hold/rele.
+ */
+
+int
+dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(name, &spa, tag);
+ if (error == 0) {
+ *dp = spa_get_dsl(spa);
+ dsl_pool_config_enter(*dp, tag);
+ }
+ return (error);
+}
+
+void
+dsl_pool_rele(dsl_pool_t *dp, void *tag)
+{
+ dsl_pool_config_exit(dp, tag);
+ spa_close(dp->dp_spa, tag);
+}
+
+void
+dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
+{
+ /*
+ * We use a "reentrant" reader-writer lock, but not reentrantly.
+ *
+ * The rrwlock can (with the track_all flag) track all reading threads,
+ * which is very useful for debugging which code path failed to release
+ * the lock, and for verifying that the *current* thread does hold
+ * the lock.
+ *
+ * (Unlike a rwlock, which knows that N threads hold it for
+ * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
+ * if any thread holds it for read, even if this thread doesn't).
+ */
+ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+ rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
+}
+
+void
+dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+{
+ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+ rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
+}
+
+void
+dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
+{
+ rrw_exit(&dp->dp_config_rwlock, tag);
+}
+
+boolean_t
+dsl_pool_config_held(dsl_pool_t *dp)
+{
+ return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
+}
+
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+ return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
new file mode 100644
index 000000000000..50aef5b618f9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
@@ -0,0 +1,1211 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+#define ZPROP_INHERIT_SUFFIX "$inherit"
+#define ZPROP_RECVD_SUFFIX "$recvd"
+
+static int
+dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
+{
+ /*
+ * The setonce properties are read-only, BUT they still
+ * have a default value that can be used as the initial
+ * value.
+ */
+ if (prop == ZPROP_INVAL ||
+ (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
+ return (SET_ERROR(ENOENT));
+
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+ if (zfs_prop_default_string(prop) == NULL)
+ return (SET_ERROR(ENOENT));
+ if (intsz != 1)
+ return (SET_ERROR(EOVERFLOW));
+ (void) strncpy(buf, zfs_prop_default_string(prop),
+ numints);
+ } else {
+ if (intsz != 8 || numints < 1)
+ return (SET_ERROR(EOVERFLOW));
+
+ *(uint64_t *)buf = zfs_prop_default_numeric(prop);
+ }
+
+ return (0);
+}
+
+int
+dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
+{
+ int err = ENOENT;
+ dsl_dir_t *target = dd;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ zfs_prop_t prop;
+ boolean_t inheritable;
+ boolean_t inheriting = B_FALSE;
+ char *inheritstr;
+ char *recvdstr;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+ if (setpoint)
+ setpoint[0] = '\0';
+
+ prop = zfs_name_to_prop(propname);
+ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ /*
+ * Note: dd may become NULL, therefore we shouldn't dereference it
+ * after this loop.
+ */
+ for (; dd != NULL; dd = dd->dd_parent) {
+ if (dd != target || snapshot) {
+ if (!inheritable)
+ break;
+ inheriting = B_TRUE;
+ }
+
+ /* Check for a local value. */
+ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ propname, intsz, numints, buf);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ dsl_dir_name(dd, setpoint);
+ break;
+ }
+
+ /*
+ * Skip the check for a received value if there is an explicit
+ * inheritance entry.
+ */
+ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ inheritstr);
+ if (err != 0 && err != ENOENT)
+ break;
+
+ if (err == ENOENT) {
+ /* Check for a received value. */
+ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ recvdstr, intsz, numints, buf);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0) {
+ if (inheriting) {
+ dsl_dir_name(dd, setpoint);
+ } else {
+ (void) strcpy(setpoint,
+ ZPROP_SOURCE_VAL_RECVD);
+ }
+ }
+ break;
+ }
+ }
+
+ /*
+ * If we found an explicit inheritance entry, err is zero even
+ * though we haven't yet found the value, so reinitializing err
+ * at the end of the loop (instead of at the beginning) ensures
+ * that err has a valid post-loop value.
+ */
+ err = SET_ERROR(ENOENT);
+ }
+
+ if (err == ENOENT)
+ err = dodefault(prop, intsz, numints, buf);
+
+ strfree(inheritstr);
+ strfree(recvdstr);
+
+ return (err);
+}
+
+int
+dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ boolean_t inheritable;
+ uint64_t zapobj;
+
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ zapobj = dsl_dataset_phys(ds)->ds_props_obj;
+
+ if (zapobj != 0) {
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int err;
+
+ ASSERT(ds->ds_is_snapshot);
+
+ /* Check for a local value. */
+ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ dsl_dataset_name(ds, setpoint);
+ return (err);
+ }
+
+ /*
+ * Skip the check for a received value if there is an explicit
+ * inheritance entry.
+ */
+ if (inheritable) {
+ char *inheritstr = kmem_asprintf("%s%s", propname,
+ ZPROP_INHERIT_SUFFIX);
+ err = zap_contains(mos, zapobj, inheritstr);
+ strfree(inheritstr);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ }
+
+ if (err == ENOENT) {
+ /* Check for a received value. */
+ char *recvdstr = kmem_asprintf("%s%s", propname,
+ ZPROP_RECVD_SUFFIX);
+ err = zap_lookup(mos, zapobj, recvdstr,
+ intsz, numints, buf);
+ strfree(recvdstr);
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ (void) strcpy(setpoint,
+ ZPROP_SOURCE_VAL_RECVD);
+ return (err);
+ }
+ }
+ }
+
+ return (dsl_prop_get_dd(ds->ds_dir, propname,
+ intsz, numints, buf, setpoint, ds->ds_is_snapshot));
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_find(dsl_dir_t *dd, const char *propname)
+{
+ dsl_prop_record_t *pr = NULL;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ for (pr = list_head(&dd->dd_props);
+ pr != NULL; pr = list_next(&dd->dd_props, pr)) {
+ if (strcmp(pr->pr_propname, propname) == 0)
+ break;
+ }
+
+ return (pr);
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_create(dsl_dir_t *dd, const char *propname)
+{
+ dsl_prop_record_t *pr;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
+ pr->pr_propname = spa_strdup(propname);
+ list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_pr_node));
+ list_insert_head(&dd->dd_props, pr);
+
+ return (pr);
+}
+
+void
+dsl_prop_init(dsl_dir_t *dd)
+{
+ list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
+ offsetof(dsl_prop_record_t, pr_node));
+}
+
+void
+dsl_prop_fini(dsl_dir_t *dd)
+{
+ dsl_prop_record_t *pr;
+
+ while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
+ list_destroy(&pr->pr_cbs);
+ strfree((char *)pr->pr_propname);
+ kmem_free(pr, sizeof (dsl_prop_record_t));
+ }
+ list_destroy(&dd->dd_props);
+}
+
+/*
+ * Register interest in the named property. We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_pool_t *dp = dd->dd_pool;
+ uint64_t value;
+ dsl_prop_record_t *pr;
+ dsl_prop_cb_record_t *cbr;
+ int err;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ err = dsl_prop_get_int_ds(ds, propname, &value);
+ if (err != 0)
+ return (err);
+
+ cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+ cbr->cbr_ds = ds;
+ cbr->cbr_func = callback;
+ cbr->cbr_arg = cbarg;
+
+ mutex_enter(&dd->dd_lock);
+ pr = dsl_prop_record_find(dd, propname);
+ if (pr == NULL)
+ pr = dsl_prop_record_create(dd, propname);
+ cbr->cbr_pr = pr;
+ list_insert_head(&pr->pr_cbs, cbr);
+ list_insert_head(&ds->ds_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+
+ cbr->cbr_func(cbr->cbr_arg, value);
+ return (0);
+}
+
+int
+dsl_prop_get(const char *dsname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_hold(dsname, FTAG, &os);
+ if (error != 0)
+ return (error);
+
+ error = dsl_prop_get_ds(dmu_objset_ds(os), propname,
+ intsz, numints, buf, setpoint);
+
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+/*
+ * Get the current property value. It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint)
+{
+ return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+int
+dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname,
+ uint64_t *valuep)
+{
+ return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL));
+}
+
+/*
+ * Predict the effective value of the given special property if it were set with
+ * the given value and source. This is not a general purpose function. It exists
+ * only to handle the special requirements of the quota and reservation
+ * properties. The fact that these properties are non-inheritable greatly
+ * simplifies the prediction logic.
+ *
+ * Returns 0 on success, a positive error code on failure, or -1 if called with
+ * a property not handled by this function.
+ */
+int
+dsl_prop_predict(dsl_dir_t *dd, const char *propname,
+ zprop_source_t source, uint64_t value, uint64_t *newvalp)
+{
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ objset_t *mos;
+ uint64_t zapobj;
+ uint64_t version;
+ char *recvdstr;
+ int err = 0;
+
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_RESERVATION:
+ case ZFS_PROP_REFQUOTA:
+ case ZFS_PROP_REFRESERVATION:
+ break;
+ default:
+ return (-1);
+ }
+
+ mos = dd->dd_pool->dp_meta_objset;
+ zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ version = spa_version(dd->dd_pool->dp_spa);
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ if (source & ZPROP_SRC_NONE)
+ source = ZPROP_SRC_NONE;
+ else if (source & ZPROP_SRC_RECEIVED)
+ source = ZPROP_SRC_LOCAL;
+ }
+
+ switch (source) {
+ case ZPROP_SRC_NONE:
+ /* Revert to the received value, if any. */
+ err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp);
+ if (err == ENOENT)
+ *newvalp = 0;
+ break;
+ case ZPROP_SRC_LOCAL:
+ *newvalp = value;
+ break;
+ case ZPROP_SRC_RECEIVED:
+ /*
+ * If there's no local setting, then the new received value will
+ * be the effective value.
+ */
+ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
+ if (err == ENOENT)
+ *newvalp = value;
+ break;
+ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+ /*
+ * We're clearing the received value, so the local setting (if
+ * it exists) remains the effective value.
+ */
+ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
+ if (err == ENOENT)
+ *newvalp = 0;
+ break;
+ default:
+ panic("unexpected property source: %d", source);
+ }
+
+ strfree(recvdstr);
+
+ if (err == ENOENT)
+ return (0);
+
+ return (err);
+}
+
+/*
+ * Unregister all callbacks that are registered with the
+ * given callback argument.
+ */
+void
+dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg)
+{
+ dsl_prop_cb_record_t *cbr, *next_cbr;
+
+ dsl_dir_t *dd = ds->ds_dir;
+
+ mutex_enter(&dd->dd_lock);
+ next_cbr = list_head(&ds->ds_prop_cbs);
+ while (next_cbr != NULL) {
+ cbr = next_cbr;
+ next_cbr = list_next(&ds->ds_prop_cbs, cbr);
+ if (cbr->cbr_arg == cbarg) {
+ list_remove(&ds->ds_prop_cbs, cbr);
+ list_remove(&cbr->cbr_pr->pr_cbs, cbr);
+ kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+}
+
+boolean_t
+dsl_prop_hascb(dsl_dataset_t *ds)
+{
+ return (!list_is_empty(&ds->ds_prop_cbs));
+}
+
+/* ARGSUSED */
+static int
+dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_record_t *pr;
+ dsl_prop_cb_record_t *cbr;
+
+ mutex_enter(&dd->dd_lock);
+ for (pr = list_head(&dd->dd_props);
+ pr; pr = list_next(&dd->dd_props, pr)) {
+ for (cbr = list_head(&pr->pr_cbs); cbr;
+ cbr = list_next(&pr->pr_cbs, cbr)) {
+ uint64_t value;
+
+ /*
+ * Callback entries do not have holds on their
+ * datasets so that datasets with registered
+ * callbacks are still eligible for eviction.
+ * Unlike operations to update properties on a
+ * single dataset, we are performing a recursive
+ * descent of related head datasets. The caller
+ * of this function only has a dataset hold on
+ * the passed in head dataset, not the snapshots
+ * associated with this dataset. Without a hold,
+ * the dataset pointer within callback records
+ * for snapshots can be invalidated by eviction
+ * at any time.
+ *
+ * Use dsl_dataset_try_add_ref() to verify
+ * that the dataset for a snapshot has not
+ * begun eviction processing and to prevent
+ * eviction from occurring for the duration of
+ * the callback. If the hold attempt fails,
+ * this object is already being evicted and the
+ * callback can be safely ignored.
+ */
+ if (ds != cbr->cbr_ds &&
+ !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+ continue;
+
+ if (dsl_prop_get_ds(cbr->cbr_ds,
+ cbr->cbr_pr->pr_propname, sizeof (value), 1,
+ &value, NULL) == 0)
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ if (ds != cbr->cbr_ds)
+ dsl_dataset_rele(cbr->cbr_ds, FTAG);
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+
+ return (0);
+}
+
+/*
+ * Update all property values for ddobj & its descendants. This is used
+ * when renaming the dir.
+ */
+void
+dsl_prop_notify_all(dsl_dir_t *dd)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+ (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb,
+ NULL, DS_FIND_CHILDREN);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+ const char *propname, uint64_t value, int first)
+{
+ dsl_dir_t *dd;
+ dsl_prop_record_t *pr;
+ dsl_prop_cb_record_t *cbr;
+ objset_t *mos = dp->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t *za;
+ int err;
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+ err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+ if (err)
+ return;
+
+ if (!first) {
+ /*
+ * If the prop is set here, then this change is not
+ * being inherited here or below; stop the recursion.
+ */
+ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ propname);
+ if (err == 0) {
+ dsl_dir_rele(dd, FTAG);
+ return;
+ }
+ ASSERT3U(err, ==, ENOENT);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ pr = dsl_prop_record_find(dd, propname);
+ if (pr != NULL) {
+ for (cbr = list_head(&pr->pr_cbs); cbr;
+ cbr = list_next(&pr->pr_cbs, cbr)) {
+ uint64_t propobj;
+
+ /*
+ * cbr->cbr_ds may be invalidated due to eviction,
+ * requiring the use of dsl_dataset_try_add_ref().
+ * See comment block in dsl_prop_notify_all_cb()
+ * for details.
+ */
+ if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+ continue;
+
+ propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
+
+ /*
+ * If the property is not set on this ds, then it is
+ * inherited here; call the callback.
+ */
+ if (propobj == 0 ||
+ zap_contains(mos, propobj, propname) != 0)
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ dsl_dataset_rele(cbr->cbr_ds, FTAG);
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, mos,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_prop_changed_notify(dp, za->za_first_integer,
+ propname, value, FALSE);
+ }
+ kmem_free(za, sizeof (zap_attribute_t));
+ zap_cursor_fini(&zc);
+ dsl_dir_rele(dd, FTAG);
+}
+
+void
+dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
+ zprop_source_t source, int intsz, int numints, const void *value,
+ dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t zapobj, intval, dummy;
+ int isint;
+ char valbuf[32];
+ const char *valstr = NULL;
+ char *inheritstr;
+ char *recvdstr;
+ char *tbuf = NULL;
+ int err;
+ uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+
+ isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0);
+
+ if (ds->ds_is_snapshot) {
+ ASSERT(version >= SPA_VERSION_SNAP_PROPS);
+ if (dsl_dataset_phys(ds)->ds_props_obj == 0) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_props_obj =
+ zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ }
+ zapobj = dsl_dataset_phys(ds)->ds_props_obj;
+ } else {
+ zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
+ }
+
+ if (version < SPA_VERSION_RECVD_PROPS) {
+ if (source & ZPROP_SRC_NONE)
+ source = ZPROP_SRC_NONE;
+ else if (source & ZPROP_SRC_RECEIVED)
+ source = ZPROP_SRC_LOCAL;
+ }
+
+ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+ switch (source) {
+ case ZPROP_SRC_NONE:
+ /*
+ * revert to received value, if any (inherit -S)
+ * - remove propname
+ * - remove propname$inherit
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ break;
+ case ZPROP_SRC_LOCAL:
+ /*
+ * remove propname$inherit
+ * set propname -> value
+ */
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ VERIFY0(zap_update(mos, zapobj, propname,
+ intsz, numints, value, tx));
+ break;
+ case ZPROP_SRC_INHERITED:
+ /*
+ * explicitly inherit
+ * - remove propname
+ * - set propname$inherit
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ if (version >= SPA_VERSION_RECVD_PROPS &&
+ dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
+ dummy = 0;
+ VERIFY0(zap_update(mos, zapobj, inheritstr,
+ 8, 1, &dummy, tx));
+ }
+ break;
+ case ZPROP_SRC_RECEIVED:
+ /*
+ * set propname$recvd -> value
+ */
+ err = zap_update(mos, zapobj, recvdstr,
+ intsz, numints, value, tx);
+ ASSERT(err == 0);
+ break;
+ case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
+ /*
+ * clear local and received settings
+ * - remove propname
+ * - remove propname$inherit
+ * - remove propname$recvd
+ */
+ err = zap_remove(mos, zapobj, propname, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, inheritstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ /* FALLTHRU */
+ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+ /*
+ * remove propname$recvd
+ */
+ err = zap_remove(mos, zapobj, recvdstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ break;
+ default:
+ cmn_err(CE_PANIC, "unexpected property source: %d", source);
+ }
+
+ strfree(inheritstr);
+ strfree(recvdstr);
+
+ if (isint) {
+ VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
+
+ if (ds->ds_is_snapshot) {
+ dsl_prop_cb_record_t *cbr;
+ /*
+ * It's a snapshot; nothing can inherit this
+ * property, so just look for callbacks on this
+ * ds here.
+ */
+ mutex_enter(&ds->ds_dir->dd_lock);
+ for (cbr = list_head(&ds->ds_prop_cbs); cbr;
+ cbr = list_next(&ds->ds_prop_cbs, cbr)) {
+ if (strcmp(cbr->cbr_pr->pr_propname,
+ propname) == 0)
+ cbr->cbr_func(cbr->cbr_arg, intval);
+ }
+ mutex_exit(&ds->ds_dir->dd_lock);
+ } else {
+ dsl_prop_changed_notify(ds->ds_dir->dd_pool,
+ ds->ds_dir->dd_object, propname, intval, TRUE);
+ }
+
+ (void) snprintf(valbuf, sizeof (valbuf),
+ "%lld", (longlong_t)intval);
+ valstr = valbuf;
+ } else {
+ if (source == ZPROP_SRC_LOCAL) {
+ valstr = value;
+ } else {
+ tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ if (dsl_prop_get_ds(ds, propname, 1,
+ ZAP_MAXVALUELEN, tbuf, NULL) == 0)
+ valstr = tbuf;
+ }
+ }
+
+ spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE ||
+ source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx,
+ "%s=%s", propname, (valstr == NULL ? "" : valstr));
+
+ if (tbuf != NULL)
+ kmem_free(tbuf, ZAP_MAXVALUELEN);
+}
+
+int
+dsl_prop_set_int(const char *dsname, const char *propname,
+ zprop_source_t source, uint64_t value)
+{
+ nvlist_t *nvl = fnvlist_alloc();
+ int error;
+
+ fnvlist_add_uint64(nvl, propname, value);
+ error = dsl_props_set(dsname, source, nvl);
+ fnvlist_free(nvl);
+ return (error);
+}
+
+int
+dsl_prop_set_string(const char *dsname, const char *propname,
+ zprop_source_t source, const char *value)
+{
+ nvlist_t *nvl = fnvlist_alloc();
+ int error;
+
+ fnvlist_add_string(nvl, propname, value);
+ error = dsl_props_set(dsname, source, nvl);
+ fnvlist_free(nvl);
+ return (error);
+}
+
+int
+dsl_prop_inherit(const char *dsname, const char *propname,
+ zprop_source_t source)
+{
+ nvlist_t *nvl = fnvlist_alloc();
+ int error;
+
+ fnvlist_add_boolean(nvl, propname);
+ error = dsl_props_set(dsname, source, nvl);
+ fnvlist_free(nvl);
+ return (error);
+}
+
+typedef struct dsl_props_set_arg {
+ const char *dpsa_dsname;
+ zprop_source_t dpsa_source;
+ nvlist_t *dpsa_props;
+} dsl_props_set_arg_t;
+
+static int
+dsl_props_set_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_props_set_arg_t *dpsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ uint64_t version;
+ nvpair_t *elem = NULL;
+ int err;
+
+ err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds);
+ if (err != 0)
+ return (err);
+
+ version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) {
+ if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ char *valstr = fnvpair_value_string(elem);
+ if (strlen(valstr) >= (version <
+ SPA_VERSION_STMF_PROP ?
+ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (E2BIG);
+ }
+ }
+ }
+
+ if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source,
+ nvlist_t *props, dmu_tx_t *tx)
+{
+ nvpair_t *elem = NULL;
+
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ nvpair_t *pair = elem;
+ const char *name = nvpair_name(pair);
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ /*
+ * This usually happens when we reuse the nvlist_t data
+ * returned by the counterpart dsl_prop_get_all_impl().
+ * For instance we do this to restore the original
+ * received properties when an error occurs in the
+ * zfs_ioc_recv() codepath.
+ */
+ nvlist_t *attrs = fnvpair_value_nvlist(pair);
+ pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
+ }
+
+ if (nvpair_type(pair) == DATA_TYPE_STRING) {
+ const char *value = fnvpair_value_string(pair);
+ dsl_prop_set_sync_impl(ds, name,
+ source, 1, strlen(value) + 1, value, tx);
+ } else if (nvpair_type(pair) == DATA_TYPE_UINT64) {
+ uint64_t intval = fnvpair_value_uint64(pair);
+ dsl_prop_set_sync_impl(ds, name,
+ source, sizeof (intval), 1, &intval, tx);
+ } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) {
+ dsl_prop_set_sync_impl(ds, name,
+ source, 0, 0, NULL, tx);
+ } else {
+ panic("invalid nvpair type");
+ }
+ }
+}
+
+static void
+dsl_props_set_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_props_set_arg_t *dpsa = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds));
+ dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * All-or-nothing; if any prop can't be set, nothing will be modified.
+ */
+int
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
+{
+ dsl_props_set_arg_t dpsa;
+ int nblks = 0;
+
+ dpsa.dpsa_dsname = dsname;
+ dpsa.dpsa_source = source;
+ dpsa.dpsa_props = props;
+
+ /*
+ * If the source includes NONE, then we will only be removing entries
+ * from the ZAP object. In that case don't check for ENOSPC.
+ */
+ if ((source & ZPROP_SRC_NONE) == 0)
+ nblks = 2 * fnvlist_num_pairs(props);
+
+ return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
+ &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
+}
+
+typedef enum dsl_prop_getflags {
+ DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */
+ DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */
+ DSL_PROP_GET_LOCAL = 0x4, /* local properties */
+ DSL_PROP_GET_RECEIVED = 0x8 /* received properties */
+} dsl_prop_getflags_t;
+
+static int
+dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
+ const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err = 0;
+
+ for (zap_cursor_init(&zc, mos, propobj);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ nvlist_t *propval;
+ zfs_prop_t prop;
+ char buf[ZAP_MAXNAMELEN];
+ char *valstr;
+ const char *suffix;
+ const char *propname;
+ const char *source;
+
+ suffix = strchr(za.za_name, '$');
+
+ if (suffix == NULL) {
+ /*
+ * Skip local properties if we only want received
+ * properties.
+ */
+ if (flags & DSL_PROP_GET_RECEIVED)
+ continue;
+
+ propname = za.za_name;
+ source = setpoint;
+ } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
+ /* Skip explicitly inherited entries. */
+ continue;
+ } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
+ if (flags & DSL_PROP_GET_LOCAL)
+ continue;
+
+ (void) strncpy(buf, za.za_name, (suffix - za.za_name));
+ buf[suffix - za.za_name] = '\0';
+ propname = buf;
+
+ if (!(flags & DSL_PROP_GET_RECEIVED)) {
+ /* Skip if locally overridden. */
+ err = zap_contains(mos, propobj, propname);
+ if (err == 0)
+ continue;
+ if (err != ENOENT)
+ break;
+
+ /* Skip if explicitly inherited. */
+ valstr = kmem_asprintf("%s%s", propname,
+ ZPROP_INHERIT_SUFFIX);
+ err = zap_contains(mos, propobj, valstr);
+ strfree(valstr);
+ if (err == 0)
+ continue;
+ if (err != ENOENT)
+ break;
+ }
+
+ source = ((flags & DSL_PROP_GET_INHERITING) ?
+ setpoint : ZPROP_SOURCE_VAL_RECVD);
+ } else {
+ /*
+ * For backward compatibility, skip suffixes we don't
+ * recognize.
+ */
+ continue;
+ }
+
+ prop = zfs_name_to_prop(propname);
+
+ /* Skip non-inheritable properties. */
+ if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
+ !zfs_prop_inheritable(prop))
+ continue;
+
+ /* Skip properties not valid for this type. */
+ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+ !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+ continue;
+
+ /* Skip properties already defined. */
+ if (nvlist_exists(nv, propname))
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (za.za_integer_length == 1) {
+ /*
+ * String property
+ */
+ char *tmp = kmem_alloc(za.za_num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos, propobj,
+ za.za_name, 1, za.za_num_integers, tmp);
+ if (err != 0) {
+ kmem_free(tmp, za.za_num_integers);
+ break;
+ }
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+ tmp) == 0);
+ kmem_free(tmp, za.za_num_integers);
+ } else {
+ /*
+ * Integer property
+ */
+ ASSERT(za.za_integer_length == 8);
+ (void) nvlist_add_uint64(propval, ZPROP_VALUE,
+ za.za_first_integer);
+ }
+
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+/*
+ * Iterate over all properties for this dataset and return them in an nvlist.
+ */
+static int
+dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
+ dsl_prop_getflags_t flags)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err = 0;
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ if (ds->ds_is_snapshot)
+ flags |= DSL_PROP_GET_SNAPSHOT;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
+ ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
+ dsl_dataset_name(ds, setpoint);
+ err = dsl_prop_get_all_impl(mos,
+ dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
+ if (err)
+ goto out;
+ }
+
+ for (; dd != NULL; dd = dd->dd_parent) {
+ if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
+ if (flags & (DSL_PROP_GET_LOCAL |
+ DSL_PROP_GET_RECEIVED))
+ break;
+ flags |= DSL_PROP_GET_INHERITING;
+ }
+ dsl_dir_name(dd, setpoint);
+ err = dsl_prop_get_all_impl(mos,
+ dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
+ if (err)
+ break;
+ }
+out:
+ return (err);
+}
+
+boolean_t
+dsl_prop_get_hasrecvd(const char *dsname)
+{
+ uint64_t dummy;
+
+ return (0 ==
+ dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL));
+}
+
+static int
+dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source)
+{
+ uint64_t version;
+ spa_t *spa;
+ int error = 0;
+
+ VERIFY0(spa_open(dsname, &spa, FTAG));
+ version = spa_version(spa);
+ spa_close(spa, FTAG);
+
+ if (version >= SPA_VERSION_RECVD_PROPS)
+ error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0);
+ return (error);
+}
+
+/*
+ * Call after successfully receiving properties to ensure that only the first
+ * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
+ */
+int
+dsl_prop_set_hasrecvd(const char *dsname)
+{
+ int error = 0;
+ if (!dsl_prop_get_hasrecvd(dsname))
+ error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL);
+ return (error);
+}
+
+void
+dsl_prop_unset_hasrecvd(const char *dsname)
+{
+ VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE));
+}
+
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+ return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
+}
+
+int
+dsl_prop_get_received(const char *dsname, nvlist_t **nvp)
+{
+ objset_t *os;
+ int error;
+
+ /*
+ * Received properties are not distinguishable from local properties
+ * until the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS.
+ */
+ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ?
+ DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
+
+ error = dmu_objset_hold(dsname, FTAG, &os);
+ if (error != 0)
+ return (error);
+ error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags);
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+void
+dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
+{
+ nvlist_t *propval;
+ const char *propname = zfs_prop_to_name(prop);
+ uint64_t default_value;
+
+ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+ return;
+ }
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+ /* Indicate the default source if we can. */
+ if (dodefault(prop, 8, 1, &default_value) == 0 &&
+ value == default_value) {
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
+ }
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+ nvlist_free(propval);
+}
+
+void
+dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
+{
+ nvlist_t *propval;
+ const char *propname = zfs_prop_to_name(prop);
+
+ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+ return;
+ }
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+ nvlist_free(propval);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
new file mode 100644
index 000000000000..f87a0539e9bb
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -0,0 +1,4001 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2016 Gary Mills
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017 Datto Inc.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/range_tree.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+/*
+ * Grand theory statement on scan queue sorting
+ *
+ * Scanning is implemented by recursively traversing all indirection levels
+ * in an object and reading all blocks referenced from said objects. This
+ * results in us approximately traversing the object from lowest logical
+ * offset to the highest. For best performance, we would want the logical
+ * blocks to be physically contiguous. However, this is frequently not the
+ * case with pools given the allocation patterns of copy-on-write filesystems.
+ * So instead, we put the I/Os into a reordering queue and issue them in a
+ * way that will most benefit physical disks (LBA-order).
+ *
+ * Queue management:
+ *
+ * Ideally, we would want to scan all metadata and queue up all block I/O
+ * prior to starting to issue it, because that allows us to do an optimal
+ * sorting job. This can however consume large amounts of memory. Therefore
+ * we continuously monitor the size of the queues and constrain them to 5%
+ * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
+ * limit, we clear out a few of the largest extents at the head of the queues
+ * to make room for more scanning. Hopefully, these extents will be fairly
+ * large and contiguous, allowing us to approach sequential I/O throughput
+ * even without a fully sorted tree.
+ *
+ * Metadata scanning takes place in dsl_scan_visit(), which is called from
+ * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
+ * metadata on the pool, or we need to make room in memory because our
+ * queues are too large, dsl_scan_visit() is postponed and
+ * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
+ * that metadata scanning and queued I/O issuing are mutually exclusive. This
+ * allows us to provide maximum sequential I/O throughput for the majority of
+ * I/O's issued since sequential I/O performance is significantly negatively
+ * impacted if it is interleaved with random I/O.
+ *
+ * Implementation Notes
+ *
+ * One side effect of the queued scanning algorithm is that the scanning code
+ * needs to be notified whenever a block is freed. This is needed to allow
+ * the scanning code to remove these I/Os from the issuing queue. Additionally,
+ * we do not attempt to queue gang blocks to be issued sequentially since this
+ * is very hard to do and would have an extremely limitted performance benefit.
+ * Instead, we simply issue gang I/Os as soon as we find them using the legacy
+ * algorithm.
+ *
+ * Backwards compatibility
+ *
+ * This new algorithm is backwards compatible with the legacy on-disk data
+ * structures (and therefore does not require a new feature flag).
+ * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
+ * will stop scanning metadata (in logical order) and wait for all outstanding
+ * sorted I/O to complete. Once this is done, we write out a checkpoint
+ * bookmark, indicating that we have scanned everything logically before it.
+ * If the pool is imported on a machine without the new sorting algorithm,
+ * the scan simply resumes from the last checkpoint using the legacy algorithm.
+ */
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
+ const zbookmark_phys_t *);
+
+static scan_cb_t dsl_scan_scrub_cb;
+
+static int scan_ds_queue_compare(const void *a, const void *b);
+static int scan_prefetch_queue_compare(const void *a, const void *b);
+static void scan_ds_queue_clear(dsl_scan_t *scn);
+static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
+ uint64_t *txg);
+static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
+static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
+static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
+static uint64_t dsl_scan_count_leaves(vdev_t *vd);
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+/*
+ * By default zfs will check to ensure it is not over the hard memory
+ * limit before each txg. If finer-grained control of this is needed
+ * this value can be set to 1 to enable checking before scanning each
+ * block.
+ */
+int zfs_scan_strict_mem_lim = B_FALSE;
+
+unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver -- 2 is a good number */
+unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub -- 4 is a good number */
+unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */
+
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev. We attempt
+ * to strike a balance here between keeping the vdev queues full of I/Os
+ * at all times and not overflowing the queues to cause long latency,
+ * which would cause long txg sync times. No matter what, we will not
+ * overload the drives with I/O, since that is protected by
+ * zfs_vdev_scrub_max_active.
+ */
+unsigned long zfs_scan_vdev_limit = 4 << 20;
+
+int zfs_scan_issue_strategy = 0;
+int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
+uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
+
+unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */
+#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval)
+
+/*
+ * fill_weight is non-tunable at runtime, so we copy it at module init from
+ * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
+ * break queue sorting.
+ */
+uint64_t zfs_scan_fill_weight = 3;
+static uint64_t fill_weight;
+
+/* See dsl_scan_should_clear() for details on the memory limit tunables */
+uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */
+uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */
+int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */
+int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */
+
+unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
+unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
+unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN,
+ &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN,
+ &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
+ &zfs_scan_idle, 0, "Idle scan window in clock ticks");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
+ &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
+ &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
+ &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN,
+ &zfs_no_scrub_io, 0, "Disable scrub I/O");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
+ &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN,
+ &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN,
+ &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval");
+
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+/* max number of blocks to free in a single TXG */
+uint64_t zfs_async_block_max_blocks = UINT64_MAX;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
+ &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG");
+
+/*
+ * We wait a few txgs after importing a pool to begin scanning so that
+ * the import / mounting code isn't held up by scrub / resilver IO.
+ * Unfortunately, it is a bit difficult to determine exactly how long
+ * this will take since userspace will trigger fs mounts asynchronously
+ * and the kernel will create zvol minors asynchronously. As a result,
+ * the value provided here is a bit arbitrary, but represents a
+ * reasonable estimate of how many txgs it will take to finish fully
+ * importing a pool
+ */
+#define SCAN_IMPORT_WAIT_TXGS 5
+
+
+#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+extern int zfs_txg_timeout;
+
+/*
+ * Enable/disable the processing of the free_bpobj object.
+ */
+boolean_t zfs_free_bpobj_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
+ &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+ NULL,
+ dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
+ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
+};
+
+/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
+typedef struct {
+ uint64_t sds_dsobj;
+ uint64_t sds_txg;
+ avl_node_t sds_node;
+} scan_ds_t;
+
+/*
+ * This controls what conditions are placed on dsl_scan_sync_state():
+ * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
+ * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ * write out the scn_phys_cached version.
+ * See dsl_scan_sync_state for details.
+ */
+typedef enum {
+ SYNC_OPTIONAL,
+ SYNC_MANDATORY,
+ SYNC_CACHED
+} state_sync_type_t;
+
+/*
+ * This struct represents the minimum information needed to reconstruct a
+ * zio for sequential scanning. This is useful because many of these will
+ * accumulate in the sequential IO queues before being issued, so saving
+ * memory matters here.
+ */
+typedef struct scan_io {
+ /* fields from blkptr_t */
+ uint64_t sio_offset;
+ uint64_t sio_blk_prop;
+ uint64_t sio_phys_birth;
+ uint64_t sio_birth;
+ zio_cksum_t sio_cksum;
+ uint32_t sio_asize;
+
+ /* fields from zio_t */
+ int sio_flags;
+ zbookmark_phys_t sio_zb;
+
+ /* members for queue sorting */
+ union {
+ avl_node_t sio_addr_node; /* link into issueing queue */
+ list_node_t sio_list_node; /* link for issuing to disk */
+ } sio_nodes;
+} scan_io_t;
+
+struct dsl_scan_io_queue {
+ dsl_scan_t *q_scn; /* associated dsl_scan_t */
+ vdev_t *q_vd; /* top-level vdev that this queue represents */
+
+ /* trees used for sorting I/Os and extents of I/Os */
+ range_tree_t *q_exts_by_addr;
+ avl_tree_t q_exts_by_size;
+ avl_tree_t q_sios_by_addr;
+
+ /* members for zio rate limiting */
+ uint64_t q_maxinflight_bytes;
+ uint64_t q_inflight_bytes;
+ kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
+
+ /* per txg statistics */
+ uint64_t q_total_seg_size_this_txg;
+ uint64_t q_segs_this_txg;
+ uint64_t q_total_zio_size_this_txg;
+ uint64_t q_zios_this_txg;
+};
+
+/* private data for dsl_scan_prefetch_cb() */
+typedef struct scan_prefetch_ctx {
+ zfs_refcount_t spc_refcnt; /* refcount for memory management */
+ dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */
+ boolean_t spc_root; /* is this prefetch for an objset? */
+ uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */
+ uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */
+} scan_prefetch_ctx_t;
+
+/* private data for dsl_scan_prefetch() */
+typedef struct scan_prefetch_issue_ctx {
+ avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */
+ scan_prefetch_ctx_t *spic_spc; /* spc for the callback */
+ blkptr_t spic_bp; /* bp to prefetch */
+ zbookmark_phys_t spic_zb; /* bookmark to prefetch */
+} scan_prefetch_issue_ctx_t;
+
+static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
+static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
+ scan_io_t *sio);
+
+static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
+static void scan_io_queues_destroy(dsl_scan_t *scn);
+
+static kmem_cache_t *sio_cache;
+
+void
+scan_init(void)
+{
+ /*
+ * This is used in ext_size_compare() to weight segments
+ * based on how sparse they are. This cannot be changed
+ * mid-scan and the tree comparison functions don't currently
+ * have a mechansim for passing additional context to the
+ * compare functions. Thus we store this value globally and
+ * we only allow it to be set at module intiailization time
+ */
+ fill_weight = zfs_scan_fill_weight;
+
+ sio_cache = kmem_cache_create("sio_cache",
+ sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+scan_fini(void)
+{
+ kmem_cache_destroy(sio_cache);
+}
+
+static inline boolean_t
+dsl_scan_is_running(const dsl_scan_t *scn)
+{
+ return (scn->scn_phys.scn_state == DSS_SCANNING);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+ return (dsl_scan_is_running(dp->dp_scan) &&
+ dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+static inline void
+sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+{
+ bzero(bp, sizeof (*bp));
+ DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
+ DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
+ DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
+ bp->blk_prop = sio->sio_blk_prop;
+ bp->blk_phys_birth = sio->sio_phys_birth;
+ bp->blk_birth = sio->sio_birth;
+ bp->blk_fill = 1; /* we always only work with data pointers */
+ bp->blk_cksum = sio->sio_cksum;
+}
+
+static inline void
+bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
+{
+ /* we discard the vdev id, since we can deduce it from the queue */
+ sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
+ sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
+ sio->sio_blk_prop = bp->blk_prop;
+ sio->sio_phys_birth = bp->blk_phys_birth;
+ sio->sio_birth = bp->blk_birth;
+ sio->sio_cksum = bp->blk_cksum;
+}
+
+void
+dsl_scan_global_init(void)
+{
+ /*
+ * This is used in ext_size_compare() to weight segments
+ * based on how sparse they are. This cannot be changed
+ * mid-scan and the tree comparison functions don't currently
+ * have a mechansim for passing additional context to the
+ * compare functions. Thus we store this value globally and
+ * we only allow it to be set at module intiailization time
+ */
+ fill_weight = zfs_scan_fill_weight;
+}
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+ int err;
+ dsl_scan_t *scn;
+ spa_t *spa = dp->dp_spa;
+ uint64_t f;
+
+ scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+ scn->scn_dp = dp;
+
+ /*
+ * It's possible that we're resuming a scan after a reboot so
+ * make sure that the scan_async_destroying flag is initialized
+ * appropriately.
+ */
+ ASSERT(!scn->scn_async_destroying);
+ scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_ASYNC_DESTROY);
+
+ /*
+ * Calculate the max number of in-flight bytes for pool-wide
+ * scanning operations (minimum 1MB). Limits for the issuing
+ * phase are done per top-level vdev and are handled separately.
+ */
+ scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
+ dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
+
+ avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
+ offsetof(scan_ds_t, sds_node));
+ avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
+ sizeof (scan_prefetch_issue_ctx_t),
+ offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_func", sizeof (uint64_t), 1, &f);
+ if (err == 0) {
+ /*
+ * There was an old-style scrub in progress. Restart a
+ * new-style scrub from the beginning.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("old-style scrub was in progress; "
+ "restarting new-style scrub in txg %llu",
+ (longlong_t)scn->scn_restart_txg);
+
+ /*
+ * Load the queue obj from the old location so that it
+ * can be freed by dsl_scan_done().
+ */
+ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_queue", sizeof (uint64_t), 1,
+ &scn->scn_phys.scn_queue_obj);
+ } else {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys);
+ if (err == ENOENT)
+ return (0);
+ else if (err)
+ return (err);
+
+ /*
+ * We might be restarting after a reboot, so jump the issued
+ * counter to how far we've scanned. We know we're consistent
+ * up to here.
+ */
+ scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+
+ if (dsl_scan_is_running(scn) &&
+ spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+ /*
+ * A new-type scrub was in progress on an old
+ * pool, and the pool was accessed by old
+ * software. Restart from the beginning, since
+ * the old software may have changed the pool in
+ * the meantime.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("new-style scrub was modified "
+ "by old software; restarting in txg %llu",
+ (longlong_t)scn->scn_restart_txg);
+ }
+ }
+
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
+ /* reload the queue into the in-core state */
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ scan_ds_queue_insert(scn,
+ zfs_strtonum(za.za_name, NULL),
+ za.za_first_integer);
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ spa_scan_stat_init(spa);
+ return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+ if (dp->dp_scan != NULL) {
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (scn->scn_taskq != NULL)
+ taskq_destroy(scn->scn_taskq);
+ scan_ds_queue_clear(scn);
+ avl_destroy(&scn->scn_queue);
+ avl_destroy(&scn->scn_prefetch_queue);
+
+ kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+ dp->dp_scan = NULL;
+ }
+}
+
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ return (scn->scn_restart_txg != 0 &&
+ scn->scn_restart_txg <= tx->tx_txg);
+}
+
+boolean_t
+dsl_scan_scrubbing(const dsl_pool_t *dp)
+{
+ dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
+
+ return (scn_phys->scn_state == DSS_SCANNING &&
+ scn_phys->scn_func == POOL_SCAN_SCRUB);
+}
+
+boolean_t
+dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
+{
+ return (dsl_scan_scrubbing(scn->scn_dp) &&
+ scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
+}
+
+/*
+ * Writes out a persistent dsl_scan_phys_t record to the pool directory.
+ * Because we can be running in the block sorting algorithm, we do not always
+ * want to write out the record, only when it is "safe" to do so. This safety
+ * condition is achieved by making sure that the sorting queues are empty
+ * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
+ * is inconsistent with how much actual scanning progress has been made. The
+ * kind of sync to be performed is specified by the sync_type argument. If the
+ * sync is optional, we only sync if the queues are empty. If the sync is
+ * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
+ * third possible state is a "cached" sync. This is done in response to:
+ * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ * destroyed, so we wouldn't be able to restart scanning from it.
+ * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
+ * superseded by a newer snapshot.
+ * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ * swapped with its clone.
+ * In all cases, a cached sync simply rewrites the last record we've written,
+ * just slightly modified. For the modifications that are performed to the
+ * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
+ * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
+ */
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
+{
+ int i;
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
+ if (scn->scn_bytes_pending == 0) {
+ for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
+
+ if (q == NULL)
+ continue;
+
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
+ ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL);
+ ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0)
+ scan_ds_queue_sync(scn, tx);
+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys, tx));
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached,
+ sizeof (scn->scn_phys));
+
+ if (scn->scn_checkpointing)
+ zfs_dbgmsg("finish scan checkpoint");
+
+ scn->scn_checkpointing = B_FALSE;
+ scn->scn_last_checkpoint = ddi_get_lbolt();
+ } else if (sync_type == SYNC_CACHED) {
+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys_cached, tx));
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+ if (dsl_scan_is_running(scn))
+ return (SET_ERROR(EBUSY));
+
+ return (0);
+}
+
+static void
+dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+ pool_scan_func_t *funcp = arg;
+ dmu_object_type_t ot = 0;
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(!dsl_scan_is_running(scn));
+ ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+ bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+ scn->scn_phys.scn_func = *funcp;
+ scn->scn_phys.scn_state = DSS_SCANNING;
+ scn->scn_phys.scn_min_txg = 0;
+ scn->scn_phys.scn_max_txg = tx->tx_txg;
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+ scn->scn_phys.scn_start_time = gethrestime_sec();
+ scn->scn_phys.scn_errors = 0;
+ scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+ scn->scn_issued_before_pass = 0;
+ scn->scn_restart_txg = 0;
+ scn->scn_done_txg = 0;
+ scn->scn_last_checkpoint = 0;
+ scn->scn_checkpointing = B_FALSE;
+ spa_scan_stat_init(spa);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+ /* rewrite all disk labels */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ if (vdev_resilver_needed(spa->spa_root_vdev,
+ &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+ spa_event_notify(spa, NULL, NULL,
+ ESC_ZFS_RESILVER_START);
+ } else {
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
+ }
+
+ spa->spa_scrub_started = B_TRUE;
+ /*
+ * If this is an incremental scrub, limit the DDT scrub phase
+ * to just the auto-ditto class (for correctness); the rest
+ * of the scrub should go faster using top-down pruning.
+ */
+ if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+ }
+
+ /* back to the generic stuff */
+
+ if (dp->dp_blkstats == NULL) {
+ dp->dp_blkstats =
+ kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ mutex_init(&dp->dp_blkstats->zab_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
+ bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
+
+ if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+ ot = DMU_OT_ZAP_OTHER;
+
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+ ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
+ dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
+
+ spa_history_log_internal(spa, "scan setup", tx,
+ "func=%u mintxg=%llu maxtxg=%llu",
+ *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+}
+
+/*
+ * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
+ * Can also be called to resume a paused scrub.
+ */
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ /*
+ * Purge all vdev caches and probe all devices. We do this here
+ * rather than in sync context because this requires a writer lock
+ * on the spa_config lock, which we can't do from sync context. The
+ * spa_scrub_reopen flag indicates that vdev_open() should not
+ * attempt to start another scrub.
+ */
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
+ /* got scrub start cmd, resume paused scrub */
+ int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+ POOL_SCRUB_NORMAL);
+ if (err == 0) {
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
+ return (ECANCELED);
+ }
+ return (SET_ERROR(err));
+ }
+
+ return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+ dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+ static const char *old_names[] = {
+ "scrub_bookmark",
+ "scrub_ddt_bookmark",
+ "scrub_ddt_class_max",
+ "scrub_queue",
+ "scrub_min_txg",
+ "scrub_max_txg",
+ "scrub_func",
+ "scrub_errors",
+ NULL
+ };
+
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ int i;
+
+ /* Remove any remnants of an old-style scrub. */
+ for (i = 0; old_names[i]; i++) {
+ (void) zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ VERIFY0(dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = 0;
+ }
+ scan_ds_queue_clear(scn);
+
+ scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+
+ /*
+ * If we were "restarted" from a stopped state, don't bother
+ * with anything else.
+ */
+ if (!dsl_scan_is_running(scn)) {
+ ASSERT(!scn->scn_is_sorted);
+ return;
+ }
+
+ if (scn->scn_is_sorted) {
+ scan_io_queues_destroy(scn);
+ scn->scn_is_sorted = B_FALSE;
+
+ if (scn->scn_taskq != NULL) {
+ taskq_destroy(scn->scn_taskq);
+ scn->scn_taskq = NULL;
+ }
+ }
+
+ scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
+
+ if (dsl_scan_restarting(scn, tx))
+ spa_history_log_internal(spa, "scan aborted, restarting", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
+ else if (!complete)
+ spa_history_log_internal(spa, "scan cancelled", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
+ else
+ spa_history_log_internal(spa, "scan done", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ spa->spa_scrub_started = B_FALSE;
+ spa->spa_scrub_active = B_FALSE;
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to
+ * reflect this. Whether it succeeded or not, vacate
+ * all temporary scrub DTLs.
+ *
+ * As the scrub does not currently support traversing
+ * data that have been freed but are part of a checkpoint,
+ * we don't mark the scrub as done in the DTLs as faults
+ * may still exist in those vdevs.
+ */
+ if (complete &&
+ !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ scn->scn_phys.scn_max_txg, B_TRUE);
+
+ spa_event_notify(spa, NULL, NULL,
+ scn->scn_phys.scn_min_txg ?
+ ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ } else {
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ 0, B_TRUE);
+ }
+ spa_errlog_rotate(spa);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ }
+
+ scn->scn_phys.scn_end_time = gethrestime_sec();
+
+ ASSERT(!dsl_scan_is_running(scn));
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+ if (!dsl_scan_is_running(scn))
+ return (SET_ERROR(ENOENT));
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+ dsl_scan_done(scn, B_FALSE, tx);
+ dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
+ spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+ return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
+ dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+}
+
+static int
+dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
+{
+ pool_scrub_cmd_t *cmd = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (*cmd == POOL_SCRUB_PAUSE) {
+ /* can't pause a scrub when there is no in-progress scrub */
+ if (!dsl_scan_scrubbing(dp))
+ return (SET_ERROR(ENOENT));
+
+ /* can't pause a paused scrub */
+ if (dsl_scan_is_paused_scrub(scn))
+ return (SET_ERROR(EBUSY));
+ } else if (*cmd != POOL_SCRUB_NORMAL) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ return (0);
+}
+
+static void
+dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
+{
+ pool_scrub_cmd_t *cmd = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (*cmd == POOL_SCRUB_PAUSE) {
+ /* can't pause a scrub when there is no in-progress scrub */
+ spa->spa_scan_pass_scrub_pause = gethrestime_sec();
+ scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
+ scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
+ } else {
+ ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
+ if (dsl_scan_is_paused_scrub(scn)) {
+ /*
+ * We need to keep track of how much time we spend
+ * paused per pass so that we can adjust the scrub rate
+ * shown in the output of 'zpool status'
+ */
+ spa->spa_scan_pass_scrub_spent_paused +=
+ gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
+ spa->spa_scan_pass_scrub_pause = 0;
+ scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+ scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+ }
+ }
+}
+
+/*
+ * Set scrub pause/resume state if it makes sense to do so
+ */
+int
+dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
+{
+ return (dsl_sync_task(spa_name(dp->dp_spa),
+ dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
+ ZFS_SPACE_CHECK_RESERVED));
+}
+
+
+/* start a new scan, or restart an existing one. */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+ if (txg == 0) {
+ dmu_tx_t *tx;
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+ dp->dp_scan->scn_restart_txg = txg;
+ dmu_tx_commit(tx);
+ } else {
+ dp->dp_scan->scn_restart_txg = txg;
+ }
+ zfs_dbgmsg("restarting resilver txg=%llu", txg);
+}
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+ zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+ ASSERT(dsl_pool_sync_context(dp));
+ zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
+ pio->io_flags));
+}
+
+static int
+scan_ds_queue_compare(const void *a, const void *b)
+{
+ const scan_ds_t *sds_a = a, *sds_b = b;
+
+ if (sds_a->sds_dsobj < sds_b->sds_dsobj)
+ return (-1);
+ if (sds_a->sds_dsobj == sds_b->sds_dsobj)
+ return (0);
+ return (1);
+}
+
+static void
+scan_ds_queue_clear(dsl_scan_t *scn)
+{
+ void *cookie = NULL;
+ scan_ds_t *sds;
+ while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
+ kmem_free(sds, sizeof (*sds));
+ }
+}
+
+static boolean_t
+scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
+{
+ scan_ds_t srch, *sds;
+
+ srch.sds_dsobj = dsobj;
+ sds = avl_find(&scn->scn_queue, &srch, NULL);
+ if (sds != NULL && txg != NULL)
+ *txg = sds->sds_txg;
+ return (sds != NULL);
+}
+
+static void
+scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
+{
+ scan_ds_t *sds;
+ avl_index_t where;
+
+ sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
+ sds->sds_dsobj = dsobj;
+ sds->sds_txg = txg;
+
+ VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
+ avl_insert(&scn->scn_queue, sds, where);
+}
+
+static void
+scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
+{
+ scan_ds_t srch, *sds;
+
+ srch.sds_dsobj = dsobj;
+
+ sds = avl_find(&scn->scn_queue, &srch, NULL);
+ VERIFY(sds != NULL);
+ avl_remove(&scn->scn_queue, sds);
+ kmem_free(sds, sizeof (*sds));
+}
+
+static void
+scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
+ DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
+
+ ASSERT0(scn->scn_bytes_pending);
+ ASSERT(scn->scn_phys.scn_queue_obj != 0);
+
+ VERIFY0(dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
+ DMU_OT_NONE, 0, tx);
+ for (scan_ds_t *sds = avl_first(&scn->scn_queue);
+ sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
+ VERIFY0(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
+ sds->sds_txg, tx));
+ }
+}
+
+/*
+ * Computes the memory limit state that we're currently in. A sorted scan
+ * needs quite a bit of memory to hold the sorting queue, so we need to
+ * reasonably constrain the size so it doesn't impact overall system
+ * performance. We compute two limits:
+ * 1) Hard memory limit: if the amount of memory used by the sorting
+ * queues on a pool gets above this value, we stop the metadata
+ * scanning portion and start issuing the queued up and sorted
+ * I/Os to reduce memory usage.
+ * This limit is calculated as a fraction of physmem (by default 5%).
+ * We constrain the lower bound of the hard limit to an absolute
+ * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
+ * the upper bound to 5% of the total pool size - no chance we'll
+ * ever need that much memory, but just to keep the value in check.
+ * 2) Soft memory limit: once we hit the hard memory limit, we start
+ * issuing I/O to reduce queue memory usage, but we don't want to
+ * completely empty out the queues, since we might be able to find I/Os
+ * that will fill in the gaps of our non-sequential IOs at some point
+ * in the future. So we stop the issuing of I/Os once the amount of
+ * memory used drops below the soft limit (at which point we stop issuing
+ * I/O and start scanning metadata again).
+ *
+ * This limit is calculated by subtracting a fraction of the hard
+ * limit from the hard limit. By default this fraction is 5%, so
+ * the soft limit is 95% of the hard limit. We cap the size of the
+ * difference between the hard and soft limits at an absolute
+ * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
+ * sufficient to not cause too frequent switching between the
+ * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
+ * worth of queues is about 1.2 GiB of on-pool data, so scanning
+ * that should take at least a decent fraction of a second).
+ */
+static boolean_t
+dsl_scan_should_clear(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+ uint64_t alloc, mlim_hard, mlim_soft, mused;
+
+ alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ alloc += metaslab_class_get_alloc(spa_special_class(spa));
+ alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+
+ mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
+ zfs_scan_mem_lim_min);
+ mlim_hard = MIN(mlim_hard, alloc / 20);
+ mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
+ zfs_scan_mem_lim_soft_max);
+ mused = 0;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *tvd = rvd->vdev_child[i];
+ dsl_scan_io_queue_t *queue;
+
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+ queue = tvd->vdev_scan_io_queue;
+ if (queue != NULL) {
+ /* #extents in exts_by_size = # in exts_by_addr */
+ mused += avl_numnodes(&queue->q_exts_by_size) *
+ sizeof (range_seg_t) +
+ avl_numnodes(&queue->q_sios_by_addr) *
+ sizeof (scan_io_t);
+ }
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ }
+
+ dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
+
+ if (mused == 0)
+ ASSERT0(scn->scn_bytes_pending);
+
+ /*
+ * If we are above our hard limit, we need to clear out memory.
+ * If we are below our soft limit, we need to accumulate sequential IOs.
+ * Otherwise, we should keep doing whatever we are currently doing.
+ */
+ if (mused >= mlim_hard)
+ return (B_TRUE);
+ else if (mused < mlim_soft)
+ return (B_FALSE);
+ else
+ return (scn->scn_clearing);
+}
+
+static boolean_t
+dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+ /* we never skip user/group accounting objects */
+ if (zb && (int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ if (scn->scn_suspending)
+ return (B_TRUE); /* we're already suspending */
+
+ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
+ return (B_FALSE); /* we're resuming */
+
+ /* We only know how to resume from level-0 blocks. */
+ if (zb && zb->zb_level != 0)
+ return (B_FALSE);
+
+ /*
+ * We suspend if:
+ * - we have scanned for at least the minimum time (default 1 sec
+ * for scrub, 3 sec for resilver), and either we have sufficient
+ * dirty data that we are starting to write more quickly
+ * (default 30%), or someone is explicitly waiting for this txg
+ * to complete.
+ * or
+ * - the spa is shutting down because this pool is being exported
+ * or the machine is rebooting.
+ * or
+ * - the scan queue has reached its memory use limit
+ */
+ uint64_t elapsed_nanosecs = gethrtime();
+ uint64_t curr_time_ns = gethrtime();
+ uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+ uint64_t sync_time_ns = curr_time_ns -
+ scn->scn_dp->dp_spa->spa_sync_starttime;
+
+ int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+ if ((NSEC2MSEC(scan_time_ns) > mintime &&
+ (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+ txg_sync_waiting(scn->scn_dp) ||
+ NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa) ||
+ (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+ if (zb) {
+ dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ scn->scn_phys.scn_bookmark = *zb;
+ } else {
+ dsl_scan_phys_t *scnp = &scn->scn_phys;
+
+ dprintf("suspending at at DDT bookmark "
+ "%llx/%llx/%llx/%llx\n",
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+ }
+ scn->scn_suspending = B_TRUE;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+ dsl_pool_t *zsa_dp;
+ zil_header_t *zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ zbookmark_phys_t zb;
+
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * One block ("stubby") can be allocated a long time ago; we
+ * want to visit that one because it has been allocated
+ * (on-disk) even if it hasn't been claimed (even though for
+ * scrub there's nothing to do to it).
+ */
+ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_phys_t zb;
+
+ if (BP_IS_HOLE(bp) ||
+ bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * birth can be < claim_txg if this record's txg is
+ * already txg sync'ed (but this log block contains
+ * other records that are not synced)
+ */
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ }
+ return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zil_scan_arg_t zsa = { dp, zh };
+ zilog_t *zilog;
+
+ ASSERT(spa_writeable(dp->dp_spa));
+
+ /*
+ * We only want to visit blocks that have been claimed
+ * but not yet replayed.
+ */
+ if (claim_txg == 0)
+ return;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+ claim_txg);
+
+ zil_free(zilog);
+}
+
+/*
+ * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
+ * here is to sort the AVL tree by the order each block will be needed.
+ */
+static int
+scan_prefetch_queue_compare(const void *a, const void *b)
+{
+ const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
+ const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
+ const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
+
+ return (zbookmark_compare(spc_a->spc_datablkszsec,
+ spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
+ spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
+}
+
+static void
+scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
+{
+ if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
+ zfs_refcount_destroy(&spc->spc_refcnt);
+ kmem_free(spc, sizeof (scan_prefetch_ctx_t));
+ }
+}
+
+static scan_prefetch_ctx_t *
+scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
+{
+ scan_prefetch_ctx_t *spc;
+
+ spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
+ zfs_refcount_create(&spc->spc_refcnt);
+ zfs_refcount_add(&spc->spc_refcnt, tag);
+ spc->spc_scn = scn;
+ if (dnp != NULL) {
+ spc->spc_datablkszsec = dnp->dn_datablkszsec;
+ spc->spc_indblkshift = dnp->dn_indblkshift;
+ spc->spc_root = B_FALSE;
+ } else {
+ spc->spc_datablkszsec = 0;
+ spc->spc_indblkshift = 0;
+ spc->spc_root = B_TRUE;
+ }
+
+ return (spc);
+}
+
+static void
+scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
+{
+ zfs_refcount_add(&spc->spc_refcnt, tag);
+}
+
+static boolean_t
+dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
+ const zbookmark_phys_t *zb)
+{
+ zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
+ dnode_phys_t tmp_dnp;
+ dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
+
+ if (zb->zb_objset != last_zb->zb_objset)
+ return (B_TRUE);
+ if ((int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
+ tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
+
+ if (zbookmark_subtree_completed(dnp, zb, last_zb))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static void
+dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
+{
+ avl_index_t idx;
+ dsl_scan_t *scn = spc->spc_scn;
+ spa_t *spa = scn->scn_dp->dp_spa;
+ scan_prefetch_issue_ctx_t *spic;
+
+ if (zfs_no_scrub_prefetch)
+ return;
+
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+ BP_GET_TYPE(bp) != DMU_OT_OBJSET))
+ return;
+
+ if (dsl_scan_check_prefetch_resume(spc, zb))
+ return;
+
+ scan_prefetch_ctx_add_ref(spc, scn);
+ spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
+ spic->spic_spc = spc;
+ spic->spic_bp = *bp;
+ spic->spic_zb = *zb;
+
+ /*
+ * Add the IO to the queue of blocks to prefetch. This allows us to
+ * prioritize blocks that we will need first for the main traversal
+ * thread.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
+ /* this block is already queued for prefetch */
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ scan_prefetch_ctx_rele(spc, scn);
+ mutex_exit(&spa->spa_scrub_lock);
+ return;
+ }
+
+ avl_insert(&scn->scn_prefetch_queue, spic, idx);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static void
+dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
+ uint64_t objset, uint64_t object)
+{
+ int i;
+ zbookmark_phys_t zb;
+ scan_prefetch_ctx_t *spc;
+
+ if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ return;
+
+ SET_BOOKMARK(&zb, objset, object, 0, 0);
+
+ spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
+
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
+ zb.zb_blkid = i;
+ dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zb.zb_level = 0;
+ zb.zb_blkid = DMU_SPILL_BLKID;
+ dsl_scan_prefetch(spc, &dnp->dn_spill, &zb);
+ }
+
+ scan_prefetch_ctx_rele(spc, FTAG);
+}
+
+void
+dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *private)
+{
+ scan_prefetch_ctx_t *spc = private;
+ dsl_scan_t *scn = spc->spc_scn;
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ /* broadcast that the IO has completed for rate limitting purposes */
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+ spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /* if there was an error or we are done prefetching, just cleanup */
+ if (buf == NULL || scn->scn_suspending)
+ goto out;
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ zbookmark_phys_t czb;
+
+ for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1, zb->zb_blkid * epb + i);
+ dsl_scan_prefetch(spc, cbp, &czb);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ dnode_phys_t *cdnp = buf->b_data;
+ int i;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
+ dsl_scan_prefetch_dnode(scn, cdnp,
+ zb->zb_objset, zb->zb_blkid * epb + i);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ objset_phys_t *osp = buf->b_data;
+
+ dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
+ zb->zb_objset, DMU_META_DNODE_OBJECT);
+
+ if (OBJSET_BUF_HAS_USERUSED(buf)) {
+ dsl_scan_prefetch_dnode(scn,
+ &osp->os_groupused_dnode, zb->zb_objset,
+ DMU_GROUPUSED_OBJECT);
+ dsl_scan_prefetch_dnode(scn,
+ &osp->os_userused_dnode, zb->zb_objset,
+ DMU_USERUSED_OBJECT);
+ }
+ }
+
+out:
+ if (buf != NULL)
+ arc_buf_destroy(buf, private);
+ scan_prefetch_ctx_rele(spc, scn);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch_thread(void *arg)
+{
+ dsl_scan_t *scn = arg;
+ spa_t *spa = scn->scn_dp->dp_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ scan_prefetch_issue_ctx_t *spic;
+
+ /* loop until we are told to stop */
+ while (!scn->scn_prefetch_stop) {
+ arc_flags_t flags = ARC_FLAG_NOWAIT |
+ ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * Wait until we have an IO to issue and are not above our
+ * maximum in flight limit.
+ */
+ while (!scn->scn_prefetch_stop &&
+ (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
+ spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ }
+
+ /* recheck if we should stop since we waited for the cv */
+ if (scn->scn_prefetch_stop) {
+ mutex_exit(&spa->spa_scrub_lock);
+ break;
+ }
+
+ /* remove the prefetch IO from the tree */
+ spic = avl_first(&scn->scn_prefetch_queue);
+ spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
+ avl_remove(&scn->scn_prefetch_queue, spic);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /* issue the prefetch asynchronously */
+ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ }
+
+ ASSERT(scn->scn_prefetch_stop);
+
+ /* free any prefetches we didn't get to complete */
+ mutex_enter(&spa->spa_scrub_lock);
+ while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
+ avl_remove(&scn->scn_prefetch_queue, spic);
+ scan_prefetch_ctx_rele(spic->spic_spc, scn);
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ }
+ ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+ const zbookmark_phys_t *zb)
+{
+ /*
+ * We never skip over user/group accounting objects (obj<0)
+ */
+ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
+ (int64_t)zb->zb_object >= 0) {
+ /*
+ * If we already visited this bp & everything below (in
+ * a prior txg sync), don't bother doing it again.
+ */
+ if (zbookmark_subtree_completed(dnp, zb,
+ &scn->scn_phys.scn_bookmark))
+ return (B_TRUE);
+
+ /*
+ * If we found the block we're trying to resume from, or
+ * we went past it to a different object, zero it out to
+ * indicate that it's OK to start checking for suspending
+ * again.
+ */
+ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+ zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+ dprintf("resuming at %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+ }
+ }
+ return (B_FALSE);
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+ dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+ dmu_objset_type_t ostype, dmu_tx_t *tx);
+static void dsl_scan_visitdnode(
+ dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+ int err;
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ arc_buf_t *buf;
+
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+ zbookmark_phys_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ dsl_scan_visitbp(cbp, &czb, dnp,
+ ds, scn, ostype, tx);
+ }
+ arc_buf_destroy(buf, &buf);
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ dnode_phys_t *cdnp;
+ int i;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+ arc_buf_t *buf;
+
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
+ dsl_scan_visitdnode(scn, ds, ostype,
+ cdnp, zb->zb_blkid * epb + i, tx);
+ }
+
+ arc_buf_destroy(buf, &buf);
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ objset_phys_t *osp;
+ arc_buf_t *buf;
+
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+
+ osp = buf->b_data;
+
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
+
+ if (OBJSET_BUF_HAS_USERUSED(buf)) {
+ /*
+ * We also always visit user/group accounting
+ * objects, and never skip them, even if we are
+ * suspending. This is necessary so that the space
+ * deltas from this txg get integrated.
+ */
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_groupused_dnode,
+ DMU_GROUPUSED_OBJECT, tx);
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_userused_dnode,
+ DMU_USERUSED_OBJECT, tx);
+ }
+ arc_buf_destroy(buf, &buf);
+ }
+
+ return (0);
+}
+
+static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype, dnode_phys_t *dnp,
+ uint64_t object, dmu_tx_t *tx)
+{
+ int j;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zbookmark_phys_t czb;
+
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ dnp->dn_nlevels - 1, j);
+ dsl_scan_visitbp(&dnp->dn_blkptr[j],
+ &czb, dnp, ds, scn, ostype, tx);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zbookmark_phys_t czb;
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ 0, DMU_SPILL_BLKID);
+ dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
+ &czb, dnp, ds, scn, ostype, tx);
+ }
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+ dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+ dmu_objset_type_t ostype, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ blkptr_t *bp_toread = NULL;
+
+ if (dsl_scan_check_suspend(scn, zb))
+ return;
+
+ if (dsl_scan_check_resume(scn, dnp, zb))
+ return;
+
+ scn->scn_visited_this_txg++;
+
+ dprintf_bp(bp,
+ "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
+ ds, ds ? ds->ds_object : 0,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ bp);
+
+ if (BP_IS_HOLE(bp)) {
+ scn->scn_holes_this_txg++;
+ return;
+ }
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+ scn->scn_lt_min_this_txg++;
+ return;
+ }
+
+ bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ *bp_toread = *bp;
+
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
+ goto out;
+
+ /*
+ * If dsl_scan_ddt() has already visited this block, it will have
+ * already done any translations or scrubbing, so don't call the
+ * callback again.
+ */
+ if (ddt_class_contains(dp->dp_spa,
+ scn->scn_phys.scn_ddt_class_max, bp)) {
+ scn->scn_ddt_contained_this_txg++;
+ goto out;
+ }
+
+ /*
+ * If this block is from the future (after cur_max_txg), then we
+ * are doing this on behalf of a deleted snapshot, and we will
+ * revisit the future block on the next pass of this dataset.
+ * Don't scan it now unless we need to because something
+ * under it was modified.
+ */
+ if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+ scn->scn_gt_max_this_txg++;
+ goto out;
+ }
+
+ scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+out:
+ kmem_free(bp_toread, sizeof (blkptr_t));
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_tx_t *tx)
+{
+ zbookmark_phys_t zb;
+ scan_prefetch_ctx_t *spc;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+ if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
+ SET_BOOKMARK(&scn->scn_prefetch_bookmark,
+ zb.zb_objset, 0, 0, 0);
+ } else {
+ scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
+ }
+
+ scn->scn_objsets_visited_this_txg++;
+
+ spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
+ dsl_scan_prefetch(spc, bp, &zb);
+ scan_prefetch_ctx_rele(spc, FTAG);
+
+ dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
+
+ dprintf_ds(ds, "finished scan%s", "");
+}
+
+static void
+ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
+{
+ if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
+ if (ds->ds_is_snapshot) {
+ /*
+ * Note:
+ * - scn_cur_{min,max}_txg stays the same.
+ * - Setting the flag is not really necessary if
+ * scn_cur_max_txg == scn_max_txg, because there
+ * is nothing after this snapshot that we care
+ * about. However, we set it anyway and then
+ * ignore it when we retraverse it in
+ * dsl_scan_visitds().
+ */
+ scn_phys->scn_bookmark.zb_objset =
+ dsl_dataset_phys(ds)->ds_next_snap_obj;
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->
+ ds_next_snap_obj);
+ scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
+ } else {
+ SET_BOOKMARK(&scn_phys->scn_bookmark,
+ ZB_DESTROYED_OBJSET, 0, 0, 0);
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset bookmark to -1,0,0,0",
+ (u_longlong_t)ds->ds_object);
+ }
+ }
+}
+
+/*
+ * Invoked when a dataset is destroyed. We need to make sure that:
+ *
+ * 1) If it is the dataset that was currently being scanned, we write
+ * a new dsl_scan_phys_t and marking the objset reference in it
+ * as destroyed.
+ * 2) Remove it from the work queue, if it was present.
+ *
+ * If the dataset was actually a snapshot, instead of marking the dataset
+ * as destroyed, we instead substitute the next snapshot in line.
+ */
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ ds_destroyed_scn_phys(ds, &scn->scn_phys);
+ ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
+
+ if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+ scan_ds_queue_remove(scn, ds->ds_object);
+ if (ds->ds_is_snapshot)
+ scan_ds_queue_insert(scn,
+ dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, &mintxg) == 0) {
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ if (ds->ds_is_snapshot) {
+ /*
+ * We keep the same mintxg; it could be >
+ * ds_creation_txg if the previous snapshot was
+ * deleted too.
+ */
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ mintxg, tx) == 0);
+ zfs_dbgmsg("destroying ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->
+ ds_next_snap_obj);
+ } else {
+ zfs_dbgmsg("destroying ds %llu; in queue; removing",
+ (u_longlong_t)ds->ds_object);
+ }
+ }
+
+ /*
+ * dsl_scan_sync() should be called after this, and should sync
+ * out our changed state, but just to be safe, do it here.
+ */
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+static void
+ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
+{
+ if (scn_bookmark->zb_objset == ds->ds_object) {
+ scn_bookmark->zb_objset =
+ dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+ }
+}
+
+/*
+ * Called when a dataset is snapshotted. If we were currently traversing
+ * this snapshot, we reset our bookmark to point at the newly created
+ * snapshot. We also modify our work queue to remove the old snapshot and
+ * replace with the new one.
+ */
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+
+ ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
+ ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
+
+ if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+ scan_ds_queue_remove(scn, ds->ds_object);
+ scan_ds_queue_insert(scn,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, &mintxg) == 0) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("snapshotting ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+ }
+
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+static void
+ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
+ zbookmark_phys_t *scn_bookmark)
+{
+ if (scn_bookmark->zb_objset == ds1->ds_object) {
+ scn_bookmark->zb_objset = ds2->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (scn_bookmark->zb_objset == ds2->ds_object) {
+ scn_bookmark->zb_objset = ds1->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+}
+
+/*
+ * Called when an origin dataset and its clone are swapped. If we were
+ * currently traversing the dataset, we need to switch to traversing the
+ * newly promoted clone.
+ */
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg1, mintxg2;
+ boolean_t ds1_queued, ds2_queued;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
+ ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
+
+ /*
+ * Handle the in-memory scan queue.
+ */
+ ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
+ ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
+
+ /* Sanity checking. */
+ if (ds1_queued) {
+ ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+ }
+ if (ds2_queued) {
+ ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+ }
+
+ if (ds1_queued && ds2_queued) {
+ /*
+ * If both are queued, we don't need to do anything.
+ * The swapping code below would not handle this case correctly,
+ * since we can't insert ds2 if it is already there. That's
+ * because scan_ds_queue_insert() prohibits a duplicate insert
+ * and panics.
+ */
+ } else if (ds1_queued) {
+ scan_ds_queue_remove(scn, ds1->ds_object);
+ scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
+ } else if (ds2_queued) {
+ scan_ds_queue_remove(scn, ds2->ds_object);
+ scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
+ }
+
+ /*
+ * Handle the on-disk scan queue.
+ * The on-disk state is an out-of-date version of the in-memory state,
+ * so the in-memory and on-disk values for ds1_queued and ds2_queued may
+ * be different. Therefore we need to apply the swap logic to the
+ * on-disk state independently of the in-memory state.
+ */
+ ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
+ ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
+
+ /* Sanity checking. */
+ if (ds1_queued) {
+ ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+ }
+ if (ds2_queued) {
+ ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+ }
+
+ if (ds1_queued && ds2_queued) {
+ /*
+ * If both are queued, we don't need to do anything.
+ * Alternatively, we could check for EEXIST from
+ * zap_add_int_key() and back out to the original state, but
+ * that would be more work than checking for this case upfront.
+ */
+ } else if (ds1_queued) {
+ VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+ VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (ds2_queued) {
+ VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+ VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+ uint64_t originobj = *(uint64_t *)arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
+ return (0);
+
+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+
+ dsl_dataset_rele(ds, FTAG);
+ if (err)
+ return (err);
+ ds = prev;
+ }
+ scan_ds_queue_insert(scn, ds->ds_object,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ dsl_dataset_t *ds;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ if (scn->scn_phys.scn_cur_min_txg >=
+ scn->scn_phys.scn_max_txg) {
+ /*
+ * This can happen if this snapshot was created after the
+ * scan started, and we already completed a previous snapshot
+ * that was created after the scan started. This snapshot
+ * only references blocks with:
+ *
+ * birth < our ds_creation_txg
+ * cur_min_txg is no less than ds_creation_txg.
+ * We have already visited these blocks.
+ * or
+ * birth > scn_max_txg
+ * The scan requested not to visit these blocks.
+ *
+ * Subsequent snapshots (and clones) can reference our
+ * blocks, or blocks with even higher birth times.
+ * Therefore we do not need to visit them either,
+ * so we do not add them to the work queue.
+ *
+ * Note that checking for cur_min_txg >= cur_max_txg
+ * is not sufficient, because in that case we may need to
+ * visit subsequent snapshots. This happens when min_txg > 0,
+ * which raises cur_min_txg. In this case we will visit
+ * this dataset but skip all of its blocks, because the
+ * rootbp's birth time is < cur_min_txg. Then we will
+ * add the next snapshots/clones to the work queue.
+ */
+ char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
+ "cur_min_txg (%llu) >= max_txg (%llu)",
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_max_txg);
+ kmem_free(dsname, MAXNAMELEN);
+
+ goto out;
+ }
+
+ /*
+ * Only the ZIL in the head (non-snapshot) is valid. Even though
+ * snapshots can have ZIL block pointers (which may be the same
+ * BP as in the head), they must be ignored. In addition, $ORIGIN
+ * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
+ * need to look for a ZIL in it either. So we traverse the ZIL here,
+ * rather than in scan_recurse(), because the regular snapshot
+ * block-sharing rules don't apply to it.
+ */
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
+ (dp->dp_origin_snap == NULL ||
+ ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
+ objset_t *os;
+ if (dmu_objset_from_ds(ds, &os) != 0) {
+ goto out;
+ }
+ dsl_scan_zil(dp, &os->os_zil_header);
+ }
+
+ /*
+ * Iterate over the bps in this ds.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+ "suspending=%u",
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_cur_max_txg,
+ (int)scn->scn_suspending);
+ kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
+
+ if (scn->scn_suspending)
+ goto out;
+
+ /*
+ * We've finished this pass over this dataset.
+ */
+
+ /*
+ * If we did not completely visit this dataset, do another pass.
+ */
+ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+ zfs_dbgmsg("incomplete pass; visiting again");
+ scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+ scan_ds_queue_insert(scn, ds->ds_object,
+ scn->scn_phys.scn_cur_max_txg);
+ goto out;
+ }
+
+ /*
+ * Add descendent datasets to work queue.
+ */
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+ scan_ds_queue_insert(scn,
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ dsl_dataset_phys(ds)->ds_creation_txg);
+ }
+ if (dsl_dataset_phys(ds)->ds_num_children > 1) {
+ boolean_t usenext = B_FALSE;
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+ uint64_t count;
+ /*
+ * A bug in a previous version of the code could
+ * cause upgrade_clones_cb() to not set
+ * ds_next_snap_obj when it should, leading to a
+ * missing entry. Therefore we can only use the
+ * next_clones_obj when its count is correct.
+ */
+ int err = zap_count(dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
+ if (err == 0 &&
+ count == dsl_dataset_phys(ds)->ds_num_children - 1)
+ usenext = B_TRUE;
+ }
+
+ if (usenext) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ scan_ds_queue_insert(scn,
+ zfs_strtonum(za.za_name, NULL),
+ dsl_dataset_phys(ds)->ds_creation_txg);
+ }
+ zap_cursor_fini(&zc);
+ } else {
+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ enqueue_clones_cb, &ds->ds_object,
+ DS_FIND_CHILDREN));
+ }
+ }
+
+out:
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+ dsl_dataset_t *ds;
+ int err;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ /*
+ * If this is a clone, we don't need to worry about it for now.
+ */
+ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ }
+
+ scan_ds_queue_insert(scn, ds->ds_object,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ const ddt_key_t *ddk = &dde->dde_key;
+ ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t bp;
+ zbookmark_phys_t zb = { 0 };
+ int p;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ /*
+ * This function is special because it is the only thing
+ * that can add scan_io_t's to the vdev scan queues from
+ * outside dsl_scan_sync(). For the most part this is ok
+ * as long as it is called from within syncing context.
+ * However, dsl_scan_sync() expects that no new sio's will
+ * be added between when all the work for a scan is done
+ * and the next txg when the scan is actually marked as
+ * completed. This check ensures we do not issue new sio's
+ * during this period.
+ */
+ if (scn->scn_done_txg != 0)
+ return;
+
+ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &bp);
+
+ scn->scn_visited_this_txg++;
+ scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+ }
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+ ddt_entry_t dde = { 0 };
+ int error;
+ uint64_t n = 0;
+
+ while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+ ddt_t *ddt;
+
+ if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+ break;
+ dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+ (longlong_t)ddb->ddb_class,
+ (longlong_t)ddb->ddb_type,
+ (longlong_t)ddb->ddb_checksum,
+ (longlong_t)ddb->ddb_cursor);
+
+ /* There should be no pending changes to the dedup table */
+ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+ ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+ dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+ n++;
+
+ if (dsl_scan_check_suspend(scn, NULL))
+ break;
+ }
+
+ zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
+ "suspending=%u", (longlong_t)n,
+ (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+
+ ASSERT(error == 0 || error == ENOENT);
+ ASSERT(error != ENOENT ||
+ ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+ uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+ if (ds->ds_is_snapshot)
+ return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
+ return (smt);
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ scan_ds_t *sds;
+ dsl_pool_t *dp = scn->scn_dp;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_ddt(scn, tx);
+ if (scn->scn_suspending)
+ return;
+ }
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+ /* First do the MOS & ORIGIN */
+
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_visit_rootbp(scn, NULL,
+ &dp->dp_meta_rootbp, tx);
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ if (scn->scn_suspending)
+ return;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ enqueue_cb, NULL, DS_FIND_CHILDREN));
+ } else {
+ dsl_scan_visitds(scn,
+ dp->dp_origin_snap->ds_object, tx);
+ }
+ ASSERT(!scn->scn_suspending);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset !=
+ ZB_DESTROYED_OBJSET) {
+ uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
+ /*
+ * If we were suspended, continue from here. Note if the
+ * ds we were suspended on was deleted, the zb_objset may
+ * be -1, so we will skip this and find a new objset
+ * below.
+ */
+ dsl_scan_visitds(scn, dsobj, tx);
+ if (scn->scn_suspending)
+ return;
+ }
+
+ /*
+ * In case we suspended right at the end of the ds, zero the
+ * bookmark so we don't think that we're still trying to resume.
+ */
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
+
+ /*
+ * Keep pulling things out of the dataset avl queue. Updates to the
+ * persistent zap-object-as-queue happen only at checkpoints.
+ */
+ while ((sds = avl_first(&scn->scn_queue)) != NULL) {
+ dsl_dataset_t *ds;
+ uint64_t dsobj = sds->sds_dsobj;
+ uint64_t txg = sds->sds_txg;
+
+ /* dequeue and free the ds from the queue */
+ scan_ds_queue_remove(scn, dsobj);
+ sds = NULL; /* must not be touched after removal */
+
+ /* Set up min / max txg */
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ if (txg != 0) {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg, txg);
+ } else {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ }
+ scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+ dsl_dataset_rele(ds, FTAG);
+
+ dsl_scan_visitds(scn, dsobj, tx);
+ if (scn->scn_suspending)
+ return;
+ }
+ /* No more objsets to fetch, we're done */
+ scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
+ ASSERT0(scn->scn_suspending);
+}
+
+static uint64_t
+dsl_scan_count_leaves(vdev_t *vd)
+{
+ uint64_t i, leaves = 0;
+
+ /* we only count leaves that belong to the main pool and are readable */
+ if (vd->vdev_islog || vd->vdev_isspare ||
+ vd->vdev_isl2cache || !vdev_readable(vd))
+ return (0);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (1);
+
+ for (i = 0; i < vd->vdev_children; i++) {
+ leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
+ }
+
+ return (leaves);
+}
+
+
+static void
+scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
+{
+ int i;
+ uint64_t cur_size = 0;
+
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
+ }
+
+ q->q_total_zio_size_this_txg += cur_size;
+ q->q_zios_this_txg++;
+}
+
+static void
+scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
+ uint64_t end)
+{
+ q->q_total_seg_size_this_txg += end - start;
+ q->q_segs_this_txg++;
+}
+
+static boolean_t
+scan_io_queue_check_suspend(dsl_scan_t *scn)
+{
+ /* See comment in dsl_scan_check_suspend() */
+ uint64_t curr_time_ns = gethrtime();
+ uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+ uint64_t sync_time_ns = curr_time_ns -
+ scn->scn_dp->dp_spa->spa_sync_starttime;
+ int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+ return ((NSEC2MSEC(scan_time_ns) > mintime &&
+ (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+ txg_sync_waiting(scn->scn_dp) ||
+ NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+/*
+ * Given a list of scan_io_t's in io_list, this issues the io's out to
+ * disk. This consumes the io_list and frees the scan_io_t's. This is
+ * called when emptying queues, either when we're up against the memory
+ * limit or when we have finished scanning. Returns B_TRUE if we stopped
+ * processing the list before we finished. Any zios that were not issued
+ * will remain in the io_list.
+ */
+static boolean_t
+scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio;
+ int64_t bytes_issued = 0;
+ boolean_t suspended = B_FALSE;
+
+ while ((sio = list_head(io_list)) != NULL) {
+ blkptr_t bp;
+
+ if (scan_io_queue_check_suspend(scn)) {
+ suspended = B_TRUE;
+ break;
+ }
+
+ sio2bp(sio, &bp, queue->q_vd->vdev_id);
+ bytes_issued += sio->sio_asize;
+ scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
+ &sio->sio_zb, queue);
+ (void) list_remove_head(io_list);
+ scan_io_queues_update_zio_stats(queue, &bp);
+ kmem_free(sio, sizeof (*sio));
+ }
+
+ atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
+
+ return (suspended);
+}
+
+/*
+ * Given a range_seg_t (extent) and a list, this function passes over a
+ * scan queue and gathers up the appropriate ios which fit into that
+ * scan seg (starting from lowest LBA). At the end, we remove the segment
+ * from the q_exts_by_addr range tree.
+ */
+static boolean_t
+scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
+{
+ scan_io_t srch_sio, *sio, *next_sio;
+ avl_index_t idx;
+ uint_t num_sios = 0;
+ int64_t bytes_issued = 0;
+
+ ASSERT(rs != NULL);
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ srch_sio.sio_offset = rs->rs_start;
+
+ /*
+ * The exact start of the extent might not contain any matching zios,
+ * so if that's the case, examine the next one in the tree.
+ */
+ sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+ if (sio == NULL)
+ sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
+
+ while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
+ ASSERT3U(sio->sio_offset, >=, rs->rs_start);
+ ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+
+ next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
+ avl_remove(&queue->q_sios_by_addr, sio);
+
+ bytes_issued += sio->sio_asize;
+ num_sios++;
+ list_insert_tail(list, sio);
+ sio = next_sio;
+ }
+
+ /*
+ * We limit the number of sios we process at once to 32 to avoid
+ * biting off more than we can chew. If we didn't take everything
+ * in the segment we update it to reflect the work we were able to
+ * complete. Otherwise, we remove it from the range tree entirely.
+ */
+ if (sio != NULL && sio->sio_offset < rs->rs_end) {
+ range_tree_adjust_fill(queue->q_exts_by_addr, rs,
+ -bytes_issued);
+ range_tree_resize_segment(queue->q_exts_by_addr, rs,
+ sio->sio_offset, rs->rs_end - sio->sio_offset);
+
+ return (B_TRUE);
+ } else {
+ range_tree_remove(queue->q_exts_by_addr, rs->rs_start,
+ rs->rs_end - rs->rs_start);
+ return (B_FALSE);
+ }
+}
+
+
+/*
+ * This is called from the queue emptying thread and selects the next
+ * extent from which we are to issue io's. The behavior of this function
+ * depends on the state of the scan, the current memory consumption and
+ * whether or not we are performing a scan shutdown.
+ * 1) We select extents in an elevator algorithm (LBA-order) if the scan
+ * needs to perform a checkpoint
+ * 2) We select the largest available extent if we are up against the
+ * memory limit.
+ * 3) Otherwise we don't select any extents.
+ */
+static const range_seg_t *
+scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
+{
+ dsl_scan_t *scn = queue->q_scn;
+
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+ ASSERT(scn->scn_is_sorted);
+
+ /* handle tunable overrides */
+ if (scn->scn_checkpointing || scn->scn_clearing) {
+ if (zfs_scan_issue_strategy == 1) {
+ return (range_tree_first(queue->q_exts_by_addr));
+ } else if (zfs_scan_issue_strategy == 2) {
+ return (avl_first(&queue->q_exts_by_size));
+ }
+ }
+
+ /*
+ * During normal clearing, we want to issue our largest segments
+ * first, keeping IO as sequential as possible, and leaving the
+ * smaller extents for later with the hope that they might eventually
+ * grow to larger sequential segments. However, when the scan is
+ * checkpointing, no new extents will be added to the sorting queue,
+ * so the way we are sorted now is as good as it will ever get.
+ * In this case, we instead switch to issuing extents in LBA order.
+ */
+ if (scn->scn_checkpointing) {
+ return (range_tree_first(queue->q_exts_by_addr));
+ } else if (scn->scn_clearing) {
+ return (avl_first(&queue->q_exts_by_size));
+ } else {
+ return (NULL);
+ }
+}
+
+static void
+scan_io_queues_run_one(void *arg)
+{
+ dsl_scan_io_queue_t *queue = arg;
+ kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+ boolean_t suspended = B_FALSE;
+ range_seg_t *rs = NULL;
+ scan_io_t *sio = NULL;
+ list_t sio_list;
+ uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+ uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
+
+ ASSERT(queue->q_scn->scn_is_sorted);
+
+ list_create(&sio_list, sizeof (scan_io_t),
+ offsetof(scan_io_t, sio_nodes.sio_list_node));
+ mutex_enter(q_lock);
+
+ /* calculate maximum in-flight bytes for this txg (min 1MB) */
+ queue->q_maxinflight_bytes =
+ MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+ /* reset per-queue scan statistics for this txg */
+ queue->q_total_seg_size_this_txg = 0;
+ queue->q_segs_this_txg = 0;
+ queue->q_total_zio_size_this_txg = 0;
+ queue->q_zios_this_txg = 0;
+
+ /* loop until we have run out of time or sios */
+ while ((rs = (range_seg_t*)scan_io_queue_fetch_ext(queue)) != NULL) {
+ uint64_t seg_start = 0, seg_end = 0;
+ boolean_t more_left = B_TRUE;
+
+ ASSERT(list_is_empty(&sio_list));
+
+ /* loop while we still have sios left to process in this rs */
+ while (more_left) {
+ scan_io_t *first_sio, *last_sio;
+
+ /*
+ * We have selected which extent needs to be
+ * processed next. Gather up the corresponding sios.
+ */
+ more_left = scan_io_queue_gather(queue, rs, &sio_list);
+ ASSERT(!list_is_empty(&sio_list));
+ first_sio = list_head(&sio_list);
+ last_sio = list_tail(&sio_list);
+
+ seg_end = last_sio->sio_offset + last_sio->sio_asize;
+ if (seg_start == 0)
+ seg_start = first_sio->sio_offset;
+
+ /*
+ * Issuing sios can take a long time so drop the
+ * queue lock. The sio queue won't be updated by
+ * other threads since we're in syncing context so
+ * we can be sure that our trees will remain exactly
+ * as we left them.
+ */
+ mutex_exit(q_lock);
+ suspended = scan_io_queue_issue(queue, &sio_list);
+ mutex_enter(q_lock);
+
+ if (suspended)
+ break;
+ }
+ /* update statistics for debugging purposes */
+ scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
+
+ if (suspended)
+ break;
+ }
+
+
+ /* If we were suspended in the middle of processing,
+ * requeue any unfinished sios and exit.
+ */
+ while ((sio = list_head(&sio_list)) != NULL) {
+ list_remove(&sio_list, sio);
+ scan_io_queue_insert_impl(queue, sio);
+ }
+
+ mutex_exit(q_lock);
+ list_destroy(&sio_list);
+}
+
+/*
+ * Performs an emptying run on all scan queues in the pool. This just
+ * punches out one thread per top-level vdev, each of which processes
+ * only that vdev's scan queue. We can parallelize the I/O here because
+ * we know that each queue's io's only affect its own top-level vdev.
+ *
+ * This function waits for the queue runs to complete, and must be
+ * called from dsl_scan_sync (or in general, syncing context).
+ */
+static void
+scan_io_queues_run(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ ASSERT(scn->scn_is_sorted);
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ if (scn->scn_bytes_pending == 0)
+ return;
+
+ if (scn->scn_taskq == NULL) {
+ char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16,
+ KM_SLEEP);
+ int nthreads = spa->spa_root_vdev->vdev_children;
+
+ /*
+ * We need to make this taskq *always* execute as many
+ * threads in parallel as we have top-level vdevs and no
+ * less, otherwise strange serialization of the calls to
+ * scan_io_queues_run_one can occur during spa_sync runs
+ * and that significantly impacts performance.
+ */
+ (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16,
+ "dsl_scan_tq_%s", spa->spa_name);
+ scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri,
+ nthreads, nthreads, TASKQ_PREPOPULATE);
+ kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16);
+ }
+
+ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ if (vd->vdev_scan_io_queue != NULL) {
+ VERIFY(taskq_dispatch(scn->scn_taskq,
+ scan_io_queues_run_one, vd->vdev_scan_io_queue,
+ TQ_SLEEP) != TASKQID_INVALID);
+ }
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ /*
+ * Wait for the queues to finish issuing thir IOs for this run
+ * before we return. There may still be IOs in flight at this
+ * point.
+ */
+ taskq_wait(scn->scn_taskq);
+}
+
+static boolean_t
+dsl_scan_async_block_should_pause(dsl_scan_t *scn)
+{
+ uint64_t elapsed_nanosecs;
+
+ if (zfs_recover)
+ return (B_FALSE);
+
+ if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks)
+ return (B_TRUE);
+
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg;
+
+ if (!scn->scn_is_bptree ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+ if (dsl_scan_async_block_should_pause(scn))
+ return (SET_ERROR(ERESTART));
+ }
+
+ zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+ -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+ -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+ scn->scn_visited_this_txg++;
+ return (0);
+}
+
+static void
+dsl_scan_update_stats(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t i;
+ uint64_t seg_size_total = 0, zio_size_total = 0;
+ uint64_t seg_count_total = 0, zio_count_total = 0;
+
+ for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
+
+ if (queue == NULL)
+ continue;
+
+ seg_size_total += queue->q_total_seg_size_this_txg;
+ zio_size_total += queue->q_total_zio_size_this_txg;
+ seg_count_total += queue->q_segs_this_txg;
+ zio_count_total += queue->q_zios_this_txg;
+ }
+
+ if (seg_count_total == 0 || zio_count_total == 0) {
+ scn->scn_avg_seg_size_this_txg = 0;
+ scn->scn_avg_zio_size_this_txg = 0;
+ scn->scn_segs_this_txg = 0;
+ scn->scn_zios_this_txg = 0;
+ return;
+ }
+
+ scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
+ scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
+ scn->scn_segs_this_txg = seg_count_total;
+ scn->scn_zios_this_txg = zio_count_total;
+}
+
+static int
+dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg;
+ const dva_t *dva = &bp->blk_dva[0];
+
+ if (dsl_scan_async_block_should_pause(scn))
+ return (SET_ERROR(ERESTART));
+
+ spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
+ DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
+ DVA_GET_ASIZE(dva), tx);
+ scn->scn_visited_this_txg++;
+ return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t used = 0, comp, uncomp;
+
+ if (spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+ if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
+ (scn->scn_async_destroying && !scn->scn_async_stalled))
+ return (B_TRUE);
+
+ if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+ &used, &comp, &uncomp);
+ }
+ return (used != 0);
+}
+
+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ vdev_t *vd;
+
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * The indirect vdev can point to multiple
+ * vdevs. For simplicity, always create
+ * the resilver zio_t. zio_vdev_io_start()
+ * will bypass the child resilver i/o's if
+ * they are on vdevs that don't have DTL's.
+ */
+ return (B_TRUE);
+ }
+
+ if (DVA_GET_GANG(dva)) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best estimate we have is the
+ * scrub range, which has already been checked.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that all
+ * gang members reside on the same vdev.
+ */
+ return (B_TRUE);
+ }
+
+ /*
+ * Check if the txg falls within the range which must be
+ * resilvered. DVAs outside this range can always be skipped.
+ */
+ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+ return (B_FALSE);
+
+ /*
+ * Check if the top-level vdev must resilver this offset.
+ * When the offset does not intersect with a dirty leaf DTL
+ * then it may be possible to skip the resilver IO. The psize
+ * is provided instead of asize to simplify the check for RAIDZ.
+ */
+ if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ int err = 0;
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+
+ if (spa_suspend_async_destroy(spa))
+ return (0);
+
+ if (zfs_free_bpobj_enabled &&
+ spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ scn->scn_is_bptree = B_FALSE;
+ scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
+ scn->scn_zio_root = zio_root(spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bpobj_iterate(&dp->dp_free_bpobj,
+ dsl_scan_free_block_cb, scn, tx);
+ VERIFY0(zio_wait(scn->scn_zio_root));
+ scn->scn_zio_root = NULL;
+
+ if (err != 0 && err != ERESTART)
+ zfs_panic_recover("error %u from bpobj_iterate()", err);
+ }
+
+ if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ ASSERT(scn->scn_async_destroying);
+ scn->scn_is_bptree = B_TRUE;
+ scn->scn_zio_root = zio_root(spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bptree_iterate(dp->dp_meta_objset,
+ dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+ VERIFY0(zio_wait(scn->scn_zio_root));
+ scn->scn_zio_root = NULL;
+
+ if (err == EIO || err == ECKSUM) {
+ err = 0;
+ } else if (err != 0 && err != ERESTART) {
+ zfs_panic_recover("error %u from "
+ "traverse_dataset_destroyed()", err);
+ }
+
+ if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+ /* finished; deactivate async destroy feature */
+ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+ ASSERT(!spa_feature_is_active(spa,
+ SPA_FEATURE_ASYNC_DESTROY));
+ VERIFY0(zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, tx));
+ VERIFY0(bptree_free(dp->dp_meta_objset,
+ dp->dp_bptree_obj, tx));
+ dp->dp_bptree_obj = 0;
+ scn->scn_async_destroying = B_FALSE;
+ scn->scn_async_stalled = B_FALSE;
+ } else {
+ /*
+ * If we didn't make progress, mark the async
+ * destroy as stalled, so that we will not initiate
+ * a spa_sync() on its behalf. Note that we only
+ * check this if we are not finished, because if the
+ * bptree had no blocks for us to visit, we can
+ * finish without "making progress".
+ */
+ scn->scn_async_stalled =
+ (scn->scn_visited_this_txg == 0);
+ }
+ }
+ if (scn->scn_visited_this_txg) {
+ zfs_dbgmsg("freed %llu blocks in %llums from "
+ "free_bpobj/bptree txg %llu; err=%d",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)
+ NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+ (longlong_t)tx->tx_txg, err);
+ scn->scn_visited_this_txg = 0;
+
+ /*
+ * Write out changes to the DDT that may be required as a
+ * result of the blocks freed. This ensures that the DDT
+ * is clean when a scrub/resilver runs.
+ */
+ ddt_sync(spa, tx->tx_txg);
+ }
+ if (err != 0)
+ return (err);
+ if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+ zfs_free_leak_on_eio &&
+ (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+ dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+ dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
+ /*
+ * We have finished background destroying, but there is still
+ * some space left in the dp_free_dir. Transfer this leaked
+ * space to the dp_leak_dir.
+ */
+ if (dp->dp_leak_dir == NULL) {
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ LEAK_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ LEAK_DIR_NAME, &dp->dp_leak_dir));
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+ }
+ dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+ dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+ dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+ dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+ }
+
+ if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
+ /* finished; verify that space accounting went to zero */
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
+ }
+
+ EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
+ 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ));
+ if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_OBSOLETE_COUNTS));
+
+ scn->scn_is_bptree = B_FALSE;
+ scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
+ err = bpobj_iterate(&dp->dp_obsolete_bpobj,
+ dsl_scan_obsolete_block_cb, scn, tx);
+ if (err != 0 && err != ERESTART)
+ zfs_panic_recover("error %u from bpobj_iterate()", err);
+
+ if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
+ dsl_pool_destroy_obsolete_bpobj(dp, tx);
+ }
+
+ return (0);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this funciton controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ int err = 0;
+ state_sync_type_t sync_type = SYNC_OPTIONAL;
+
+ /*
+ * Check for scn_restart_txg before checking spa_load_state, so
+ * that we can restart an old-style scan while the pool is being
+ * imported (see dsl_scan_init).
+ */
+ if (dsl_scan_restarting(scn, tx)) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_done(scn, B_FALSE, tx);
+ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ func = POOL_SCAN_RESILVER;
+ zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ func, (longlong_t)tx->tx_txg);
+ dsl_scan_setup_sync(&func, tx);
+ }
+
+ /*
+ * Only process scans in sync pass 1.
+ */
+ if (spa_sync_pass(dp->dp_spa) > 1)
+ return;
+
+ /*
+ * If the spa is shutting down, then stop scanning. This will
+ * ensure that the scan does not dirty any new data during the
+ * shutdown phase.
+ */
+ if (spa_shutting_down(spa))
+ return;
+
+ /*
+ * If the scan is inactive due to a stalled async destroy, try again.
+ */
+ if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+ return;
+
+ /* reset scan statistics */
+ scn->scn_visited_this_txg = 0;
+ scn->scn_holes_this_txg = 0;
+ scn->scn_lt_min_this_txg = 0;
+ scn->scn_gt_max_this_txg = 0;
+ scn->scn_ddt_contained_this_txg = 0;
+ scn->scn_objsets_visited_this_txg = 0;
+ scn->scn_avg_seg_size_this_txg = 0;
+ scn->scn_segs_this_txg = 0;
+ scn->scn_avg_zio_size_this_txg = 0;
+ scn->scn_zios_this_txg = 0;
+ scn->scn_suspending = B_FALSE;
+ scn->scn_sync_start_time = gethrtime();
+ spa->spa_scrub_active = B_TRUE;
+
+ /*
+ * First process the async destroys. If we pause, don't do
+ * any scrubbing or resilvering. This ensures that there are no
+ * async destroys while we are scanning, so the scan code doesn't
+ * have to worry about traversing it. It is also faster to free the
+ * blocks than to scrub them.
+ */
+ err = dsl_process_async_destroys(dp, tx);
+ if (err != 0)
+ return;
+
+ if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
+ return;
+
+ /*
+ * Wait a few txgs after importing to begin scanning so that
+ * we can get the pool imported quickly.
+ */
+ if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
+ return;
+
+ /*
+ * It is possible to switch from unsorted to sorted at any time,
+ * but afterwards the scan will remain sorted unless reloaded from
+ * a checkpoint after a reboot.
+ */
+ if (!zfs_scan_legacy) {
+ scn->scn_is_sorted = B_TRUE;
+ if (scn->scn_last_checkpoint == 0)
+ scn->scn_last_checkpoint = ddi_get_lbolt();
+ }
+
+ /*
+ * For sorted scans, determine what kind of work we will be doing
+ * this txg based on our memory limitations and whether or not we
+ * need to perform a checkpoint.
+ */
+ if (scn->scn_is_sorted) {
+ /*
+ * If we are over our checkpoint interval, set scn_clearing
+ * so that we can begin checkpointing immediately. The
+ * checkpoint allows us to save a consisent bookmark
+ * representing how much data we have scrubbed so far.
+ * Otherwise, use the memory limit to determine if we should
+ * scan for metadata or start issue scrub IOs. We accumulate
+ * metadata until we hit our hard memory limit at which point
+ * we issue scrub IOs until we are at our soft memory limit.
+ */
+ if (scn->scn_checkpointing ||
+ ddi_get_lbolt() - scn->scn_last_checkpoint >
+ SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
+ if (!scn->scn_checkpointing)
+ zfs_dbgmsg("begin scan checkpoint");
+
+ scn->scn_checkpointing = B_TRUE;
+ scn->scn_clearing = B_TRUE;
+ } else {
+ boolean_t should_clear = dsl_scan_should_clear(scn);
+ if (should_clear && !scn->scn_clearing) {
+ zfs_dbgmsg("begin scan clearing");
+ scn->scn_clearing = B_TRUE;
+ } else if (!should_clear && scn->scn_clearing) {
+ zfs_dbgmsg("finish scan clearing");
+ scn->scn_clearing = B_FALSE;
+ }
+ }
+ } else {
+ ASSERT0(scn->scn_checkpointing);
+ ASSERT0(scn->scn_clearing);
+ }
+
+ if (!scn->scn_clearing && scn->scn_done_txg == 0) {
+ /* Need to scan metadata for more blocks to scrub */
+ dsl_scan_phys_t *scnp = &scn->scn_phys;
+ taskqid_t prefetch_tqid;
+ uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+ uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
+
+ /*
+ * Recalculate the max number of in-flight bytes for pool-wide
+ * scanning operations (minimum 1MB). Limits for the issuing
+ * phase are done per top-level vdev and are handled separately.
+ */
+ scn->scn_maxinflight_bytes =
+ MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+ if (scnp->scn_ddt_bookmark.ddb_class <=
+ scnp->scn_ddt_class_max) {
+ ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "ddt bm=%llu/%llu/%llu/%llx",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+ } else {
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "bm=%llu/%llu/%llu/%llu",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scnp->scn_bookmark.zb_objset,
+ (longlong_t)scnp->scn_bookmark.zb_object,
+ (longlong_t)scnp->scn_bookmark.zb_level,
+ (longlong_t)scnp->scn_bookmark.zb_blkid);
+ }
+
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+
+ scn->scn_prefetch_stop = B_FALSE;
+ prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
+ dsl_scan_prefetch_thread, scn, TQ_SLEEP);
+ ASSERT(prefetch_tqid != TASKQID_INVALID);
+
+ dsl_pool_config_enter(dp, FTAG);
+ dsl_scan_visit(scn, tx);
+ dsl_pool_config_exit(dp, FTAG);
+
+ mutex_enter(&dp->dp_spa->spa_scrub_lock);
+ scn->scn_prefetch_stop = B_TRUE;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&dp->dp_spa->spa_scrub_lock);
+
+ taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ zfs_dbgmsg("scan visited %llu blocks in %llums "
+ "(%llu os's, %llu holes, %llu < mintxg, "
+ "%llu in ddt, %llu > maxtxg)",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)NSEC2MSEC(gethrtime() -
+ scn->scn_sync_start_time),
+ (longlong_t)scn->scn_objsets_visited_this_txg,
+ (longlong_t)scn->scn_holes_this_txg,
+ (longlong_t)scn->scn_lt_min_this_txg,
+ (longlong_t)scn->scn_ddt_contained_this_txg,
+ (longlong_t)scn->scn_gt_max_this_txg);
+
+ if (!scn->scn_suspending) {
+ ASSERT0(avl_numnodes(&scn->scn_queue));
+ scn->scn_done_txg = tx->tx_txg + 1;
+ if (scn->scn_is_sorted) {
+ scn->scn_checkpointing = B_TRUE;
+ scn->scn_clearing = B_TRUE;
+ }
+ zfs_dbgmsg("scan complete txg %llu",
+ (longlong_t)tx->tx_txg);
+ }
+ } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+ ASSERT(scn->scn_clearing);
+
+ /* need to issue scrubbing IOs from per-vdev queues */
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+ scan_io_queues_run(scn);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ /* calculate and dprintf the current memory usage */
+ (void) dsl_scan_should_clear(scn);
+ dsl_scan_update_stats(scn);
+
+ zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums "
+ "(avg_block_size = %llu, avg_seg_size = %llu)",
+ (longlong_t)scn->scn_zios_this_txg,
+ (longlong_t)scn->scn_segs_this_txg,
+ (longlong_t)NSEC2MSEC(gethrtime() -
+ scn->scn_sync_start_time),
+ (longlong_t)scn->scn_avg_zio_size_this_txg,
+ (longlong_t)scn->scn_avg_seg_size_this_txg);
+ } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
+ /* Finished with everything. Mark the scrub as complete */
+ zfs_dbgmsg("scan issuing complete txg %llu",
+ (longlong_t)tx->tx_txg);
+ ASSERT3U(scn->scn_done_txg, !=, 0);
+ ASSERT0(spa->spa_scrub_inflight);
+ ASSERT0(scn->scn_bytes_pending);
+ dsl_scan_done(scn, B_TRUE, tx);
+ sync_type = SYNC_MANDATORY;
+ }
+
+ dsl_scan_sync_state(scn, tx, sync_type);
+}
+
+static void
+count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+ int i;
+
+ /* update the spa's stats on how many bytes we have issued */
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
+ DVA_GET_ASIZE(&bp->blk_dva[i]));
+ }
+
+ /*
+ * If we resume after a reboot, zab will be NULL; don't record
+ * incomplete stats in that case.
+ */
+ if (zab == NULL)
+ return;
+
+ mutex_enter(&zab->zab_lock);
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+ int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+ if (t & DMU_OT_NEWTYPE)
+ t = DMU_OT_OTHER;
+ zfs_blkstat_t *zb = &zab->zab_type[l][t];
+ int equal;
+
+ zb->zb_count++;
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]))
+ zb->zb_ditto_2_of_2_samevdev++;
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal == 1)
+ zb->zb_ditto_2_of_3_samevdev++;
+ else if (equal == 3)
+ zb->zb_ditto_3_of_3_samevdev++;
+ break;
+ }
+ }
+
+ mutex_exit(&zab->zab_lock);
+}
+
+static void
+scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
+{
+ avl_index_t idx;
+ int64_t asize = sio->sio_asize;
+ dsl_scan_t *scn = queue->q_scn;
+
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
+ /* block is already scheduled for reading */
+ atomic_add_64(&scn->scn_bytes_pending, -asize);
+ kmem_free(sio, sizeof (*sio));
+ return;
+ }
+ avl_insert(&queue->q_sios_by_addr, sio, idx);
+ range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
+}
+
+/*
+ * Given all the info we got from our metadata scanning process, we
+ * construct a scan_io_t and insert it into the scan sorting queue. The
+ * I/O must already be suitable for us to process. This is controlled
+ * by dsl_scan_enqueue().
+ */
+static void
+scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
+ int zio_flags, const zbookmark_phys_t *zb)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP);
+
+ ASSERT0(BP_IS_GANG(bp));
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ bp2sio(bp, sio, dva_i);
+ sio->sio_flags = zio_flags;
+ sio->sio_zb = *zb;
+
+ /*
+ * Increment the bytes pending counter now so that we can't
+ * get an integer underflow in case the worker processes the
+ * zio before we get to incrementing this counter.
+ */
+ atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+
+ scan_io_queue_insert_impl(queue, sio);
+}
+
+/*
+ * Given a set of I/O parameters as discovered by the metadata traversal
+ * process, attempts to place the I/O into the sorted queues (if allowed),
+ * or immediately executes the I/O.
+ */
+static void
+dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb)
+{
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ /*
+ * Gang blocks are hard to issue sequentially, so we just issue them
+ * here immediately instead of queuing them.
+ */
+ if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
+ scan_exec_io(dp, bp, zio_flags, zb, NULL);
+ return;
+ }
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+ dva_t dva;
+ vdev_t *vdev;
+
+ dva = bp->blk_dva[i];
+ vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
+ ASSERT(vdev != NULL);
+
+ mutex_enter(&vdev->vdev_scan_io_queue_lock);
+ if (vdev->vdev_scan_io_queue == NULL)
+ vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
+ ASSERT(dp->dp_scan != NULL);
+ scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
+ i, zio_flags, zb);
+ mutex_exit(&vdev->vdev_scan_io_queue_lock);
+ }
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ size_t psize = BP_GET_PSIZE(bp);
+ boolean_t needs_io;
+ int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+ int d;
+
+ if (phys_birth <= scn->scn_phys.scn_min_txg ||
+ phys_birth >= scn->scn_phys.scn_max_txg) {
+ count_block(scn, dp->dp_blkstats, bp);
+ return (0);
+ }
+
+ /* Embedded BP's have phys_birth==0, so we reject them above. */
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+ if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+ zio_flags |= ZIO_FLAG_SCRUB;
+ needs_io = B_TRUE;
+ } else {
+ ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
+ zio_flags |= ZIO_FLAG_RESILVER;
+ needs_io = B_FALSE;
+ }
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ const dva_t *dva = &bp->blk_dva[d];
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
+ spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
+
+ /* if it's a resilver, this may not be in the target range */
+ if (!needs_io)
+ needs_io = dsl_scan_need_resilver(spa, dva, psize,
+ phys_birth);
+ }
+
+ if (needs_io && !zfs_no_scrub_io) {
+ dsl_scan_enqueue(dp, bp, zio_flags, zb);
+ } else {
+ count_block(scn, dp->dp_blkstats, bp);
+ }
+
+ /* do not relocate this block */
+ return (0);
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ dsl_scan_io_queue_t *queue = zio->io_private;
+
+ abd_free(zio->io_abd);
+
+ if (queue == NULL) {
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+ spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+ } else {
+ mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
+ ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
+ queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
+ cv_broadcast(&queue->q_zio_cv);
+ mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
+ }
+
+ if (zio->io_error && (zio->io_error != ECKSUM ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+ atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+ }
+}
+
+/*
+ * Given a scanning zio's information, executes the zio. The zio need
+ * not necessarily be only sortable, this function simply executes the
+ * zio, no matter what it is. The optional queue argument allows the
+ * caller to specify that they want per top level vdev IO rate limiting
+ * instead of the legacy global limiting.
+ */
+static void
+scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
+{
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+ size_t size = BP_GET_PSIZE(bp);
+ abd_t *data = abd_alloc_for_io(size, B_FALSE);
+ unsigned int scan_delay = 0;
+
+ ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
+
+ if (queue == NULL) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
+ mutex_exit(&spa->spa_scrub_lock);
+ } else {
+ kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+
+ mutex_enter(q_lock);
+ while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
+ cv_wait(&queue->q_zio_cv, q_lock);
+ queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+ mutex_exit(q_lock);
+ }
+
+ if (zio_flags & ZIO_FLAG_RESILVER)
+ scan_delay = zfs_resilver_delay;
+ else {
+ ASSERT(zio_flags & ZIO_FLAG_SCRUB);
+ scan_delay = zfs_scrub_delay;
+ }
+
+ if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle))
+ delay(MAX((int)scan_delay, 0));
+
+ count_block(dp->dp_scan, dp->dp_blkstats, bp);
+ zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size,
+ dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+}
+
+/*
+ * This is the primary extent sorting algorithm. We balance two parameters:
+ * 1) how many bytes of I/O are in an extent
+ * 2) how well the extent is filled with I/O (as a fraction of its total size)
+ * Since we allow extents to have gaps between their constituent I/Os, it's
+ * possible to have a fairly large extent that contains the same amount of
+ * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
+ * The algorithm sorts based on a score calculated from the extent's size,
+ * the relative fill volume (in %) and a "fill weight" parameter that controls
+ * the split between whether we prefer larger extents or more well populated
+ * extents:
+ *
+ * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
+ *
+ * Example:
+ * 1) assume extsz = 64 MiB
+ * 2) assume fill = 32 MiB (extent is half full)
+ * 3) assume fill_weight = 3
+ * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
+ * SCORE = 32M + (50 * 3 * 32M) / 100
+ * SCORE = 32M + (4800M / 100)
+ * SCORE = 32M + 48M
+ * ^ ^
+ * | +--- final total relative fill-based score
+ * +--------- final total fill-based score
+ * SCORE = 80M
+ *
+ * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
+ * extents that are more completely filled (in a 3:2 ratio) vs just larger.
+ * Note that as an optimization, we replace multiplication and division by
+ * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128).
+ */
+static int
+ext_size_compare(const void *x, const void *y)
+{
+ const range_seg_t *rsa = x, *rsb = y;
+ uint64_t sa = rsa->rs_end - rsa->rs_start,
+ sb = rsb->rs_end - rsb->rs_start;
+ uint64_t score_a, score_b;
+
+ score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
+ fill_weight * rsa->rs_fill) >> 7);
+ score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
+ fill_weight * rsb->rs_fill) >> 7);
+
+ if (score_a > score_b)
+ return (-1);
+ if (score_a == score_b) {
+ if (rsa->rs_start < rsb->rs_start)
+ return (-1);
+ if (rsa->rs_start == rsb->rs_start)
+ return (0);
+ return (1);
+ }
+ return (1);
+}
+
+/*
+ * Comparator for the q_sios_by_addr tree. Sorting is simply performed
+ * based on LBA-order (from lowest to highest).
+ */
+static int
+io_addr_compare(const void *x, const void *y)
+{
+ const scan_io_t *a = x, *b = y;
+
+ if (a->sio_offset < b->sio_offset)
+ return (-1);
+ if (a->sio_offset == b->sio_offset)
+ return (0);
+ return (1);
+}
+
+/* IO queues are created on demand when they are needed. */
+static dsl_scan_io_queue_t *
+scan_io_queue_create(vdev_t *vd)
+{
+ dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+ dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
+
+ q->q_scn = scn;
+ q->q_vd = vd;
+ cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
+ q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
+ &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
+ avl_create(&q->q_sios_by_addr, io_addr_compare,
+ sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
+
+ return (q);
+}
+
+/*
+ * Destroys a scan queue and all segments and scan_io_t's contained in it.
+ * No further execution of I/O occurs, anything pending in the queue is
+ * simply freed without being executed.
+ */
+void
+dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio;
+ void *cookie = NULL;
+ int64_t bytes_dequeued = 0;
+
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
+ NULL) {
+ ASSERT(range_tree_contains(queue->q_exts_by_addr,
+ sio->sio_offset, sio->sio_asize));
+ bytes_dequeued += sio->sio_asize;
+ kmem_free(sio, sizeof (*sio));
+ }
+
+ atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
+ range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
+ range_tree_destroy(queue->q_exts_by_addr);
+ avl_destroy(&queue->q_sios_by_addr);
+ cv_destroy(&queue->q_zio_cv);
+
+ kmem_free(queue, sizeof (*queue));
+}
+
+/*
+ * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
+ * called on behalf of vdev_top_transfer when creating or destroying
+ * a mirror vdev due to zpool attach/detach.
+ */
+void
+dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
+{
+ mutex_enter(&svd->vdev_scan_io_queue_lock);
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+
+ VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
+ tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
+ svd->vdev_scan_io_queue = NULL;
+ if (tvd->vdev_scan_io_queue != NULL)
+ tvd->vdev_scan_io_queue->q_vd = tvd;
+
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ mutex_exit(&svd->vdev_scan_io_queue_lock);
+}
+
+static void
+scan_io_queues_destroy(dsl_scan_t *scn)
+{
+ vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *tvd = rvd->vdev_child[i];
+
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+ if (tvd->vdev_scan_io_queue != NULL)
+ dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
+ tvd->vdev_scan_io_queue = NULL;
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ }
+}
+
+static void
+dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ vdev_t *vdev;
+ kmutex_t *q_lock;
+ dsl_scan_io_queue_t *queue;
+ scan_io_t srch, *sio;
+ avl_index_t idx;
+ uint64_t start, size;
+
+ vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
+ ASSERT(vdev != NULL);
+ q_lock = &vdev->vdev_scan_io_queue_lock;
+ queue = vdev->vdev_scan_io_queue;
+
+ mutex_enter(q_lock);
+ if (queue == NULL) {
+ mutex_exit(q_lock);
+ return;
+ }
+
+ bp2sio(bp, &srch, dva_i);
+ start = srch.sio_offset;
+ size = srch.sio_asize;
+
+ /*
+ * We can find the zio in two states:
+ * 1) Cold, just sitting in the queue of zio's to be issued at
+ * some point in the future. In this case, all we do is
+ * remove the zio from the q_sios_by_addr tree, decrement
+ * its data volume from the containing range_seg_t and
+ * resort the q_exts_by_size tree to reflect that the
+ * range_seg_t has lost some of its 'fill'. We don't shorten
+ * the range_seg_t - this is usually rare enough not to be
+ * worth the extra hassle of trying keep track of precise
+ * extent boundaries.
+ * 2) Hot, where the zio is currently in-flight in
+ * dsl_scan_issue_ios. In this case, we can't simply
+ * reach in and stop the in-flight zio's, so we instead
+ * block the caller. Eventually, dsl_scan_issue_ios will
+ * be done with issuing the zio's it gathered and will
+ * signal us.
+ */
+ sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+ if (sio != NULL) {
+ int64_t asize = sio->sio_asize;
+ blkptr_t tmpbp;
+
+ /* Got it while it was cold in the queue */
+ ASSERT3U(start, ==, sio->sio_offset);
+ ASSERT3U(size, ==, asize);
+ avl_remove(&queue->q_sios_by_addr, sio);
+
+ ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
+ range_tree_remove_fill(queue->q_exts_by_addr, start, size);
+
+ /*
+ * We only update scn_bytes_pending in the cold path,
+ * otherwise it will already have been accounted for as
+ * part of the zio's execution.
+ */
+ atomic_add_64(&scn->scn_bytes_pending, -asize);
+
+ /* count the block as though we issued it */
+ sio2bp(sio, &tmpbp, dva_i);
+ count_block(scn, dp->dp_blkstats, &tmpbp);
+
+ kmem_free(sio, sizeof (*sio));
+ }
+ mutex_exit(q_lock);
+}
+
+/*
+ * Callback invoked when a zio_free() zio is executing. This needs to be
+ * intercepted to prevent the zio from deallocating a particular portion
+ * of disk space and it then getting reallocated and written to, while we
+ * still have it queued up for processing.
+ */
+void
+dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(scn != NULL);
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++)
+ dsl_scan_freed_dva(spa, bp, i);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
new file mode 100644
index 000000000000..a78b4cb030cf
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
@@ -0,0 +1,256 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab.h>
+
+#define DST_AVG_BLKSHIFT 14
+
+/* ARGSUSED */
+static int
+dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
+{
+ return (0);
+}
+
+static int
+dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check, boolean_t early)
+{
+ spa_t *spa;
+ dmu_tx_t *tx;
+ int err;
+ dsl_sync_task_t dst = { 0 };
+ dsl_pool_t *dp;
+
+ err = spa_open(pool, &spa, FTAG);
+ if (err != 0)
+ return (err);
+ dp = spa_get_dsl(spa);
+
+top:
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ dst.dst_pool = dp;
+ dst.dst_txg = dmu_tx_get_txg(tx);
+ dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+ dst.dst_space_check = space_check;
+ dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
+ dst.dst_syncfunc = syncfunc;
+ dst.dst_arg = arg;
+ dst.dst_error = 0;
+ dst.dst_nowaiter = B_FALSE;
+
+ dsl_pool_config_enter(dp, FTAG);
+ err = dst.dst_checkfunc(arg, tx);
+ dsl_pool_config_exit(dp, FTAG);
+
+ if (err != 0) {
+ dmu_tx_commit(tx);
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
+ txg_list_t *task_list = (early) ?
+ &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+ VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));
+
+ dmu_tx_commit(tx);
+
+ if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) {
+ /* current contract is to call func once */
+ sigfunc(arg, tx);
+ sigfunc = NULL; /* in case of an EAGAIN retry */
+ }
+ txg_wait_synced(dp, dst.dst_txg);
+
+ if (dst.dst_error == EAGAIN) {
+ txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE);
+ goto top;
+ }
+
+ spa_close(spa, FTAG);
+ return (dst.dst_error);
+}
+
+/*
+ * Called from open context to perform a callback in syncing context. Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail. If it succeeds, it will be called again from
+ * syncing context. The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called. It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument. Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset. The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
+int
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
+{
+ return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
+ blocks_modified, space_check, B_FALSE));
+}
+
+/*
+ * An early synctask works exactly as a standard synctask with one important
+ * difference on the way it is handled during syncing context. Standard
+ * synctasks run after we've written out all the dirty blocks of dirty
+ * datasets. Early synctasks are executed before writing out any dirty data,
+ * and thus before standard synctasks.
+ *
+ * For that reason, early synctasks can affect the process of writing dirty
+ * changes to disk for the txg that they run and should be used with caution.
+ * In addition, early synctasks should not dirty any metaslabs as this would
+ * invalidate the precodition/invariant for subsequent early synctasks.
+ * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
+ */
+int
+dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
+{
+ return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
+ blocks_modified, space_check, B_TRUE));
+}
+
+/*
+ * A standard synctask that can be interrupted from a signal. The sigfunc
+ * is called once if a signal occurred while waiting for the task to sync.
+ */
+int
+dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
+{
+ return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg,
+ blocks_modified, space_check, B_FALSE));
+}
+
+static void
+dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
+ boolean_t early)
+{
+ dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
+
+ dst->dst_pool = dp;
+ dst->dst_txg = dmu_tx_get_txg(tx);
+ dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+ dst->dst_space_check = space_check;
+ dst->dst_checkfunc = dsl_null_checkfunc;
+ dst->dst_syncfunc = syncfunc;
+ dst->dst_arg = arg;
+ dst->dst_error = 0;
+ dst->dst_nowaiter = B_TRUE;
+
+ txg_list_t *task_list = (early) ?
+ &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+ VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
+}
+
+void
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+ dsl_sync_task_nowait_common(dp, syncfunc, arg,
+ blocks_modified, space_check, tx, B_FALSE);
+}
+
+void
+dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+ dsl_sync_task_nowait_common(dp, syncfunc, arg,
+ blocks_modified, space_check, tx, B_TRUE);
+}
+
+/*
+ * Called in syncing context to execute the synctask.
+ */
+void
+dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dst->dst_pool;
+
+ ASSERT0(dst->dst_error);
+
+ /*
+ * Check for sufficient space.
+ *
+ * When the sync task was created, the caller specified the
+ * type of space checking required. See the comment in
+ * zfs_space_check_t for details on the semantics of each
+ * type of space checking.
+ *
+ * We just check against what's on-disk; we don't want any
+ * in-flight accounting to get in our way, because open context
+ * may have already used up various in-core limits
+ * (arc_tempreserve, dsl_pool_tempreserve).
+ */
+ if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
+ uint64_t quota = dsl_pool_unreserved_space(dp,
+ dst->dst_space_check);
+ uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+
+ /* MOS space is triple-dittoed, so we multiply by 3. */
+ if (used + dst->dst_space * 3 > quota) {
+ dst->dst_error = SET_ERROR(ENOSPC);
+ if (dst->dst_nowaiter)
+ kmem_free(dst, sizeof (*dst));
+ return;
+ }
+ }
+
+ /*
+ * Check for errors by calling checkfunc.
+ */
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+ dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx);
+ if (dst->dst_error == 0)
+ dst->dst_syncfunc(dst->dst_arg, tx);
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+ if (dst->dst_nowaiter)
+ kmem_free(dst, sizeof (*dst));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
new file mode 100644
index 000000000000..d0274dc4ce39
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
@@ -0,0 +1,667 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+
+typedef struct dsl_dataset_user_hold_arg {
+ nvlist_t *dduha_holds;
+ nvlist_t *dduha_chkholds;
+ nvlist_t *dduha_errlist;
+ minor_t dduha_minor;
+} dsl_dataset_user_hold_arg_t;
+
+/*
+ * If you add new checks here, you may need to add additional checks to the
+ * "temporary" case in snapshot_check() in dmu_objset.c.
+ */
+int
+dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag,
+ boolean_t temphold, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
+ int error = 0;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ if (strlen(htag) > MAXNAMELEN)
+ return (SET_ERROR(E2BIG));
+ /* Tempholds have a more restricted length */
+ if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+ return (SET_ERROR(E2BIG));
+
+ /* tags must be unique (if ds already exists) */
+ if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+ uint64_t value;
+
+ error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+ htag, 8, 1, &value);
+ if (error == 0)
+ error = SET_ERROR(EEXIST);
+ else if (error == ENOENT)
+ error = 0;
+ }
+
+ return (error);
+}
+
+static int
+dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_user_hold_arg_t *dduha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS)
+ return (SET_ERROR(ENOTSUP));
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
+ dsl_dataset_t *ds;
+ int error = 0;
+ char *htag, *name;
+
+ /* must be a snapshot */
+ name = nvpair_name(pair);
+ if (strchr(name, '@') == NULL)
+ error = SET_ERROR(EINVAL);
+
+ if (error == 0)
+ error = nvpair_value_string(pair, &htag);
+
+ if (error == 0)
+ error = dsl_dataset_hold(dp, name, FTAG, &ds);
+
+ if (error == 0) {
+ error = dsl_dataset_user_hold_check_one(ds, htag,
+ dduha->dduha_minor != 0, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ if (error == 0) {
+ fnvlist_add_string(dduha->dduha_chkholds, name, htag);
+ } else {
+ /*
+ * We register ENOENT errors so they can be correctly
+ * reported if needed, such as when all holds fail.
+ */
+ fnvlist_add_int32(dduha->dduha_errlist, name, error);
+ if (error != ENOENT)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+
+static void
+dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds,
+ const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t zapobj;
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
+ /*
+ * This is the first user hold for this dataset. Create
+ * the userrefs zap object.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
+ zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+ } else {
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
+ }
+ ds->ds_userrefs++;
+
+ VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+ if (minor != 0) {
+ char name[MAXNAMELEN];
+ nvlist_t *tags;
+
+ VERIFY0(dsl_pool_user_hold(dp, ds->ds_object,
+ htag, now, tx));
+ (void) snprintf(name, sizeof (name), "%llx",
+ (u_longlong_t)ds->ds_object);
+
+ if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) {
+ tags = fnvlist_alloc();
+ fnvlist_add_boolean(tags, htag);
+ fnvlist_add_nvlist(tmpholds, name, tags);
+ fnvlist_free(tags);
+ } else {
+ fnvlist_add_boolean(tags, htag);
+ }
+ }
+
+ spa_history_log_internal_ds(ds, "hold", tx,
+ "tag=%s temp=%d refs=%llu",
+ htag, minor != 0, ds->ds_userrefs);
+}
+
+typedef struct zfs_hold_cleanup_arg {
+ char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t zhca_spa_load_guid;
+ nvlist_t *zhca_holds;
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+ zfs_hold_cleanup_arg_t *ca = arg;
+ spa_t *spa;
+ int error;
+
+ error = spa_open(ca->zhca_spaname, &spa, FTAG);
+ if (error != 0) {
+ zfs_dbgmsg("couldn't release holds on pool=%s "
+ "because pool is no longer loaded",
+ ca->zhca_spaname);
+ return;
+ }
+ if (spa_load_guid(spa) != ca->zhca_spa_load_guid) {
+ zfs_dbgmsg("couldn't release holds on pool=%s "
+ "because pool is no longer loaded (guid doesn't match)",
+ ca->zhca_spaname);
+ spa_close(spa, FTAG);
+ return;
+ }
+
+ (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds);
+ fnvlist_free(ca->zhca_holds);
+ kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+ spa_close(spa, FTAG);
+}
+
+static void
+dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor)
+{
+ zfs_hold_cleanup_arg_t *ca;
+
+ if (minor == 0 || nvlist_empty(holds)) {
+ fnvlist_free(holds);
+ return;
+ }
+
+ ASSERT(spa != NULL);
+ ca = kmem_alloc(sizeof (*ca), KM_SLEEP);
+
+ (void) strlcpy(ca->zhca_spaname, spa_name(spa),
+ sizeof (ca->zhca_spaname));
+ ca->zhca_spa_load_guid = spa_load_guid(spa);
+ ca->zhca_holds = holds;
+ VERIFY0(zfs_onexit_add_cb(minor,
+ dsl_dataset_user_release_onexit, ca, NULL));
+}
+
+void
+dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag,
+ minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+ nvlist_t *tmpholds;
+
+ if (minor != 0)
+ tmpholds = fnvlist_alloc();
+ else
+ tmpholds = NULL;
+ dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx);
+ dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor);
+}
+
+static void
+dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_user_hold_arg_t *dduha = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ nvlist_t *tmpholds;
+ uint64_t now = gethrestime_sec();
+
+ if (dduha->dduha_minor != 0)
+ tmpholds = fnvlist_alloc();
+ else
+ tmpholds = NULL;
+ for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) {
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+ dsl_dataset_user_hold_sync_one_impl(tmpholds, ds,
+ fnvpair_value_string(pair), dduha->dduha_minor, now, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor);
+}
+
+/*
+ * The full semantics of this function are described in the comment above
+ * lzc_hold().
+ *
+ * To summarize:
+ * holds is nvl of snapname -> holdname
+ * errlist will be filled in with snapname -> error
+ *
+ * The snaphosts must all be in the same pool.
+ *
+ * Holds for snapshots that don't exist will be skipped.
+ *
+ * If none of the snapshots for requested holds exist then ENOENT will be
+ * returned.
+ *
+ * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned
+ * up when the process exits.
+ *
+ * On success all the holds, for snapshots that existed, will be created and 0
+ * will be returned.
+ *
+ * On failure no holds will be created, the errlist will be filled in,
+ * and an errno will returned.
+ *
+ * In all cases the errlist will contain entries for holds where the snapshot
+ * didn't exist.
+ */
+int
+dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
+{
+ dsl_dataset_user_hold_arg_t dduha;
+ nvpair_t *pair;
+ int ret;
+
+ pair = nvlist_next_nvpair(holds, NULL);
+ if (pair == NULL)
+ return (0);
+
+ dduha.dduha_holds = holds;
+ dduha.dduha_chkholds = fnvlist_alloc();
+ dduha.dduha_errlist = errlist;
+ dduha.dduha_minor = cleanup_minor;
+
+ ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
+ dsl_dataset_user_hold_sync, &dduha,
+ fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
+ fnvlist_free(dduha.dduha_chkholds);
+
+ return (ret);
+}
+
+typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag,
+ dsl_dataset_t **dsp);
+
+typedef struct dsl_dataset_user_release_arg {
+ dsl_holdfunc_t *ddura_holdfunc;
+ nvlist_t *ddura_holds;
+ nvlist_t *ddura_todelete;
+ nvlist_t *ddura_errlist;
+ nvlist_t *ddura_chkholds;
+} dsl_dataset_user_release_arg_t;
+
+/* Place a dataset hold on the snapshot identified by passed dsobj string */
+static int
+dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag,
+ dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp));
+}
+
+static int
+dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
+ dsl_dataset_t *ds, nvlist_t *holds, const char *snapname)
+{
+ uint64_t zapobj;
+ nvlist_t *holds_found;
+ objset_t *mos;
+ int numholds;
+
+ if (!ds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_empty(holds))
+ return (0);
+
+ numholds = 0;
+ mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
+ holds_found = fnvlist_alloc();
+
+ for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ uint64_t tmp;
+ int error;
+ const char *holdname = nvpair_name(pair);
+
+ if (zapobj != 0)
+ error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp);
+ else
+ error = SET_ERROR(ENOENT);
+
+ /*
+ * Non-existent holds are put on the errlist, but don't
+ * cause an overall failure.
+ */
+ if (error == ENOENT) {
+ if (ddura->ddura_errlist != NULL) {
+ char *errtag = kmem_asprintf("%s#%s",
+ snapname, holdname);
+ fnvlist_add_int32(ddura->ddura_errlist, errtag,
+ ENOENT);
+ strfree(errtag);
+ }
+ continue;
+ }
+
+ if (error != 0) {
+ fnvlist_free(holds_found);
+ return (error);
+ }
+
+ fnvlist_add_boolean(holds_found, holdname);
+ numholds++;
+ }
+
+ if (DS_IS_DEFER_DESTROY(ds) &&
+ dsl_dataset_phys(ds)->ds_num_children == 1 &&
+ ds->ds_userrefs == numholds) {
+ /* we need to destroy the snapshot as well */
+ if (dsl_dataset_long_held(ds)) {
+ fnvlist_free(holds_found);
+ return (SET_ERROR(EBUSY));
+ }
+ fnvlist_add_boolean(ddura->ddura_todelete, snapname);
+ }
+
+ if (numholds != 0) {
+ fnvlist_add_nvlist(ddura->ddura_chkholds, snapname,
+ holds_found);
+ }
+ fnvlist_free(holds_found);
+
+ return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_user_release_arg_t *ddura;
+ dsl_holdfunc_t *holdfunc;
+ dsl_pool_t *dp;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ dp = dmu_tx_pool(tx);
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ ddura = arg;
+ holdfunc = ddura->ddura_holdfunc;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
+ int error;
+ dsl_dataset_t *ds;
+ nvlist_t *holds;
+ const char *snapname = nvpair_name(pair);
+
+ error = nvpair_value_nvlist(pair, &holds);
+ if (error != 0)
+ error = (SET_ERROR(EINVAL));
+ else
+ error = holdfunc(dp, snapname, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_dataset_user_release_check_one(ddura, ds,
+ holds, snapname);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ if (error != 0) {
+ if (ddura->ddura_errlist != NULL) {
+ fnvlist_add_int32(ddura->ddura_errlist,
+ snapname, error);
+ }
+ /*
+ * Non-existent snapshots are put on the errlist,
+ * but don't cause an overall failure.
+ */
+ if (error != ENOENT)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static void
+dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ int error;
+ const char *holdname = nvpair_name(pair);
+
+ /* Remove temporary hold if one exists. */
+ error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
+ VERIFY(error == 0 || error == ENOENT);
+
+ VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+ holdname, tx));
+ ds->ds_userrefs--;
+
+ spa_history_log_internal_ds(ds, "release", tx,
+ "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs);
+ }
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_user_release_arg_t *ddura = arg;
+ dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+
+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds,
+ pair)) {
+ dsl_dataset_t *ds;
+ const char *name = nvpair_name(pair);
+
+ VERIFY0(holdfunc(dp, name, FTAG, &ds));
+
+ dsl_dataset_user_release_sync_one(ds,
+ fnvpair_value_nvlist(pair), tx);
+ if (nvlist_exists(ddura->ddura_todelete, name)) {
+ ASSERT(ds->ds_userrefs == 0 &&
+ dsl_dataset_phys(ds)->ds_num_children == 1 &&
+ DS_IS_DEFER_DESTROY(ds));
+ dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ }
+}
+
+/*
+ * The full semantics of this function are described in the comment above
+ * lzc_release().
+ *
+ * To summarize:
+ * Releases holds specified in the nvl holds.
+ *
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ *
+ * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots,
+ * otherwise they should be the names of shapshots.
+ *
+ * As a release may cause snapshots to be destroyed this trys to ensure they
+ * aren't mounted.
+ *
+ * The release of non-existent holds are skipped.
+ *
+ * At least one hold must have been released for the this function to succeed
+ * and return 0.
+ */
+static int
+dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
+ dsl_pool_t *tmpdp)
+{
+ dsl_dataset_user_release_arg_t ddura;
+ nvpair_t *pair;
+ char *pool;
+ int error;
+
+ pair = nvlist_next_nvpair(holds, NULL);
+ if (pair == NULL)
+ return (0);
+
+ /*
+ * The release may cause snapshots to be destroyed; make sure they
+ * are not mounted.
+ */
+ if (tmpdp != NULL) {
+ /* Temporary holds are specified by dsobj string. */
+ ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
+ pool = spa_name(tmpdp->dp_spa);
+#ifdef _KERNEL
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ dsl_dataset_t *ds;
+
+ dsl_pool_config_enter(tmpdp, FTAG);
+ error = dsl_dataset_hold_obj_string(tmpdp,
+ nvpair_name(pair), FTAG, &ds);
+ if (error == 0) {
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(ds, name);
+ dsl_pool_config_exit(tmpdp, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ (void) zfs_unmount_snap(name);
+ } else {
+ dsl_pool_config_exit(tmpdp, FTAG);
+ }
+ }
+#endif
+ } else {
+ /* Non-temporary holds are specified by name. */
+ ddura.ddura_holdfunc = dsl_dataset_hold;
+ pool = nvpair_name(pair);
+#ifdef _KERNEL
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ (void) zfs_unmount_snap(nvpair_name(pair));
+ }
+#endif
+ }
+
+ ddura.ddura_holds = holds;
+ ddura.ddura_errlist = errlist;
+ ddura.ddura_todelete = fnvlist_alloc();
+ ddura.ddura_chkholds = fnvlist_alloc();
+
+ error = dsl_sync_task(pool, dsl_dataset_user_release_check,
+ dsl_dataset_user_release_sync, &ddura, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
+ fnvlist_free(ddura.ddura_todelete);
+ fnvlist_free(ddura.ddura_chkholds);
+
+ return (error);
+}
+
+/*
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ */
+int
+dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist)
+{
+ return (dsl_dataset_user_release_impl(holds, errlist, NULL));
+}
+
+/*
+ * holds is nvl of snapdsobj -> { holdname, ... }
+ */
+void
+dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds)
+{
+ ASSERT(dp != NULL);
+ (void) dsl_dataset_user_release_impl(holds, NULL, dp);
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_pool_hold(dsname, FTAG, &dp);
+ if (err != 0)
+ return (err);
+ err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+ zap_attribute_t *za;
+ zap_cursor_t zc;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_userrefs_obj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ zap_cursor_advance(&zc)) {
+ fnvlist_add_uint64(nvl, za->za_name,
+ za->za_first_integer);
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (zap_attribute_t));
+ }
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
new file mode 100644
index 000000000000..9a3430d94668
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
@@ -0,0 +1,114 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/edonr.h>
+#include <sys/abd.h>
+
+#define EDONR_MODE 512
+#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE
+
+static int
+edonr_incremental(void *buf, size_t size, void *arg)
+{
+ EdonRState *ctx = arg;
+ EdonRUpdate(ctx, buf, size * 8);
+ return (0);
+}
+
+/*
+ * Native zio_checksum interface for the Edon-R hash function.
+ */
+/*ARGSUSED*/
+void
+abd_checksum_edonr_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ uint8_t digest[EDONR_MODE / 8];
+ EdonRState ctx;
+
+ ASSERT(ctx_template != NULL);
+ bcopy(ctx_template, &ctx, sizeof (ctx));
+ (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
+ EdonRFinal(&ctx, digest);
+ bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
+}
+
+/*
+ * Byteswapped zio_checksum interface for the Edon-R hash function.
+ */
+void
+abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]);
+}
+
+void *
+abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ EdonRState *ctx;
+ uint8_t salt_block[EDONR_BLOCK_SIZE];
+
+ /*
+ * Edon-R needs all but the last hash invocation to be on full-size
+ * blocks, but the salt is too small. Rather than simply padding it
+ * with zeros, we expand the salt into a new salt block of proper
+ * size by double-hashing it (the new salt block will be composed of
+ * H(salt) || H(H(salt))).
+ */
+ CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
+ EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
+ salt_block);
+ EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
+ EDONR_MODE / 8);
+
+ /*
+ * Feed the new salt block into the hash function - this will serve
+ * as our MAC key.
+ */
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ EdonRInit(ctx, EDONR_MODE);
+ EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
+ return (ctx);
+}
+
+void
+abd_checksum_edonr_tmpl_free(void *ctx_template)
+{
+ EdonRState *ctx = ctx_template;
+
+ bzero(ctx, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
new file mode 100644
index 000000000000..b257d4af753c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/zmod.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <strings.h>
+#endif
+
+size_t
+gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ size_t dstlen = d_len;
+
+ ASSERT(d_len <= s_len);
+
+ if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
+ if (d_len != s_len)
+ return (s_len);
+
+ bcopy(s_start, d_start, s_len);
+ return (s_len);
+ }
+
+ return (dstlen);
+}
+
+/*ARGSUSED*/
+int
+gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ size_t dstlen = d_len;
+
+ ASSERT(d_len >= s_len);
+
+ if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK)
+ return (-1);
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs
new file mode 100644
index 000000000000..0e22de7a4a18
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs
@@ -0,0 +1,80 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+Introduction
+------------
+
+This README describes the Lua interpreter source code that lives in the ZFS
+source tree to enable execution of ZFS channel programs, including its
+maintenance policy, the modifications that have been made to it, and how it
+should (and should not) be used.
+
+For a description of the Lua language and features exposed by ZFS channel
+programs, please refer to the zfs-program(1m) man page instead.
+
+
+Maintenance policy
+------------------
+
+The Lua runtime is considered stable software. Channel programs don't need much
+complicated logic, so updates to the Lua runtime from upstream are viewed as
+nice-to-have, but not required for channel programs to be well-supported. As
+such, the Lua runtime in ZFS should be updated on an as-needed basis for
+security vulnerabilities, but not much else.
+
+
+Modifications to Lua
+--------------------
+
+The version of the Lua runtime we're using in ZFS has been modified in a variety
+of ways to make it more useful for the specific purpose of running channel
+programs. These changes include:
+
+1. "Normal" Lua uses floating point for all numbers it stores, but those aren't
+ useful inside ZFS / the kernel. We have changed the runtime to use int64_t
+ throughout for all numbers.
+2. Some of the Lua standard libraries do file I/O or spawn processes, but
+ neither of these make sense from inside channel programs. We have removed
+ those libraries rather than reimplementing them using kernel APIs.
+3. The "normal" Lua runtime handles errors by failing fatally, but since this
+ version of Lua runs inside the kernel we must handle these failures and
+ return meaningful error codes to userland. We have customized the Lua
+ failure paths so that they aren't fatal.
+4. Running poorly-vetted code inside the kernel is always a risk; even if the
+ ability to do so is restricted to the root user, it's still possible to write
+ an incorrect program that results in an infinite loop or massive memory use.
+ We've added new protections into the Lua interpreter to limit the runtime
+ (measured in number of Lua instructions run) and memory overhead of running
+ a channel program.
+5. The Lua bytecode is not designed to be secure / safe, so it would be easy to
+ pass invalid bytecode which can panic the kernel. By comparison, the parser
+ is hardened and fails gracefully on invalid input. Therefore, we only accept
+ Lua source code at the ioctl level and then interpret it inside the kernel.
+
+Each of these modifications have been tested in the zfs-test suite. If / when
+new modifications are made, new tests should be added to the suite located in
+zfs-tests/tests/functional/channel_program/lua_core.
+
+
+How to use this Lua interpreter
+-------------------------------
+
+From the above, it should be clear that this is not a general-purpose Lua
+interpreter. Additional work would be required to extricate this custom version
+of Lua from ZFS and make it usable by other areas of the kernel.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c
new file mode 100644
index 000000000000..34820a2d8b44
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c
@@ -0,0 +1,1283 @@
+/*
+** $Id: lapi.c,v 2.171.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua API
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define lapi_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lundump.h"
+#include "lvm.h"
+
+
+
+const char lua_ident[] =
+ "$LuaVersion: " LUA_COPYRIGHT " $"
+ "$LuaAuthors: " LUA_AUTHORS " $";
+
+
+/* value at a non-valid index */
+#define NONVALIDVALUE cast(TValue *, luaO_nilobject)
+
+/* corresponding test */
+#define isvalid(o) ((o) != luaO_nilobject)
+
+/* test for pseudo index */
+#define ispseudo(i) ((i) <= LUA_REGISTRYINDEX)
+
+/* test for valid but not pseudo index */
+#define isstackindex(i, o) (isvalid(o) && !ispseudo(i))
+
+#define api_checkvalidindex(L, o) api_check(L, isvalid(o), "invalid index")
+
+#define api_checkstackindex(L, i, o) \
+ api_check(L, isstackindex(i, o), "index not in the stack")
+
+
+static TValue *index2addr (lua_State *L, int idx) {
+ CallInfo *ci = L->ci;
+ if (idx > 0) {
+ TValue *o = ci->func + idx;
+ api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index");
+ if (o >= L->top) return NONVALIDVALUE;
+ else return o;
+ }
+ else if (!ispseudo(idx)) { /* negative index */
+ api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index");
+ return L->top + idx;
+ }
+ else if (idx == LUA_REGISTRYINDEX)
+ return &G(L)->l_registry;
+ else { /* upvalues */
+ idx = LUA_REGISTRYINDEX - idx;
+ api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large");
+ if (ttislcf(ci->func)) /* light C function? */
+ return NONVALIDVALUE; /* it has no upvalues */
+ else {
+ CClosure *func = clCvalue(ci->func);
+ return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE;
+ }
+ }
+}
+
+
+/*
+** to be called by 'lua_checkstack' in protected mode, to grow stack
+** capturing memory errors
+*/
+static void growstack (lua_State *L, void *ud) {
+ int size = *(int *)ud;
+ luaD_growstack(L, size);
+}
+
+
+LUA_API int lua_checkstack (lua_State *L, int size) {
+ int res;
+ CallInfo *ci = L->ci;
+ lua_lock(L);
+ if (L->stack_last - L->top > size) /* stack large enough? */
+ res = 1; /* yes; check is OK */
+ else { /* no; need to grow stack */
+ int inuse = cast_int(L->top - L->stack) + EXTRA_STACK;
+ if (inuse > LUAI_MAXSTACK - size) /* can grow without overflow? */
+ res = 0; /* no */
+ else /* try to grow stack */
+ res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK);
+ }
+ if (res && ci->top < L->top + size)
+ ci->top = L->top + size; /* adjust frame top */
+ lua_unlock(L);
+ return res;
+}
+
+
+LUA_API void lua_xmove (lua_State *from, lua_State *to, int n) {
+ int i;
+ if (from == to) return;
+ lua_lock(to);
+ api_checknelems(from, n);
+ api_check(from, G(from) == G(to), "moving among independent states");
+ api_check(from, to->ci->top - to->top >= n, "not enough elements to move");
+ from->top -= n;
+ for (i = 0; i < n; i++) {
+ setobj2s(to, to->top++, from->top + i);
+ }
+ lua_unlock(to);
+}
+
+
+LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) {
+ lua_CFunction old;
+ lua_lock(L);
+ old = G(L)->panic;
+ G(L)->panic = panicf;
+ lua_unlock(L);
+ return old;
+}
+
+
+LUA_API const lua_Number *lua_version (lua_State *L) {
+ static const lua_Number version = LUA_VERSION_NUM;
+ if (L == NULL) return &version;
+ else return G(L)->version;
+}
+
+
+
+/*
+** basic stack manipulation
+*/
+
+
+/*
+** convert an acceptable stack index into an absolute index
+*/
+LUA_API int lua_absindex (lua_State *L, int idx) {
+ return (idx > 0 || ispseudo(idx))
+ ? idx
+ : cast_int(L->top - L->ci->func + idx);
+}
+
+
+LUA_API int lua_gettop (lua_State *L) {
+ return cast_int(L->top - (L->ci->func + 1));
+}
+
+
+LUA_API void lua_settop (lua_State *L, int idx) {
+ StkId func = L->ci->func;
+ lua_lock(L);
+ if (idx >= 0) {
+ api_check(L, idx <= L->stack_last - (func + 1), "new top too large");
+ while (L->top < (func + 1) + idx)
+ setnilvalue(L->top++);
+ L->top = (func + 1) + idx;
+ }
+ else {
+ api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top");
+ L->top += idx+1; /* `subtract' index (index is negative) */
+ }
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_remove (lua_State *L, int idx) {
+ StkId p;
+ lua_lock(L);
+ p = index2addr(L, idx);
+ api_checkstackindex(L, idx, p);
+ while (++p < L->top) setobjs2s(L, p-1, p);
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_insert (lua_State *L, int idx) {
+ StkId p;
+ StkId q;
+ lua_lock(L);
+ p = index2addr(L, idx);
+ api_checkstackindex(L, idx, p);
+ for (q = L->top; q > p; q--) /* use L->top as a temporary */
+ setobjs2s(L, q, q - 1);
+ setobjs2s(L, p, L->top);
+ lua_unlock(L);
+}
+
+
+static void moveto (lua_State *L, TValue *fr, int idx) {
+ TValue *to = index2addr(L, idx);
+ api_checkvalidindex(L, to);
+ setobj(L, to, fr);
+ if (idx < LUA_REGISTRYINDEX) /* function upvalue? */
+ luaC_barrier(L, clCvalue(L->ci->func), fr);
+ /* LUA_REGISTRYINDEX does not need gc barrier
+ (collector revisits it before finishing collection) */
+}
+
+
+LUA_API void lua_replace (lua_State *L, int idx) {
+ lua_lock(L);
+ api_checknelems(L, 1);
+ moveto(L, L->top - 1, idx);
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) {
+ TValue *fr;
+ lua_lock(L);
+ fr = index2addr(L, fromidx);
+ moveto(L, fr, toidx);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushvalue (lua_State *L, int idx) {
+ lua_lock(L);
+ setobj2s(L, L->top, index2addr(L, idx));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+
+/*
+** access functions (stack -> C)
+*/
+
+
+LUA_API int lua_type (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ return (isvalid(o) ? ttypenv(o) : LUA_TNONE);
+}
+
+
+LUA_API const char *lua_typename (lua_State *L, int t) {
+ UNUSED(L);
+ return ttypename(t);
+}
+
+
+LUA_API int lua_iscfunction (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ return (ttislcf(o) || (ttisCclosure(o)));
+}
+
+
+LUA_API int lua_isnumber (lua_State *L, int idx) {
+ TValue n;
+ const TValue *o = index2addr(L, idx);
+ return tonumber(o, &n);
+}
+
+
+LUA_API int lua_isstring (lua_State *L, int idx) {
+ int t = lua_type(L, idx);
+ return (t == LUA_TSTRING || t == LUA_TNUMBER);
+}
+
+
+LUA_API int lua_isuserdata (lua_State *L, int idx) {
+ const TValue *o = index2addr(L, idx);
+ return (ttisuserdata(o) || ttislightuserdata(o));
+}
+
+
+LUA_API int lua_rawequal (lua_State *L, int index1, int index2) {
+ StkId o1 = index2addr(L, index1);
+ StkId o2 = index2addr(L, index2);
+ return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0;
+}
+
+
+LUA_API void lua_arith (lua_State *L, int op) {
+ StkId o1; /* 1st operand */
+ StkId o2; /* 2nd operand */
+ lua_lock(L);
+ if (op != LUA_OPUNM) /* all other operations expect two operands */
+ api_checknelems(L, 2);
+ else { /* for unary minus, add fake 2nd operand */
+ api_checknelems(L, 1);
+ setobjs2s(L, L->top, L->top - 1);
+ L->top++;
+ }
+ o1 = L->top - 2;
+ o2 = L->top - 1;
+ if (ttisnumber(o1) && ttisnumber(o2)) {
+ setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2)));
+ }
+ else
+ luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD));
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) {
+ StkId o1, o2;
+ int i = 0;
+ lua_lock(L); /* may call tag method */
+ o1 = index2addr(L, index1);
+ o2 = index2addr(L, index2);
+ if (isvalid(o1) && isvalid(o2)) {
+ switch (op) {
+ case LUA_OPEQ: i = equalobj(L, o1, o2); break;
+ case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break;
+ case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break;
+ default: api_check(L, 0, "invalid option");
+ }
+ }
+ lua_unlock(L);
+ return i;
+}
+
+
+LUA_API lua_Number lua_tonumberx (lua_State *L, int idx, int *isnum) {
+ TValue n;
+ const TValue *o = index2addr(L, idx);
+ if (tonumber(o, &n)) {
+ if (isnum) *isnum = 1;
+ return nvalue(o);
+ }
+ else {
+ if (isnum) *isnum = 0;
+ return 0;
+ }
+}
+
+
+LUA_API lua_Integer lua_tointegerx (lua_State *L, int idx, int *isnum) {
+ TValue n;
+ const TValue *o = index2addr(L, idx);
+ if (tonumber(o, &n)) {
+ lua_Integer res;
+ lua_Number num = nvalue(o);
+ lua_number2integer(res, num);
+ if (isnum) *isnum = 1;
+ return res;
+ }
+ else {
+ if (isnum) *isnum = 0;
+ return 0;
+ }
+}
+
+
+LUA_API lua_Unsigned lua_tounsignedx (lua_State *L, int idx, int *isnum) {
+ TValue n;
+ const TValue *o = index2addr(L, idx);
+ if (tonumber(o, &n)) {
+ lua_Unsigned res;
+ lua_Number num = nvalue(o);
+ lua_number2unsigned(res, num);
+ if (isnum) *isnum = 1;
+ return res;
+ }
+ else {
+ if (isnum) *isnum = 0;
+ return 0;
+ }
+}
+
+
+LUA_API int lua_toboolean (lua_State *L, int idx) {
+ const TValue *o = index2addr(L, idx);
+ return !l_isfalse(o);
+}
+
+
+LUA_API const char *lua_tolstring (lua_State *L, int idx, size_t *len) {
+ StkId o = index2addr(L, idx);
+ if (!ttisstring(o)) {
+ lua_lock(L); /* `luaV_tostring' may create a new string */
+ if (!luaV_tostring(L, o)) { /* conversion failed? */
+ if (len != NULL) *len = 0;
+ lua_unlock(L);
+ return NULL;
+ }
+ luaC_checkGC(L);
+ o = index2addr(L, idx); /* previous call may reallocate the stack */
+ lua_unlock(L);
+ }
+ if (len != NULL) *len = tsvalue(o)->len;
+ return svalue(o);
+}
+
+
+LUA_API size_t lua_rawlen (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ switch (ttypenv(o)) {
+ case LUA_TSTRING: return tsvalue(o)->len;
+ case LUA_TUSERDATA: return uvalue(o)->len;
+ case LUA_TTABLE: return luaH_getn(hvalue(o));
+ default: return 0;
+ }
+}
+
+
+LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ if (ttislcf(o)) return fvalue(o);
+ else if (ttisCclosure(o))
+ return clCvalue(o)->f;
+ else return NULL; /* not a C function */
+}
+
+
+LUA_API void *lua_touserdata (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ switch (ttypenv(o)) {
+ case LUA_TUSERDATA: return (rawuvalue(o) + 1);
+ case LUA_TLIGHTUSERDATA: return pvalue(o);
+ default: return NULL;
+ }
+}
+
+
+LUA_API lua_State *lua_tothread (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ return (!ttisthread(o)) ? NULL : thvalue(o);
+}
+
+
+LUA_API const void *lua_topointer (lua_State *L, int idx) {
+ StkId o = index2addr(L, idx);
+ switch (ttype(o)) {
+ case LUA_TTABLE: return hvalue(o);
+ case LUA_TLCL: return clLvalue(o);
+ case LUA_TCCL: return clCvalue(o);
+ case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o)));
+ case LUA_TTHREAD: return thvalue(o);
+ case LUA_TUSERDATA:
+ case LUA_TLIGHTUSERDATA:
+ return lua_touserdata(L, idx);
+ default: return NULL;
+ }
+}
+
+
+
+/*
+** push functions (C -> stack)
+*/
+
+
+LUA_API void lua_pushnil (lua_State *L) {
+ lua_lock(L);
+ setnilvalue(L->top);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushnumber (lua_State *L, lua_Number n) {
+ lua_lock(L);
+ setnvalue(L->top, n);
+ luai_checknum(L, L->top,
+ luaG_runerror(L, "C API - attempt to push a signaling NaN"));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) {
+ lua_lock(L);
+ setnvalue(L->top, cast_num(n));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) {
+ lua_Number n;
+ lua_lock(L);
+ n = lua_unsigned2number(u);
+ setnvalue(L->top, n);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) {
+ TString *ts;
+ lua_lock(L);
+ luaC_checkGC(L);
+ ts = luaS_newlstr(L, s, len);
+ setsvalue2s(L, L->top, ts);
+ api_incr_top(L);
+ lua_unlock(L);
+ return getstr(ts);
+}
+
+
+LUA_API const char *lua_pushstring (lua_State *L, const char *s) {
+ if (s == NULL) {
+ lua_pushnil(L);
+ return NULL;
+ }
+ else {
+ TString *ts;
+ lua_lock(L);
+ luaC_checkGC(L);
+ ts = luaS_new(L, s);
+ setsvalue2s(L, L->top, ts);
+ api_incr_top(L);
+ lua_unlock(L);
+ return getstr(ts);
+ }
+}
+
+
+LUA_API const char *lua_pushvfstring (lua_State *L, const char *fmt,
+ va_list argp) {
+ const char *ret;
+ lua_lock(L);
+ luaC_checkGC(L);
+ ret = luaO_pushvfstring(L, fmt, argp);
+ lua_unlock(L);
+ return ret;
+}
+
+
+LUA_API const char *lua_pushfstring (lua_State *L, const char *fmt, ...) {
+ const char *ret;
+ va_list argp;
+ lua_lock(L);
+ luaC_checkGC(L);
+ va_start(argp, fmt);
+ ret = luaO_pushvfstring(L, fmt, argp);
+ va_end(argp);
+ lua_unlock(L);
+ return ret;
+}
+
+
+LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) {
+ lua_lock(L);
+ if (n == 0) {
+ setfvalue(L->top, fn);
+ }
+ else {
+ Closure *cl;
+ api_checknelems(L, n);
+ api_check(L, n <= MAXUPVAL, "upvalue index too large");
+ luaC_checkGC(L);
+ cl = luaF_newCclosure(L, n);
+ cl->c.f = fn;
+ L->top -= n;
+ while (n--)
+ setobj2n(L, &cl->c.upvalue[n], L->top + n);
+ setclCvalue(L, L->top, cl);
+ }
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushboolean (lua_State *L, int b) {
+ lua_lock(L);
+ setbvalue(L->top, (b != 0)); /* ensure that true is 1 */
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_pushlightuserdata (lua_State *L, void *p) {
+ lua_lock(L);
+ setpvalue(L->top, p);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API int lua_pushthread (lua_State *L) {
+ lua_lock(L);
+ setthvalue(L, L->top, L);
+ api_incr_top(L);
+ lua_unlock(L);
+ return (G(L)->mainthread == L);
+}
+
+
+
+/*
+** get functions (Lua -> stack)
+*/
+
+
+LUA_API void lua_getglobal (lua_State *L, const char *var) {
+ Table *reg = hvalue(&G(L)->l_registry);
+ const TValue *gt; /* global table */
+ lua_lock(L);
+ gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+ setsvalue2s(L, L->top++, luaS_new(L, var));
+ luaV_gettable(L, gt, L->top - 1, L->top - 1);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_gettable (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ luaV_gettable(L, t, L->top - 1, L->top - 1);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_getfield (lua_State *L, int idx, const char *k) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ setsvalue2s(L, L->top, luaS_new(L, k));
+ api_incr_top(L);
+ luaV_gettable(L, t, L->top - 1, L->top - 1);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawget (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1));
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawgeti (lua_State *L, int idx, int n) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setobj2s(L, L->top, luaH_getint(hvalue(t), n));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawgetp (lua_State *L, int idx, const void *p) {
+ StkId t;
+ TValue k;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setpvalue(&k, cast(void *, p));
+ setobj2s(L, L->top, luaH_get(hvalue(t), &k));
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_createtable (lua_State *L, int narray, int nrec) {
+ Table *t;
+ lua_lock(L);
+ luaC_checkGC(L);
+ t = luaH_new(L);
+ sethvalue(L, L->top, t);
+ api_incr_top(L);
+ if (narray > 0 || nrec > 0)
+ luaH_resize(L, t, narray, nrec);
+ lua_unlock(L);
+}
+
+
+LUA_API int lua_getmetatable (lua_State *L, int objindex) {
+ const TValue *obj;
+ Table *mt = NULL;
+ int res;
+ lua_lock(L);
+ obj = index2addr(L, objindex);
+ switch (ttypenv(obj)) {
+ case LUA_TTABLE:
+ mt = hvalue(obj)->metatable;
+ break;
+ case LUA_TUSERDATA:
+ mt = uvalue(obj)->metatable;
+ break;
+ default:
+ mt = G(L)->mt[ttypenv(obj)];
+ break;
+ }
+ if (mt == NULL)
+ res = 0;
+ else {
+ sethvalue(L, L->top, mt);
+ api_incr_top(L);
+ res = 1;
+ }
+ lua_unlock(L);
+ return res;
+}
+
+
+LUA_API void lua_getuservalue (lua_State *L, int idx) {
+ StkId o;
+ lua_lock(L);
+ o = index2addr(L, idx);
+ api_check(L, ttisuserdata(o), "userdata expected");
+ if (uvalue(o)->env) {
+ sethvalue(L, L->top, uvalue(o)->env);
+ } else
+ setnilvalue(L->top);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+/*
+** set functions (stack -> Lua)
+*/
+
+
+LUA_API void lua_setglobal (lua_State *L, const char *var) {
+ Table *reg = hvalue(&G(L)->l_registry);
+ const TValue *gt; /* global table */
+ lua_lock(L);
+ api_checknelems(L, 1);
+ gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+ setsvalue2s(L, L->top++, luaS_new(L, var));
+ luaV_settable(L, gt, L->top - 1, L->top - 2);
+ L->top -= 2; /* pop value and key */
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_settable (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ api_checknelems(L, 2);
+ t = index2addr(L, idx);
+ luaV_settable(L, t, L->top - 2, L->top - 1);
+ L->top -= 2; /* pop index and value */
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_setfield (lua_State *L, int idx, const char *k) {
+ StkId t;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ t = index2addr(L, idx);
+ setsvalue2s(L, L->top++, luaS_new(L, k));
+ luaV_settable(L, t, L->top - 1, L->top - 2);
+ L->top -= 2; /* pop value and key */
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawset (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ api_checknelems(L, 2);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1);
+ invalidateTMcache(hvalue(t));
+ luaC_barrierback(L, gcvalue(t), L->top-1);
+ L->top -= 2;
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawseti (lua_State *L, int idx, int n) {
+ StkId t;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ luaH_setint(L, hvalue(t), n, L->top - 1);
+ luaC_barrierback(L, gcvalue(t), L->top-1);
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_rawsetp (lua_State *L, int idx, const void *p) {
+ StkId t;
+ TValue k;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ setpvalue(&k, cast(void *, p));
+ setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1);
+ luaC_barrierback(L, gcvalue(t), L->top - 1);
+ L->top--;
+ lua_unlock(L);
+}
+
+
+LUA_API int lua_setmetatable (lua_State *L, int objindex) {
+ TValue *obj;
+ Table *mt;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ obj = index2addr(L, objindex);
+ if (ttisnil(L->top - 1))
+ mt = NULL;
+ else {
+ api_check(L, ttistable(L->top - 1), "table expected");
+ mt = hvalue(L->top - 1);
+ }
+ switch (ttypenv(obj)) {
+ case LUA_TTABLE: {
+ hvalue(obj)->metatable = mt;
+ if (mt) {
+ luaC_objbarrierback(L, gcvalue(obj), mt);
+ luaC_checkfinalizer(L, gcvalue(obj), mt);
+ }
+ break;
+ }
+ case LUA_TUSERDATA: {
+ uvalue(obj)->metatable = mt;
+ if (mt) {
+ luaC_objbarrier(L, rawuvalue(obj), mt);
+ luaC_checkfinalizer(L, gcvalue(obj), mt);
+ }
+ break;
+ }
+ default: {
+ G(L)->mt[ttypenv(obj)] = mt;
+ break;
+ }
+ }
+ L->top--;
+ lua_unlock(L);
+ return 1;
+}
+
+
+LUA_API void lua_setuservalue (lua_State *L, int idx) {
+ StkId o;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ o = index2addr(L, idx);
+ api_check(L, ttisuserdata(o), "userdata expected");
+ if (ttisnil(L->top - 1))
+ uvalue(o)->env = NULL;
+ else {
+ api_check(L, ttistable(L->top - 1), "table expected");
+ uvalue(o)->env = hvalue(L->top - 1);
+ luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1));
+ }
+ L->top--;
+ lua_unlock(L);
+}
+
+
+/*
+** `load' and `call' functions (run Lua code)
+*/
+
+
+#define checkresults(L,na,nr) \
+ api_check(L, (nr) == LUA_MULTRET || (L->ci->top - L->top >= (nr) - (na)), \
+ "results from function overflow current stack size")
+
+
+LUA_API int lua_getctx (lua_State *L, int *ctx) {
+ if (L->ci->callstatus & CIST_YIELDED) {
+ if (ctx) *ctx = L->ci->u.c.ctx;
+ return L->ci->u.c.status;
+ }
+ else return LUA_OK;
+}
+
+
+LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx,
+ lua_CFunction k) {
+ StkId func;
+ lua_lock(L);
+ api_check(L, k == NULL || !isLua(L->ci),
+ "cannot use continuations inside hooks");
+ api_checknelems(L, nargs+1);
+ api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
+ checkresults(L, nargs, nresults);
+ func = L->top - (nargs+1);
+ if (k != NULL && L->nny == 0) { /* need to prepare continuation? */
+ L->ci->u.c.k = k; /* save continuation */
+ L->ci->u.c.ctx = ctx; /* save context */
+ luaD_call(L, func, nresults, 1); /* do the call */
+ }
+ else /* no continuation or no yieldable */
+ luaD_call(L, func, nresults, 0); /* just do the call */
+ adjustresults(L, nresults);
+ lua_unlock(L);
+}
+
+
+
+/*
+** Execute a protected call.
+*/
+struct CallS { /* data to `f_call' */
+ StkId func;
+ int nresults;
+};
+
+
+static void f_call (lua_State *L, void *ud) {
+ struct CallS *c = cast(struct CallS *, ud);
+ luaD_call(L, c->func, c->nresults, 0);
+}
+
+
+
+LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc,
+ int ctx, lua_CFunction k) {
+ struct CallS c;
+ int status;
+ ptrdiff_t func;
+ lua_lock(L);
+ api_check(L, k == NULL || !isLua(L->ci),
+ "cannot use continuations inside hooks");
+ api_checknelems(L, nargs+1);
+ api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
+ checkresults(L, nargs, nresults);
+ if (errfunc == 0)
+ func = 0;
+ else {
+ StkId o = index2addr(L, errfunc);
+ api_checkstackindex(L, errfunc, o);
+ func = savestack(L, o);
+ }
+ c.func = L->top - (nargs+1); /* function to be called */
+ if (k == NULL || L->nny > 0) { /* no continuation or no yieldable? */
+ c.nresults = nresults; /* do a 'conventional' protected call */
+ status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func);
+ }
+ else { /* prepare continuation (call is already protected by 'resume') */
+ CallInfo *ci = L->ci;
+ ci->u.c.k = k; /* save continuation */
+ ci->u.c.ctx = ctx; /* save context */
+ /* save information for error recovery */
+ ci->extra = savestack(L, c.func);
+ ci->u.c.old_allowhook = L->allowhook;
+ ci->u.c.old_errfunc = L->errfunc;
+ L->errfunc = func;
+ /* mark that function may do error recovery */
+ ci->callstatus |= CIST_YPCALL;
+ luaD_call(L, c.func, nresults, 1); /* do the call */
+ ci->callstatus &= ~CIST_YPCALL;
+ L->errfunc = ci->u.c.old_errfunc;
+ status = LUA_OK; /* if it is here, there were no errors */
+ }
+ adjustresults(L, nresults);
+ lua_unlock(L);
+ return status;
+}
+
+
+LUA_API int lua_load (lua_State *L, lua_Reader reader, void *data,
+ const char *chunkname, const char *mode) {
+ ZIO z;
+ int status;
+ lua_lock(L);
+ if (!chunkname) chunkname = "?";
+ luaZ_init(L, &z, reader, data);
+ status = luaD_protectedparser(L, &z, chunkname, mode);
+ if (status == LUA_OK) { /* no errors? */
+ LClosure *f = clLvalue(L->top - 1); /* get newly created function */
+ if (f->nupvalues == 1) { /* does it have one upvalue? */
+ /* get global table from registry */
+ Table *reg = hvalue(&G(L)->l_registry);
+ const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+ /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */
+ setobj(L, f->upvals[0]->v, gt);
+ luaC_barrier(L, f->upvals[0], gt);
+ }
+ }
+ lua_unlock(L);
+ return status;
+}
+
+
+LUA_API int lua_dump (lua_State *L, lua_Writer writer, void *data) {
+ int status;
+ TValue *o;
+ lua_lock(L);
+ api_checknelems(L, 1);
+ o = L->top - 1;
+ if (isLfunction(o))
+ status = luaU_dump(L, getproto(o), writer, data, 0);
+ else
+ status = 1;
+ lua_unlock(L);
+ return status;
+}
+
+
+LUA_API int lua_status (lua_State *L) {
+ return L->status;
+}
+
+
+/*
+** Garbage-collection function
+*/
+
+LUA_API int lua_gc (lua_State *L, int what, int data) {
+ int res = 0;
+ global_State *g;
+ lua_lock(L);
+ g = G(L);
+ switch (what) {
+ case LUA_GCSTOP: {
+ g->gcrunning = 0;
+ break;
+ }
+ case LUA_GCRESTART: {
+ luaE_setdebt(g, 0);
+ g->gcrunning = 1;
+ break;
+ }
+ case LUA_GCCOLLECT: {
+ luaC_fullgc(L, 0);
+ break;
+ }
+ case LUA_GCCOUNT: {
+ /* GC values are expressed in Kbytes: #bytes/2^10 */
+ res = cast_int(gettotalbytes(g) >> 10);
+ break;
+ }
+ case LUA_GCCOUNTB: {
+ res = cast_int(gettotalbytes(g) & 0x3ff);
+ break;
+ }
+ case LUA_GCSTEP: {
+ if (g->gckind == KGC_GEN) { /* generational mode? */
+ res = (g->GCestimate == 0); /* true if it will do major collection */
+ luaC_forcestep(L); /* do a single step */
+ }
+ else {
+ lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE;
+ if (g->gcrunning)
+ debt += g->GCdebt; /* include current debt */
+ luaE_setdebt(g, debt);
+ luaC_forcestep(L);
+ if (g->gcstate == GCSpause) /* end of cycle? */
+ res = 1; /* signal it */
+ }
+ break;
+ }
+ case LUA_GCSETPAUSE: {
+ res = g->gcpause;
+ g->gcpause = data;
+ break;
+ }
+ case LUA_GCSETMAJORINC: {
+ res = g->gcmajorinc;
+ g->gcmajorinc = data;
+ break;
+ }
+ case LUA_GCSETSTEPMUL: {
+ res = g->gcstepmul;
+ g->gcstepmul = data;
+ break;
+ }
+ case LUA_GCISRUNNING: {
+ res = g->gcrunning;
+ break;
+ }
+ case LUA_GCGEN: { /* change collector to generational mode */
+ luaC_changemode(L, KGC_GEN);
+ break;
+ }
+ case LUA_GCINC: { /* change collector to incremental mode */
+ luaC_changemode(L, KGC_NORMAL);
+ break;
+ }
+ default: res = -1; /* invalid option */
+ }
+ lua_unlock(L);
+ return res;
+}
+
+
+
+/*
+** miscellaneous functions
+*/
+
+
+LUA_API int lua_error (lua_State *L) {
+ lua_lock(L);
+ api_checknelems(L, 1);
+ luaG_errormsg(L);
+ /* code unreachable; will unlock when control actually leaves the kernel */
+ return 0; /* to avoid warnings */
+}
+
+
+LUA_API int lua_next (lua_State *L, int idx) {
+ StkId t;
+ int more;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ api_check(L, ttistable(t), "table expected");
+ more = luaH_next(L, hvalue(t), L->top - 1);
+ if (more) {
+ api_incr_top(L);
+ }
+ else /* no more elements */
+ L->top -= 1; /* remove key */
+ lua_unlock(L);
+ return more;
+}
+
+
+LUA_API void lua_concat (lua_State *L, int n) {
+ lua_lock(L);
+ api_checknelems(L, n);
+ if (n >= 2) {
+ luaC_checkGC(L);
+ luaV_concat(L, n);
+ }
+ else if (n == 0) { /* push empty string */
+ setsvalue2s(L, L->top, luaS_newlstr(L, "", 0));
+ api_incr_top(L);
+ }
+ /* else n == 1; nothing to do */
+ lua_unlock(L);
+}
+
+
+LUA_API void lua_len (lua_State *L, int idx) {
+ StkId t;
+ lua_lock(L);
+ t = index2addr(L, idx);
+ luaV_objlen(L, L->top, t);
+ api_incr_top(L);
+ lua_unlock(L);
+}
+
+
+LUA_API lua_Alloc lua_getallocf (lua_State *L, void **ud) {
+ lua_Alloc f;
+ lua_lock(L);
+ if (ud) *ud = G(L)->ud;
+ f = G(L)->frealloc;
+ lua_unlock(L);
+ return f;
+}
+
+
+LUA_API void lua_setallocf (lua_State *L, lua_Alloc f, void *ud) {
+ lua_lock(L);
+ G(L)->ud = ud;
+ G(L)->frealloc = f;
+ lua_unlock(L);
+}
+
+
+LUA_API void *lua_newuserdata (lua_State *L, size_t size) {
+ Udata *u;
+ lua_lock(L);
+ luaC_checkGC(L);
+ u = luaS_newudata(L, size, NULL);
+ setuvalue(L, L->top, u);
+ api_incr_top(L);
+ lua_unlock(L);
+ return u + 1;
+}
+
+
+
+static const char *aux_upvalue (StkId fi, int n, TValue **val,
+ GCObject **owner) {
+ switch (ttype(fi)) {
+ case LUA_TCCL: { /* C closure */
+ CClosure *f = clCvalue(fi);
+ if (!(1 <= n && n <= f->nupvalues)) return NULL;
+ *val = &f->upvalue[n-1];
+ if (owner) *owner = obj2gco(f);
+ return "";
+ }
+ case LUA_TLCL: { /* Lua closure */
+ LClosure *f = clLvalue(fi);
+ TString *name;
+ Proto *p = f->p;
+ if (!(1 <= n && n <= p->sizeupvalues)) return NULL;
+ *val = f->upvals[n-1]->v;
+ if (owner) *owner = obj2gco(f->upvals[n - 1]);
+ name = p->upvalues[n-1].name;
+ return (name == NULL) ? "" : getstr(name);
+ }
+ default: return NULL; /* not a closure */
+ }
+}
+
+
+LUA_API const char *lua_getupvalue (lua_State *L, int funcindex, int n) {
+ const char *name;
+ TValue *val = NULL; /* to avoid warnings */
+ lua_lock(L);
+ name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL);
+ if (name) {
+ setobj2s(L, L->top, val);
+ api_incr_top(L);
+ }
+ lua_unlock(L);
+ return name;
+}
+
+
+LUA_API const char *lua_setupvalue (lua_State *L, int funcindex, int n) {
+ const char *name;
+ TValue *val = NULL; /* to avoid warnings */
+ GCObject *owner = NULL; /* to avoid warnings */
+ StkId fi;
+ lua_lock(L);
+ fi = index2addr(L, funcindex);
+ api_checknelems(L, 1);
+ name = aux_upvalue(fi, n, &val, &owner);
+ if (name) {
+ L->top--;
+ setobj(L, val, L->top);
+ luaC_barrier(L, owner, L->top);
+ }
+ lua_unlock(L);
+ return name;
+}
+
+
+static UpVal **getupvalref (lua_State *L, int fidx, int n, LClosure **pf) {
+ LClosure *f;
+ StkId fi = index2addr(L, fidx);
+ api_check(L, ttisLclosure(fi), "Lua function expected");
+ f = clLvalue(fi);
+ api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index");
+ if (pf) *pf = f;
+ return &f->upvals[n - 1]; /* get its upvalue pointer */
+}
+
+
+LUA_API void *lua_upvalueid (lua_State *L, int fidx, int n) {
+ StkId fi = index2addr(L, fidx);
+ switch (ttype(fi)) {
+ case LUA_TLCL: { /* lua closure */
+ return *getupvalref(L, fidx, n, NULL);
+ }
+ case LUA_TCCL: { /* C closure */
+ CClosure *f = clCvalue(fi);
+ api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index");
+ return &f->upvalue[n - 1];
+ }
+ default: {
+ api_check(L, 0, "closure expected");
+ return NULL;
+ }
+ }
+}
+
+
+LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1,
+ int fidx2, int n2) {
+ LClosure *f1;
+ UpVal **up1 = getupvalref(L, fidx1, n1, &f1);
+ UpVal **up2 = getupvalref(L, fidx2, n2, NULL);
+ *up1 = *up2;
+ luaC_objbarrier(L, f1, *up2);
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h
new file mode 100644
index 000000000000..c7d34ad84866
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h
@@ -0,0 +1,24 @@
+/*
+** $Id: lapi.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions from Lua API
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lapi_h
+#define lapi_h
+
+
+#include "llimits.h"
+#include "lstate.h"
+
+#define api_incr_top(L) {L->top++; api_check(L, L->top <= L->ci->top, \
+ "stack overflow");}
+
+#define adjustresults(L,nres) \
+ { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; }
+
+#define api_checknelems(L,n) api_check(L, (n) < (L->top - L->ci->func), \
+ "not enough elements in the stack")
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c
new file mode 100644
index 000000000000..4bd13788b459
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c
@@ -0,0 +1,791 @@
+/*
+** $Id: lauxlib.c,v 1.248.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions for building Lua libraries
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+/* This file uses only the official API of Lua.
+** Any function declared here could be written as an application function.
+*/
+
+#define lauxlib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+
+
+/*
+** {======================================================
+** Traceback
+** =======================================================
+*/
+
+
+#define LEVELS1 12 /* size of the first part of the stack */
+#define LEVELS2 10 /* size of the second part of the stack */
+
+
+
+/*
+** search for 'objidx' in table at index -1.
+** return 1 + string at top if find a good name.
+*/
+static int findfield (lua_State *L, int objidx, int level) {
+ if (level == 0 || !lua_istable(L, -1))
+ return 0; /* not found */
+ lua_pushnil(L); /* start 'next' loop */
+ while (lua_next(L, -2)) { /* for each pair in table */
+ if (lua_type(L, -2) == LUA_TSTRING) { /* ignore non-string keys */
+ if (lua_rawequal(L, objidx, -1)) { /* found object? */
+ lua_pop(L, 1); /* remove value (but keep name) */
+ return 1;
+ }
+ else if (findfield(L, objidx, level - 1)) { /* try recursively */
+ lua_remove(L, -2); /* remove table (but keep name) */
+ lua_pushliteral(L, ".");
+ lua_insert(L, -2); /* place '.' between the two names */
+ lua_concat(L, 3);
+ return 1;
+ }
+ }
+ lua_pop(L, 1); /* remove value */
+ }
+ return 0; /* not found */
+}
+
+
+static int pushglobalfuncname (lua_State *L, lua_Debug *ar) {
+ int top = lua_gettop(L);
+ lua_getinfo(L, "f", ar); /* push function */
+ lua_pushglobaltable(L);
+ if (findfield(L, top + 1, 2)) {
+ lua_copy(L, -1, top + 1); /* move name to proper place */
+ lua_pop(L, 2); /* remove pushed values */
+ return 1;
+ }
+ else {
+ lua_settop(L, top); /* remove function and global table */
+ return 0;
+ }
+}
+
+
+static void pushfuncname (lua_State *L, lua_Debug *ar) {
+ if (*ar->namewhat != '\0') /* is there a name? */
+ lua_pushfstring(L, "function " LUA_QS, ar->name);
+ else if (*ar->what == 'm') /* main? */
+ lua_pushliteral(L, "main chunk");
+ else if (*ar->what == 'C') {
+ if (pushglobalfuncname(L, ar)) {
+ lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1));
+ lua_remove(L, -2); /* remove name */
+ }
+ else
+ lua_pushliteral(L, "?");
+ }
+ else
+ lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined);
+}
+
+
+static int countlevels (lua_State *L) {
+ lua_Debug ar;
+ int li = 1, le = 1;
+ /* find an upper bound */
+ while (lua_getstack(L, le, &ar)) { li = le; le *= 2; }
+ /* do a binary search */
+ while (li < le) {
+ int m = (li + le)/2;
+ if (lua_getstack(L, m, &ar)) li = m + 1;
+ else le = m;
+ }
+ return le - 1;
+}
+
+
+LUALIB_API void luaL_traceback (lua_State *L, lua_State *L1,
+ const char *msg, int level) {
+ lua_Debug ar;
+ int top = lua_gettop(L);
+ int numlevels = countlevels(L1);
+ int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0;
+ if (msg) lua_pushfstring(L, "%s\n", msg);
+ lua_pushliteral(L, "stack traceback:");
+ while (lua_getstack(L1, level++, &ar)) {
+ if (level == mark) { /* too many levels? */
+ lua_pushliteral(L, "\n\t..."); /* add a '...' */
+ level = numlevels - LEVELS2; /* and skip to last ones */
+ }
+ else {
+ lua_getinfo(L1, "Slnt", &ar);
+ lua_pushfstring(L, "\n\t%s:", ar.short_src);
+ if (ar.currentline > 0)
+ lua_pushfstring(L, "%d:", ar.currentline);
+ lua_pushliteral(L, " in ");
+ pushfuncname(L, &ar);
+ if (ar.istailcall)
+ lua_pushliteral(L, "\n\t(...tail calls...)");
+ lua_concat(L, lua_gettop(L) - top);
+ }
+ }
+ lua_concat(L, lua_gettop(L) - top);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Error-report functions
+** =======================================================
+*/
+
+LUALIB_API int luaL_argerror (lua_State *L, int narg, const char *extramsg) {
+ lua_Debug ar;
+ if (!lua_getstack(L, 0, &ar)) /* no stack frame? */
+ return luaL_error(L, "bad argument #%d (%s)", narg, extramsg);
+ lua_getinfo(L, "n", &ar);
+ if (strcmp(ar.namewhat, "method") == 0) {
+ narg--; /* do not count `self' */
+ if (narg == 0) /* error is in the self argument itself? */
+ return luaL_error(L, "calling " LUA_QS " on bad self (%s)",
+ ar.name, extramsg);
+ }
+ if (ar.name == NULL)
+ ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?";
+ return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)",
+ narg, ar.name, extramsg);
+}
+
+
+static int typeerror (lua_State *L, int narg, const char *tname) {
+ const char *msg = lua_pushfstring(L, "%s expected, got %s",
+ tname, luaL_typename(L, narg));
+ return luaL_argerror(L, narg, msg);
+}
+
+
+static void tag_error (lua_State *L, int narg, int tag) {
+ typeerror(L, narg, lua_typename(L, tag));
+}
+
+
+LUALIB_API void luaL_where (lua_State *L, int level) {
+ lua_Debug ar;
+ if (lua_getstack(L, level, &ar)) { /* check function at level */
+ lua_getinfo(L, "Sl", &ar); /* get info about it */
+ if (ar.currentline > 0) { /* is there info? */
+ lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline);
+ return;
+ }
+ }
+ lua_pushliteral(L, ""); /* else, no information available... */
+}
+
+
+LUALIB_API int luaL_error (lua_State *L, const char *fmt, ...) {
+ va_list argp;
+ va_start(argp, fmt);
+ luaL_where(L, 1);
+ lua_pushvfstring(L, fmt, argp);
+ va_end(argp);
+ lua_concat(L, 2);
+ return lua_error(L);
+}
+
+
+#if !defined(inspectstat) /* { */
+
+#if defined(LUA_USE_POSIX)
+
+#include <sys/wait.h>
+
+/*
+** use appropriate macros to interpret 'pclose' return status
+*/
+#define inspectstat(stat,what) \
+ if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \
+ else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; }
+
+#else
+
+#define inspectstat(stat,what) /* no op */
+
+#endif
+
+#endif /* } */
+
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Userdata's metatable manipulation
+** =======================================================
+*/
+
+LUALIB_API int luaL_newmetatable (lua_State *L, const char *tname) {
+ luaL_getmetatable(L, tname); /* try to get metatable */
+ if (!lua_isnil(L, -1)) /* name already in use? */
+ return 0; /* leave previous value on top, but return 0 */
+ lua_pop(L, 1);
+ lua_newtable(L); /* create metatable */
+ lua_pushvalue(L, -1);
+ lua_setfield(L, LUA_REGISTRYINDEX, tname); /* registry.name = metatable */
+ return 1;
+}
+
+
+LUALIB_API void luaL_setmetatable (lua_State *L, const char *tname) {
+ luaL_getmetatable(L, tname);
+ lua_setmetatable(L, -2);
+}
+
+
+LUALIB_API void *luaL_testudata (lua_State *L, int ud, const char *tname) {
+ void *p = lua_touserdata(L, ud);
+ if (p != NULL) { /* value is a userdata? */
+ if (lua_getmetatable(L, ud)) { /* does it have a metatable? */
+ luaL_getmetatable(L, tname); /* get correct metatable */
+ if (!lua_rawequal(L, -1, -2)) /* not the same? */
+ p = NULL; /* value is a userdata with wrong metatable */
+ lua_pop(L, 2); /* remove both metatables */
+ return p;
+ }
+ }
+ return NULL; /* value is not a userdata with a metatable */
+}
+
+
+LUALIB_API void *luaL_checkudata (lua_State *L, int ud, const char *tname) {
+ void *p = luaL_testudata(L, ud, tname);
+ if (p == NULL) typeerror(L, ud, tname);
+ return p;
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Argument check functions
+** =======================================================
+*/
+
+LUALIB_API int luaL_checkoption (lua_State *L, int narg, const char *def,
+ const char *const lst[]) {
+ const char *name = (def) ? luaL_optstring(L, narg, def) :
+ luaL_checkstring(L, narg);
+ int i;
+ for (i=0; lst[i]; i++)
+ if (strcmp(lst[i], name) == 0)
+ return i;
+ return luaL_argerror(L, narg,
+ lua_pushfstring(L, "invalid option " LUA_QS, name));
+}
+
+
+LUALIB_API void luaL_checkstack (lua_State *L, int space, const char *msg) {
+ /* keep some extra space to run error routines, if needed */
+ const int extra = LUA_MINSTACK;
+ if (!lua_checkstack(L, space + extra)) {
+ if (msg)
+ luaL_error(L, "stack overflow (%s)", msg);
+ else
+ luaL_error(L, "stack overflow");
+ }
+}
+
+
+LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) {
+ if (lua_type(L, narg) != t)
+ tag_error(L, narg, t);
+}
+
+
+LUALIB_API void luaL_checkany (lua_State *L, int narg) {
+ if (lua_type(L, narg) == LUA_TNONE)
+ luaL_argerror(L, narg, "value expected");
+}
+
+
+LUALIB_API const char *luaL_checklstring (lua_State *L, int narg, size_t *len) {
+ const char *s = lua_tolstring(L, narg, len);
+ if (!s) tag_error(L, narg, LUA_TSTRING);
+ return s;
+}
+
+
+LUALIB_API const char *luaL_optlstring (lua_State *L, int narg,
+ const char *def, size_t *len) {
+ if (lua_isnoneornil(L, narg)) {
+ if (len)
+ *len = (def ? strlen(def) : 0);
+ return def;
+ }
+ else return luaL_checklstring(L, narg, len);
+}
+
+
+LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) {
+ int isnum;
+ lua_Number d = lua_tonumberx(L, narg, &isnum);
+ if (!isnum)
+ tag_error(L, narg, LUA_TNUMBER);
+ return d;
+}
+
+
+LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) {
+ return luaL_opt(L, luaL_checknumber, narg, def);
+}
+
+
+LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) {
+ int isnum;
+ lua_Integer d = lua_tointegerx(L, narg, &isnum);
+ if (!isnum)
+ tag_error(L, narg, LUA_TNUMBER);
+ return d;
+}
+
+
+LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) {
+ int isnum;
+ lua_Unsigned d = lua_tounsignedx(L, narg, &isnum);
+ if (!isnum)
+ tag_error(L, narg, LUA_TNUMBER);
+ return d;
+}
+
+
+LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg,
+ lua_Integer def) {
+ return luaL_opt(L, luaL_checkinteger, narg, def);
+}
+
+
+LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg,
+ lua_Unsigned def) {
+ return luaL_opt(L, luaL_checkunsigned, narg, def);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Generic Buffer manipulation
+** =======================================================
+*/
+
+/*
+** check whether buffer is using a userdata on the stack as a temporary
+** buffer
+*/
+#define buffonstack(B) ((B)->b != (B)->initb)
+
+
+/*
+** returns a pointer to a free area with at least 'sz' bytes
+*/
+LUALIB_API char *luaL_prepbuffsize (luaL_Buffer *B, size_t sz) {
+ lua_State *L = B->L;
+ if (B->size - B->n < sz) { /* not enough space? */
+ char *newbuff;
+ size_t newsize = B->size * 2; /* double buffer size */
+ if (newsize - B->n < sz) /* not big enough? */
+ newsize = B->n + sz;
+ if (newsize < B->n || newsize - B->n < sz)
+ luaL_error(L, "buffer too large");
+ /* create larger buffer */
+ newbuff = (char *)lua_newuserdata(L, newsize * sizeof(char));
+ /* move content to new buffer */
+ memcpy(newbuff, B->b, B->n * sizeof(char));
+ if (buffonstack(B))
+ lua_remove(L, -2); /* remove old buffer */
+ B->b = newbuff;
+ B->size = newsize;
+ }
+ return &B->b[B->n];
+}
+
+
+LUALIB_API void luaL_addlstring (luaL_Buffer *B, const char *s, size_t l) {
+ char *b = luaL_prepbuffsize(B, l);
+ memcpy(b, s, l * sizeof(char));
+ luaL_addsize(B, l);
+}
+
+
+LUALIB_API void luaL_addstring (luaL_Buffer *B, const char *s) {
+ luaL_addlstring(B, s, strlen(s));
+}
+
+
+LUALIB_API void luaL_pushresult (luaL_Buffer *B) {
+ lua_State *L = B->L;
+ lua_pushlstring(L, B->b, B->n);
+ if (buffonstack(B))
+ lua_remove(L, -2); /* remove old buffer */
+}
+
+
+LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) {
+ luaL_addsize(B, sz);
+ luaL_pushresult(B);
+}
+
+
+LUALIB_API void luaL_addvalue (luaL_Buffer *B) {
+ lua_State *L = B->L;
+ size_t l;
+ const char *s = lua_tolstring(L, -1, &l);
+ if (buffonstack(B))
+ lua_insert(L, -2); /* put value below buffer */
+ luaL_addlstring(B, s, l);
+ lua_remove(L, (buffonstack(B)) ? -2 : -1); /* remove value */
+}
+
+
+LUALIB_API void luaL_buffinit (lua_State *L, luaL_Buffer *B) {
+ B->L = L;
+ B->b = B->initb;
+ B->n = 0;
+ B->size = LUAL_BUFFERSIZE;
+}
+
+
+LUALIB_API char *luaL_buffinitsize (lua_State *L, luaL_Buffer *B, size_t sz) {
+ luaL_buffinit(L, B);
+ return luaL_prepbuffsize(B, sz);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Reference system
+** =======================================================
+*/
+
+/* index of free-list header */
+#define freelist 0
+
+
+LUALIB_API int luaL_ref (lua_State *L, int t) {
+ int ref;
+ if (lua_isnil(L, -1)) {
+ lua_pop(L, 1); /* remove from stack */
+ return LUA_REFNIL; /* `nil' has a unique fixed reference */
+ }
+ t = lua_absindex(L, t);
+ lua_rawgeti(L, t, freelist); /* get first free element */
+ ref = (int)lua_tointeger(L, -1); /* ref = t[freelist] */
+ lua_pop(L, 1); /* remove it from stack */
+ if (ref != 0) { /* any free element? */
+ lua_rawgeti(L, t, ref); /* remove it from list */
+ lua_rawseti(L, t, freelist); /* (t[freelist] = t[ref]) */
+ }
+ else /* no free elements */
+ ref = (int)lua_rawlen(L, t) + 1; /* get a new reference */
+ lua_rawseti(L, t, ref);
+ return ref;
+}
+
+
+LUALIB_API void luaL_unref (lua_State *L, int t, int ref) {
+ if (ref >= 0) {
+ t = lua_absindex(L, t);
+ lua_rawgeti(L, t, freelist);
+ lua_rawseti(L, t, ref); /* t[ref] = t[freelist] */
+ lua_pushinteger(L, ref);
+ lua_rawseti(L, t, freelist); /* t[freelist] = ref */
+ }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Load functions
+** =======================================================
+*/
+
+typedef struct LoadS {
+ const char *s;
+ size_t size;
+} LoadS;
+
+
+static const char *getS (lua_State *L, void *ud, size_t *size) {
+ LoadS *ls = (LoadS *)ud;
+ (void)L; /* not used */
+ if (ls->size == 0) return NULL;
+ *size = ls->size;
+ ls->size = 0;
+ return ls->s;
+}
+
+
+LUALIB_API int luaL_loadbufferx (lua_State *L, const char *buff, size_t size,
+ const char *name, const char *mode) {
+ LoadS ls;
+ ls.s = buff;
+ ls.size = size;
+ return lua_load(L, getS, &ls, name, mode);
+}
+
+
+LUALIB_API int luaL_loadstring (lua_State *L, const char *s) {
+ return luaL_loadbuffer(L, s, strlen(s), s);
+}
+
+/* }====================================================== */
+
+
+
+LUALIB_API int luaL_getmetafield (lua_State *L, int obj, const char *event) {
+ if (!lua_getmetatable(L, obj)) /* no metatable? */
+ return 0;
+ lua_pushstring(L, event);
+ lua_rawget(L, -2);
+ if (lua_isnil(L, -1)) {
+ lua_pop(L, 2); /* remove metatable and metafield */
+ return 0;
+ }
+ else {
+ lua_remove(L, -2); /* remove only metatable */
+ return 1;
+ }
+}
+
+
+LUALIB_API int luaL_callmeta (lua_State *L, int obj, const char *event) {
+ obj = lua_absindex(L, obj);
+ if (!luaL_getmetafield(L, obj, event)) /* no metafield? */
+ return 0;
+ lua_pushvalue(L, obj);
+ lua_call(L, 1, 1);
+ return 1;
+}
+
+
+LUALIB_API int luaL_len (lua_State *L, int idx) {
+ int l;
+ int isnum;
+ lua_len(L, idx);
+ l = (int)lua_tointegerx(L, -1, &isnum);
+ if (!isnum)
+ luaL_error(L, "object length is not a number");
+ lua_pop(L, 1); /* remove object */
+ return l;
+}
+
+
+LUALIB_API const char *luaL_tolstring (lua_State *L, int idx, size_t *len) {
+ if (!luaL_callmeta(L, idx, "__tostring")) { /* no metafield? */
+ switch (lua_type(L, idx)) {
+ case LUA_TNUMBER:
+ case LUA_TSTRING:
+ lua_pushvalue(L, idx);
+ break;
+ case LUA_TBOOLEAN:
+ lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false"));
+ break;
+ case LUA_TNIL:
+ lua_pushliteral(L, "nil");
+ break;
+ default:
+ lua_pushfstring(L, "%s: %p", luaL_typename(L, idx),
+ lua_topointer(L, idx));
+ break;
+ }
+ }
+ return lua_tolstring(L, -1, len);
+}
+
+
+/*
+** {======================================================
+** Compatibility with 5.1 module functions
+** =======================================================
+*/
+#if defined(LUA_COMPAT_MODULE)
+
+static const char *luaL_findtable (lua_State *L, int idx,
+ const char *fname, int szhint) {
+ const char *e;
+ if (idx) lua_pushvalue(L, idx);
+ do {
+ e = strchr(fname, '.');
+ if (e == NULL) e = fname + strlen(fname);
+ lua_pushlstring(L, fname, e - fname);
+ lua_rawget(L, -2);
+ if (lua_isnil(L, -1)) { /* no such field? */
+ lua_pop(L, 1); /* remove this nil */
+ lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */
+ lua_pushlstring(L, fname, e - fname);
+ lua_pushvalue(L, -2);
+ lua_settable(L, -4); /* set new table into field */
+ }
+ else if (!lua_istable(L, -1)) { /* field has a non-table value? */
+ lua_pop(L, 2); /* remove table and value */
+ return fname; /* return problematic part of the name */
+ }
+ lua_remove(L, -2); /* remove previous table */
+ fname = e + 1;
+ } while (*e == '.');
+ return NULL;
+}
+
+
+/*
+** Count number of elements in a luaL_Reg list.
+*/
+static int libsize (const luaL_Reg *l) {
+ int size = 0;
+ for (; l && l->name; l++) size++;
+ return size;
+}
+
+
+/*
+** Find or create a module table with a given name. The function
+** first looks at the _LOADED table and, if that fails, try a
+** global variable with that name. In any case, leaves on the stack
+** the module table.
+*/
+LUALIB_API void luaL_pushmodule (lua_State *L, const char *modname,
+ int sizehint) {
+ luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1); /* get _LOADED table */
+ lua_getfield(L, -1, modname); /* get _LOADED[modname] */
+ if (!lua_istable(L, -1)) { /* not found? */
+ lua_pop(L, 1); /* remove previous result */
+ /* try global variable (and create one if it does not exist) */
+ lua_pushglobaltable(L);
+ if (luaL_findtable(L, 0, modname, sizehint) != NULL)
+ luaL_error(L, "name conflict for module " LUA_QS, modname);
+ lua_pushvalue(L, -1);
+ lua_setfield(L, -3, modname); /* _LOADED[modname] = new table */
+ }
+ lua_remove(L, -2); /* remove _LOADED table */
+}
+
+
+LUALIB_API void luaL_openlib (lua_State *L, const char *libname,
+ const luaL_Reg *l, int nup) {
+ luaL_checkversion(L);
+ if (libname) {
+ luaL_pushmodule(L, libname, libsize(l)); /* get/create library table */
+ lua_insert(L, -(nup + 1)); /* move library table to below upvalues */
+ }
+ if (l)
+ luaL_setfuncs(L, l, nup);
+ else
+ lua_pop(L, nup); /* remove upvalues */
+}
+
+#endif
+/* }====================================================== */
+
+/*
+** set functions from list 'l' into table at top - 'nup'; each
+** function gets the 'nup' elements at the top as upvalues.
+** Returns with only the table at the stack.
+*/
+LUALIB_API void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) {
+ luaL_checkversion(L);
+ luaL_checkstack(L, nup, "too many upvalues");
+ for (; l->name != NULL; l++) { /* fill the table with given functions */
+ int i;
+ for (i = 0; i < nup; i++) /* copy upvalues to the top */
+ lua_pushvalue(L, -nup);
+ lua_pushcclosure(L, l->func, nup); /* closure with those upvalues */
+ lua_setfield(L, -(nup + 2), l->name);
+ }
+ lua_pop(L, nup); /* remove upvalues */
+}
+
+
+/*
+** ensure that stack[idx][fname] has a table and push that table
+** into the stack
+*/
+LUALIB_API int luaL_getsubtable (lua_State *L, int idx, const char *fname) {
+ lua_getfield(L, idx, fname);
+ if (lua_istable(L, -1)) return 1; /* table already there */
+ else {
+ lua_pop(L, 1); /* remove previous result */
+ idx = lua_absindex(L, idx);
+ lua_newtable(L);
+ lua_pushvalue(L, -1); /* copy to be left at top */
+ lua_setfield(L, idx, fname); /* assign new table to field */
+ return 0; /* false, because did not find table there */
+ }
+}
+
+
+/*
+** stripped-down 'require'. Calls 'openf' to open a module,
+** registers the result in 'package.loaded' table and, if 'glb'
+** is true, also registers the result in the global table.
+** Leaves resulting module on the top.
+*/
+LUALIB_API void luaL_requiref (lua_State *L, const char *modname,
+ lua_CFunction openf, int glb) {
+ lua_pushcfunction(L, openf);
+ lua_pushstring(L, modname); /* argument to open function */
+ lua_call(L, 1, 1); /* open module */
+ luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED");
+ lua_pushvalue(L, -2); /* make copy of module (call result) */
+ lua_setfield(L, -2, modname); /* _LOADED[modname] = module */
+ lua_pop(L, 1); /* remove _LOADED table */
+ if (glb) {
+ lua_pushvalue(L, -1); /* copy of 'mod' */
+ lua_setglobal(L, modname); /* _G[modname] = module */
+ }
+}
+
+
+LUALIB_API const char *luaL_gsub (lua_State *L, const char *s, const char *p,
+ const char *r) {
+ const char *wild;
+ size_t l = strlen(p);
+ luaL_Buffer b;
+ luaL_buffinit(L, &b);
+ while ((wild = strstr(s, p)) != NULL) {
+ luaL_addlstring(&b, s, wild - s); /* push prefix */
+ luaL_addstring(&b, r); /* push replacement in place of pattern */
+ s = wild + l; /* continue after `p' */
+ }
+ luaL_addstring(&b, s); /* push last suffix */
+ luaL_pushresult(&b);
+ return lua_tostring(L, -1);
+}
+
+
+LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) {
+ const lua_Number *v = lua_version(L);
+ if (v != lua_version(NULL))
+ luaL_error(L, "multiple Lua VMs detected");
+ else if (*v != ver)
+ luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f",
+ ver, *v);
+ /* check conversions number -> integer types */
+ lua_pushnumber(L, -(lua_Number)0x1234);
+ if (lua_tointeger(L, -1) != -0x1234 ||
+ lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234)
+ luaL_error(L, "bad conversion number->int;"
+ " must recompile Lua with proper settings");
+ lua_pop(L, 1);
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h
new file mode 100644
index 000000000000..f6fdac14f50b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h
@@ -0,0 +1,176 @@
+/*
+** $Id: lauxlib.h,v 1.120.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions for building Lua libraries
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lauxlib_h
+#define lauxlib_h
+
+
+#include <sys/zfs_context.h>
+
+#include "lua.h"
+
+
+
+/* extra error code for `luaL_load' */
+#define LUA_ERRFILE (LUA_ERRERR+1)
+
+
+typedef struct luaL_Reg {
+ const char *name;
+ lua_CFunction func;
+} luaL_Reg;
+
+
+LUALIB_API void (luaL_checkversion_) (lua_State *L, lua_Number ver);
+#define luaL_checkversion(L) luaL_checkversion_(L, LUA_VERSION_NUM)
+
+LUALIB_API int (luaL_getmetafield) (lua_State *L, int obj, const char *e);
+LUALIB_API int (luaL_callmeta) (lua_State *L, int obj, const char *e);
+LUALIB_API const char *(luaL_tolstring) (lua_State *L, int idx, size_t *len);
+LUALIB_API int (luaL_argerror) (lua_State *L, int numarg, const char *extramsg);
+LUALIB_API const char *(luaL_checklstring) (lua_State *L, int numArg,
+ size_t *l);
+LUALIB_API const char *(luaL_optlstring) (lua_State *L, int numArg,
+ const char *def, size_t *l);
+LUALIB_API lua_Number (luaL_checknumber) (lua_State *L, int numArg);
+LUALIB_API lua_Number (luaL_optnumber) (lua_State *L, int nArg, lua_Number def);
+
+LUALIB_API lua_Integer (luaL_checkinteger) (lua_State *L, int numArg);
+LUALIB_API lua_Integer (luaL_optinteger) (lua_State *L, int nArg,
+ lua_Integer def);
+LUALIB_API lua_Unsigned (luaL_checkunsigned) (lua_State *L, int numArg);
+LUALIB_API lua_Unsigned (luaL_optunsigned) (lua_State *L, int numArg,
+ lua_Unsigned def);
+
+LUALIB_API void (luaL_checkstack) (lua_State *L, int sz, const char *msg);
+LUALIB_API void (luaL_checktype) (lua_State *L, int narg, int t);
+LUALIB_API void (luaL_checkany) (lua_State *L, int narg);
+
+LUALIB_API int (luaL_newmetatable) (lua_State *L, const char *tname);
+LUALIB_API void (luaL_setmetatable) (lua_State *L, const char *tname);
+LUALIB_API void *(luaL_testudata) (lua_State *L, int ud, const char *tname);
+LUALIB_API void *(luaL_checkudata) (lua_State *L, int ud, const char *tname);
+
+LUALIB_API void (luaL_where) (lua_State *L, int lvl);
+LUALIB_API int (luaL_error) (lua_State *L, const char *fmt, ...);
+
+LUALIB_API int (luaL_checkoption) (lua_State *L, int narg, const char *def,
+ const char *const lst[]);
+
+/* pre-defined references */
+#define LUA_NOREF (-2)
+#define LUA_REFNIL (-1)
+
+LUALIB_API int (luaL_ref) (lua_State *L, int t);
+LUALIB_API void (luaL_unref) (lua_State *L, int t, int ref);
+
+LUALIB_API int (luaL_loadbufferx) (lua_State *L, const char *buff, size_t sz,
+ const char *name, const char *mode);
+LUALIB_API int (luaL_loadstring) (lua_State *L, const char *s);
+
+LUALIB_API int (luaL_len) (lua_State *L, int idx);
+
+LUALIB_API const char *(luaL_gsub) (lua_State *L, const char *s, const char *p,
+ const char *r);
+
+LUALIB_API void (luaL_setfuncs) (lua_State *L, const luaL_Reg *l, int nup);
+
+LUALIB_API int (luaL_getsubtable) (lua_State *L, int idx, const char *fname);
+
+LUALIB_API void (luaL_traceback) (lua_State *L, lua_State *L1,
+ const char *msg, int level);
+
+LUALIB_API void (luaL_requiref) (lua_State *L, const char *modname,
+ lua_CFunction openf, int glb);
+
+/*
+** ===============================================================
+** some useful macros
+** ===============================================================
+*/
+
+
+#define luaL_newlibtable(L,l) \
+ lua_createtable(L, 0, sizeof(l)/sizeof((l)[0]) - 1)
+
+#define luaL_newlib(L,l) (luaL_newlibtable(L,l), luaL_setfuncs(L,l,0))
+
+#define luaL_argcheck(L, cond,numarg,extramsg) \
+ ((void)((cond) || luaL_argerror(L, (numarg), (extramsg))))
+#define luaL_checkstring(L,n) (luaL_checklstring(L, (n), NULL))
+#define luaL_optstring(L,n,d) (luaL_optlstring(L, (n), (d), NULL))
+#define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n)))
+#define luaL_optint(L,n,d) ((int)luaL_optinteger(L, (n), (d)))
+#define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n)))
+#define luaL_optlong(L,n,d) ((long)luaL_optinteger(L, (n), (d)))
+
+#define luaL_typename(L,i) lua_typename(L, lua_type(L,(i)))
+
+#define luaL_dofile(L, fn) \
+ (luaL_loadfile(L, fn) || lua_pcall(L, 0, LUA_MULTRET, 0))
+
+#define luaL_dostring(L, s) \
+ (luaL_loadstring(L, s) || lua_pcall(L, 0, LUA_MULTRET, 0))
+
+#define luaL_getmetatable(L,n) (lua_getfield(L, LUA_REGISTRYINDEX, (n)))
+
+#define luaL_opt(L,f,n,d) (lua_isnoneornil(L,(n)) ? (d) : f(L,(n)))
+
+#define luaL_loadbuffer(L,s,sz,n) luaL_loadbufferx(L,s,sz,n,NULL)
+
+
+/*
+** {======================================================
+** Generic Buffer manipulation
+** =======================================================
+*/
+
+typedef struct luaL_Buffer {
+ char *b; /* buffer address */
+ size_t size; /* buffer size */
+ size_t n; /* number of characters in buffer */
+ lua_State *L;
+ char initb[LUAL_BUFFERSIZE]; /* initial buffer */
+} luaL_Buffer;
+
+
+#define luaL_addchar(B,c) \
+ ((void)((B)->n < (B)->size || luaL_prepbuffsize((B), 1)), \
+ ((B)->b[(B)->n++] = (c)))
+
+#define luaL_addsize(B,s) ((B)->n += (s))
+
+LUALIB_API void (luaL_buffinit) (lua_State *L, luaL_Buffer *B);
+LUALIB_API char *(luaL_prepbuffsize) (luaL_Buffer *B, size_t sz);
+LUALIB_API void (luaL_addlstring) (luaL_Buffer *B, const char *s, size_t l);
+LUALIB_API void (luaL_addstring) (luaL_Buffer *B, const char *s);
+LUALIB_API void (luaL_addvalue) (luaL_Buffer *B);
+LUALIB_API void (luaL_pushresult) (luaL_Buffer *B);
+LUALIB_API void (luaL_pushresultsize) (luaL_Buffer *B, size_t sz);
+LUALIB_API char *(luaL_buffinitsize) (lua_State *L, luaL_Buffer *B, size_t sz);
+
+#define luaL_prepbuffer(B) luaL_prepbuffsize(B, LUAL_BUFFERSIZE)
+
+/* }====================================================== */
+
+
+/* compatibility with old module system */
+#if defined(LUA_COMPAT_MODULE)
+
+LUALIB_API void (luaL_pushmodule) (lua_State *L, const char *modname,
+ int sizehint);
+LUALIB_API void (luaL_openlib) (lua_State *L, const char *libname,
+ const luaL_Reg *l, int nup);
+
+#define luaL_register(L,n,l) (luaL_openlib(L,(n),(l),0))
+
+#endif
+
+
+#endif
+
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c
new file mode 100644
index 000000000000..b580cee1f955
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c
@@ -0,0 +1,296 @@
+/*
+** $Id: lbaselib.c,v 1.276.1.1 2013/04/12 18:48:47 roberto Exp $
+** Basic library
+** See Copyright Notice in lua.h
+*/
+
+/* The following built-in lua functions have been removed and are not available
+ * for use in ZFS channel programs:
+ *
+ * dofile
+ * loadfile
+ * load
+ * pcall
+ * print
+ * xpcall
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/ctype.h>
+#ifdef illumos
+#define toupper(C) (((C) >= 'a' && (C) <= 'z')? (C) - 'a' + 'A': (C))
+#else
+#define isalnum(C) (isalpha(C) || isdigit(C))
+#endif
+
+#define lbaselib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+#define SPACECHARS " \f\n\r\t\v"
+
+static int luaB_tonumber (lua_State *L) {
+ if (lua_isnoneornil(L, 2)) { /* standard conversion */
+ int isnum;
+ lua_Number n = lua_tonumberx(L, 1, &isnum);
+ if (isnum) {
+ lua_pushnumber(L, n);
+ return 1;
+ } /* else not a number; must be something */
+ luaL_checkany(L, 1);
+ }
+ else {
+ size_t l;
+ const char *s = luaL_checklstring(L, 1, &l);
+ const char *e = s + l; /* end point for 's' */
+ int base = luaL_checkint(L, 2);
+ int neg = 0;
+ luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range");
+ s += strspn(s, SPACECHARS); /* skip initial spaces */
+ if (*s == '-') { s++; neg = 1; } /* handle signal */
+ else if (*s == '+') s++;
+ if (isalnum((unsigned char)*s)) {
+ lua_Number n = 0;
+ do {
+ int digit = (isdigit((unsigned char)*s)) ? *s - '0'
+ : toupper((unsigned char)*s) - 'A' + 10;
+ if (digit >= base) break; /* invalid numeral; force a fail */
+ n = n * (lua_Number)base + (lua_Number)digit;
+ s++;
+ } while (isalnum((unsigned char)*s));
+ s += strspn(s, SPACECHARS); /* skip trailing spaces */
+ if (s == e) { /* no invalid trailing characters? */
+ lua_pushnumber(L, (neg) ? -n : n);
+ return 1;
+ } /* else not a number */
+ } /* else not a number */
+ }
+ lua_pushnil(L); /* not a number */
+ return 1;
+}
+
+
+static int luaB_error (lua_State *L) {
+ int level = luaL_optint(L, 2, 1);
+ lua_settop(L, 1);
+ if (lua_isstring(L, 1) && level > 0) { /* add extra information? */
+ luaL_where(L, level);
+ lua_pushvalue(L, 1);
+ lua_concat(L, 2);
+ }
+ return lua_error(L);
+}
+
+
+static int luaB_getmetatable (lua_State *L) {
+ luaL_checkany(L, 1);
+ if (!lua_getmetatable(L, 1)) {
+ lua_pushnil(L);
+ return 1; /* no metatable */
+ }
+ luaL_getmetafield(L, 1, "__metatable");
+ return 1; /* returns either __metatable field (if present) or metatable */
+}
+
+
+static int luaB_setmetatable (lua_State *L) {
+ int t = lua_type(L, 2);
+ luaL_checktype(L, 1, LUA_TTABLE);
+ luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2,
+ "nil or table expected");
+ if (luaL_getmetafield(L, 1, "__metatable"))
+ return luaL_error(L, "cannot change a protected metatable");
+ lua_settop(L, 2);
+ lua_setmetatable(L, 1);
+ return 1;
+}
+
+
+static int luaB_rawequal (lua_State *L) {
+ luaL_checkany(L, 1);
+ luaL_checkany(L, 2);
+ lua_pushboolean(L, lua_rawequal(L, 1, 2));
+ return 1;
+}
+
+
+static int luaB_rawlen (lua_State *L) {
+ int t = lua_type(L, 1);
+ luaL_argcheck(L, t == LUA_TTABLE || t == LUA_TSTRING, 1,
+ "table or string expected");
+ lua_pushinteger(L, lua_rawlen(L, 1));
+ return 1;
+}
+
+
+static int luaB_rawget (lua_State *L) {
+ luaL_checktype(L, 1, LUA_TTABLE);
+ luaL_checkany(L, 2);
+ lua_settop(L, 2);
+ lua_rawget(L, 1);
+ return 1;
+}
+
+static int luaB_rawset (lua_State *L) {
+ luaL_checktype(L, 1, LUA_TTABLE);
+ luaL_checkany(L, 2);
+ luaL_checkany(L, 3);
+ lua_settop(L, 3);
+ lua_rawset(L, 1);
+ return 1;
+}
+
+
+static int luaB_collectgarbage (lua_State *L) {
+ static const char *const opts[] = {"stop", "restart", "collect",
+ "count", "step", "setpause", "setstepmul",
+ "setmajorinc", "isrunning", "generational", "incremental", NULL};
+ static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT,
+ LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL,
+ LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC};
+ int o = optsnum[luaL_checkoption(L, 1, "collect", opts)];
+ int ex = luaL_optint(L, 2, 0);
+ int res = lua_gc(L, o, ex);
+ switch (o) {
+ case LUA_GCCOUNT: {
+ int b = lua_gc(L, LUA_GCCOUNTB, 0);
+ lua_pushnumber(L, res + ((lua_Number)b/1024));
+ lua_pushinteger(L, b);
+ return 2;
+ }
+ case LUA_GCSTEP: case LUA_GCISRUNNING: {
+ lua_pushboolean(L, res);
+ return 1;
+ }
+ default: {
+ lua_pushinteger(L, res);
+ return 1;
+ }
+ }
+}
+
+
+static int luaB_type (lua_State *L) {
+ luaL_checkany(L, 1);
+ lua_pushstring(L, luaL_typename(L, 1));
+ return 1;
+}
+
+
+static int pairsmeta (lua_State *L, const char *method, int iszero,
+ lua_CFunction iter) {
+ if (!luaL_getmetafield(L, 1, method)) { /* no metamethod? */
+ luaL_checktype(L, 1, LUA_TTABLE); /* argument must be a table */
+ lua_pushcfunction(L, iter); /* will return generator, */
+ lua_pushvalue(L, 1); /* state, */
+ if (iszero) lua_pushinteger(L, 0); /* and initial value */
+ else lua_pushnil(L);
+ }
+ else {
+ lua_pushvalue(L, 1); /* argument 'self' to metamethod */
+ lua_call(L, 1, 3); /* get 3 values from metamethod */
+ }
+ return 3;
+}
+
+
+static int luaB_next (lua_State *L) {
+ luaL_checktype(L, 1, LUA_TTABLE);
+ lua_settop(L, 2); /* create a 2nd argument if there isn't one */
+ if (lua_next(L, 1))
+ return 2;
+ else {
+ lua_pushnil(L);
+ return 1;
+ }
+}
+
+
+static int luaB_pairs (lua_State *L) {
+ return pairsmeta(L, "__pairs", 0, luaB_next);
+}
+
+
+static int ipairsaux (lua_State *L) {
+ int i = luaL_checkint(L, 2);
+ luaL_checktype(L, 1, LUA_TTABLE);
+ i++; /* next value */
+ lua_pushinteger(L, i);
+ lua_rawgeti(L, 1, i);
+ return (lua_isnil(L, -1)) ? 1 : 2;
+}
+
+
+static int luaB_ipairs (lua_State *L) {
+ return pairsmeta(L, "__ipairs", 1, ipairsaux);
+}
+
+
+static int luaB_assert (lua_State *L) {
+ if (!lua_toboolean(L, 1))
+ return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!"));
+ return lua_gettop(L);
+}
+
+
+static int luaB_select (lua_State *L) {
+ int n = lua_gettop(L);
+ if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') {
+ lua_pushinteger(L, n-1);
+ return 1;
+ }
+ else {
+ int i = luaL_checkint(L, 1);
+ if (i < 0) i = n + i;
+ else if (i > n) i = n;
+ luaL_argcheck(L, 1 <= i, 1, "index out of range");
+ return n - i;
+ }
+}
+
+static int luaB_tostring (lua_State *L) {
+ luaL_checkany(L, 1);
+ luaL_tolstring(L, 1, NULL);
+ return 1;
+}
+
+static const luaL_Reg base_funcs[] = {
+ {"assert", luaB_assert},
+ {"collectgarbage", luaB_collectgarbage},
+ {"error", luaB_error},
+ {"getmetatable", luaB_getmetatable},
+ {"ipairs", luaB_ipairs},
+#if defined(LUA_COMPAT_LOADSTRING)
+ {"loadstring", luaB_load},
+#endif
+ {"next", luaB_next},
+ {"pairs", luaB_pairs},
+ {"rawequal", luaB_rawequal},
+ {"rawlen", luaB_rawlen},
+ {"rawget", luaB_rawget},
+ {"rawset", luaB_rawset},
+ {"select", luaB_select},
+ {"setmetatable", luaB_setmetatable},
+ {"tonumber", luaB_tonumber},
+ {"tostring", luaB_tostring},
+ {"type", luaB_type},
+ {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_base (lua_State *L) {
+ /* set global _G */
+ lua_pushglobaltable(L);
+ lua_pushglobaltable(L);
+ lua_setfield(L, -2, "_G");
+ /* open lib into global table */
+ luaL_setfuncs(L, base_funcs, 0);
+ lua_pushliteral(L, LUA_VERSION);
+ lua_setfield(L, -2, "_VERSION"); /* set global _VERSION */
+ return 1;
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c
new file mode 100644
index 000000000000..31c7b66f1290
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c
@@ -0,0 +1,212 @@
+/*
+** $Id: lbitlib.c,v 1.18.1.2 2013/07/09 18:01:41 roberto Exp $
+** Standard library for bitwise operations
+** See Copyright Notice in lua.h
+*/
+
+#define lbitlib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+/* number of bits to consider in a number */
+#if !defined(LUA_NBITS)
+#define LUA_NBITS 32
+#endif
+
+
+#define ALLONES (~(((~(lua_Unsigned)0) << (LUA_NBITS - 1)) << 1))
+
+/* macro to trim extra bits */
+#define trim(x) ((x) & ALLONES)
+
+
+/* builds a number with 'n' ones (1 <= n <= LUA_NBITS) */
+#define mask(n) (~((ALLONES << 1) << ((n) - 1)))
+
+
+typedef lua_Unsigned b_uint;
+
+
+
+static b_uint andaux (lua_State *L) {
+ int i, n = lua_gettop(L);
+ b_uint r = ~(b_uint)0;
+ for (i = 1; i <= n; i++)
+ r &= luaL_checkunsigned(L, i);
+ return trim(r);
+}
+
+
+static int b_and (lua_State *L) {
+ b_uint r = andaux(L);
+ lua_pushunsigned(L, r);
+ return 1;
+}
+
+
+static int b_test (lua_State *L) {
+ b_uint r = andaux(L);
+ lua_pushboolean(L, r != 0);
+ return 1;
+}
+
+
+static int b_or (lua_State *L) {
+ int i, n = lua_gettop(L);
+ b_uint r = 0;
+ for (i = 1; i <= n; i++)
+ r |= luaL_checkunsigned(L, i);
+ lua_pushunsigned(L, trim(r));
+ return 1;
+}
+
+
+static int b_xor (lua_State *L) {
+ int i, n = lua_gettop(L);
+ b_uint r = 0;
+ for (i = 1; i <= n; i++)
+ r ^= luaL_checkunsigned(L, i);
+ lua_pushunsigned(L, trim(r));
+ return 1;
+}
+
+
+static int b_not (lua_State *L) {
+ b_uint r = ~luaL_checkunsigned(L, 1);
+ lua_pushunsigned(L, trim(r));
+ return 1;
+}
+
+
+static int b_shift (lua_State *L, b_uint r, int i) {
+ if (i < 0) { /* shift right? */
+ i = -i;
+ r = trim(r);
+ if (i >= LUA_NBITS) r = 0;
+ else r >>= i;
+ }
+ else { /* shift left */
+ if (i >= LUA_NBITS) r = 0;
+ else r <<= i;
+ r = trim(r);
+ }
+ lua_pushunsigned(L, r);
+ return 1;
+}
+
+
+static int b_lshift (lua_State *L) {
+ return b_shift(L, luaL_checkunsigned(L, 1), luaL_checkint(L, 2));
+}
+
+
+static int b_rshift (lua_State *L) {
+ return b_shift(L, luaL_checkunsigned(L, 1), -luaL_checkint(L, 2));
+}
+
+
+static int b_arshift (lua_State *L) {
+ b_uint r = luaL_checkunsigned(L, 1);
+ int i = luaL_checkint(L, 2);
+ if (i < 0 || !(r & ((b_uint)1 << (LUA_NBITS - 1))))
+ return b_shift(L, r, -i);
+ else { /* arithmetic shift for 'negative' number */
+ if (i >= LUA_NBITS) r = ALLONES;
+ else
+ r = trim((r >> i) | ~(~(b_uint)0 >> i)); /* add signal bit */
+ lua_pushunsigned(L, r);
+ return 1;
+ }
+}
+
+
+static int b_rot (lua_State *L, int i) {
+ b_uint r = luaL_checkunsigned(L, 1);
+ i &= (LUA_NBITS - 1); /* i = i % NBITS */
+ r = trim(r);
+ if (i != 0) /* avoid undefined shift of LUA_NBITS when i == 0 */
+ r = (r << i) | (r >> (LUA_NBITS - i));
+ lua_pushunsigned(L, trim(r));
+ return 1;
+}
+
+
+static int b_lrot (lua_State *L) {
+ return b_rot(L, luaL_checkint(L, 2));
+}
+
+
+static int b_rrot (lua_State *L) {
+ return b_rot(L, -luaL_checkint(L, 2));
+}
+
+
+/*
+** get field and width arguments for field-manipulation functions,
+** checking whether they are valid.
+** ('luaL_error' called without 'return' to avoid later warnings about
+** 'width' being used uninitialized.)
+*/
+static int fieldargs (lua_State *L, int farg, int *width) {
+ int f = luaL_checkint(L, farg);
+ int w = luaL_optint(L, farg + 1, 1);
+ luaL_argcheck(L, 0 <= f, farg, "field cannot be negative");
+ luaL_argcheck(L, 0 < w, farg + 1, "width must be positive");
+ if (f + w > LUA_NBITS)
+ luaL_error(L, "trying to access non-existent bits");
+ *width = w;
+ return f;
+}
+
+
+static int b_extract (lua_State *L) {
+ int w;
+ b_uint r = luaL_checkunsigned(L, 1);
+ int f = fieldargs(L, 2, &w);
+ r = (r >> f) & mask(w);
+ lua_pushunsigned(L, r);
+ return 1;
+}
+
+
+static int b_replace (lua_State *L) {
+ int w;
+ b_uint r = luaL_checkunsigned(L, 1);
+ b_uint v = luaL_checkunsigned(L, 2);
+ int f = fieldargs(L, 3, &w);
+ int m = mask(w);
+ v &= m; /* erase bits outside given width */
+ r = (r & ~(m << f)) | (v << f);
+ lua_pushunsigned(L, r);
+ return 1;
+}
+
+
+static const luaL_Reg bitlib[] = {
+ {"arshift", b_arshift},
+ {"band", b_and},
+ {"bnot", b_not},
+ {"bor", b_or},
+ {"bxor", b_xor},
+ {"btest", b_test},
+ {"extract", b_extract},
+ {"lrotate", b_lrot},
+ {"lshift", b_lshift},
+ {"replace", b_replace},
+ {"rrotate", b_rrot},
+ {"rshift", b_rshift},
+ {NULL, NULL}
+};
+
+
+
+LUAMOD_API int luaopen_bit32 (lua_State *L) {
+ luaL_newlib(L, bitlib);
+ return 1;
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c
new file mode 100644
index 000000000000..f155014d12c4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c
@@ -0,0 +1,885 @@
+/*
+** $Id: lcode.c,v 2.62.1.1 2013/04/12 18:48:47 roberto Exp $
+** Code generator for Lua
+** See Copyright Notice in lua.h
+*/
+
+#include <sys/zfs_context.h>
+
+#define lcode_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lvm.h"
+
+
+#define hasjumps(e) ((e)->t != (e)->f)
+
+
+static int isnumeral(expdesc *e) {
+ return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP);
+}
+
+
+void luaK_nil (FuncState *fs, int from, int n) {
+ Instruction *previous;
+ int l = from + n - 1; /* last register to set nil */
+ if (fs->pc > fs->lasttarget) { /* no jumps to current position? */
+ previous = &fs->f->code[fs->pc-1];
+ if (GET_OPCODE(*previous) == OP_LOADNIL) {
+ int pfrom = GETARG_A(*previous);
+ int pl = pfrom + GETARG_B(*previous);
+ if ((pfrom <= from && from <= pl + 1) ||
+ (from <= pfrom && pfrom <= l + 1)) { /* can connect both? */
+ if (pfrom < from) from = pfrom; /* from = min(from, pfrom) */
+ if (pl > l) l = pl; /* l = max(l, pl) */
+ SETARG_A(*previous, from);
+ SETARG_B(*previous, l - from);
+ return;
+ }
+ } /* else go through */
+ }
+ luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0); /* else no optimization */
+}
+
+
+int luaK_jump (FuncState *fs) {
+ int jpc = fs->jpc; /* save list of jumps to here */
+ int j;
+ fs->jpc = NO_JUMP;
+ j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP);
+ luaK_concat(fs, &j, jpc); /* keep them on hold */
+ return j;
+}
+
+
+void luaK_ret (FuncState *fs, int first, int nret) {
+ luaK_codeABC(fs, OP_RETURN, first, nret+1, 0);
+}
+
+
+static int condjump (FuncState *fs, OpCode op, int A, int B, int C) {
+ luaK_codeABC(fs, op, A, B, C);
+ return luaK_jump(fs);
+}
+
+
+static void fixjump (FuncState *fs, int pc, int dest) {
+ Instruction *jmp = &fs->f->code[pc];
+ int offset = dest-(pc+1);
+ lua_assert(dest != NO_JUMP);
+ if (abs(offset) > MAXARG_sBx)
+ luaX_syntaxerror(fs->ls, "control structure too long");
+ SETARG_sBx(*jmp, offset);
+}
+
+
+/*
+** returns current `pc' and marks it as a jump target (to avoid wrong
+** optimizations with consecutive instructions not in the same basic block).
+*/
+int luaK_getlabel (FuncState *fs) {
+ fs->lasttarget = fs->pc;
+ return fs->pc;
+}
+
+
+static int getjump (FuncState *fs, int pc) {
+ int offset = GETARG_sBx(fs->f->code[pc]);
+ if (offset == NO_JUMP) /* point to itself represents end of list */
+ return NO_JUMP; /* end of list */
+ else
+ return (pc+1)+offset; /* turn offset into absolute position */
+}
+
+
+static Instruction *getjumpcontrol (FuncState *fs, int pc) {
+ Instruction *pi = &fs->f->code[pc];
+ if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1))))
+ return pi-1;
+ else
+ return pi;
+}
+
+
+/*
+** check whether list has any jump that do not produce a value
+** (or produce an inverted value)
+*/
+static int need_value (FuncState *fs, int list) {
+ for (; list != NO_JUMP; list = getjump(fs, list)) {
+ Instruction i = *getjumpcontrol(fs, list);
+ if (GET_OPCODE(i) != OP_TESTSET) return 1;
+ }
+ return 0; /* not found */
+}
+
+
+static int patchtestreg (FuncState *fs, int node, int reg) {
+ Instruction *i = getjumpcontrol(fs, node);
+ if (GET_OPCODE(*i) != OP_TESTSET)
+ return 0; /* cannot patch other instructions */
+ if (reg != NO_REG && reg != GETARG_B(*i))
+ SETARG_A(*i, reg);
+ else /* no register to put value or register already has the value */
+ *i = CREATE_ABC(OP_TEST, GETARG_B(*i), 0, GETARG_C(*i));
+
+ return 1;
+}
+
+
+static void removevalues (FuncState *fs, int list) {
+ for (; list != NO_JUMP; list = getjump(fs, list))
+ patchtestreg(fs, list, NO_REG);
+}
+
+
+static void patchlistaux (FuncState *fs, int list, int vtarget, int reg,
+ int dtarget) {
+ while (list != NO_JUMP) {
+ int next = getjump(fs, list);
+ if (patchtestreg(fs, list, reg))
+ fixjump(fs, list, vtarget);
+ else
+ fixjump(fs, list, dtarget); /* jump to default target */
+ list = next;
+ }
+}
+
+
+static void dischargejpc (FuncState *fs) {
+ patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc);
+ fs->jpc = NO_JUMP;
+}
+
+
+void luaK_patchlist (FuncState *fs, int list, int target) {
+ if (target == fs->pc)
+ luaK_patchtohere(fs, list);
+ else {
+ lua_assert(target < fs->pc);
+ patchlistaux(fs, list, target, NO_REG, target);
+ }
+}
+
+
+LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) {
+ level++; /* argument is +1 to reserve 0 as non-op */
+ while (list != NO_JUMP) {
+ int next = getjump(fs, list);
+ lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP &&
+ (GETARG_A(fs->f->code[list]) == 0 ||
+ GETARG_A(fs->f->code[list]) >= level));
+ SETARG_A(fs->f->code[list], level);
+ list = next;
+ }
+}
+
+
+void luaK_patchtohere (FuncState *fs, int list) {
+ luaK_getlabel(fs);
+ luaK_concat(fs, &fs->jpc, list);
+}
+
+
+void luaK_concat (FuncState *fs, int *l1, int l2) {
+ if (l2 == NO_JUMP) return;
+ else if (*l1 == NO_JUMP)
+ *l1 = l2;
+ else {
+ int list = *l1;
+ int next;
+ while ((next = getjump(fs, list)) != NO_JUMP) /* find last element */
+ list = next;
+ fixjump(fs, list, l2);
+ }
+}
+
+
+static int luaK_code (FuncState *fs, Instruction i) {
+ Proto *f = fs->f;
+ dischargejpc(fs); /* `pc' will change */
+ /* put new instruction in code array */
+ luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction,
+ MAX_INT, "opcodes");
+ f->code[fs->pc] = i;
+ /* save corresponding line information */
+ luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int,
+ MAX_INT, "opcodes");
+ f->lineinfo[fs->pc] = fs->ls->lastline;
+ return fs->pc++;
+}
+
+
+int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) {
+ lua_assert(getOpMode(o) == iABC);
+ lua_assert(getBMode(o) != OpArgN || b == 0);
+ lua_assert(getCMode(o) != OpArgN || c == 0);
+ lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C);
+ return luaK_code(fs, CREATE_ABC(o, a, b, c));
+}
+
+
+int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) {
+ lua_assert(getOpMode(o) == iABx || getOpMode(o) == iAsBx);
+ lua_assert(getCMode(o) == OpArgN);
+ lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx);
+ return luaK_code(fs, CREATE_ABx(o, a, bc));
+}
+
+
+static int codeextraarg (FuncState *fs, int a) {
+ lua_assert(a <= MAXARG_Ax);
+ return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a));
+}
+
+
+int luaK_codek (FuncState *fs, int reg, int k) {
+ if (k <= MAXARG_Bx)
+ return luaK_codeABx(fs, OP_LOADK, reg, k);
+ else {
+ int p = luaK_codeABx(fs, OP_LOADKX, reg, 0);
+ codeextraarg(fs, k);
+ return p;
+ }
+}
+
+
+void luaK_checkstack (FuncState *fs, int n) {
+ int newstack = fs->freereg + n;
+ if (newstack > fs->f->maxstacksize) {
+ if (newstack >= MAXSTACK)
+ luaX_syntaxerror(fs->ls, "function or expression too complex");
+ fs->f->maxstacksize = cast_byte(newstack);
+ }
+}
+
+
+void luaK_reserveregs (FuncState *fs, int n) {
+ luaK_checkstack(fs, n);
+ fs->freereg += n;
+}
+
+
+static void freereg (FuncState *fs, int reg) {
+ if (!ISK(reg) && reg >= fs->nactvar) {
+ fs->freereg--;
+ lua_assert(reg == fs->freereg);
+ }
+}
+
+
+static void freeexp (FuncState *fs, expdesc *e) {
+ if (e->k == VNONRELOC)
+ freereg(fs, e->u.info);
+}
+
+
+static int addk (FuncState *fs, TValue *key, TValue *v) {
+ lua_State *L = fs->ls->L;
+ TValue *idx = luaH_set(L, fs->h, key);
+ Proto *f = fs->f;
+ int k, oldsize;
+ if (ttisnumber(idx)) {
+ lua_Number n = nvalue(idx);
+ lua_number2int(k, n);
+ if (luaV_rawequalobj(&f->k[k], v))
+ return k;
+ /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0");
+ go through and create a new entry for this value */
+ }
+ /* constant not found; create a new entry */
+ oldsize = f->sizek;
+ k = fs->nk;
+ /* numerical value does not need GC barrier;
+ table has no metatable, so it does not need to invalidate cache */
+ setnvalue(idx, cast_num(k));
+ luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants");
+ while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]);
+ setobj(L, &f->k[k], v);
+ fs->nk++;
+ luaC_barrier(L, f, v);
+ return k;
+}
+
+
+int luaK_stringK (FuncState *fs, TString *s) {
+ TValue o;
+ setsvalue(fs->ls->L, &o, s);
+ return addk(fs, &o, &o);
+}
+
+
+int luaK_numberK (FuncState *fs, lua_Number r) {
+ int n;
+ lua_State *L = fs->ls->L;
+ TValue o;
+ setnvalue(&o, r);
+ if (r == 0 || luai_numisnan(NULL, r)) { /* handle -0 and NaN */
+ /* use raw representation as key to avoid numeric problems */
+ setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r)));
+ n = addk(fs, L->top - 1, &o);
+ L->top--;
+ }
+ else
+ n = addk(fs, &o, &o); /* regular case */
+ return n;
+}
+
+
+static int boolK (FuncState *fs, int b) {
+ TValue o;
+ setbvalue(&o, b);
+ return addk(fs, &o, &o);
+}
+
+
+static int nilK (FuncState *fs) {
+ TValue k, v;
+ setnilvalue(&v);
+ /* cannot use nil as key; instead use table itself to represent nil */
+ sethvalue(fs->ls->L, &k, fs->h);
+ return addk(fs, &k, &v);
+}
+
+
+void luaK_setreturns (FuncState *fs, expdesc *e, int nresults) {
+ if (e->k == VCALL) { /* expression is an open function call? */
+ SETARG_C(getcode(fs, e), nresults+1);
+ }
+ else if (e->k == VVARARG) {
+ SETARG_B(getcode(fs, e), nresults+1);
+ SETARG_A(getcode(fs, e), fs->freereg);
+ luaK_reserveregs(fs, 1);
+ }
+}
+
+
+void luaK_setoneret (FuncState *fs, expdesc *e) {
+ if (e->k == VCALL) { /* expression is an open function call? */
+ e->k = VNONRELOC;
+ e->u.info = GETARG_A(getcode(fs, e));
+ }
+ else if (e->k == VVARARG) {
+ SETARG_B(getcode(fs, e), 2);
+ e->k = VRELOCABLE; /* can relocate its simple result */
+ }
+}
+
+
+void luaK_dischargevars (FuncState *fs, expdesc *e) {
+ switch (e->k) {
+ case VLOCAL: {
+ e->k = VNONRELOC;
+ break;
+ }
+ case VUPVAL: {
+ e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0);
+ e->k = VRELOCABLE;
+ break;
+ }
+ case VINDEXED: {
+ OpCode op = OP_GETTABUP; /* assume 't' is in an upvalue */
+ freereg(fs, e->u.ind.idx);
+ if (e->u.ind.vt == VLOCAL) { /* 't' is in a register? */
+ freereg(fs, e->u.ind.t);
+ op = OP_GETTABLE;
+ }
+ e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx);
+ e->k = VRELOCABLE;
+ break;
+ }
+ case VVARARG:
+ case VCALL: {
+ luaK_setoneret(fs, e);
+ break;
+ }
+ default: break; /* there is one value available (somewhere) */
+ }
+}
+
+
+static int code_label (FuncState *fs, int A, int b, int jump) {
+ luaK_getlabel(fs); /* those instructions may be jump targets */
+ return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump);
+}
+
+
+static void discharge2reg (FuncState *fs, expdesc *e, int reg) {
+ luaK_dischargevars(fs, e);
+ switch (e->k) {
+ case VNIL: {
+ luaK_nil(fs, reg, 1);
+ break;
+ }
+ case VFALSE: case VTRUE: {
+ luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0);
+ break;
+ }
+ case VK: {
+ luaK_codek(fs, reg, e->u.info);
+ break;
+ }
+ case VKNUM: {
+ luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval));
+ break;
+ }
+ case VRELOCABLE: {
+ Instruction *pc = &getcode(fs, e);
+ SETARG_A(*pc, reg);
+ break;
+ }
+ case VNONRELOC: {
+ if (reg != e->u.info)
+ luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0);
+ break;
+ }
+ default: {
+ lua_assert(e->k == VVOID || e->k == VJMP);
+ return; /* nothing to do... */
+ }
+ }
+ e->u.info = reg;
+ e->k = VNONRELOC;
+}
+
+
+static void discharge2anyreg (FuncState *fs, expdesc *e) {
+ if (e->k != VNONRELOC) {
+ luaK_reserveregs(fs, 1);
+ discharge2reg(fs, e, fs->freereg-1);
+ }
+}
+
+
+static void exp2reg (FuncState *fs, expdesc *e, int reg) {
+ discharge2reg(fs, e, reg);
+ if (e->k == VJMP)
+ luaK_concat(fs, &e->t, e->u.info); /* put this jump in `t' list */
+ if (hasjumps(e)) {
+ int final; /* position after whole expression */
+ int p_f = NO_JUMP; /* position of an eventual LOAD false */
+ int p_t = NO_JUMP; /* position of an eventual LOAD true */
+ if (need_value(fs, e->t) || need_value(fs, e->f)) {
+ int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs);
+ p_f = code_label(fs, reg, 0, 1);
+ p_t = code_label(fs, reg, 1, 0);
+ luaK_patchtohere(fs, fj);
+ }
+ final = luaK_getlabel(fs);
+ patchlistaux(fs, e->f, final, reg, p_f);
+ patchlistaux(fs, e->t, final, reg, p_t);
+ }
+ e->f = e->t = NO_JUMP;
+ e->u.info = reg;
+ e->k = VNONRELOC;
+}
+
+
+void luaK_exp2nextreg (FuncState *fs, expdesc *e) {
+ luaK_dischargevars(fs, e);
+ freeexp(fs, e);
+ luaK_reserveregs(fs, 1);
+ exp2reg(fs, e, fs->freereg - 1);
+}
+
+
+int luaK_exp2anyreg (FuncState *fs, expdesc *e) {
+ luaK_dischargevars(fs, e);
+ if (e->k == VNONRELOC) {
+ if (!hasjumps(e)) return e->u.info; /* exp is already in a register */
+ if (e->u.info >= fs->nactvar) { /* reg. is not a local? */
+ exp2reg(fs, e, e->u.info); /* put value on it */
+ return e->u.info;
+ }
+ }
+ luaK_exp2nextreg(fs, e); /* default */
+ return e->u.info;
+}
+
+
+void luaK_exp2anyregup (FuncState *fs, expdesc *e) {
+ if (e->k != VUPVAL || hasjumps(e))
+ luaK_exp2anyreg(fs, e);
+}
+
+
+void luaK_exp2val (FuncState *fs, expdesc *e) {
+ if (hasjumps(e))
+ luaK_exp2anyreg(fs, e);
+ else
+ luaK_dischargevars(fs, e);
+}
+
+
+int luaK_exp2RK (FuncState *fs, expdesc *e) {
+ luaK_exp2val(fs, e);
+ switch (e->k) {
+ case VTRUE:
+ case VFALSE:
+ case VNIL: {
+ if (fs->nk <= MAXINDEXRK) { /* constant fits in RK operand? */
+ e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE));
+ e->k = VK;
+ return RKASK(e->u.info);
+ }
+ else break;
+ }
+ case VKNUM: {
+ e->u.info = luaK_numberK(fs, e->u.nval);
+ e->k = VK;
+ /* go through */
+ }
+ case VK: {
+ if (e->u.info <= MAXINDEXRK) /* constant fits in argC? */
+ return RKASK(e->u.info);
+ else break;
+ }
+ default: break;
+ }
+ /* not a constant in the right range: put it in a register */
+ return luaK_exp2anyreg(fs, e);
+}
+
+
+void luaK_storevar (FuncState *fs, expdesc *var, expdesc *ex) {
+ switch (var->k) {
+ case VLOCAL: {
+ freeexp(fs, ex);
+ exp2reg(fs, ex, var->u.info);
+ return;
+ }
+ case VUPVAL: {
+ int e = luaK_exp2anyreg(fs, ex);
+ luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0);
+ break;
+ }
+ case VINDEXED: {
+ OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP;
+ int e = luaK_exp2RK(fs, ex);
+ luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e);
+ break;
+ }
+ default: {
+ lua_assert(0); /* invalid var kind to store */
+ break;
+ }
+ }
+ freeexp(fs, ex);
+}
+
+
+void luaK_self (FuncState *fs, expdesc *e, expdesc *key) {
+ int ereg;
+ luaK_exp2anyreg(fs, e);
+ ereg = e->u.info; /* register where 'e' was placed */
+ freeexp(fs, e);
+ e->u.info = fs->freereg; /* base register for op_self */
+ e->k = VNONRELOC;
+ luaK_reserveregs(fs, 2); /* function and 'self' produced by op_self */
+ luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key));
+ freeexp(fs, key);
+}
+
+
+static void invertjump (FuncState *fs, expdesc *e) {
+ Instruction *pc = getjumpcontrol(fs, e->u.info);
+ lua_assert(testTMode(GET_OPCODE(*pc)) && GET_OPCODE(*pc) != OP_TESTSET &&
+ GET_OPCODE(*pc) != OP_TEST);
+ SETARG_A(*pc, !(GETARG_A(*pc)));
+}
+
+
+static int jumponcond (FuncState *fs, expdesc *e, int cond) {
+ if (e->k == VRELOCABLE) {
+ Instruction ie = getcode(fs, e);
+ if (GET_OPCODE(ie) == OP_NOT) {
+ fs->pc--; /* remove previous OP_NOT */
+ return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond);
+ }
+ /* else go through */
+ }
+ discharge2anyreg(fs, e);
+ freeexp(fs, e);
+ return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond);
+}
+
+
+void luaK_goiftrue (FuncState *fs, expdesc *e) {
+ int pc; /* pc of last jump */
+ luaK_dischargevars(fs, e);
+ switch (e->k) {
+ case VJMP: {
+ invertjump(fs, e);
+ pc = e->u.info;
+ break;
+ }
+ case VK: case VKNUM: case VTRUE: {
+ pc = NO_JUMP; /* always true; do nothing */
+ break;
+ }
+ default: {
+ pc = jumponcond(fs, e, 0);
+ break;
+ }
+ }
+ luaK_concat(fs, &e->f, pc); /* insert last jump in `f' list */
+ luaK_patchtohere(fs, e->t);
+ e->t = NO_JUMP;
+}
+
+
+void luaK_goiffalse (FuncState *fs, expdesc *e) {
+ int pc; /* pc of last jump */
+ luaK_dischargevars(fs, e);
+ switch (e->k) {
+ case VJMP: {
+ pc = e->u.info;
+ break;
+ }
+ case VNIL: case VFALSE: {
+ pc = NO_JUMP; /* always false; do nothing */
+ break;
+ }
+ default: {
+ pc = jumponcond(fs, e, 1);
+ break;
+ }
+ }
+ luaK_concat(fs, &e->t, pc); /* insert last jump in `t' list */
+ luaK_patchtohere(fs, e->f);
+ e->f = NO_JUMP;
+}
+
+
+static void codenot (FuncState *fs, expdesc *e) {
+ luaK_dischargevars(fs, e);
+ switch (e->k) {
+ case VNIL: case VFALSE: {
+ e->k = VTRUE;
+ break;
+ }
+ case VK: case VKNUM: case VTRUE: {
+ e->k = VFALSE;
+ break;
+ }
+ case VJMP: {
+ invertjump(fs, e);
+ break;
+ }
+ case VRELOCABLE:
+ case VNONRELOC: {
+ discharge2anyreg(fs, e);
+ freeexp(fs, e);
+ e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0);
+ e->k = VRELOCABLE;
+ break;
+ }
+ default: {
+ lua_assert(0); /* cannot happen */
+ break;
+ }
+ }
+ /* interchange true and false lists */
+ { int temp = e->f; e->f = e->t; e->t = temp; }
+ removevalues(fs, e->f);
+ removevalues(fs, e->t);
+}
+
+
+void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k) {
+ lua_assert(!hasjumps(t));
+ t->u.ind.t = t->u.info;
+ t->u.ind.idx = luaK_exp2RK(fs, k);
+ t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL
+ : check_exp(vkisinreg(t->k), VLOCAL);
+ t->k = VINDEXED;
+}
+
+
+static int constfolding (OpCode op, expdesc *e1, expdesc *e2) {
+ lua_Number r;
+ if (!isnumeral(e1) || !isnumeral(e2)) return 0;
+ if ((op == OP_DIV || op == OP_MOD) && e2->u.nval == 0)
+ return 0; /* do not attempt to divide by 0 */
+ /*
+ * Patched: check for MIN_INT / -1
+ */
+ if (op == OP_DIV && e1->u.nval == INT64_MIN && e2->u.nval == -1)
+ return 0;
+ r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval);
+ e1->u.nval = r;
+ return 1;
+}
+
+
+static void codearith (FuncState *fs, OpCode op,
+ expdesc *e1, expdesc *e2, int line) {
+ if (constfolding(op, e1, e2))
+ return;
+ else {
+ int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0;
+ int o1 = luaK_exp2RK(fs, e1);
+ if (o1 > o2) {
+ freeexp(fs, e1);
+ freeexp(fs, e2);
+ }
+ else {
+ freeexp(fs, e2);
+ freeexp(fs, e1);
+ }
+ e1->u.info = luaK_codeABC(fs, op, 0, o1, o2);
+ e1->k = VRELOCABLE;
+ luaK_fixline(fs, line);
+ }
+}
+
+
+static void codecomp (FuncState *fs, OpCode op, int cond, expdesc *e1,
+ expdesc *e2) {
+ int o1 = luaK_exp2RK(fs, e1);
+ int o2 = luaK_exp2RK(fs, e2);
+ freeexp(fs, e2);
+ freeexp(fs, e1);
+ if (cond == 0 && op != OP_EQ) {
+ int temp; /* exchange args to replace by `<' or `<=' */
+ temp = o1; o1 = o2; o2 = temp; /* o1 <==> o2 */
+ cond = 1;
+ }
+ e1->u.info = condjump(fs, op, cond, o1, o2);
+ e1->k = VJMP;
+}
+
+
+void luaK_prefix (FuncState *fs, UnOpr op, expdesc *e, int line) {
+ expdesc e2;
+ e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0;
+ switch (op) {
+ case OPR_MINUS: {
+ if (isnumeral(e)) /* minus constant? */
+ e->u.nval = luai_numunm(NULL, e->u.nval); /* fold it */
+ else {
+ luaK_exp2anyreg(fs, e);
+ codearith(fs, OP_UNM, e, &e2, line);
+ }
+ break;
+ }
+ case OPR_NOT: codenot(fs, e); break;
+ case OPR_LEN: {
+ luaK_exp2anyreg(fs, e); /* cannot operate on constants */
+ codearith(fs, OP_LEN, e, &e2, line);
+ break;
+ }
+ default: lua_assert(0);
+ }
+}
+
+
+void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) {
+ switch (op) {
+ case OPR_AND: {
+ luaK_goiftrue(fs, v);
+ break;
+ }
+ case OPR_OR: {
+ luaK_goiffalse(fs, v);
+ break;
+ }
+ case OPR_CONCAT: {
+ luaK_exp2nextreg(fs, v); /* operand must be on the `stack' */
+ break;
+ }
+ case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
+ case OPR_MOD: case OPR_POW: {
+ if (!isnumeral(v)) luaK_exp2RK(fs, v);
+ break;
+ }
+ default: {
+ luaK_exp2RK(fs, v);
+ break;
+ }
+ }
+}
+
+
+void luaK_posfix (FuncState *fs, BinOpr op,
+ expdesc *e1, expdesc *e2, int line) {
+ switch (op) {
+ case OPR_AND: {
+ lua_assert(e1->t == NO_JUMP); /* list must be closed */
+ luaK_dischargevars(fs, e2);
+ luaK_concat(fs, &e2->f, e1->f);
+ *e1 = *e2;
+ break;
+ }
+ case OPR_OR: {
+ lua_assert(e1->f == NO_JUMP); /* list must be closed */
+ luaK_dischargevars(fs, e2);
+ luaK_concat(fs, &e2->t, e1->t);
+ *e1 = *e2;
+ break;
+ }
+ case OPR_CONCAT: {
+ luaK_exp2val(fs, e2);
+ if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) {
+ lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1);
+ freeexp(fs, e1);
+ SETARG_B(getcode(fs, e2), e1->u.info);
+ e1->k = VRELOCABLE; e1->u.info = e2->u.info;
+ }
+ else {
+ luaK_exp2nextreg(fs, e2); /* operand must be on the 'stack' */
+ codearith(fs, OP_CONCAT, e1, e2, line);
+ }
+ break;
+ }
+ case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
+ case OPR_MOD: case OPR_POW: {
+ codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line);
+ break;
+ }
+ case OPR_EQ: case OPR_LT: case OPR_LE: {
+ codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2);
+ break;
+ }
+ case OPR_NE: case OPR_GT: case OPR_GE: {
+ codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2);
+ break;
+ }
+ default: lua_assert(0);
+ }
+}
+
+
+void luaK_fixline (FuncState *fs, int line) {
+ fs->f->lineinfo[fs->pc - 1] = line;
+}
+
+
+void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) {
+ int c = (nelems - 1)/LFIELDS_PER_FLUSH + 1;
+ int b = (tostore == LUA_MULTRET) ? 0 : tostore;
+ lua_assert(tostore != 0);
+ if (c <= MAXARG_C)
+ luaK_codeABC(fs, OP_SETLIST, base, b, c);
+ else if (c <= MAXARG_Ax) {
+ luaK_codeABC(fs, OP_SETLIST, base, b, 0);
+ codeextraarg(fs, c);
+ }
+ else
+ luaX_syntaxerror(fs->ls, "constructor too long");
+ fs->freereg = base + 1; /* free registers with list values */
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h
new file mode 100644
index 000000000000..6a1424cf5a73
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h
@@ -0,0 +1,83 @@
+/*
+** $Id: lcode.h,v 1.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Code generator for Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lcode_h
+#define lcode_h
+
+#include "llex.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+
+
+/*
+** Marks the end of a patch list. It is an invalid value both as an absolute
+** address, and as a list link (would link an element to itself).
+*/
+#define NO_JUMP (-1)
+
+
+/*
+** grep "ORDER OPR" if you change these enums (ORDER OP)
+*/
+typedef enum BinOpr {
+ OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW,
+ OPR_CONCAT,
+ OPR_EQ, OPR_LT, OPR_LE,
+ OPR_NE, OPR_GT, OPR_GE,
+ OPR_AND, OPR_OR,
+ OPR_NOBINOPR
+} BinOpr;
+
+
+typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr;
+
+
+#define getcode(fs,e) ((fs)->f->code[(e)->u.info])
+
+#define luaK_codeAsBx(fs,o,A,sBx) luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx)
+
+#define luaK_setmultret(fs,e) luaK_setreturns(fs, e, LUA_MULTRET)
+
+#define luaK_jumpto(fs,t) luaK_patchlist(fs, luaK_jump(fs), t)
+
+LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx);
+LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C);
+LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k);
+LUAI_FUNC void luaK_fixline (FuncState *fs, int line);
+LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n);
+LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n);
+LUAI_FUNC void luaK_checkstack (FuncState *fs, int n);
+LUAI_FUNC int luaK_stringK (FuncState *fs, TString *s);
+LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r);
+LUAI_FUNC void luaK_dischargevars (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_exp2anyreg (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2anyregup (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2nextreg (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2val (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_exp2RK (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_self (FuncState *fs, expdesc *e, expdesc *key);
+LUAI_FUNC void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k);
+LUAI_FUNC void luaK_goiftrue (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_goiffalse (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_storevar (FuncState *fs, expdesc *var, expdesc *e);
+LUAI_FUNC void luaK_setreturns (FuncState *fs, expdesc *e, int nresults);
+LUAI_FUNC void luaK_setoneret (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_jump (FuncState *fs);
+LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret);
+LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target);
+LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list);
+LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level);
+LUAI_FUNC void luaK_concat (FuncState *fs, int *l1, int l2);
+LUAI_FUNC int luaK_getlabel (FuncState *fs);
+LUAI_FUNC void luaK_prefix (FuncState *fs, UnOpr op, expdesc *v, int line);
+LUAI_FUNC void luaK_infix (FuncState *fs, BinOpr op, expdesc *v);
+LUAI_FUNC void luaK_posfix (FuncState *fs, BinOpr op, expdesc *v1,
+ expdesc *v2, int line);
+LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore);
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c
new file mode 100644
index 000000000000..55564ddbd9fd
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include "lua.h"
+
+#include <sys/zfs_context.h>
+
+ssize_t
+lcompat_sprintf(char *buf, const char *fmt, ...)
+{
+ ssize_t res;
+ va_list args;
+
+ va_start(args, fmt);
+ res = vsnprintf(buf, INT_MAX, fmt, args);
+ va_end(args);
+
+ return (res);
+}
+
+int64_t
+lcompat_strtoll(const char *str, char **ptr)
+{
+ int base;
+ const char *cp;
+ int digits;
+ int64_t value;
+ boolean_t is_negative;
+
+ cp = str;
+ while (*cp == ' ' || *cp == '\t' || *cp == '\n') {
+ cp++;
+ }
+ is_negative = (*cp == '-');
+ if (is_negative) {
+ cp++;
+ }
+ base = 10;
+
+ if (*cp == '0') {
+ base = 8;
+ cp++;
+ if (*cp == 'x' || *cp == 'X') {
+ base = 16;
+ cp++;
+ }
+ }
+
+ value = 0;
+ for (; *cp != '\0'; cp++) {
+ if (*cp >= '0' && *cp <= '9') {
+ digits = *cp - '0';
+ } else if (*cp >= 'a' && *cp <= 'f') {
+ digits = *cp - 'a' + 10;
+ } else if (*cp >= 'A' && *cp <= 'F') {
+ digits = *cp - 'A' + 10;
+ } else {
+ break;
+ }
+ if (digits >= base) {
+ break;
+ }
+ value = (value * base) + digits;
+ }
+
+ if (ptr != NULL) {
+ *ptr = (char *)cp;
+ }
+ if (is_negative) {
+ value = -value;
+ }
+ return (value);
+}
+
+int64_t
+lcompat_pow(int64_t x, int64_t y)
+{
+ int64_t result = 1;
+ if (y < 0)
+ return (0);
+
+ while (y) {
+ if (y & 1)
+ result *= x;
+ y >>= 1;
+ x *= x;
+ }
+ return (result);
+}
+
+int
+lcompat_hashnum(int64_t x)
+{
+ x = (~x) + (x << 18);
+ x = x ^ (x >> 31);
+ x = x * 21;
+ x = x ^ (x >> 11);
+ x = x + (x << 6);
+ x = x ^ (x >> 22);
+ return ((int)x);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c
new file mode 100644
index 000000000000..405350bb145b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c
@@ -0,0 +1,154 @@
+/*
+** $Id: lcorolib.c,v 1.5.1.1 2013/04/12 18:48:47 roberto Exp $
+** Coroutine Library
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define lcorolib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+static int auxresume (lua_State *L, lua_State *co, int narg) {
+ int status;
+ if (!lua_checkstack(co, narg)) {
+ lua_pushliteral(L, "too many arguments to resume");
+ return -1; /* error flag */
+ }
+ if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) {
+ lua_pushliteral(L, "cannot resume dead coroutine");
+ return -1; /* error flag */
+ }
+ lua_xmove(L, co, narg);
+ status = lua_resume(co, L, narg);
+ if (status == LUA_OK || status == LUA_YIELD) {
+ int nres = lua_gettop(co);
+ if (!lua_checkstack(L, nres + 1)) {
+ lua_pop(co, nres); /* remove results anyway */
+ lua_pushliteral(L, "too many results to resume");
+ return -1; /* error flag */
+ }
+ lua_xmove(co, L, nres); /* move yielded values */
+ return nres;
+ }
+ else {
+ lua_xmove(co, L, 1); /* move error message */
+ return -1; /* error flag */
+ }
+}
+
+
+static int luaB_coresume (lua_State *L) {
+ lua_State *co = lua_tothread(L, 1);
+ int r;
+ luaL_argcheck(L, co, 1, "coroutine expected");
+ r = auxresume(L, co, lua_gettop(L) - 1);
+ if (r < 0) {
+ lua_pushboolean(L, 0);
+ lua_insert(L, -2);
+ return 2; /* return false + error message */
+ }
+ else {
+ lua_pushboolean(L, 1);
+ lua_insert(L, -(r + 1));
+ return r + 1; /* return true + `resume' returns */
+ }
+}
+
+
+static int luaB_auxwrap (lua_State *L) {
+ lua_State *co = lua_tothread(L, lua_upvalueindex(1));
+ int r = auxresume(L, co, lua_gettop(L));
+ if (r < 0) {
+ if (lua_isstring(L, -1)) { /* error object is a string? */
+ luaL_where(L, 1); /* add extra info */
+ lua_insert(L, -2);
+ lua_concat(L, 2);
+ }
+ return lua_error(L); /* propagate error */
+ }
+ return r;
+}
+
+
+static int luaB_cocreate (lua_State *L) {
+ lua_State *NL;
+ luaL_checktype(L, 1, LUA_TFUNCTION);
+ NL = lua_newthread(L);
+ lua_pushvalue(L, 1); /* move function to top */
+ lua_xmove(L, NL, 1); /* move function from L to NL */
+ return 1;
+}
+
+
+static int luaB_cowrap (lua_State *L) {
+ luaB_cocreate(L);
+ lua_pushcclosure(L, luaB_auxwrap, 1);
+ return 1;
+}
+
+
+static int luaB_yield (lua_State *L) {
+ return lua_yield(L, lua_gettop(L));
+}
+
+
+static int luaB_costatus (lua_State *L) {
+ lua_State *co = lua_tothread(L, 1);
+ luaL_argcheck(L, co, 1, "coroutine expected");
+ if (L == co) lua_pushliteral(L, "running");
+ else {
+ switch (lua_status(co)) {
+ case LUA_YIELD:
+ lua_pushliteral(L, "suspended");
+ break;
+ case LUA_OK: {
+ lua_Debug ar;
+ if (lua_getstack(co, 0, &ar) > 0) /* does it have frames? */
+ lua_pushliteral(L, "normal"); /* it is running */
+ else if (lua_gettop(co) == 0)
+ lua_pushliteral(L, "dead");
+ else
+ lua_pushliteral(L, "suspended"); /* initial state */
+ break;
+ }
+ default: /* some error occurred */
+ lua_pushliteral(L, "dead");
+ break;
+ }
+ }
+ return 1;
+}
+
+
+static int luaB_corunning (lua_State *L) {
+ int ismain = lua_pushthread(L);
+ lua_pushboolean(L, ismain);
+ return 2;
+}
+
+
+static const luaL_Reg co_funcs[] = {
+ {"create", luaB_cocreate},
+ {"resume", luaB_coresume},
+ {"running", luaB_corunning},
+ {"status", luaB_costatus},
+ {"wrap", luaB_cowrap},
+ {"yield", luaB_yield},
+ {NULL, NULL}
+};
+
+
+
+LUAMOD_API int luaopen_coroutine (lua_State *L) {
+ luaL_newlib(L, co_funcs);
+ return 1;
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c
new file mode 100644
index 000000000000..107859811bfc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c
@@ -0,0 +1,52 @@
+/*
+** $Id: lctype.c,v 1.11.1.1 2013/04/12 18:48:47 roberto Exp $
+** 'ctype' functions for Lua
+** See Copyright Notice in lua.h
+*/
+
+#define lctype_c
+#define LUA_CORE
+
+#include "lctype.h"
+
+#if !LUA_USE_CTYPE /* { */
+
+#include <sys/zfs_context.h>
+
+LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = {
+ 0x00, /* EOZ */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0. */
+ 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* 2. */
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, /* 3. */
+ 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 4. */
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 5. */
+ 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05,
+ 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 6. */
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 7. */
+ 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 9. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* a. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* b. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* c. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* d. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* e. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* f. */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+#endif /* } */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h
new file mode 100644
index 000000000000..299a59b92e2c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h
@@ -0,0 +1,93 @@
+/*
+** $Id: lctype.h,v 1.12.1.1 2013/04/12 18:48:47 roberto Exp $
+** 'ctype' functions for Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lctype_h
+#define lctype_h
+
+#include "lua.h"
+
+
+/*
+** WARNING: the functions defined here do not necessarily correspond
+** to the similar functions in the standard C ctype.h. They are
+** optimized for the specific needs of Lua
+*/
+
+#if !defined(LUA_USE_CTYPE)
+
+#if 'A' == 65 && '0' == 48
+/* ASCII case: can use its own tables; faster and fixed */
+#define LUA_USE_CTYPE 0
+#else
+/* must use standard C ctype */
+#define LUA_USE_CTYPE 1
+#endif
+
+#endif
+
+
+#if !LUA_USE_CTYPE /* { */
+
+#include "llimits.h"
+
+
+#define ALPHABIT 0
+#define DIGITBIT 1
+#define PRINTBIT 2
+#define SPACEBIT 3
+#define XDIGITBIT 4
+
+
+#define MASK(B) (1 << (B))
+
+
+/*
+** add 1 to char to allow index -1 (EOZ)
+*/
+#define testprop(c,p) (luai_ctype_[(c)+1] & (p))
+
+/*
+** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_'
+*/
+#define lislalpha(c) testprop(c, MASK(ALPHABIT))
+#define lislalnum(c) testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT)))
+#define lisdigit(c) testprop(c, MASK(DIGITBIT))
+#define lisspace(c) testprop(c, MASK(SPACEBIT))
+#define lisprint(c) testprop(c, MASK(PRINTBIT))
+#define lisxdigit(c) testprop(c, MASK(XDIGITBIT))
+
+/*
+** this 'ltolower' only works for alphabetic characters
+*/
+#define ltolower(c) ((c) | ('A' ^ 'a'))
+
+
+/* two more entries for 0 and -1 (EOZ) */
+LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2];
+
+
+#else /* }{ */
+
+/*
+** use standard C ctypes
+*/
+
+#include <ctype.h>
+
+
+#define lislalpha(c) (isalpha(c) || (c) == '_')
+#define lislalnum(c) (isalnum(c) || (c) == '_')
+#define lisdigit(c) (isdigit(c))
+#define lisspace(c) (isspace(c))
+#define lisprint(c) (isprint(c))
+#define lisxdigit(c) (isxdigit(c))
+
+#define ltolower(c) (tolower(c))
+
+#endif /* } */
+
+#endif
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c
new file mode 100644
index 000000000000..b8ddcff3c6bb
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c
@@ -0,0 +1,607 @@
+/*
+** $Id: ldebug.c,v 2.90.1.4 2015/02/19 17:05:13 roberto Exp $
+** Debug Interface
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define ldebug_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lapi.h"
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+
+
+#define noLuaClosure(f) ((f) == NULL || (f)->c.tt == LUA_TCCL)
+
+
+static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name);
+
+
+static int currentpc (CallInfo *ci) {
+ lua_assert(isLua(ci));
+ return pcRel(ci->u.l.savedpc, ci_func(ci)->p);
+}
+
+
+static int currentline (CallInfo *ci) {
+ return getfuncline(ci_func(ci)->p, currentpc(ci));
+}
+
+
+static void swapextra (lua_State *L) {
+ if (L->status == LUA_YIELD) {
+ CallInfo *ci = L->ci; /* get function that yielded */
+ StkId temp = ci->func; /* exchange its 'func' and 'extra' values */
+ ci->func = restorestack(L, ci->extra);
+ ci->extra = savestack(L, temp);
+ }
+}
+
+
+/*
+** this function can be called asynchronous (e.g. during a signal)
+*/
+LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) {
+ if (func == NULL || mask == 0) { /* turn off hooks? */
+ mask = 0;
+ func = NULL;
+ }
+ if (isLua(L->ci))
+ L->oldpc = L->ci->u.l.savedpc;
+ L->hook = func;
+ L->basehookcount = count;
+ resethookcount(L);
+ L->hookmask = cast_byte(mask);
+ return 1;
+}
+
+
+LUA_API lua_Hook lua_gethook (lua_State *L) {
+ return L->hook;
+}
+
+
+LUA_API int lua_gethookmask (lua_State *L) {
+ return L->hookmask;
+}
+
+
+LUA_API int lua_gethookcount (lua_State *L) {
+ return L->basehookcount;
+}
+
+
+LUA_API int lua_getstack (lua_State *L, int level, lua_Debug *ar) {
+ int status;
+ CallInfo *ci;
+ if (level < 0) return 0; /* invalid (negative) level */
+ lua_lock(L);
+ for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous)
+ level--;
+ if (level == 0 && ci != &L->base_ci) { /* level found? */
+ status = 1;
+ ar->i_ci = ci;
+ }
+ else status = 0; /* no such level */
+ lua_unlock(L);
+ return status;
+}
+
+
+static const char *upvalname (Proto *p, int uv) {
+ TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name);
+ if (s == NULL) return "?";
+ else return getstr(s);
+}
+
+
+static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
+ int nparams = clLvalue(ci->func)->p->numparams;
+ if (n >= ci->u.l.base - ci->func - nparams)
+ return NULL; /* no such vararg */
+ else {
+ *pos = ci->func + nparams + n;
+ return "(*vararg)"; /* generic name for any vararg */
+ }
+}
+
+
+static const char *findlocal (lua_State *L, CallInfo *ci, int n,
+ StkId *pos) {
+ const char *name = NULL;
+ StkId base;
+ if (isLua(ci)) {
+ if (n < 0) /* access to vararg values? */
+ return findvararg(ci, -n, pos);
+ else {
+ base = ci->u.l.base;
+ name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
+ }
+ }
+ else
+ base = ci->func + 1;
+ if (name == NULL) { /* no 'standard' name? */
+ StkId limit = (ci == L->ci) ? L->top : ci->next->func;
+ if (limit - base >= n && n > 0) /* is 'n' inside 'ci' stack? */
+ name = "(*temporary)"; /* generic name for any valid slot */
+ else
+ return NULL; /* no name */
+ }
+ *pos = base + (n - 1);
+ return name;
+}
+
+
+LUA_API const char *lua_getlocal (lua_State *L, const lua_Debug *ar, int n) {
+ const char *name;
+ lua_lock(L);
+ swapextra(L);
+ if (ar == NULL) { /* information about non-active function? */
+ if (!isLfunction(L->top - 1)) /* not a Lua function? */
+ name = NULL;
+ else /* consider live variables at function start (parameters) */
+ name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0);
+ }
+ else { /* active function; get information through 'ar' */
+ StkId pos = 0; /* to avoid warnings */
+ name = findlocal(L, ar->i_ci, n, &pos);
+ if (name) {
+ setobj2s(L, L->top, pos);
+ api_incr_top(L);
+ }
+ }
+ swapextra(L);
+ lua_unlock(L);
+ return name;
+}
+
+
+LUA_API const char *lua_setlocal (lua_State *L, const lua_Debug *ar, int n) {
+ StkId pos = 0; /* to avoid warnings */
+ const char *name;
+ lua_lock(L);
+ swapextra(L);
+ name = findlocal(L, ar->i_ci, n, &pos);
+ if (name)
+ setobjs2s(L, pos, L->top - 1);
+ L->top--; /* pop value */
+ swapextra(L);
+ lua_unlock(L);
+ return name;
+}
+
+
+static void funcinfo (lua_Debug *ar, Closure *cl) {
+ if (noLuaClosure(cl)) {
+ ar->source = "=[C]";
+ ar->linedefined = -1;
+ ar->lastlinedefined = -1;
+ ar->what = "C";
+ }
+ else {
+ Proto *p = cl->l.p;
+ ar->source = p->source ? getstr(p->source) : "=?";
+ ar->linedefined = p->linedefined;
+ ar->lastlinedefined = p->lastlinedefined;
+ ar->what = (ar->linedefined == 0) ? "main" : "Lua";
+ }
+ luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE);
+}
+
+
+static void collectvalidlines (lua_State *L, Closure *f) {
+ if (noLuaClosure(f)) {
+ setnilvalue(L->top);
+ api_incr_top(L);
+ }
+ else {
+ int i;
+ TValue v;
+ int *lineinfo = f->l.p->lineinfo;
+ Table *t = luaH_new(L); /* new table to store active lines */
+ sethvalue(L, L->top, t); /* push it on stack */
+ api_incr_top(L);
+ setbvalue(&v, 1); /* boolean 'true' to be the value of all indices */
+ for (i = 0; i < f->l.p->sizelineinfo; i++) /* for all lines with code */
+ luaH_setint(L, t, lineinfo[i], &v); /* table[line] = true */
+ }
+}
+
+
+static int auxgetinfo (lua_State *L, const char *what, lua_Debug *ar,
+ Closure *f, CallInfo *ci) {
+ int status = 1;
+ for (; *what; what++) {
+ switch (*what) {
+ case 'S': {
+ funcinfo(ar, f);
+ break;
+ }
+ case 'l': {
+ ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1;
+ break;
+ }
+ case 'u': {
+ ar->nups = (f == NULL) ? 0 : f->c.nupvalues;
+ if (noLuaClosure(f)) {
+ ar->isvararg = 1;
+ ar->nparams = 0;
+ }
+ else {
+ ar->isvararg = f->l.p->is_vararg;
+ ar->nparams = f->l.p->numparams;
+ }
+ break;
+ }
+ case 't': {
+ ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0;
+ break;
+ }
+ case 'n': {
+ /* calling function is a known Lua function? */
+ if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous))
+ ar->namewhat = getfuncname(L, ci->previous, &ar->name);
+ else
+ ar->namewhat = NULL;
+ if (ar->namewhat == NULL) {
+ ar->namewhat = ""; /* not found */
+ ar->name = NULL;
+ }
+ break;
+ }
+ case 'L':
+ case 'f': /* handled by lua_getinfo */
+ break;
+ default: status = 0; /* invalid option */
+ }
+ }
+ return status;
+}
+
+
+LUA_API int lua_getinfo (lua_State *L, const char *what, lua_Debug *ar) {
+ int status;
+ Closure *cl;
+ CallInfo *ci;
+ StkId func;
+ lua_lock(L);
+ swapextra(L);
+ if (*what == '>') {
+ ci = NULL;
+ func = L->top - 1;
+ api_check(L, ttisfunction(func), "function expected");
+ what++; /* skip the '>' */
+ L->top--; /* pop function */
+ }
+ else {
+ ci = ar->i_ci;
+ func = ci->func;
+ lua_assert(ttisfunction(ci->func));
+ }
+ cl = ttisclosure(func) ? clvalue(func) : NULL;
+ status = auxgetinfo(L, what, ar, cl, ci);
+ if (strchr(what, 'f')) {
+ setobjs2s(L, L->top, func);
+ api_incr_top(L);
+ }
+ swapextra(L);
+ if (strchr(what, 'L'))
+ collectvalidlines(L, cl);
+ lua_unlock(L);
+ return status;
+}
+
+
+/*
+** {======================================================
+** Symbolic Execution
+** =======================================================
+*/
+
+static const char *getobjname (Proto *p, int lastpc, int reg,
+ const char **name);
+
+
+/*
+** find a "name" for the RK value 'c'
+*/
+static void kname (Proto *p, int pc, int c, const char **name) {
+ if (ISK(c)) { /* is 'c' a constant? */
+ TValue *kvalue = &p->k[INDEXK(c)];
+ if (ttisstring(kvalue)) { /* literal constant? */
+ *name = svalue(kvalue); /* it is its own name */
+ return;
+ }
+ /* else no reasonable name found */
+ }
+ else { /* 'c' is a register */
+ const char *what = getobjname(p, pc, c, name); /* search for 'c' */
+ if (what && *what == 'c') { /* found a constant name? */
+ return; /* 'name' already filled */
+ }
+ /* else no reasonable name found */
+ }
+ *name = "?"; /* no reasonable name found */
+}
+
+
+static int filterpc (int pc, int jmptarget) {
+ if (pc < jmptarget) /* is code conditional (inside a jump)? */
+ return -1; /* cannot know who sets that register */
+ else return pc; /* current position sets that register */
+}
+
+
+/*
+** try to find last instruction before 'lastpc' that modified register 'reg'
+*/
+static int findsetreg (Proto *p, int lastpc, int reg) {
+ int pc;
+ int setreg = -1; /* keep last instruction that changed 'reg' */
+ int jmptarget = 0; /* any code before this address is conditional */
+ for (pc = 0; pc < lastpc; pc++) {
+ Instruction i = p->code[pc];
+ OpCode op = GET_OPCODE(i);
+ int a = GETARG_A(i);
+ switch (op) {
+ case OP_LOADNIL: {
+ int b = GETARG_B(i);
+ if (a <= reg && reg <= a + b) /* set registers from 'a' to 'a+b' */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ case OP_TFORCALL: {
+ if (reg >= a + 2) /* affect all regs above its base */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ case OP_CALL:
+ case OP_TAILCALL: {
+ if (reg >= a) /* affect all registers above base */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ case OP_JMP: {
+ int b = GETARG_sBx(i);
+ int dest = pc + 1 + b;
+ /* jump is forward and do not skip `lastpc'? */
+ if (pc < dest && dest <= lastpc) {
+ if (dest > jmptarget)
+ jmptarget = dest; /* update 'jmptarget' */
+ }
+ break;
+ }
+ case OP_TEST: {
+ if (reg == a) /* jumped code can change 'a' */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ default:
+ if (testAMode(op) && reg == a) /* any instruction that set A */
+ setreg = filterpc(pc, jmptarget);
+ break;
+ }
+ }
+ return setreg;
+}
+
+
+static const char *getobjname (Proto *p, int lastpc, int reg,
+ const char **name) {
+ int pc;
+ *name = luaF_getlocalname(p, reg + 1, lastpc);
+ if (*name) /* is a local? */
+ return "local";
+ /* else try symbolic execution */
+ pc = findsetreg(p, lastpc, reg);
+ if (pc != -1) { /* could find instruction? */
+ Instruction i = p->code[pc];
+ OpCode op = GET_OPCODE(i);
+ switch (op) {
+ case OP_MOVE: {
+ int b = GETARG_B(i); /* move from 'b' to 'a' */
+ if (b < GETARG_A(i))
+ return getobjname(p, pc, b, name); /* get name for 'b' */
+ break;
+ }
+ case OP_GETTABUP:
+ case OP_GETTABLE: {
+ int k = GETARG_C(i); /* key index */
+ int t = GETARG_B(i); /* table index */
+ const char *vn = (op == OP_GETTABLE) /* name of indexed variable */
+ ? luaF_getlocalname(p, t + 1, pc)
+ : upvalname(p, t);
+ kname(p, pc, k, name);
+ return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field";
+ }
+ case OP_GETUPVAL: {
+ *name = upvalname(p, GETARG_B(i));
+ return "upvalue";
+ }
+ case OP_LOADK:
+ case OP_LOADKX: {
+ int b = (op == OP_LOADK) ? GETARG_Bx(i)
+ : GETARG_Ax(p->code[pc + 1]);
+ if (ttisstring(&p->k[b])) {
+ *name = svalue(&p->k[b]);
+ return "constant";
+ }
+ break;
+ }
+ case OP_SELF: {
+ int k = GETARG_C(i); /* key index */
+ kname(p, pc, k, name);
+ return "method";
+ }
+ default: break; /* go through to return NULL */
+ }
+ }
+ return NULL; /* could not find reasonable name */
+}
+
+
+static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) {
+ TMS tm;
+ Proto *p = ci_func(ci)->p; /* calling function */
+ int pc = currentpc(ci); /* calling instruction index */
+ Instruction i = p->code[pc]; /* calling instruction */
+ switch (GET_OPCODE(i)) {
+ case OP_CALL:
+ case OP_TAILCALL: /* get function name */
+ return getobjname(p, pc, GETARG_A(i), name);
+ case OP_TFORCALL: { /* for iterator */
+ *name = "for iterator";
+ return "for iterator";
+ }
+ /* all other instructions can call only through metamethods */
+ case OP_SELF:
+ case OP_GETTABUP:
+ case OP_GETTABLE: tm = TM_INDEX; break;
+ case OP_SETTABUP:
+ case OP_SETTABLE: tm = TM_NEWINDEX; break;
+ case OP_EQ: tm = TM_EQ; break;
+ case OP_ADD: tm = TM_ADD; break;
+ case OP_SUB: tm = TM_SUB; break;
+ case OP_MUL: tm = TM_MUL; break;
+ case OP_DIV: tm = TM_DIV; break;
+ case OP_MOD: tm = TM_MOD; break;
+ case OP_POW: tm = TM_POW; break;
+ case OP_UNM: tm = TM_UNM; break;
+ case OP_LEN: tm = TM_LEN; break;
+ case OP_LT: tm = TM_LT; break;
+ case OP_LE: tm = TM_LE; break;
+ case OP_CONCAT: tm = TM_CONCAT; break;
+ default:
+ return NULL; /* else no useful name can be found */
+ }
+ *name = getstr(G(L)->tmname[tm]);
+ return "metamethod";
+}
+
+/* }====================================================== */
+
+
+
+/*
+** only ANSI way to check whether a pointer points to an array
+** (used only for error messages, so efficiency is not a big concern)
+*/
+static int isinstack (CallInfo *ci, const TValue *o) {
+ StkId p;
+ for (p = ci->u.l.base; p < ci->top; p++)
+ if (o == p) return 1;
+ return 0;
+}
+
+
+static const char *getupvalname (CallInfo *ci, const TValue *o,
+ const char **name) {
+ LClosure *c = ci_func(ci);
+ int i;
+ for (i = 0; i < c->nupvalues; i++) {
+ if (c->upvals[i]->v == o) {
+ *name = upvalname(c->p, i);
+ return "upvalue";
+ }
+ }
+ return NULL;
+}
+
+
+l_noret luaG_typeerror (lua_State *L, const TValue *o, const char *op) {
+ CallInfo *ci = L->ci;
+ const char *name = NULL;
+ const char *t = objtypename(o);
+ const char *kind = NULL;
+ if (isLua(ci)) {
+ kind = getupvalname(ci, o, &name); /* check whether 'o' is an upvalue */
+ if (!kind && isinstack(ci, o)) /* no? try a register */
+ kind = getobjname(ci_func(ci)->p, currentpc(ci),
+ cast_int(o - ci->u.l.base), &name);
+ }
+ if (kind)
+ luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)",
+ op, kind, name, t);
+ else
+ luaG_runerror(L, "attempt to %s a %s value", op, t);
+}
+
+
+l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) {
+ if (ttisstring(p1) || ttisnumber(p1)) p1 = p2;
+ lua_assert(!ttisstring(p1) && !ttisnumber(p1));
+ luaG_typeerror(L, p1, "concatenate");
+}
+
+
+l_noret luaG_aritherror (lua_State *L, const TValue *p1, const TValue *p2) {
+ TValue temp;
+ if (luaV_tonumber(p1, &temp) == NULL)
+ p2 = p1; /* first operand is wrong */
+ luaG_typeerror(L, p2, "perform arithmetic on");
+}
+
+
+l_noret luaG_ordererror (lua_State *L, const TValue *p1, const TValue *p2) {
+ const char *t1 = objtypename(p1);
+ const char *t2 = objtypename(p2);
+ if (t1 == t2)
+ luaG_runerror(L, "attempt to compare two %s values", t1);
+ else
+ luaG_runerror(L, "attempt to compare %s with %s", t1, t2);
+}
+
+
+static void addinfo (lua_State *L, const char *msg) {
+ CallInfo *ci = L->ci;
+ if (isLua(ci)) { /* is Lua code? */
+ char buff[LUA_IDSIZE]; /* add file:line information */
+ int line = currentline(ci);
+ TString *src = ci_func(ci)->p->source;
+ if (src)
+ luaO_chunkid(buff, getstr(src), LUA_IDSIZE);
+ else { /* no source available; use "?" instead */
+ buff[0] = '?'; buff[1] = '\0';
+ }
+ luaO_pushfstring(L, "%s:%d: %s", buff, line, msg);
+ }
+}
+
+
+l_noret luaG_errormsg (lua_State *L) {
+ if (L->errfunc != 0) { /* is there an error handling function? */
+ StkId errfunc = restorestack(L, L->errfunc);
+ if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR);
+ setobjs2s(L, L->top, L->top - 1); /* move argument */
+ setobjs2s(L, L->top - 1, errfunc); /* push function */
+ L->top++;
+ luaD_call(L, L->top - 2, 1, 0); /* call it */
+ }
+ luaD_throw(L, LUA_ERRRUN);
+}
+
+
+l_noret luaG_runerror (lua_State *L, const char *fmt, ...) {
+ va_list argp;
+ va_start(argp, fmt);
+ addinfo(L, luaO_pushvfstring(L, fmt, argp));
+ va_end(argp);
+ luaG_errormsg(L);
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h
new file mode 100644
index 000000000000..6445c763ea51
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h
@@ -0,0 +1,34 @@
+/*
+** $Id: ldebug.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions from Debug Interface module
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ldebug_h
+#define ldebug_h
+
+
+#include "lstate.h"
+
+
+#define pcRel(pc, p) (cast(int, (pc) - (p)->code) - 1)
+
+#define getfuncline(f,pc) (((f)->lineinfo) ? (f)->lineinfo[pc] : 0)
+
+#define resethookcount(L) (L->hookcount = L->basehookcount)
+
+/* Active Lua function (given call info) */
+#define ci_func(ci) (clLvalue((ci)->func))
+
+
+LUAI_FUNC l_noret luaG_typeerror (lua_State *L, const TValue *o,
+ const char *opname);
+LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2);
+LUAI_FUNC l_noret luaG_aritherror (lua_State *L, const TValue *p1,
+ const TValue *p2);
+LUAI_FUNC l_noret luaG_ordererror (lua_State *L, const TValue *p1,
+ const TValue *p2);
+LUAI_FUNC l_noret luaG_runerror (lua_State *L, const char *fmt, ...);
+LUAI_FUNC l_noret luaG_errormsg (lua_State *L);
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c
new file mode 100644
index 000000000000..cb49cb55e6cf
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c
@@ -0,0 +1,691 @@
+/*
+** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $
+** Stack and Call structure of Lua
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define ldo_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lundump.h"
+#include "lvm.h"
+#include "lzio.h"
+
+
+
+
+/*
+** {======================================================
+** Error-recovery functions
+** =======================================================
+*/
+
+/*
+** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By
+** default, Lua handles errors with exceptions when compiling as
+** C++ code, with _longjmp/_setjmp when asked to use them, and with
+** longjmp/setjmp otherwise.
+*/
+#if !defined(LUAI_THROW)
+
+#ifdef _KERNEL
+#ifdef illumos
+#define LUAI_THROW(L,c) longjmp(&(c)->b)
+#define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a }
+#define luai_jmpbuf label_t
+#else
+#define LUAI_THROW(L,c) longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf jmp_buf
+#endif
+#else
+#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP)
+/* C++ exceptions */
+#define LUAI_THROW(L,c) throw(c)
+#define LUAI_TRY(L,c,a) \
+ try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; }
+#define luai_jmpbuf int /* dummy variable */
+
+#elif defined(LUA_USE_ULONGJMP)
+/* in Unix, try _longjmp/_setjmp (more efficient) */
+#define LUAI_THROW(L,c) _longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a) if (_setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf jmp_buf
+
+#else
+/* default handling with long jumps */
+#define LUAI_THROW(L,c) longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf jmp_buf
+
+#endif
+
+#endif
+
+#endif
+
+
+/* chain list of long jump buffers */
+struct lua_longjmp {
+ struct lua_longjmp *previous;
+ luai_jmpbuf b;
+ volatile int status; /* error code */
+};
+
+
+static void seterrorobj (lua_State *L, int errcode, StkId oldtop) {
+ switch (errcode) {
+ case LUA_ERRMEM: { /* memory error? */
+ setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */
+ break;
+ }
+ case LUA_ERRERR: {
+ setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling"));
+ break;
+ }
+ default: {
+ setobjs2s(L, oldtop, L->top - 1); /* error message on current top */
+ break;
+ }
+ }
+ L->top = oldtop + 1;
+}
+
+
+l_noret luaD_throw (lua_State *L, int errcode) {
+ if (L->errorJmp) { /* thread has an error handler? */
+ L->errorJmp->status = errcode; /* set status */
+ LUAI_THROW(L, L->errorJmp); /* jump to it */
+ }
+ else { /* thread has no error handler */
+ L->status = cast_byte(errcode); /* mark it as dead */
+ if (G(L)->mainthread->errorJmp) { /* main thread has a handler? */
+ setobjs2s(L, G(L)->mainthread->top++, L->top - 1); /* copy error obj. */
+ luaD_throw(G(L)->mainthread, errcode); /* re-throw in main thread */
+ }
+ else { /* no handler at all; abort */
+ if (G(L)->panic) { /* panic function? */
+ lua_unlock(L);
+ G(L)->panic(L); /* call it (last chance to jump out) */
+ }
+ panic("no error handler");
+ }
+ }
+}
+
+
+int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) {
+ unsigned short oldnCcalls = L->nCcalls;
+ struct lua_longjmp lj;
+ lj.status = LUA_OK;
+ lj.previous = L->errorJmp; /* chain new error handler */
+ L->errorJmp = &lj;
+ LUAI_TRY(L, &lj,
+ (*f)(L, ud);
+ );
+ L->errorJmp = lj.previous; /* restore old error handler */
+ L->nCcalls = oldnCcalls;
+ return lj.status;
+}
+
+/* }====================================================== */
+
+
+static void correctstack (lua_State *L, TValue *oldstack) {
+ CallInfo *ci;
+ GCObject *up;
+ L->top = (L->top - oldstack) + L->stack;
+ for (up = L->openupval; up != NULL; up = up->gch.next)
+ gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack;
+ for (ci = L->ci; ci != NULL; ci = ci->previous) {
+ ci->top = (ci->top - oldstack) + L->stack;
+ ci->func = (ci->func - oldstack) + L->stack;
+ if (isLua(ci))
+ ci->u.l.base = (ci->u.l.base - oldstack) + L->stack;
+ }
+}
+
+
+/* some space for error handling */
+#define ERRORSTACKSIZE (LUAI_MAXSTACK + 200)
+
+
+void luaD_reallocstack (lua_State *L, int newsize) {
+ TValue *oldstack = L->stack;
+ int lim = L->stacksize;
+ lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE);
+ lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK);
+ luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue);
+ for (; lim < newsize; lim++)
+ setnilvalue(L->stack + lim); /* erase new segment */
+ L->stacksize = newsize;
+ L->stack_last = L->stack + newsize - EXTRA_STACK;
+ correctstack(L, oldstack);
+}
+
+
+void luaD_growstack (lua_State *L, int n) {
+ int size = L->stacksize;
+ if (size > LUAI_MAXSTACK) /* error after extra size? */
+ luaD_throw(L, LUA_ERRERR);
+ else {
+ int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK;
+ int newsize = 2 * size;
+ if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK;
+ if (newsize < needed) newsize = needed;
+ if (newsize > LUAI_MAXSTACK) { /* stack overflow? */
+ luaD_reallocstack(L, ERRORSTACKSIZE);
+ luaG_runerror(L, "stack overflow");
+ }
+ else
+ luaD_reallocstack(L, newsize);
+ }
+}
+
+
+static int stackinuse (lua_State *L) {
+ CallInfo *ci;
+ StkId lim = L->top;
+ for (ci = L->ci; ci != NULL; ci = ci->previous) {
+ lua_assert(ci->top <= L->stack_last);
+ if (lim < ci->top) lim = ci->top;
+ }
+ return cast_int(lim - L->stack) + 1; /* part of stack in use */
+}
+
+
+void luaD_shrinkstack (lua_State *L) {
+ int inuse = stackinuse(L);
+ int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK;
+ if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK;
+ if (inuse > LUAI_MAXSTACK || /* handling stack overflow? */
+ goodsize >= L->stacksize) /* would grow instead of shrink? */
+ condmovestack(L); /* don't change stack (change only for debugging) */
+ else
+ luaD_reallocstack(L, goodsize); /* shrink it */
+}
+
+
+void luaD_hook (lua_State *L, int event, int line) {
+ lua_Hook hook = L->hook;
+ if (hook && L->allowhook) {
+ CallInfo *ci = L->ci;
+ ptrdiff_t top = savestack(L, L->top);
+ ptrdiff_t ci_top = savestack(L, ci->top);
+ lua_Debug ar;
+ ar.event = event;
+ ar.currentline = line;
+ ar.i_ci = ci;
+ luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */
+ ci->top = L->top + LUA_MINSTACK;
+ lua_assert(ci->top <= L->stack_last);
+ L->allowhook = 0; /* cannot call hooks inside a hook */
+ ci->callstatus |= CIST_HOOKED;
+ lua_unlock(L);
+ (*hook)(L, &ar);
+ lua_lock(L);
+ lua_assert(!L->allowhook);
+ L->allowhook = 1;
+ ci->top = restorestack(L, ci_top);
+ L->top = restorestack(L, top);
+ ci->callstatus &= ~CIST_HOOKED;
+ }
+}
+
+
+static void callhook (lua_State *L, CallInfo *ci) {
+ int hook = LUA_HOOKCALL;
+ ci->u.l.savedpc++; /* hooks assume 'pc' is already incremented */
+ if (isLua(ci->previous) &&
+ GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) {
+ ci->callstatus |= CIST_TAIL;
+ hook = LUA_HOOKTAILCALL;
+ }
+ luaD_hook(L, hook, -1);
+ ci->u.l.savedpc--; /* correct 'pc' */
+}
+
+
+static StkId adjust_varargs (lua_State *L, Proto *p, int actual) {
+ int i;
+ int nfixargs = p->numparams;
+ StkId base, fixed;
+ lua_assert(actual >= nfixargs);
+ /* move fixed parameters to final position */
+ luaD_checkstack(L, p->maxstacksize); /* check again for new 'base' */
+ fixed = L->top - actual; /* first fixed argument */
+ base = L->top; /* final position of first argument */
+ for (i=0; i<nfixargs; i++) {
+ setobjs2s(L, L->top++, fixed + i);
+ setnilvalue(fixed + i);
+ }
+ return base;
+}
+
+
+static StkId tryfuncTM (lua_State *L, StkId func) {
+ const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL);
+ StkId p;
+ ptrdiff_t funcr = savestack(L, func);
+ if (!ttisfunction(tm))
+ luaG_typeerror(L, func, "call");
+ /* Open a hole inside the stack at `func' */
+ for (p = L->top; p > func; p--) setobjs2s(L, p, p-1);
+ incr_top(L);
+ func = restorestack(L, funcr); /* previous call may change stack */
+ setobj2s(L, func, tm); /* tag method is the new function to be called */
+ return func;
+}
+
+
+
+#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L)))
+
+
+/*
+** returns true if function has been executed (C function)
+*/
+int luaD_precall (lua_State *L, StkId func, int nresults) {
+ lua_CFunction f;
+ CallInfo *ci;
+ int n; /* number of arguments (Lua) or returns (C) */
+ ptrdiff_t funcr = savestack(L, func);
+ switch (ttype(func)) {
+ case LUA_TLCF: /* light C function */
+ f = fvalue(func);
+ goto Cfunc;
+ case LUA_TCCL: { /* C closure */
+ f = clCvalue(func)->f;
+ Cfunc:
+ luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */
+ ci = next_ci(L); /* now 'enter' new function */
+ ci->nresults = nresults;
+ ci->func = restorestack(L, funcr);
+ ci->top = L->top + LUA_MINSTACK;
+ lua_assert(ci->top <= L->stack_last);
+ ci->callstatus = 0;
+ luaC_checkGC(L); /* stack grow uses memory */
+ if (L->hookmask & LUA_MASKCALL)
+ luaD_hook(L, LUA_HOOKCALL, -1);
+ lua_unlock(L);
+ n = (*f)(L); /* do the actual call */
+ lua_lock(L);
+ api_checknelems(L, n);
+ luaD_poscall(L, L->top - n);
+ return 1;
+ }
+ case LUA_TLCL: { /* Lua function: prepare its call */
+ StkId base;
+ Proto *p = clLvalue(func)->p;
+ n = cast_int(L->top - func) - 1; /* number of real arguments */
+ luaD_checkstack(L, p->maxstacksize);
+ for (; n < p->numparams; n++)
+ setnilvalue(L->top++); /* complete missing arguments */
+ if (!p->is_vararg) {
+ func = restorestack(L, funcr);
+ base = func + 1;
+ }
+ else {
+ base = adjust_varargs(L, p, n);
+ func = restorestack(L, funcr); /* previous call can change stack */
+ }
+ ci = next_ci(L); /* now 'enter' new function */
+ ci->nresults = nresults;
+ ci->func = func;
+ ci->u.l.base = base;
+ ci->top = base + p->maxstacksize;
+ lua_assert(ci->top <= L->stack_last);
+ ci->u.l.savedpc = p->code; /* starting point */
+ ci->callstatus = CIST_LUA;
+ L->top = ci->top;
+ luaC_checkGC(L); /* stack grow uses memory */
+ if (L->hookmask & LUA_MASKCALL)
+ callhook(L, ci);
+ return 0;
+ }
+ default: { /* not a function */
+ func = tryfuncTM(L, func); /* retry with 'function' tag method */
+ return luaD_precall(L, func, nresults); /* now it must be a function */
+ }
+ }
+}
+
+
+int luaD_poscall (lua_State *L, StkId firstResult) {
+ StkId res;
+ int wanted, i;
+ CallInfo *ci = L->ci;
+ if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) {
+ if (L->hookmask & LUA_MASKRET) {
+ ptrdiff_t fr = savestack(L, firstResult); /* hook may change stack */
+ luaD_hook(L, LUA_HOOKRET, -1);
+ firstResult = restorestack(L, fr);
+ }
+ L->oldpc = ci->previous->u.l.savedpc; /* 'oldpc' for caller function */
+ }
+ res = ci->func; /* res == final position of 1st result */
+ wanted = ci->nresults;
+ L->ci = ci = ci->previous; /* back to caller */
+ /* move results to correct place */
+ for (i = wanted; i != 0 && firstResult < L->top; i--)
+ setobjs2s(L, res++, firstResult++);
+ while (i-- > 0)
+ setnilvalue(res++);
+ L->top = res;
+ return (wanted - LUA_MULTRET); /* 0 iff wanted == LUA_MULTRET */
+}
+
+
+/*
+** Call a function (C or Lua). The function to be called is at *func.
+** The arguments are on the stack, right after the function.
+** When returns, all the results are on the stack, starting at the original
+** function position.
+*/
+void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) {
+ if (++L->nCcalls >= LUAI_MAXCCALLS) {
+ if (L->nCcalls == LUAI_MAXCCALLS)
+ luaG_runerror(L, "C stack overflow");
+ else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3)))
+ luaD_throw(L, LUA_ERRERR); /* error while handing stack error */
+ }
+ if (!allowyield) L->nny++;
+ if (!luaD_precall(L, func, nResults)) /* is a Lua function? */
+ luaV_execute(L); /* call it */
+ if (!allowyield) L->nny--;
+ L->nCcalls--;
+}
+
+
+static void finishCcall (lua_State *L) {
+ CallInfo *ci = L->ci;
+ int n;
+ lua_assert(ci->u.c.k != NULL); /* must have a continuation */
+ lua_assert(L->nny == 0);
+ if (ci->callstatus & CIST_YPCALL) { /* was inside a pcall? */
+ ci->callstatus &= ~CIST_YPCALL; /* finish 'lua_pcall' */
+ L->errfunc = ci->u.c.old_errfunc;
+ }
+ /* finish 'lua_callk'/'lua_pcall' */
+ adjustresults(L, ci->nresults);
+ /* call continuation function */
+ if (!(ci->callstatus & CIST_STAT)) /* no call status? */
+ ci->u.c.status = LUA_YIELD; /* 'default' status */
+ lua_assert(ci->u.c.status != LUA_OK);
+ ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED;
+ lua_unlock(L);
+ n = (*ci->u.c.k)(L);
+ lua_lock(L);
+ api_checknelems(L, n);
+ /* finish 'luaD_precall' */
+ luaD_poscall(L, L->top - n);
+}
+
+
+static void unroll (lua_State *L, void *ud) {
+ UNUSED(ud);
+ for (;;) {
+ if (L->ci == &L->base_ci) /* stack is empty? */
+ return; /* coroutine finished normally */
+ if (!isLua(L->ci)) /* C function? */
+ finishCcall(L);
+ else { /* Lua function */
+ luaV_finishOp(L); /* finish interrupted instruction */
+ luaV_execute(L); /* execute down to higher C 'boundary' */
+ }
+ }
+}
+
+
+/*
+** check whether thread has a suspended protected call
+*/
+static CallInfo *findpcall (lua_State *L) {
+ CallInfo *ci;
+ for (ci = L->ci; ci != NULL; ci = ci->previous) { /* search for a pcall */
+ if (ci->callstatus & CIST_YPCALL)
+ return ci;
+ }
+ return NULL; /* no pending pcall */
+}
+
+
+static int recover (lua_State *L, int status) {
+ StkId oldtop;
+ CallInfo *ci = findpcall(L);
+ if (ci == NULL) return 0; /* no recovery point */
+ /* "finish" luaD_pcall */
+ oldtop = restorestack(L, ci->extra);
+ luaF_close(L, oldtop);
+ seterrorobj(L, status, oldtop);
+ L->ci = ci;
+ L->allowhook = ci->u.c.old_allowhook;
+ L->nny = 0; /* should be zero to be yieldable */
+ luaD_shrinkstack(L);
+ L->errfunc = ci->u.c.old_errfunc;
+ ci->callstatus |= CIST_STAT; /* call has error status */
+ ci->u.c.status = status; /* (here it is) */
+ return 1; /* continue running the coroutine */
+}
+
+
+/*
+** signal an error in the call to 'resume', not in the execution of the
+** coroutine itself. (Such errors should not be handled by any coroutine
+** error handler and should not kill the coroutine.)
+*/
+static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) {
+ L->top = firstArg; /* remove args from the stack */
+ setsvalue2s(L, L->top, luaS_new(L, msg)); /* push error message */
+ api_incr_top(L);
+ luaD_throw(L, -1); /* jump back to 'lua_resume' */
+}
+
+
+/*
+** do the work for 'lua_resume' in protected mode
+*/
+static void resume_cb (lua_State *L, void *ud) {
+ int nCcalls = L->nCcalls;
+ StkId firstArg = cast(StkId, ud);
+ CallInfo *ci = L->ci;
+ if (nCcalls >= LUAI_MAXCCALLS)
+ resume_error(L, "C stack overflow", firstArg);
+ if (L->status == LUA_OK) { /* may be starting a coroutine */
+ if (ci != &L->base_ci) /* not in base level? */
+ resume_error(L, "cannot resume non-suspended coroutine", firstArg);
+ /* coroutine is in base level; start running it */
+ if (!luaD_precall(L, firstArg - 1, LUA_MULTRET)) /* Lua function? */
+ luaV_execute(L); /* call it */
+ }
+ else if (L->status != LUA_YIELD)
+ resume_error(L, "cannot resume dead coroutine", firstArg);
+ else { /* resuming from previous yield */
+ L->status = LUA_OK;
+ ci->func = restorestack(L, ci->extra);
+ if (isLua(ci)) /* yielded inside a hook? */
+ luaV_execute(L); /* just continue running Lua code */
+ else { /* 'common' yield */
+ if (ci->u.c.k != NULL) { /* does it have a continuation? */
+ int n;
+ ci->u.c.status = LUA_YIELD; /* 'default' status */
+ ci->callstatus |= CIST_YIELDED;
+ lua_unlock(L);
+ n = (*ci->u.c.k)(L); /* call continuation */
+ lua_lock(L);
+ api_checknelems(L, n);
+ firstArg = L->top - n; /* yield results come from continuation */
+ }
+ luaD_poscall(L, firstArg); /* finish 'luaD_precall' */
+ }
+ unroll(L, NULL);
+ }
+ lua_assert(nCcalls == L->nCcalls);
+}
+
+
+LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) {
+ int status;
+ int oldnny = L->nny; /* save 'nny' */
+ lua_lock(L);
+ luai_userstateresume(L, nargs);
+ L->nCcalls = (from) ? from->nCcalls + 1 : 1;
+ L->nny = 0; /* allow yields */
+ api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs);
+ status = luaD_rawrunprotected(L, resume_cb, L->top - nargs);
+ if (status == -1) /* error calling 'lua_resume'? */
+ status = LUA_ERRRUN;
+ else { /* yield or regular error */
+ while (status != LUA_OK && status != LUA_YIELD) { /* error? */
+ if (recover(L, status)) /* recover point? */
+ status = luaD_rawrunprotected(L, unroll, NULL); /* run continuation */
+ else { /* unrecoverable error */
+ L->status = cast_byte(status); /* mark thread as `dead' */
+ seterrorobj(L, status, L->top);
+ L->ci->top = L->top;
+ break;
+ }
+ }
+ lua_assert(status == L->status);
+ }
+ L->nny = oldnny; /* restore 'nny' */
+ L->nCcalls--;
+ lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0));
+ lua_unlock(L);
+ return status;
+}
+
+
+LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) {
+ CallInfo *ci = L->ci;
+ luai_userstateyield(L, nresults);
+ lua_lock(L);
+ api_checknelems(L, nresults);
+ if (L->nny > 0) {
+ if (L != G(L)->mainthread)
+ luaG_runerror(L, "attempt to yield across a C-call boundary");
+ else
+ luaG_runerror(L, "attempt to yield from outside a coroutine");
+ }
+ L->status = LUA_YIELD;
+ ci->extra = savestack(L, ci->func); /* save current 'func' */
+ if (isLua(ci)) { /* inside a hook? */
+ api_check(L, k == NULL, "hooks cannot continue after yielding");
+ }
+ else {
+ if ((ci->u.c.k = k) != NULL) /* is there a continuation? */
+ ci->u.c.ctx = ctx; /* save context */
+ ci->func = L->top - nresults - 1; /* protect stack below results */
+ luaD_throw(L, LUA_YIELD);
+ }
+ lua_assert(ci->callstatus & CIST_HOOKED); /* must be inside a hook */
+ lua_unlock(L);
+ return 0; /* return to 'luaD_hook' */
+}
+
+
+int luaD_pcall (lua_State *L, Pfunc func, void *u,
+ ptrdiff_t old_top, ptrdiff_t ef) {
+ int status;
+ CallInfo *old_ci = L->ci;
+ lu_byte old_allowhooks = L->allowhook;
+ unsigned short old_nny = L->nny;
+ ptrdiff_t old_errfunc = L->errfunc;
+ L->errfunc = ef;
+ status = luaD_rawrunprotected(L, func, u);
+ if (status != LUA_OK) { /* an error occurred? */
+ StkId oldtop = restorestack(L, old_top);
+ luaF_close(L, oldtop); /* close possible pending closures */
+ seterrorobj(L, status, oldtop);
+ L->ci = old_ci;
+ L->allowhook = old_allowhooks;
+ L->nny = old_nny;
+ luaD_shrinkstack(L);
+ }
+ L->errfunc = old_errfunc;
+ return status;
+}
+
+
+
+/*
+** Execute a protected parser.
+*/
+struct SParser { /* data to `f_parser' */
+ ZIO *z;
+ Mbuffer buff; /* dynamic structure used by the scanner */
+ Dyndata dyd; /* dynamic structures used by the parser */
+ const char *mode;
+ const char *name;
+};
+
+
+static void checkmode (lua_State *L, const char *mode, const char *x) {
+ if (mode && strchr(mode, x[0]) == NULL) {
+ luaO_pushfstring(L,
+ "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode);
+ luaD_throw(L, LUA_ERRSYNTAX);
+ }
+}
+
+
+static void f_parser (lua_State *L, void *ud) {
+ int i;
+ Closure *cl;
+ struct SParser *p = cast(struct SParser *, ud);
+ int c = zgetc(p->z); /* read first character */
+ if (c == LUA_SIGNATURE[0]) {
+ checkmode(L, p->mode, "binary");
+ cl = luaU_undump(L, p->z, &p->buff, p->name);
+ }
+ else {
+ checkmode(L, p->mode, "text");
+ cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c);
+ }
+ lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues);
+ for (i = 0; i < cl->l.nupvalues; i++) { /* initialize upvalues */
+ UpVal *up = luaF_newupval(L);
+ cl->l.upvals[i] = up;
+ luaC_objbarrier(L, cl, up);
+ }
+}
+
+
+int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
+ const char *mode) {
+ struct SParser p;
+ int status;
+ L->nny++; /* cannot yield during parsing */
+ p.z = z; p.name = name; p.mode = mode;
+ p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0;
+ p.dyd.gt.arr = NULL; p.dyd.gt.size = 0;
+ p.dyd.label.arr = NULL; p.dyd.label.size = 0;
+ luaZ_initbuffer(L, &p.buff);
+ status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc);
+ luaZ_freebuffer(L, &p.buff);
+ luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size);
+ luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size);
+ luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size);
+ L->nny--;
+ return status;
+}
+
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h
new file mode 100644
index 000000000000..d3d3082c9ba3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h
@@ -0,0 +1,46 @@
+/*
+** $Id: ldo.h,v 2.20.1.1 2013/04/12 18:48:47 roberto Exp $
+** Stack and Call structure of Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ldo_h
+#define ldo_h
+
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lzio.h"
+
+
+#define luaD_checkstack(L,n) if (L->stack_last - L->top <= (n)) \
+ luaD_growstack(L, n); else condmovestack(L);
+
+
+#define incr_top(L) {L->top++; luaD_checkstack(L,0);}
+
+#define savestack(L,p) ((char *)(p) - (char *)L->stack)
+#define restorestack(L,n) ((TValue *)((char *)L->stack + (n)))
+
+
+/* type of protected functions, to be ran by `runprotected' */
+typedef void (*Pfunc) (lua_State *L, void *ud);
+
+LUAI_FUNC int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
+ const char *mode);
+LUAI_FUNC void luaD_hook (lua_State *L, int event, int line);
+LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults);
+LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults,
+ int allowyield);
+LUAI_FUNC int luaD_pcall (lua_State *L, Pfunc func, void *u,
+ ptrdiff_t oldtop, ptrdiff_t ef);
+LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult);
+LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize);
+LUAI_FUNC void luaD_growstack (lua_State *L, int n);
+LUAI_FUNC void luaD_shrinkstack (lua_State *L);
+
+LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode);
+LUAI_FUNC int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud);
+
+#endif
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c
new file mode 100644
index 000000000000..64e564933268
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c
@@ -0,0 +1,173 @@
+/*
+** $Id: ldump.c,v 2.17.1.1 2013/04/12 18:48:47 roberto Exp $
+** save precompiled Lua chunks
+** See Copyright Notice in lua.h
+*/
+
+#include <sys/zfs_context.h>
+
+#define ldump_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lundump.h"
+
+typedef struct {
+ lua_State* L;
+ lua_Writer writer;
+ void* data;
+ int strip;
+ int status;
+} DumpState;
+
+#define DumpMem(b,n,size,D) DumpBlock(b,(n)*(size),D)
+#define DumpVar(x,D) DumpMem(&x,1,sizeof(x),D)
+
+static void DumpBlock(const void* b, size_t size, DumpState* D)
+{
+ if (D->status==0)
+ {
+ lua_unlock(D->L);
+ D->status=(*D->writer)(D->L,b,size,D->data);
+ lua_lock(D->L);
+ }
+}
+
+static void DumpChar(int y, DumpState* D)
+{
+ char x=(char)y;
+ DumpVar(x,D);
+}
+
+static void DumpInt(int x, DumpState* D)
+{
+ DumpVar(x,D);
+}
+
+static void DumpNumber(lua_Number x, DumpState* D)
+{
+ DumpVar(x,D);
+}
+
+static void DumpVector(const void* b, int n, size_t size, DumpState* D)
+{
+ DumpInt(n,D);
+ DumpMem(b,n,size,D);
+}
+
+static void DumpString(const TString* s, DumpState* D)
+{
+ if (s==NULL)
+ {
+ size_t size=0;
+ DumpVar(size,D);
+ }
+ else
+ {
+ size_t size=s->tsv.len+1; /* include trailing '\0' */
+ DumpVar(size,D);
+ DumpBlock(getstr(s),size*sizeof(char),D);
+ }
+}
+
+#define DumpCode(f,D) DumpVector(f->code,f->sizecode,sizeof(Instruction),D)
+
+static void DumpFunction(const Proto* f, DumpState* D);
+
+static void DumpConstants(const Proto* f, DumpState* D)
+{
+ int i,n=f->sizek;
+ DumpInt(n,D);
+ for (i=0; i<n; i++)
+ {
+ const TValue* o=&f->k[i];
+ DumpChar(ttypenv(o),D);
+ switch (ttypenv(o))
+ {
+ case LUA_TNIL:
+ break;
+ case LUA_TBOOLEAN:
+ DumpChar(bvalue(o),D);
+ break;
+ case LUA_TNUMBER:
+ DumpNumber(nvalue(o),D);
+ break;
+ case LUA_TSTRING:
+ DumpString(rawtsvalue(o),D);
+ break;
+ default: lua_assert(0);
+ }
+ }
+ n=f->sizep;
+ DumpInt(n,D);
+ for (i=0; i<n; i++) DumpFunction(f->p[i],D);
+}
+
+static void DumpUpvalues(const Proto* f, DumpState* D)
+{
+ int i,n=f->sizeupvalues;
+ DumpInt(n,D);
+ for (i=0; i<n; i++)
+ {
+ DumpChar(f->upvalues[i].instack,D);
+ DumpChar(f->upvalues[i].idx,D);
+ }
+}
+
+static void DumpDebug(const Proto* f, DumpState* D)
+{
+ int i,n;
+ DumpString((D->strip) ? NULL : f->source,D);
+ n= (D->strip) ? 0 : f->sizelineinfo;
+ DumpVector(f->lineinfo,n,sizeof(int),D);
+ n= (D->strip) ? 0 : f->sizelocvars;
+ DumpInt(n,D);
+ for (i=0; i<n; i++)
+ {
+ DumpString(f->locvars[i].varname,D);
+ DumpInt(f->locvars[i].startpc,D);
+ DumpInt(f->locvars[i].endpc,D);
+ }
+ n= (D->strip) ? 0 : f->sizeupvalues;
+ DumpInt(n,D);
+ for (i=0; i<n; i++) DumpString(f->upvalues[i].name,D);
+}
+
+static void DumpFunction(const Proto* f, DumpState* D)
+{
+ DumpInt(f->linedefined,D);
+ DumpInt(f->lastlinedefined,D);
+ DumpChar(f->numparams,D);
+ DumpChar(f->is_vararg,D);
+ DumpChar(f->maxstacksize,D);
+ DumpCode(f,D);
+ DumpConstants(f,D);
+ DumpUpvalues(f,D);
+ DumpDebug(f,D);
+}
+
+static void DumpHeader(DumpState* D)
+{
+ lu_byte h[LUAC_HEADERSIZE];
+ luaU_header(h);
+ DumpBlock(h,LUAC_HEADERSIZE,D);
+}
+
+/*
+** dump Lua function as precompiled chunk
+*/
+int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip)
+{
+ DumpState D;
+ D.L=L;
+ D.writer=w;
+ D.data=data;
+ D.strip=strip;
+ D.status=0;
+ DumpHeader(&D);
+ DumpFunction(f,&D);
+ return D.status;
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c
new file mode 100644
index 000000000000..684e44709a8f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c
@@ -0,0 +1,161 @@
+/*
+** $Id: lfunc.c,v 2.30.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions to manipulate prototypes and closures
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define lfunc_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+
+Closure *luaF_newCclosure (lua_State *L, int n) {
+ Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl;
+ c->c.nupvalues = cast_byte(n);
+ return c;
+}
+
+
+Closure *luaF_newLclosure (lua_State *L, int n) {
+ Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl;
+ c->l.p = NULL;
+ c->l.nupvalues = cast_byte(n);
+ while (n--) c->l.upvals[n] = NULL;
+ return c;
+}
+
+
+UpVal *luaF_newupval (lua_State *L) {
+ UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv;
+ uv->v = &uv->u.value;
+ setnilvalue(uv->v);
+ return uv;
+}
+
+
+UpVal *luaF_findupval (lua_State *L, StkId level) {
+ global_State *g = G(L);
+ GCObject **pp = &L->openupval;
+ UpVal *p;
+ UpVal *uv;
+ while (*pp != NULL && (p = gco2uv(*pp))->v >= level) {
+ GCObject *o = obj2gco(p);
+ lua_assert(p->v != &p->u.value);
+ lua_assert(!isold(o) || isold(obj2gco(L)));
+ if (p->v == level) { /* found a corresponding upvalue? */
+ if (isdead(g, o)) /* is it dead? */
+ changewhite(o); /* resurrect it */
+ return p;
+ }
+ pp = &p->next;
+ }
+ /* not found: create a new one */
+ uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv;
+ uv->v = level; /* current value lives in the stack */
+ uv->u.l.prev = &g->uvhead; /* double link it in `uvhead' list */
+ uv->u.l.next = g->uvhead.u.l.next;
+ uv->u.l.next->u.l.prev = uv;
+ g->uvhead.u.l.next = uv;
+ lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
+ return uv;
+}
+
+
+static void unlinkupval (UpVal *uv) {
+ lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
+ uv->u.l.next->u.l.prev = uv->u.l.prev; /* remove from `uvhead' list */
+ uv->u.l.prev->u.l.next = uv->u.l.next;
+}
+
+
+void luaF_freeupval (lua_State *L, UpVal *uv) {
+ if (uv->v != &uv->u.value) /* is it open? */
+ unlinkupval(uv); /* remove from open list */
+ luaM_free(L, uv); /* free upvalue */
+}
+
+
+void luaF_close (lua_State *L, StkId level) {
+ UpVal *uv;
+ global_State *g = G(L);
+ while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) {
+ GCObject *o = obj2gco(uv);
+ lua_assert(!isblack(o) && uv->v != &uv->u.value);
+ L->openupval = uv->next; /* remove from `open' list */
+ if (isdead(g, o))
+ luaF_freeupval(L, uv); /* free upvalue */
+ else {
+ unlinkupval(uv); /* remove upvalue from 'uvhead' list */
+ setobj(L, &uv->u.value, uv->v); /* move value to upvalue slot */
+ uv->v = &uv->u.value; /* now current value lives here */
+ gch(o)->next = g->allgc; /* link upvalue into 'allgc' list */
+ g->allgc = o;
+ luaC_checkupvalcolor(g, uv);
+ }
+ }
+}
+
+
+Proto *luaF_newproto (lua_State *L) {
+ Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p;
+ f->k = NULL;
+ f->sizek = 0;
+ f->p = NULL;
+ f->sizep = 0;
+ f->code = NULL;
+ f->cache = NULL;
+ f->sizecode = 0;
+ f->lineinfo = NULL;
+ f->sizelineinfo = 0;
+ f->upvalues = NULL;
+ f->sizeupvalues = 0;
+ f->numparams = 0;
+ f->is_vararg = 0;
+ f->maxstacksize = 0;
+ f->locvars = NULL;
+ f->sizelocvars = 0;
+ f->linedefined = 0;
+ f->lastlinedefined = 0;
+ f->source = NULL;
+ return f;
+}
+
+
+void luaF_freeproto (lua_State *L, Proto *f) {
+ luaM_freearray(L, f->code, f->sizecode);
+ luaM_freearray(L, f->p, f->sizep);
+ luaM_freearray(L, f->k, f->sizek);
+ luaM_freearray(L, f->lineinfo, f->sizelineinfo);
+ luaM_freearray(L, f->locvars, f->sizelocvars);
+ luaM_freearray(L, f->upvalues, f->sizeupvalues);
+ luaM_free(L, f);
+}
+
+
+/*
+** Look for n-th local variable at line `line' in function `func'.
+** Returns NULL if not found.
+*/
+const char *luaF_getlocalname (const Proto *f, int local_number, int pc) {
+ int i;
+ for (i = 0; i<f->sizelocvars && f->locvars[i].startpc <= pc; i++) {
+ if (pc < f->locvars[i].endpc) { /* is variable active? */
+ local_number--;
+ if (local_number == 0)
+ return getstr(f->locvars[i].varname);
+ }
+ }
+ return NULL; /* not found */
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h
new file mode 100644
index 000000000000..ca0d3a3e0b03
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h
@@ -0,0 +1,33 @@
+/*
+** $Id: lfunc.h,v 2.8.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions to manipulate prototypes and closures
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lfunc_h
+#define lfunc_h
+
+
+#include "lobject.h"
+
+
+#define sizeCclosure(n) (cast(int, sizeof(CClosure)) + \
+ cast(int, sizeof(TValue)*((n)-1)))
+
+#define sizeLclosure(n) (cast(int, sizeof(LClosure)) + \
+ cast(int, sizeof(TValue *)*((n)-1)))
+
+
+LUAI_FUNC Proto *luaF_newproto (lua_State *L);
+LUAI_FUNC Closure *luaF_newCclosure (lua_State *L, int nelems);
+LUAI_FUNC Closure *luaF_newLclosure (lua_State *L, int nelems);
+LUAI_FUNC UpVal *luaF_newupval (lua_State *L);
+LUAI_FUNC UpVal *luaF_findupval (lua_State *L, StkId level);
+LUAI_FUNC void luaF_close (lua_State *L, StkId level);
+LUAI_FUNC void luaF_freeproto (lua_State *L, Proto *f);
+LUAI_FUNC void luaF_freeupval (lua_State *L, UpVal *uv);
+LUAI_FUNC const char *luaF_getlocalname (const Proto *func, int local_number,
+ int pc);
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c
new file mode 100644
index 000000000000..4a7d25af2083
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c
@@ -0,0 +1,1220 @@
+/*
+** $Id: lgc.c,v 2.140.1.3 2014/09/01 16:55:08 roberto Exp $
+** Garbage Collector
+** See Copyright Notice in lua.h
+*/
+
+#include <sys/zfs_context.h>
+
+#define lgc_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+
+/*
+** cost of sweeping one element (the size of a small object divided
+** by some adjust for the sweep speed)
+*/
+#define GCSWEEPCOST ((sizeof(TString) + 4) / 4)
+
+/* maximum number of elements to sweep in each single step */
+#define GCSWEEPMAX (cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4))
+
+/* maximum number of finalizers to call in each GC step */
+#define GCFINALIZENUM 4
+
+
+/*
+** macro to adjust 'stepmul': 'stepmul' is actually used like
+** 'stepmul / STEPMULADJ' (value chosen by tests)
+*/
+#define STEPMULADJ 200
+
+
+/*
+** macro to adjust 'pause': 'pause' is actually used like
+** 'pause / PAUSEADJ' (value chosen by tests)
+*/
+#define PAUSEADJ 100
+
+
+/*
+** 'makewhite' erases all color bits plus the old bit and then
+** sets only the current white bit
+*/
+#define maskcolors (~(bit2mask(BLACKBIT, OLDBIT) | WHITEBITS))
+#define makewhite(g,x) \
+ (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) | luaC_white(g)))
+
+#define white2gray(x) resetbits(gch(x)->marked, WHITEBITS)
+#define black2gray(x) resetbit(gch(x)->marked, BLACKBIT)
+
+
+#define isfinalized(x) testbit(gch(x)->marked, FINALIZEDBIT)
+
+#define checkdeadkey(n) lua_assert(!ttisdeadkey(gkey(n)) || ttisnil(gval(n)))
+
+
+#define checkconsistency(obj) \
+ lua_longassert(!iscollectable(obj) || righttt(obj))
+
+
+#define markvalue(g,o) { checkconsistency(o); \
+ if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); }
+
+#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \
+ reallymarkobject(g, obj2gco(t)); }
+
+static void reallymarkobject (global_State *g, GCObject *o);
+
+
+/*
+** {======================================================
+** Generic functions
+** =======================================================
+*/
+
+
+/*
+** one after last element in a hash array
+*/
+#define gnodelast(h) gnode(h, cast(size_t, sizenode(h)))
+
+
+/*
+** link table 'h' into list pointed by 'p'
+*/
+#define linktable(h,p) ((h)->gclist = *(p), *(p) = obj2gco(h))
+
+
+/*
+** if key is not marked, mark its entry as dead (therefore removing it
+** from the table)
+*/
+static void removeentry (Node *n) {
+ lua_assert(ttisnil(gval(n)));
+ if (valiswhite(gkey(n)))
+ setdeadvalue(gkey(n)); /* unused and unmarked key; remove it */
+}
+
+
+/*
+** tells whether a key or value can be cleared from a weak
+** table. Non-collectable objects are never removed from weak
+** tables. Strings behave as `values', so are never removed too. for
+** other objects: if really collected, cannot keep them; for objects
+** being finalized, keep them in keys, but not in values
+*/
+static int iscleared (global_State *g, const TValue *o) {
+ if (!iscollectable(o)) return 0;
+ else if (ttisstring(o)) {
+ markobject(g, rawtsvalue(o)); /* strings are `values', so are never weak */
+ return 0;
+ }
+ else return iswhite(gcvalue(o));
+}
+
+
+/*
+** barrier that moves collector forward, that is, mark the white object
+** being pointed by a black object.
+*/
+void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v) {
+ global_State *g = G(L);
+ lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o));
+ lua_assert(g->gcstate != GCSpause);
+ lua_assert(gch(o)->tt != LUA_TTABLE);
+ if (keepinvariantout(g)) /* must keep invariant? */
+ reallymarkobject(g, v); /* restore invariant */
+ else { /* sweep phase */
+ lua_assert(issweepphase(g));
+ makewhite(g, o); /* mark main obj. as white to avoid other barriers */
+ }
+}
+
+
+/*
+** barrier that moves collector backward, that is, mark the black object
+** pointing to a white object as gray again. (Current implementation
+** only works for tables; access to 'gclist' is not uniform across
+** different types.)
+*/
+void luaC_barrierback_ (lua_State *L, GCObject *o) {
+ global_State *g = G(L);
+ lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE);
+ black2gray(o); /* make object gray (again) */
+ gco2t(o)->gclist = g->grayagain;
+ g->grayagain = o;
+}
+
+
+/*
+** barrier for prototypes. When creating first closure (cache is
+** NULL), use a forward barrier; this may be the only closure of the
+** prototype (if it is a "regular" function, with a single instance)
+** and the prototype may be big, so it is better to avoid traversing
+** it again. Otherwise, use a backward barrier, to avoid marking all
+** possible instances.
+*/
+LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c) {
+ global_State *g = G(L);
+ lua_assert(isblack(obj2gco(p)));
+ if (p->cache == NULL) { /* first time? */
+ luaC_objbarrier(L, p, c);
+ }
+ else { /* use a backward barrier */
+ black2gray(obj2gco(p)); /* make prototype gray (again) */
+ p->gclist = g->grayagain;
+ g->grayagain = obj2gco(p);
+ }
+}
+
+
+/*
+** check color (and invariants) for an upvalue that was closed,
+** i.e., moved into the 'allgc' list
+*/
+void luaC_checkupvalcolor (global_State *g, UpVal *uv) {
+ GCObject *o = obj2gco(uv);
+ lua_assert(!isblack(o)); /* open upvalues are never black */
+ if (isgray(o)) {
+ if (keepinvariant(g)) {
+ resetoldbit(o); /* see MOVE OLD rule */
+ gray2black(o); /* it is being visited now */
+ markvalue(g, uv->v);
+ }
+ else {
+ lua_assert(issweepphase(g));
+ makewhite(g, o);
+ }
+ }
+}
+
+
+/*
+** create a new collectable object (with given type and size) and link
+** it to '*list'. 'offset' tells how many bytes to allocate before the
+** object itself (used only by states).
+*/
+GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list,
+ int offset) {
+ global_State *g = G(L);
+ char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz));
+ GCObject *o = obj2gco(raw + offset);
+ if (list == NULL)
+ list = &g->allgc; /* standard list for collectable objects */
+ gch(o)->marked = luaC_white(g);
+ gch(o)->tt = tt;
+ gch(o)->next = *list;
+ *list = o;
+ return o;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** Mark functions
+** =======================================================
+*/
+
+
+/*
+** mark an object. Userdata, strings, and closed upvalues are visited
+** and turned black here. Other objects are marked gray and added
+** to appropriate list to be visited (and turned black) later. (Open
+** upvalues are already linked in 'headuv' list.)
+*/
+static void reallymarkobject (global_State *g, GCObject *o) {
+ lu_mem size;
+ white2gray(o);
+ switch (gch(o)->tt) {
+ case LUA_TSHRSTR:
+ case LUA_TLNGSTR: {
+ size = sizestring(gco2ts(o));
+ break; /* nothing else to mark; make it black */
+ }
+ case LUA_TUSERDATA: {
+ Table *mt = gco2u(o)->metatable;
+ markobject(g, mt);
+ markobject(g, gco2u(o)->env);
+ size = sizeudata(gco2u(o));
+ break;
+ }
+ case LUA_TUPVAL: {
+ UpVal *uv = gco2uv(o);
+ markvalue(g, uv->v);
+ if (uv->v != &uv->u.value) /* open? */
+ return; /* open upvalues remain gray */
+ size = sizeof(UpVal);
+ break;
+ }
+ case LUA_TLCL: {
+ gco2lcl(o)->gclist = g->gray;
+ g->gray = o;
+ return;
+ }
+ case LUA_TCCL: {
+ gco2ccl(o)->gclist = g->gray;
+ g->gray = o;
+ return;
+ }
+ case LUA_TTABLE: {
+ linktable(gco2t(o), &g->gray);
+ return;
+ }
+ case LUA_TTHREAD: {
+ gco2th(o)->gclist = g->gray;
+ g->gray = o;
+ return;
+ }
+ case LUA_TPROTO: {
+ gco2p(o)->gclist = g->gray;
+ g->gray = o;
+ return;
+ }
+ default: lua_assert(0); return;
+ }
+ gray2black(o);
+ g->GCmemtrav += size;
+}
+
+
+/*
+** mark metamethods for basic types
+*/
+static void markmt (global_State *g) {
+ int i;
+ for (i=0; i < LUA_NUMTAGS; i++)
+ markobject(g, g->mt[i]);
+}
+
+
+/*
+** mark all objects in list of being-finalized
+*/
+static void markbeingfnz (global_State *g) {
+ GCObject *o;
+ for (o = g->tobefnz; o != NULL; o = gch(o)->next) {
+ makewhite(g, o);
+ reallymarkobject(g, o);
+ }
+}
+
+
+/*
+** mark all values stored in marked open upvalues. (See comment in
+** 'lstate.h'.)
+*/
+static void remarkupvals (global_State *g) {
+ UpVal *uv;
+ for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) {
+ if (isgray(obj2gco(uv)))
+ markvalue(g, uv->v);
+ }
+}
+
+
+/*
+** mark root set and reset all gray lists, to start a new
+** incremental (or full) collection
+*/
+static void restartcollection (global_State *g) {
+ g->gray = g->grayagain = NULL;
+ g->weak = g->allweak = g->ephemeron = NULL;
+ markobject(g, g->mainthread);
+ markvalue(g, &g->l_registry);
+ markmt(g);
+ markbeingfnz(g); /* mark any finalizing object left from previous cycle */
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Traverse functions
+** =======================================================
+*/
+
+static void traverseweakvalue (global_State *g, Table *h) {
+ Node *n, *limit = gnodelast(h);
+ /* if there is array part, assume it may have white values (do not
+ traverse it just to check) */
+ int hasclears = (h->sizearray > 0);
+ for (n = gnode(h, 0); n < limit; n++) {
+ checkdeadkey(n);
+ if (ttisnil(gval(n))) /* entry is empty? */
+ removeentry(n); /* remove it */
+ else {
+ lua_assert(!ttisnil(gkey(n)));
+ markvalue(g, gkey(n)); /* mark key */
+ if (!hasclears && iscleared(g, gval(n))) /* is there a white value? */
+ hasclears = 1; /* table will have to be cleared */
+ }
+ }
+ if (hasclears)
+ linktable(h, &g->weak); /* has to be cleared later */
+ else /* no white values */
+ linktable(h, &g->grayagain); /* no need to clean */
+}
+
+
+static int traverseephemeron (global_State *g, Table *h) {
+ int marked = 0; /* true if an object is marked in this traversal */
+ int hasclears = 0; /* true if table has white keys */
+ int prop = 0; /* true if table has entry "white-key -> white-value" */
+ Node *n, *limit = gnodelast(h);
+ int i;
+ /* traverse array part (numeric keys are 'strong') */
+ for (i = 0; i < h->sizearray; i++) {
+ if (valiswhite(&h->array[i])) {
+ marked = 1;
+ reallymarkobject(g, gcvalue(&h->array[i]));
+ }
+ }
+ /* traverse hash part */
+ for (n = gnode(h, 0); n < limit; n++) {
+ checkdeadkey(n);
+ if (ttisnil(gval(n))) /* entry is empty? */
+ removeentry(n); /* remove it */
+ else if (iscleared(g, gkey(n))) { /* key is not marked (yet)? */
+ hasclears = 1; /* table must be cleared */
+ if (valiswhite(gval(n))) /* value not marked yet? */
+ prop = 1; /* must propagate again */
+ }
+ else if (valiswhite(gval(n))) { /* value not marked yet? */
+ marked = 1;
+ reallymarkobject(g, gcvalue(gval(n))); /* mark it now */
+ }
+ }
+ if (g->gcstate != GCSatomic || prop)
+ linktable(h, &g->ephemeron); /* have to propagate again */
+ else if (hasclears) /* does table have white keys? */
+ linktable(h, &g->allweak); /* may have to clean white keys */
+ else /* no white keys */
+ linktable(h, &g->grayagain); /* no need to clean */
+ return marked;
+}
+
+
+static void traversestrongtable (global_State *g, Table *h) {
+ Node *n, *limit = gnodelast(h);
+ int i;
+ for (i = 0; i < h->sizearray; i++) /* traverse array part */
+ markvalue(g, &h->array[i]);
+ for (n = gnode(h, 0); n < limit; n++) { /* traverse hash part */
+ checkdeadkey(n);
+ if (ttisnil(gval(n))) /* entry is empty? */
+ removeentry(n); /* remove it */
+ else {
+ lua_assert(!ttisnil(gkey(n)));
+ markvalue(g, gkey(n)); /* mark key */
+ markvalue(g, gval(n)); /* mark value */
+ }
+ }
+}
+
+
+static lu_mem traversetable (global_State *g, Table *h) {
+ const char *weakkey, *weakvalue;
+ const TValue *mode = gfasttm(g, h->metatable, TM_MODE);
+ markobject(g, h->metatable);
+ if (mode && ttisstring(mode) && /* is there a weak mode? */
+ ((weakkey = strchr(svalue(mode), 'k')),
+ (weakvalue = strchr(svalue(mode), 'v')),
+ (weakkey || weakvalue))) { /* is really weak? */
+ black2gray(obj2gco(h)); /* keep table gray */
+ if (!weakkey) /* strong keys? */
+ traverseweakvalue(g, h);
+ else if (!weakvalue) /* strong values? */
+ traverseephemeron(g, h);
+ else /* all weak */
+ linktable(h, &g->allweak); /* nothing to traverse now */
+ }
+ else /* not weak */
+ traversestrongtable(g, h);
+ return sizeof(Table) + sizeof(TValue) * h->sizearray +
+ sizeof(Node) * cast(size_t, sizenode(h));
+}
+
+
+static int traverseproto (global_State *g, Proto *f) {
+ int i;
+ if (f->cache && iswhite(obj2gco(f->cache)))
+ f->cache = NULL; /* allow cache to be collected */
+ markobject(g, f->source);
+ for (i = 0; i < f->sizek; i++) /* mark literals */
+ markvalue(g, &f->k[i]);
+ for (i = 0; i < f->sizeupvalues; i++) /* mark upvalue names */
+ markobject(g, f->upvalues[i].name);
+ for (i = 0; i < f->sizep; i++) /* mark nested protos */
+ markobject(g, f->p[i]);
+ for (i = 0; i < f->sizelocvars; i++) /* mark local-variable names */
+ markobject(g, f->locvars[i].varname);
+ return sizeof(Proto) + sizeof(Instruction) * f->sizecode +
+ sizeof(Proto *) * f->sizep +
+ sizeof(TValue) * f->sizek +
+ sizeof(int) * f->sizelineinfo +
+ sizeof(LocVar) * f->sizelocvars +
+ sizeof(Upvaldesc) * f->sizeupvalues;
+}
+
+
+static lu_mem traverseCclosure (global_State *g, CClosure *cl) {
+ int i;
+ for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */
+ markvalue(g, &cl->upvalue[i]);
+ return sizeCclosure(cl->nupvalues);
+}
+
+static lu_mem traverseLclosure (global_State *g, LClosure *cl) {
+ int i;
+ markobject(g, cl->p); /* mark its prototype */
+ for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */
+ markobject(g, cl->upvals[i]);
+ return sizeLclosure(cl->nupvalues);
+}
+
+
+static lu_mem traversestack (global_State *g, lua_State *th) {
+ int n = 0;
+ StkId o = th->stack;
+ if (o == NULL)
+ return 1; /* stack not completely built yet */
+ for (; o < th->top; o++) /* mark live elements in the stack */
+ markvalue(g, o);
+ if (g->gcstate == GCSatomic) { /* final traversal? */
+ StkId lim = th->stack + th->stacksize; /* real end of stack */
+ for (; o < lim; o++) /* clear not-marked stack slice */
+ setnilvalue(o);
+ }
+ else { /* count call infos to compute size */
+ CallInfo *ci;
+ for (ci = &th->base_ci; ci != th->ci; ci = ci->next)
+ n++;
+ }
+ return sizeof(lua_State) + sizeof(TValue) * th->stacksize +
+ sizeof(CallInfo) * n;
+}
+
+
+/*
+** traverse one gray object, turning it to black (except for threads,
+** which are always gray).
+*/
+static void propagatemark (global_State *g) {
+ lu_mem size;
+ GCObject *o = g->gray;
+ lua_assert(isgray(o));
+ gray2black(o);
+ switch (gch(o)->tt) {
+ case LUA_TTABLE: {
+ Table *h = gco2t(o);
+ g->gray = h->gclist; /* remove from 'gray' list */
+ size = traversetable(g, h);
+ break;
+ }
+ case LUA_TLCL: {
+ LClosure *cl = gco2lcl(o);
+ g->gray = cl->gclist; /* remove from 'gray' list */
+ size = traverseLclosure(g, cl);
+ break;
+ }
+ case LUA_TCCL: {
+ CClosure *cl = gco2ccl(o);
+ g->gray = cl->gclist; /* remove from 'gray' list */
+ size = traverseCclosure(g, cl);
+ break;
+ }
+ case LUA_TTHREAD: {
+ lua_State *th = gco2th(o);
+ g->gray = th->gclist; /* remove from 'gray' list */
+ th->gclist = g->grayagain;
+ g->grayagain = o; /* insert into 'grayagain' list */
+ black2gray(o);
+ size = traversestack(g, th);
+ break;
+ }
+ case LUA_TPROTO: {
+ Proto *p = gco2p(o);
+ g->gray = p->gclist; /* remove from 'gray' list */
+ size = traverseproto(g, p);
+ break;
+ }
+ default: lua_assert(0); return;
+ }
+ g->GCmemtrav += size;
+}
+
+
+static void propagateall (global_State *g) {
+ while (g->gray) propagatemark(g);
+}
+
+
+static void propagatelist (global_State *g, GCObject *l) {
+ lua_assert(g->gray == NULL); /* no grays left */
+ g->gray = l;
+ propagateall(g); /* traverse all elements from 'l' */
+}
+
+/*
+** retraverse all gray lists. Because tables may be reinserted in other
+** lists when traversed, traverse the original lists to avoid traversing
+** twice the same table (which is not wrong, but inefficient)
+*/
+static void retraversegrays (global_State *g) {
+ GCObject *weak = g->weak; /* save original lists */
+ GCObject *grayagain = g->grayagain;
+ GCObject *ephemeron = g->ephemeron;
+ g->weak = g->grayagain = g->ephemeron = NULL;
+ propagateall(g); /* traverse main gray list */
+ propagatelist(g, grayagain);
+ propagatelist(g, weak);
+ propagatelist(g, ephemeron);
+}
+
+
+static void convergeephemerons (global_State *g) {
+ int changed;
+ do {
+ GCObject *w;
+ GCObject *next = g->ephemeron; /* get ephemeron list */
+ g->ephemeron = NULL; /* tables will return to this list when traversed */
+ changed = 0;
+ while ((w = next) != NULL) {
+ next = gco2t(w)->gclist;
+ if (traverseephemeron(g, gco2t(w))) { /* traverse marked some value? */
+ propagateall(g); /* propagate changes */
+ changed = 1; /* will have to revisit all ephemeron tables */
+ }
+ }
+ } while (changed);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Sweep Functions
+** =======================================================
+*/
+
+
+/*
+** clear entries with unmarked keys from all weaktables in list 'l' up
+** to element 'f'
+*/
+static void clearkeys (global_State *g, GCObject *l, GCObject *f) {
+ for (; l != f; l = gco2t(l)->gclist) {
+ Table *h = gco2t(l);
+ Node *n, *limit = gnodelast(h);
+ for (n = gnode(h, 0); n < limit; n++) {
+ if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) {
+ setnilvalue(gval(n)); /* remove value ... */
+ removeentry(n); /* and remove entry from table */
+ }
+ }
+ }
+}
+
+
+/*
+** clear entries with unmarked values from all weaktables in list 'l' up
+** to element 'f'
+*/
+static void clearvalues (global_State *g, GCObject *l, GCObject *f) {
+ for (; l != f; l = gco2t(l)->gclist) {
+ Table *h = gco2t(l);
+ Node *n, *limit = gnodelast(h);
+ int i;
+ for (i = 0; i < h->sizearray; i++) {
+ TValue *o = &h->array[i];
+ if (iscleared(g, o)) /* value was collected? */
+ setnilvalue(o); /* remove value */
+ }
+ for (n = gnode(h, 0); n < limit; n++) {
+ if (!ttisnil(gval(n)) && iscleared(g, gval(n))) {
+ setnilvalue(gval(n)); /* remove value ... */
+ removeentry(n); /* and remove entry from table */
+ }
+ }
+ }
+}
+
+
+static void freeobj (lua_State *L, GCObject *o) {
+ switch (gch(o)->tt) {
+ case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break;
+ case LUA_TLCL: {
+ luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues));
+ break;
+ }
+ case LUA_TCCL: {
+ luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues));
+ break;
+ }
+ case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break;
+ case LUA_TTABLE: luaH_free(L, gco2t(o)); break;
+ case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break;
+ case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break;
+ case LUA_TSHRSTR:
+ G(L)->strt.nuse--;
+ /* FALLTHROUGH */
+ case LUA_TLNGSTR: {
+ luaM_freemem(L, o, sizestring(gco2ts(o)));
+ break;
+ }
+ default: lua_assert(0);
+ }
+}
+
+
+#define sweepwholelist(L,p) sweeplist(L,p,MAX_LUMEM)
+static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count);
+
+
+/*
+** sweep the (open) upvalues of a thread and resize its stack and
+** list of call-info structures.
+*/
+static void sweepthread (lua_State *L, lua_State *L1) {
+ if (L1->stack == NULL) return; /* stack not completely built yet */
+ sweepwholelist(L, &L1->openupval); /* sweep open upvalues */
+ luaE_freeCI(L1); /* free extra CallInfo slots */
+ /* should not change the stack during an emergency gc cycle */
+ if (G(L)->gckind != KGC_EMERGENCY)
+ luaD_shrinkstack(L1);
+}
+
+
+/*
+** sweep at most 'count' elements from a list of GCObjects erasing dead
+** objects, where a dead (not alive) object is one marked with the "old"
+** (non current) white and not fixed.
+** In non-generational mode, change all non-dead objects back to white,
+** preparing for next collection cycle.
+** In generational mode, keep black objects black, and also mark them as
+** old; stop when hitting an old object, as all objects after that
+** one will be old too.
+** When object is a thread, sweep its list of open upvalues too.
+*/
+static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count) {
+ global_State *g = G(L);
+ int ow = otherwhite(g);
+ int toclear, toset; /* bits to clear and to set in all live objects */
+ int tostop; /* stop sweep when this is true */
+ if (isgenerational(g)) { /* generational mode? */
+ toclear = ~0; /* clear nothing */
+ toset = bitmask(OLDBIT); /* set the old bit of all surviving objects */
+ tostop = bitmask(OLDBIT); /* do not sweep old generation */
+ }
+ else { /* normal mode */
+ toclear = maskcolors; /* clear all color bits + old bit */
+ toset = luaC_white(g); /* make object white */
+ tostop = 0; /* do not stop */
+ }
+ while (*p != NULL && count-- > 0) {
+ GCObject *curr = *p;
+ int marked = gch(curr)->marked;
+ if (isdeadm(ow, marked)) { /* is 'curr' dead? */
+ *p = gch(curr)->next; /* remove 'curr' from list */
+ freeobj(L, curr); /* erase 'curr' */
+ }
+ else {
+ if (testbits(marked, tostop))
+ return NULL; /* stop sweeping this list */
+ if (gch(curr)->tt == LUA_TTHREAD)
+ sweepthread(L, gco2th(curr)); /* sweep thread's upvalues */
+ /* update marks */
+ gch(curr)->marked = cast_byte((marked & toclear) | toset);
+ p = &gch(curr)->next; /* go to next element */
+ }
+ }
+ return (*p == NULL) ? NULL : p;
+}
+
+
+/*
+** sweep a list until a live object (or end of list)
+*/
+static GCObject **sweeptolive (lua_State *L, GCObject **p, int *n) {
+ GCObject ** old = p;
+ int i = 0;
+ do {
+ i++;
+ p = sweeplist(L, p, 1);
+ } while (p == old);
+ if (n) *n += i;
+ return p;
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Finalization
+** =======================================================
+*/
+
+static void checkSizes (lua_State *L) {
+ global_State *g = G(L);
+ if (g->gckind != KGC_EMERGENCY) { /* do not change sizes in emergency */
+ int hs = g->strt.size / 2; /* half the size of the string table */
+ if (g->strt.nuse < cast(lu_int32, hs)) /* using less than that half? */
+ luaS_resize(L, hs); /* halve its size */
+ luaZ_freebuffer(L, &g->buff); /* free concatenation buffer */
+ }
+}
+
+
+static GCObject *udata2finalize (global_State *g) {
+ GCObject *o = g->tobefnz; /* get first element */
+ lua_assert(isfinalized(o));
+ g->tobefnz = gch(o)->next; /* remove it from 'tobefnz' list */
+ gch(o)->next = g->allgc; /* return it to 'allgc' list */
+ g->allgc = o;
+ resetbit(gch(o)->marked, SEPARATED); /* mark that it is not in 'tobefnz' */
+ lua_assert(!isold(o)); /* see MOVE OLD rule */
+ if (!keepinvariantout(g)) /* not keeping invariant? */
+ makewhite(g, o); /* "sweep" object */
+ return o;
+}
+
+
+static void dothecall (lua_State *L, void *ud) {
+ UNUSED(ud);
+ luaD_call(L, L->top - 2, 0, 0);
+}
+
+
+static void GCTM (lua_State *L, int propagateerrors) {
+ global_State *g = G(L);
+ const TValue *tm;
+ TValue v;
+ setgcovalue(L, &v, udata2finalize(g));
+ tm = luaT_gettmbyobj(L, &v, TM_GC);
+ if (tm != NULL && ttisfunction(tm)) { /* is there a finalizer? */
+ int status;
+ lu_byte oldah = L->allowhook;
+ int running = g->gcrunning;
+ L->allowhook = 0; /* stop debug hooks during GC metamethod */
+ g->gcrunning = 0; /* avoid GC steps */
+ setobj2s(L, L->top, tm); /* push finalizer... */
+ setobj2s(L, L->top + 1, &v); /* ... and its argument */
+ L->top += 2; /* and (next line) call the finalizer */
+ status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0);
+ L->allowhook = oldah; /* restore hooks */
+ g->gcrunning = running; /* restore state */
+ if (status != LUA_OK && propagateerrors) { /* error while running __gc? */
+ if (status == LUA_ERRRUN) { /* is there an error object? */
+ const char *msg = (ttisstring(L->top - 1))
+ ? svalue(L->top - 1)
+ : "no message";
+ luaO_pushfstring(L, "error in __gc metamethod (%s)", msg);
+ status = LUA_ERRGCMM; /* error in __gc metamethod */
+ }
+ luaD_throw(L, status); /* re-throw error */
+ }
+ }
+}
+
+
+/*
+** move all unreachable objects (or 'all' objects) that need
+** finalization from list 'finobj' to list 'tobefnz' (to be finalized)
+*/
+static void separatetobefnz (lua_State *L, int all) {
+ global_State *g = G(L);
+ GCObject **p = &g->finobj;
+ GCObject *curr;
+ GCObject **lastnext = &g->tobefnz;
+ /* find last 'next' field in 'tobefnz' list (to add elements in its end) */
+ while (*lastnext != NULL)
+ lastnext = &gch(*lastnext)->next;
+ while ((curr = *p) != NULL) { /* traverse all finalizable objects */
+ lua_assert(!isfinalized(curr));
+ lua_assert(testbit(gch(curr)->marked, SEPARATED));
+ if (!(iswhite(curr) || all)) /* not being collected? */
+ p = &gch(curr)->next; /* don't bother with it */
+ else {
+ l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */
+ *p = gch(curr)->next; /* remove 'curr' from 'finobj' list */
+ gch(curr)->next = *lastnext; /* link at the end of 'tobefnz' list */
+ *lastnext = curr;
+ lastnext = &gch(curr)->next;
+ }
+ }
+}
+
+
+/*
+** if object 'o' has a finalizer, remove it from 'allgc' list (must
+** search the list to find it) and link it in 'finobj' list.
+*/
+void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt) {
+ global_State *g = G(L);
+ if (testbit(gch(o)->marked, SEPARATED) || /* obj. is already separated... */
+ isfinalized(o) || /* ... or is finalized... */
+ gfasttm(g, mt, TM_GC) == NULL) /* or has no finalizer? */
+ return; /* nothing to be done */
+ else { /* move 'o' to 'finobj' list */
+ GCObject **p;
+ GCheader *ho = gch(o);
+ if (g->sweepgc == &ho->next) { /* avoid removing current sweep object */
+ lua_assert(issweepphase(g));
+ g->sweepgc = sweeptolive(L, g->sweepgc, NULL);
+ }
+ /* search for pointer pointing to 'o' */
+ for (p = &g->allgc; *p != o; p = &gch(*p)->next) { /* empty */ }
+ *p = ho->next; /* remove 'o' from root list */
+ ho->next = g->finobj; /* link it in list 'finobj' */
+ g->finobj = o;
+ l_setbit(ho->marked, SEPARATED); /* mark it as such */
+ if (!keepinvariantout(g)) /* not keeping invariant? */
+ makewhite(g, o); /* "sweep" object */
+ else
+ resetoldbit(o); /* see MOVE OLD rule */
+ }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** GC control
+** =======================================================
+*/
+
+
+/*
+** set a reasonable "time" to wait before starting a new GC cycle;
+** cycle will start when memory use hits threshold
+*/
+static void setpause (global_State *g, l_mem estimate) {
+ l_mem debt, threshold;
+ estimate = estimate / PAUSEADJ; /* adjust 'estimate' */
+ threshold = (g->gcpause < MAX_LMEM / estimate) /* overflow? */
+ ? estimate * g->gcpause /* no overflow */
+ : MAX_LMEM; /* overflow; truncate to maximum */
+ debt = -cast(l_mem, threshold - gettotalbytes(g));
+ luaE_setdebt(g, debt);
+}
+
+
+#define sweepphases \
+ (bitmask(GCSsweepstring) | bitmask(GCSsweepudata) | bitmask(GCSsweep))
+
+
+/*
+** enter first sweep phase (strings) and prepare pointers for other
+** sweep phases. The calls to 'sweeptolive' make pointers point to an
+** object inside the list (instead of to the header), so that the real
+** sweep do not need to skip objects created between "now" and the start
+** of the real sweep.
+** Returns how many objects it swept.
+*/
+static int entersweep (lua_State *L) {
+ global_State *g = G(L);
+ int n = 0;
+ g->gcstate = GCSsweepstring;
+ lua_assert(g->sweepgc == NULL && g->sweepfin == NULL);
+ /* prepare to sweep strings, finalizable objects, and regular objects */
+ g->sweepstrgc = 0;
+ g->sweepfin = sweeptolive(L, &g->finobj, &n);
+ g->sweepgc = sweeptolive(L, &g->allgc, &n);
+ return n;
+}
+
+
+/*
+** change GC mode
+*/
+void luaC_changemode (lua_State *L, int mode) {
+ global_State *g = G(L);
+ if (mode == g->gckind) return; /* nothing to change */
+ if (mode == KGC_GEN) { /* change to generational mode */
+ /* make sure gray lists are consistent */
+ luaC_runtilstate(L, bitmask(GCSpropagate));
+ g->GCestimate = gettotalbytes(g);
+ g->gckind = KGC_GEN;
+ }
+ else { /* change to incremental mode */
+ /* sweep all objects to turn them back to white
+ (as white has not changed, nothing extra will be collected) */
+ g->gckind = KGC_NORMAL;
+ entersweep(L);
+ luaC_runtilstate(L, ~sweepphases);
+ }
+}
+
+
+/*
+** call all pending finalizers
+*/
+static void callallpendingfinalizers (lua_State *L, int propagateerrors) {
+ global_State *g = G(L);
+ while (g->tobefnz) {
+ resetoldbit(g->tobefnz);
+ GCTM(L, propagateerrors);
+ }
+}
+
+
+void luaC_freeallobjects (lua_State *L) {
+ global_State *g = G(L);
+ int i;
+ separatetobefnz(L, 1); /* separate all objects with finalizers */
+ lua_assert(g->finobj == NULL);
+ callallpendingfinalizers(L, 0);
+ g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */
+ g->gckind = KGC_NORMAL;
+ sweepwholelist(L, &g->finobj); /* finalizers can create objs. in 'finobj' */
+ sweepwholelist(L, &g->allgc);
+ for (i = 0; i < g->strt.size; i++) /* free all string lists */
+ sweepwholelist(L, &g->strt.hash[i]);
+ lua_assert(g->strt.nuse == 0);
+}
+
+
+static l_mem atomic (lua_State *L) {
+ global_State *g = G(L);
+ l_mem work = -cast(l_mem, g->GCmemtrav); /* start counting work */
+ GCObject *origweak, *origall;
+ lua_assert(!iswhite(obj2gco(g->mainthread)));
+ markobject(g, L); /* mark running thread */
+ /* registry and global metatables may be changed by API */
+ markvalue(g, &g->l_registry);
+ markmt(g); /* mark basic metatables */
+ /* remark occasional upvalues of (maybe) dead threads */
+ remarkupvals(g);
+ propagateall(g); /* propagate changes */
+ work += g->GCmemtrav; /* stop counting (do not (re)count grays) */
+ /* traverse objects caught by write barrier and by 'remarkupvals' */
+ retraversegrays(g);
+ work -= g->GCmemtrav; /* restart counting */
+ convergeephemerons(g);
+ /* at this point, all strongly accessible objects are marked. */
+ /* clear values from weak tables, before checking finalizers */
+ clearvalues(g, g->weak, NULL);
+ clearvalues(g, g->allweak, NULL);
+ origweak = g->weak; origall = g->allweak;
+ work += g->GCmemtrav; /* stop counting (objects being finalized) */
+ separatetobefnz(L, 0); /* separate objects to be finalized */
+ markbeingfnz(g); /* mark objects that will be finalized */
+ propagateall(g); /* remark, to propagate `preserveness' */
+ work -= g->GCmemtrav; /* restart counting */
+ convergeephemerons(g);
+ /* at this point, all resurrected objects are marked. */
+ /* remove dead objects from weak tables */
+ clearkeys(g, g->ephemeron, NULL); /* clear keys from all ephemeron tables */
+ clearkeys(g, g->allweak, NULL); /* clear keys from all allweak tables */
+ /* clear values from resurrected weak tables */
+ clearvalues(g, g->weak, origweak);
+ clearvalues(g, g->allweak, origall);
+ g->currentwhite = cast_byte(otherwhite(g)); /* flip current white */
+ work += g->GCmemtrav; /* complete counting */
+ return work; /* estimate of memory marked by 'atomic' */
+}
+
+
+static lu_mem singlestep (lua_State *L) {
+ global_State *g = G(L);
+ switch (g->gcstate) {
+ case GCSpause: {
+ /* start to count memory traversed */
+ g->GCmemtrav = g->strt.size * sizeof(GCObject*);
+ lua_assert(!isgenerational(g));
+ restartcollection(g);
+ g->gcstate = GCSpropagate;
+ return g->GCmemtrav;
+ }
+ case GCSpropagate: {
+ if (g->gray) {
+ lu_mem oldtrav = g->GCmemtrav;
+ propagatemark(g);
+ return g->GCmemtrav - oldtrav; /* memory traversed in this step */
+ }
+ else { /* no more `gray' objects */
+ lu_mem work;
+ int sw;
+ g->gcstate = GCSatomic; /* finish mark phase */
+ g->GCestimate = g->GCmemtrav; /* save what was counted */;
+ work = atomic(L); /* add what was traversed by 'atomic' */
+ g->GCestimate += work; /* estimate of total memory traversed */
+ sw = entersweep(L);
+ return work + sw * GCSWEEPCOST;
+ }
+ }
+ case GCSsweepstring: {
+ int i;
+ for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++)
+ sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]);
+ g->sweepstrgc += i;
+ if (g->sweepstrgc >= g->strt.size) /* no more strings to sweep? */
+ g->gcstate = GCSsweepudata;
+ return i * GCSWEEPCOST;
+ }
+ case GCSsweepudata: {
+ if (g->sweepfin) {
+ g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX);
+ return GCSWEEPMAX*GCSWEEPCOST;
+ }
+ else {
+ g->gcstate = GCSsweep;
+ return 0;
+ }
+ }
+ case GCSsweep: {
+ if (g->sweepgc) {
+ g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX);
+ return GCSWEEPMAX*GCSWEEPCOST;
+ }
+ else {
+ /* sweep main thread */
+ GCObject *mt = obj2gco(g->mainthread);
+ sweeplist(L, &mt, 1);
+ checkSizes(L);
+ g->gcstate = GCSpause; /* finish collection */
+ return GCSWEEPCOST;
+ }
+ }
+ default: lua_assert(0); return 0;
+ }
+}
+
+
+/*
+** advances the garbage collector until it reaches a state allowed
+** by 'statemask'
+*/
+void luaC_runtilstate (lua_State *L, int statesmask) {
+ global_State *g = G(L);
+ while (!testbit(statesmask, g->gcstate))
+ singlestep(L);
+}
+
+
+static void generationalcollection (lua_State *L) {
+ global_State *g = G(L);
+ lua_assert(g->gcstate == GCSpropagate);
+ if (g->GCestimate == 0) { /* signal for another major collection? */
+ luaC_fullgc(L, 0); /* perform a full regular collection */
+ g->GCestimate = gettotalbytes(g); /* update control */
+ }
+ else {
+ lu_mem estimate = g->GCestimate;
+ luaC_runtilstate(L, bitmask(GCSpause)); /* run complete (minor) cycle */
+ g->gcstate = GCSpropagate; /* skip restart */
+ if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc)
+ g->GCestimate = 0; /* signal for a major collection */
+ else
+ g->GCestimate = estimate; /* keep estimate from last major coll. */
+
+ }
+ setpause(g, gettotalbytes(g));
+ lua_assert(g->gcstate == GCSpropagate);
+}
+
+
+static void incstep (lua_State *L) {
+ global_State *g = G(L);
+ l_mem debt = g->GCdebt;
+ int stepmul = g->gcstepmul;
+ if (stepmul < 40) stepmul = 40; /* avoid ridiculous low values (and 0) */
+ /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */
+ debt = (debt / STEPMULADJ) + 1;
+ debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM;
+ do { /* always perform at least one single step */
+ lu_mem work = singlestep(L); /* do some work */
+ debt -= work;
+ } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause);
+ if (g->gcstate == GCSpause)
+ setpause(g, g->GCestimate); /* pause until next cycle */
+ else {
+ debt = (debt / stepmul) * STEPMULADJ; /* convert 'work units' to Kb */
+ luaE_setdebt(g, debt);
+ }
+}
+
+
+/*
+** performs a basic GC step
+*/
+void luaC_forcestep (lua_State *L) {
+ global_State *g = G(L);
+ int i;
+ if (isgenerational(g)) generationalcollection(L);
+ else incstep(L);
+ /* run a few finalizers (or all of them at the end of a collect cycle) */
+ for (i = 0; g->tobefnz && (i < GCFINALIZENUM || g->gcstate == GCSpause); i++)
+ GCTM(L, 1); /* call one finalizer */
+}
+
+
+/*
+** performs a basic GC step only if collector is running
+*/
+void luaC_step (lua_State *L) {
+ global_State *g = G(L);
+ if (g->gcrunning) luaC_forcestep(L);
+ else luaE_setdebt(g, -GCSTEPSIZE); /* avoid being called too often */
+}
+
+
+
+/*
+** performs a full GC cycle; if "isemergency", does not call
+** finalizers (which could change stack positions)
+*/
+void luaC_fullgc (lua_State *L, int isemergency) {
+ global_State *g = G(L);
+ int origkind = g->gckind;
+ lua_assert(origkind != KGC_EMERGENCY);
+ if (isemergency) /* do not run finalizers during emergency GC */
+ g->gckind = KGC_EMERGENCY;
+ else {
+ g->gckind = KGC_NORMAL;
+ callallpendingfinalizers(L, 1);
+ }
+ if (keepinvariant(g)) { /* may there be some black objects? */
+ /* must sweep all objects to turn them back to white
+ (as white has not changed, nothing will be collected) */
+ entersweep(L);
+ }
+ /* finish any pending sweep phase to start a new cycle */
+ luaC_runtilstate(L, bitmask(GCSpause));
+ luaC_runtilstate(L, ~bitmask(GCSpause)); /* start new collection */
+ luaC_runtilstate(L, bitmask(GCSpause)); /* run entire collection */
+ if (origkind == KGC_GEN) { /* generational mode? */
+ /* generational mode must be kept in propagate phase */
+ luaC_runtilstate(L, bitmask(GCSpropagate));
+ }
+ g->gckind = origkind;
+ setpause(g, gettotalbytes(g));
+ if (!isemergency) /* do not run finalizers during emergency GC */
+ callallpendingfinalizers(L, 1);
+}
+
+/* }====================================================== */
+
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h
new file mode 100644
index 000000000000..84bb1cdf99fa
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h
@@ -0,0 +1,157 @@
+/*
+** $Id: lgc.h,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Garbage Collector
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lgc_h
+#define lgc_h
+
+
+#include "lobject.h"
+#include "lstate.h"
+
+/*
+** Collectable objects may have one of three colors: white, which
+** means the object is not marked; gray, which means the
+** object is marked, but its references may be not marked; and
+** black, which means that the object and all its references are marked.
+** The main invariant of the garbage collector, while marking objects,
+** is that a black object can never point to a white one. Moreover,
+** any gray object must be in a "gray list" (gray, grayagain, weak,
+** allweak, ephemeron) so that it can be visited again before finishing
+** the collection cycle. These lists have no meaning when the invariant
+** is not being enforced (e.g., sweep phase).
+*/
+
+
+
+/* how much to allocate before next GC step */
+#if !defined(GCSTEPSIZE)
+/* ~100 small strings */
+#define GCSTEPSIZE (cast_int(100 * sizeof(TString)))
+#endif
+
+
+/*
+** Possible states of the Garbage Collector
+*/
+#define GCSpropagate 0
+#define GCSatomic 1
+#define GCSsweepstring 2
+#define GCSsweepudata 3
+#define GCSsweep 4
+#define GCSpause 5
+
+
+#define issweepphase(g) \
+ (GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep)
+
+#define isgenerational(g) ((g)->gckind == KGC_GEN)
+
+/*
+** macros to tell when main invariant (white objects cannot point to black
+** ones) must be kept. During a non-generational collection, the sweep
+** phase may break the invariant, as objects turned white may point to
+** still-black objects. The invariant is restored when sweep ends and
+** all objects are white again. During a generational collection, the
+** invariant must be kept all times.
+*/
+
+#define keepinvariant(g) (isgenerational(g) || g->gcstate <= GCSatomic)
+
+
+/*
+** Outside the collector, the state in generational mode is kept in
+** 'propagate', so 'keepinvariant' is always true.
+*/
+#define keepinvariantout(g) \
+ check_exp(g->gcstate == GCSpropagate || !isgenerational(g), \
+ g->gcstate <= GCSatomic)
+
+
+/*
+** some useful bit tricks
+*/
+#define resetbits(x,m) ((x) &= cast(lu_byte, ~(m)))
+#define setbits(x,m) ((x) |= (m))
+#define testbits(x,m) ((x) & (m))
+#define bitmask(b) (1<<(b))
+#define bit2mask(b1,b2) (bitmask(b1) | bitmask(b2))
+#define l_setbit(x,b) setbits(x, bitmask(b))
+#define resetbit(x,b) resetbits(x, bitmask(b))
+#define testbit(x,b) testbits(x, bitmask(b))
+
+
+/* Layout for bit use in `marked' field: */
+#define WHITE0BIT 0 /* object is white (type 0) */
+#define WHITE1BIT 1 /* object is white (type 1) */
+#define BLACKBIT 2 /* object is black */
+#define FINALIZEDBIT 3 /* object has been separated for finalization */
+#define SEPARATED 4 /* object is in 'finobj' list or in 'tobefnz' */
+#define FIXEDBIT 5 /* object is fixed (should not be collected) */
+#define OLDBIT 6 /* object is old (only in generational mode) */
+/* bit 7 is currently used by tests (luaL_checkmemory) */
+
+#define WHITEBITS bit2mask(WHITE0BIT, WHITE1BIT)
+
+
+#define iswhite(x) testbits((x)->gch.marked, WHITEBITS)
+#define isblack(x) testbit((x)->gch.marked, BLACKBIT)
+#define isgray(x) /* neither white nor black */ \
+ (!testbits((x)->gch.marked, WHITEBITS | bitmask(BLACKBIT)))
+
+#define isold(x) testbit((x)->gch.marked, OLDBIT)
+
+/* MOVE OLD rule: whenever an object is moved to the beginning of
+ a GC list, its old bit must be cleared */
+#define resetoldbit(o) resetbit((o)->gch.marked, OLDBIT)
+
+#define otherwhite(g) (g->currentwhite ^ WHITEBITS)
+#define isdeadm(ow,m) (!(((m) ^ WHITEBITS) & (ow)))
+#define isdead(g,v) isdeadm(otherwhite(g), (v)->gch.marked)
+
+#define changewhite(x) ((x)->gch.marked ^= WHITEBITS)
+#define gray2black(x) l_setbit((x)->gch.marked, BLACKBIT)
+
+#define valiswhite(x) (iscollectable(x) && iswhite(gcvalue(x)))
+
+#define luaC_white(g) cast(lu_byte, (g)->currentwhite & WHITEBITS)
+
+
+#define luaC_condGC(L,c) \
+ {if (G(L)->GCdebt > 0) {c;}; condchangemem(L);}
+#define luaC_checkGC(L) luaC_condGC(L, luaC_step(L);)
+
+
+#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \
+ luaC_barrier_(L,obj2gco(p),gcvalue(v)); }
+
+#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \
+ luaC_barrierback_(L,p); }
+
+#define luaC_objbarrier(L,p,o) \
+ { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \
+ luaC_barrier_(L,obj2gco(p),obj2gco(o)); }
+
+#define luaC_objbarrierback(L,p,o) \
+ { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); }
+
+#define luaC_barrierproto(L,p,c) \
+ { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); }
+
+LUAI_FUNC void luaC_freeallobjects (lua_State *L);
+LUAI_FUNC void luaC_step (lua_State *L);
+LUAI_FUNC void luaC_forcestep (lua_State *L);
+LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask);
+LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency);
+LUAI_FUNC GCObject *luaC_newobj (lua_State *L, int tt, size_t sz,
+ GCObject **list, int offset);
+LUAI_FUNC void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v);
+LUAI_FUNC void luaC_barrierback_ (lua_State *L, GCObject *o);
+LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c);
+LUAI_FUNC void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt);
+LUAI_FUNC void luaC_checkupvalcolor (global_State *g, UpVal *uv);
+LUAI_FUNC void luaC_changemode (lua_State *L, int mode);
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c
new file mode 100644
index 000000000000..dfac7aef8645
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c
@@ -0,0 +1,529 @@
+/*
+** $Id: llex.c,v 2.63.1.3 2015/02/09 17:56:34 roberto Exp $
+** Lexical Analyzer
+** See Copyright Notice in lua.h
+*/
+
+#include <sys/zfs_context.h>
+
+#define llex_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lctype.h"
+#include "ldo.h"
+#include "llex.h"
+#include "lobject.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lzio.h"
+
+
+
+#define next(ls) (ls->current = zgetc(ls->z))
+
+
+
+#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r')
+
+
+/* ORDER RESERVED */
+static const char *const luaX_tokens [] = {
+ "and", "break", "do", "else", "elseif",
+ "end", "false", "for", "function", "goto", "if",
+ "in", "local", "nil", "not", "or", "repeat",
+ "return", "then", "true", "until", "while",
+ "..", "...", "==", ">=", "<=", "~=", "::", "<eof>",
+ "<number>", "<name>", "<string>"
+};
+
+
+#define save_and_next(ls) (save(ls, ls->current), next(ls))
+
+
+static l_noret lexerror (LexState *ls, const char *msg, int token);
+
+
+static void save (LexState *ls, int c) {
+ Mbuffer *b = ls->buff;
+ if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
+ size_t newsize;
+ if (luaZ_sizebuffer(b) >= MAX_SIZET/2)
+ lexerror(ls, "lexical element too long", 0);
+ newsize = luaZ_sizebuffer(b) * 2;
+ luaZ_resizebuffer(ls->L, b, newsize);
+ }
+ b->buffer[luaZ_bufflen(b)++] = cast(char, c);
+}
+
+
+void luaX_init (lua_State *L) {
+ int i;
+ for (i=0; i<NUM_RESERVED; i++) {
+ TString *ts = luaS_new(L, luaX_tokens[i]);
+ luaS_fix(ts); /* reserved words are never collected */
+ ts->tsv.extra = cast_byte(i+1); /* reserved word */
+ }
+}
+
+
+const char *luaX_token2str (LexState *ls, int token) {
+ if (token < FIRST_RESERVED) { /* single-byte symbols? */
+ lua_assert(token == cast(unsigned char, token));
+ return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) :
+ luaO_pushfstring(ls->L, "char(%d)", token);
+ }
+ else {
+ const char *s = luaX_tokens[token - FIRST_RESERVED];
+ if (token < TK_EOS) /* fixed format (symbols and reserved words)? */
+ return luaO_pushfstring(ls->L, LUA_QS, s);
+ else /* names, strings, and numerals */
+ return s;
+ }
+}
+
+
+static const char *txtToken (LexState *ls, int token) {
+ switch (token) {
+ case TK_NAME:
+ case TK_STRING:
+ case TK_NUMBER:
+ save(ls, '\0');
+ return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff));
+ default:
+ return luaX_token2str(ls, token);
+ }
+}
+
+
+static l_noret lexerror (LexState *ls, const char *msg, int token) {
+ char buff[LUA_IDSIZE];
+ luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE);
+ msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg);
+ if (token)
+ luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
+ luaD_throw(ls->L, LUA_ERRSYNTAX);
+}
+
+
+l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
+ lexerror(ls, msg, ls->t.token);
+}
+
+
+/*
+** creates a new string and anchors it in function's table so that
+** it will not be collected until the end of the function's compilation
+** (by that time it should be anchored in function's prototype)
+*/
+TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
+ lua_State *L = ls->L;
+ TValue *o; /* entry for `str' */
+ TString *ts = luaS_newlstr(L, str, l); /* create new string */
+ setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */
+ o = luaH_set(L, ls->fs->h, L->top - 1);
+ if (ttisnil(o)) { /* not in use yet? (see 'addK') */
+ /* boolean value does not need GC barrier;
+ table has no metatable, so it does not need to invalidate cache */
+ setbvalue(o, 1); /* t[string] = true */
+ luaC_checkGC(L);
+ }
+ else { /* string already present */
+ ts = rawtsvalue(keyfromval(o)); /* re-use value previously stored */
+ }
+ L->top--; /* remove string from stack */
+ return ts;
+}
+
+
+/*
+** increment line number and skips newline sequence (any of
+** \n, \r, \n\r, or \r\n)
+*/
+static void inclinenumber (LexState *ls) {
+ int old = ls->current;
+ lua_assert(currIsNewline(ls));
+ next(ls); /* skip `\n' or `\r' */
+ if (currIsNewline(ls) && ls->current != old)
+ next(ls); /* skip `\n\r' or `\r\n' */
+ if (++ls->linenumber >= MAX_INT)
+ lexerror(ls, "chunk has too many lines", 0);
+}
+
+
+void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
+ int firstchar) {
+ ls->decpoint = '.';
+ ls->L = L;
+ ls->current = firstchar;
+ ls->lookahead.token = TK_EOS; /* no look-ahead token */
+ ls->z = z;
+ ls->fs = NULL;
+ ls->linenumber = 1;
+ ls->lastline = 1;
+ ls->source = source;
+ ls->envn = luaS_new(L, LUA_ENV); /* create env name */
+ luaS_fix(ls->envn); /* never collect this name */
+ luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */
+}
+
+
+
+/*
+** =======================================================
+** LEXICAL ANALYZER
+** =======================================================
+*/
+
+
+
+static int check_next (LexState *ls, const char *set) {
+ if (ls->current == '\0' || !strchr(set, ls->current))
+ return 0;
+ save_and_next(ls);
+ return 1;
+}
+
+
+/*
+** change all characters 'from' in buffer to 'to'
+*/
+static void buffreplace (LexState *ls, char from, char to) {
+ size_t n = luaZ_bufflen(ls->buff);
+ char *p = luaZ_buffer(ls->buff);
+ while (n--)
+ if (p[n] == from) p[n] = to;
+}
+
+
+#if !defined(getlocaledecpoint)
+#define getlocaledecpoint() (localeconv()->decimal_point[0])
+#endif
+
+
+#define buff2d(b,e) luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e)
+
+/*
+** in case of format error, try to change decimal point separator to
+** the one defined in the current locale and check again
+*/
+static void trydecpoint (LexState *ls, SemInfo *seminfo) {
+ char old = ls->decpoint;
+ ls->decpoint = getlocaledecpoint();
+ buffreplace(ls, old, ls->decpoint); /* try new decimal separator */
+ if (!buff2d(ls->buff, &seminfo->r)) {
+ /* format error with correct decimal point: no more options */
+ buffreplace(ls, ls->decpoint, '.'); /* undo change (for error message) */
+ lexerror(ls, "malformed number", TK_NUMBER);
+ }
+}
+
+
+/* LUA_NUMBER */
+/*
+** this function is quite liberal in what it accepts, as 'luaO_str2d'
+** will reject ill-formed numerals.
+*/
+static void read_numeral (LexState *ls, SemInfo *seminfo) {
+ const char *expo = "Ee";
+ int first = ls->current;
+ lua_assert(lisdigit(ls->current));
+ save_and_next(ls);
+ if (first == '0' && check_next(ls, "Xx")) /* hexadecimal? */
+ expo = "Pp";
+ for (;;) {
+ if (check_next(ls, expo)) /* exponent part? */
+ check_next(ls, "+-"); /* optional exponent sign */
+ if (lisxdigit(ls->current) || ls->current == '.')
+ save_and_next(ls);
+ else break;
+ }
+ save(ls, '\0');
+ buffreplace(ls, '.', ls->decpoint); /* follow locale for decimal point */
+ if (!buff2d(ls->buff, &seminfo->r)) /* format error? */
+ trydecpoint(ls, seminfo); /* try to update decimal point separator */
+}
+
+
+/*
+** skip a sequence '[=*[' or ']=*]' and return its number of '='s or
+** -1 if sequence is malformed
+*/
+static int skip_sep (LexState *ls) {
+ int count = 0;
+ int s = ls->current;
+ lua_assert(s == '[' || s == ']');
+ save_and_next(ls);
+ while (ls->current == '=') {
+ save_and_next(ls);
+ count++;
+ }
+ return (ls->current == s) ? count : (-count) - 1;
+}
+
+
+static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
+ save_and_next(ls); /* skip 2nd `[' */
+ if (currIsNewline(ls)) /* string starts with a newline? */
+ inclinenumber(ls); /* skip it */
+ for (;;) {
+ switch (ls->current) {
+ case EOZ:
+ lexerror(ls, (seminfo) ? "unfinished long string" :
+ "unfinished long comment", TK_EOS);
+ break; /* to avoid warnings */
+ case ']': {
+ if (skip_sep(ls) == sep) {
+ save_and_next(ls); /* skip 2nd `]' */
+ goto endloop;
+ }
+ break;
+ }
+ case '\n': case '\r': {
+ save(ls, '\n');
+ inclinenumber(ls);
+ if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */
+ break;
+ }
+ default: {
+ if (seminfo) save_and_next(ls);
+ else next(ls);
+ }
+ }
+ } endloop:
+ if (seminfo)
+ seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
+ luaZ_bufflen(ls->buff) - 2*(2 + sep));
+}
+
+
+static void escerror (LexState *ls, int *c, int n, const char *msg) {
+ int i;
+ luaZ_resetbuffer(ls->buff); /* prepare error message */
+ save(ls, '\\');
+ for (i = 0; i < n && c[i] != EOZ; i++)
+ save(ls, c[i]);
+ lexerror(ls, msg, TK_STRING);
+}
+
+
+static int readhexaesc (LexState *ls) {
+ int c[3], i; /* keep input for error message */
+ int r = 0; /* result accumulator */
+ c[0] = 'x'; /* for error message */
+ for (i = 1; i < 3; i++) { /* read two hexadecimal digits */
+ c[i] = next(ls);
+ if (!lisxdigit(c[i]))
+ escerror(ls, c, i + 1, "hexadecimal digit expected");
+ r = (r << 4) + luaO_hexavalue(c[i]);
+ }
+ return r;
+}
+
+
+static int readdecesc (LexState *ls) {
+ int c[3], i;
+ int r = 0; /* result accumulator */
+ for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */
+ c[i] = ls->current;
+ r = 10*r + c[i] - '0';
+ next(ls);
+ }
+ if (r > UCHAR_MAX)
+ escerror(ls, c, i, "decimal escape too large");
+ return r;
+}
+
+
+static void read_string (LexState *ls, int del, SemInfo *seminfo) {
+ save_and_next(ls); /* keep delimiter (for error messages) */
+ while (ls->current != del) {
+ switch (ls->current) {
+ case EOZ:
+ lexerror(ls, "unfinished string", TK_EOS);
+ break; /* to avoid warnings */
+ case '\n':
+ case '\r':
+ lexerror(ls, "unfinished string", TK_STRING);
+ break; /* to avoid warnings */
+ case '\\': { /* escape sequences */
+ int c; /* final character to be saved */
+ next(ls); /* do not save the `\' */
+ switch (ls->current) {
+ case 'a': c = '\a'; goto read_save;
+ case 'b': c = '\b'; goto read_save;
+ case 'f': c = '\f'; goto read_save;
+ case 'n': c = '\n'; goto read_save;
+ case 'r': c = '\r'; goto read_save;
+ case 't': c = '\t'; goto read_save;
+ case 'v': c = '\v'; goto read_save;
+ case 'x': c = readhexaesc(ls); goto read_save;
+ case '\n': case '\r':
+ inclinenumber(ls); c = '\n'; goto only_save;
+ case '\\': case '\"': case '\'':
+ c = ls->current; goto read_save;
+ case EOZ: goto no_save; /* will raise an error next loop */
+ case 'z': { /* zap following span of spaces */
+ next(ls); /* skip the 'z' */
+ while (lisspace(ls->current)) {
+ if (currIsNewline(ls)) inclinenumber(ls);
+ else next(ls);
+ }
+ goto no_save;
+ }
+ default: {
+ if (!lisdigit(ls->current))
+ escerror(ls, &ls->current, 1, "invalid escape sequence");
+ /* digital escape \ddd */
+ c = readdecesc(ls);
+ goto only_save;
+ }
+ }
+ read_save: next(ls); /* read next character */
+ only_save: save(ls, c); /* save 'c' */
+ no_save: break;
+ }
+ default:
+ save_and_next(ls);
+ }
+ }
+ save_and_next(ls); /* skip delimiter */
+ seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
+ luaZ_bufflen(ls->buff) - 2);
+}
+
+
+static int llex (LexState *ls, SemInfo *seminfo) {
+ luaZ_resetbuffer(ls->buff);
+ for (;;) {
+ switch (ls->current) {
+ case '\n': case '\r': { /* line breaks */
+ inclinenumber(ls);
+ break;
+ }
+ case ' ': case '\f': case '\t': case '\v': { /* spaces */
+ next(ls);
+ break;
+ }
+ case '-': { /* '-' or '--' (comment) */
+ next(ls);
+ if (ls->current != '-') return '-';
+ /* else is a comment */
+ next(ls);
+ if (ls->current == '[') { /* long comment? */
+ int sep = skip_sep(ls);
+ luaZ_resetbuffer(ls->buff); /* `skip_sep' may dirty the buffer */
+ if (sep >= 0) {
+ read_long_string(ls, NULL, sep); /* skip long comment */
+ luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */
+ break;
+ }
+ }
+ /* else short comment */
+ while (!currIsNewline(ls) && ls->current != EOZ)
+ next(ls); /* skip until end of line (or end of file) */
+ break;
+ }
+ case '[': { /* long string or simply '[' */
+ int sep = skip_sep(ls);
+ if (sep >= 0) {
+ read_long_string(ls, seminfo, sep);
+ return TK_STRING;
+ }
+ else if (sep == -1) return '[';
+ else lexerror(ls, "invalid long string delimiter", TK_STRING);
+ }
+ case '=': {
+ next(ls);
+ if (ls->current != '=') return '=';
+ else { next(ls); return TK_EQ; }
+ }
+ case '<': {
+ next(ls);
+ if (ls->current != '=') return '<';
+ else { next(ls); return TK_LE; }
+ }
+ case '>': {
+ next(ls);
+ if (ls->current != '=') return '>';
+ else { next(ls); return TK_GE; }
+ }
+ case '~': {
+ next(ls);
+ if (ls->current != '=') return '~';
+ else { next(ls); return TK_NE; }
+ }
+ case ':': {
+ next(ls);
+ if (ls->current != ':') return ':';
+ else { next(ls); return TK_DBCOLON; }
+ }
+ case '"': case '\'': { /* short literal strings */
+ read_string(ls, ls->current, seminfo);
+ return TK_STRING;
+ }
+ case '.': { /* '.', '..', '...', or number */
+ save_and_next(ls);
+ if (check_next(ls, ".")) {
+ if (check_next(ls, "."))
+ return TK_DOTS; /* '...' */
+ else return TK_CONCAT; /* '..' */
+ }
+ else if (!lisdigit(ls->current)) return '.';
+ /* else go through */
+ }
+ /* FALLTHROUGH */
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ read_numeral(ls, seminfo);
+ return TK_NUMBER;
+ }
+ case EOZ: {
+ return TK_EOS;
+ }
+ default: {
+ if (lislalpha(ls->current)) { /* identifier or reserved word? */
+ TString *ts;
+ do {
+ save_and_next(ls);
+ } while (lislalnum(ls->current));
+ ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
+ luaZ_bufflen(ls->buff));
+ seminfo->ts = ts;
+ if (isreserved(ts)) /* reserved word? */
+ return ts->tsv.extra - 1 + FIRST_RESERVED;
+ else {
+ return TK_NAME;
+ }
+ }
+ else { /* single-char tokens (+ - / ...) */
+ int c = ls->current;
+ next(ls);
+ return c;
+ }
+ }
+ }
+ }
+}
+
+
+void luaX_next (LexState *ls) {
+ ls->lastline = ls->linenumber;
+ if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */
+ ls->t = ls->lookahead; /* use this one */
+ ls->lookahead.token = TK_EOS; /* and discharge it */
+ }
+ else
+ ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
+}
+
+
+int luaX_lookahead (LexState *ls) {
+ lua_assert(ls->lookahead.token == TK_EOS);
+ ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
+ return ls->lookahead.token;
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h
new file mode 100644
index 000000000000..a4acdd30218a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h
@@ -0,0 +1,78 @@
+/*
+** $Id: llex.h,v 1.72.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lexical Analyzer
+** See Copyright Notice in lua.h
+*/
+
+#ifndef llex_h
+#define llex_h
+
+#include "lobject.h"
+#include "lzio.h"
+
+
+#define FIRST_RESERVED 257
+
+
+
+/*
+* WARNING: if you change the order of this enumeration,
+* grep "ORDER RESERVED"
+*/
+enum RESERVED {
+ /* terminal symbols denoted by reserved words */
+ TK_AND = FIRST_RESERVED, TK_BREAK,
+ TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
+ TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
+ TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
+ /* other terminal symbols */
+ TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS,
+ TK_NUMBER, TK_NAME, TK_STRING
+};
+
+/* number of reserved words */
+#define NUM_RESERVED (cast(int, TK_WHILE-FIRST_RESERVED+1))
+
+
+typedef union {
+ lua_Number r;
+ TString *ts;
+} SemInfo; /* semantics information */
+
+
+typedef struct Token {
+ int token;
+ SemInfo seminfo;
+} Token;
+
+
+/* state of the lexer plus state of the parser when shared by all
+ functions */
+typedef struct LexState {
+ int current; /* current character (charint) */
+ int linenumber; /* input line counter */
+ int lastline; /* line of last token `consumed' */
+ Token t; /* current token */
+ Token lookahead; /* look ahead token */
+ struct FuncState *fs; /* current function (parser) */
+ struct lua_State *L;
+ ZIO *z; /* input stream */
+ Mbuffer *buff; /* buffer for tokens */
+ struct Dyndata *dyd; /* dynamic structures used by the parser */
+ TString *source; /* current source name */
+ TString *envn; /* environment variable name */
+ char decpoint; /* locale decimal point */
+} LexState;
+
+
+LUAI_FUNC void luaX_init (lua_State *L);
+LUAI_FUNC void luaX_setinput (lua_State *L, LexState *ls, ZIO *z,
+ TString *source, int firstchar);
+LUAI_FUNC TString *luaX_newstring (LexState *ls, const char *str, size_t l);
+LUAI_FUNC void luaX_next (LexState *ls);
+LUAI_FUNC int luaX_lookahead (LexState *ls);
+LUAI_FUNC l_noret luaX_syntaxerror (LexState *ls, const char *s);
+LUAI_FUNC const char *luaX_token2str (LexState *ls, int token);
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h
new file mode 100644
index 000000000000..4277c1fd03db
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h
@@ -0,0 +1,308 @@
+/*
+** $Id: llimits.h,v 1.103.1.1 2013/04/12 18:48:47 roberto Exp $
+** Limits, basic types, and some other `installation-dependent' definitions
+** See Copyright Notice in lua.h
+*/
+
+#ifndef llimits_h
+#define llimits_h
+
+
+#include <sys/zfs_context.h>
+
+#include "lua.h"
+
+
+typedef unsigned LUA_INT32 lu_int32;
+
+typedef LUAI_UMEM lu_mem;
+
+typedef LUAI_MEM l_mem;
+
+
+
+/* chars used as small naturals (so that `char' is reserved for characters) */
+typedef unsigned char lu_byte;
+
+
+#define MAX_SIZET ((size_t)(~(size_t)0)-2)
+
+#define MAX_LUMEM ((lu_mem)(~(lu_mem)0)-2)
+
+#define MAX_LMEM ((l_mem) ((MAX_LUMEM >> 1) - 2))
+
+
+#define MAX_INT (INT_MAX-2) /* maximum value of an int (-2 for safety) */
+
+/*
+** conversion of pointer to integer
+** this is for hashing only; there is no problem if the integer
+** cannot hold the whole pointer value
+*/
+#define IntPoint(p) ((unsigned int)(lu_mem)(p))
+
+
+
+/* type to ensure maximum alignment */
+#if !defined(LUAI_USER_ALIGNMENT_T)
+#define LUAI_USER_ALIGNMENT_T union { double u; void *s; long l; }
+#endif
+
+typedef LUAI_USER_ALIGNMENT_T L_Umaxalign;
+
+
+/* result of a `usual argument conversion' over lua_Number */
+typedef LUAI_UACNUMBER l_uacNumber;
+
+
+/* internal assertions for in-house debugging */
+#if defined(lua_assert)
+#define check_exp(c,e) (lua_assert(c), (e))
+/* to avoid problems with conditions too long */
+#define lua_longassert(c) { if (!(c)) lua_assert(0); }
+#else
+#define lua_assert(c) ((void)0)
+#define check_exp(c,e) (e)
+#define lua_longassert(c) ((void)0)
+#endif
+
+/*
+** assertion for checking API calls
+*/
+#if !defined(luai_apicheck)
+
+#if defined(LUA_USE_APICHECK)
+#include <assert.h>
+#define luai_apicheck(L,e) assert(e)
+#else
+#define luai_apicheck(L,e) lua_assert(e)
+#endif
+
+#endif
+
+#define api_check(l,e,msg) luai_apicheck(l,(e) && msg)
+
+
+#if !defined(UNUSED)
+#define UNUSED(x) ((void)(x)) /* to avoid warnings */
+#endif
+
+
+#define cast(t, exp) ((t)(exp))
+
+#define cast_byte(i) cast(lu_byte, (i))
+#define cast_num(i) cast(lua_Number, (i))
+#define cast_int(i) cast(int, (i))
+#define cast_uchar(i) cast(unsigned char, (i))
+
+
+/*
+** non-return type
+*/
+#if defined(__GNUC__)
+#define l_noret void __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define l_noret void __declspec(noreturn)
+#else
+#define l_noret void
+#endif
+
+
+
+/*
+** maximum depth for nested C calls and syntactical nested non-terminals
+** in a program. (Value must fit in an unsigned short int.)
+**
+** Note: On amd64 platform, the limit has been measured to be 45. We set
+** the maximum lower to give a margin for changing the amount of stack
+** used by various functions involved in parsing and executing code.
+*/
+#if !defined(LUAI_MAXCCALLS)
+#define LUAI_MAXCCALLS 20
+#endif
+
+/*
+** maximum number of upvalues in a closure (both C and Lua). (Value
+** must fit in an unsigned char.)
+*/
+#define MAXUPVAL UCHAR_MAX
+
+
+/*
+** type for virtual-machine instructions
+** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h)
+*/
+typedef lu_int32 Instruction;
+
+
+
+/* maximum stack for a Lua function */
+#define MAXSTACK 250
+
+
+
+/* minimum size for the string table (must be power of 2) */
+#if !defined(MINSTRTABSIZE)
+#define MINSTRTABSIZE 32
+#endif
+
+
+/* minimum size for string buffer */
+#if !defined(LUA_MINBUFFER)
+#define LUA_MINBUFFER 32
+#endif
+
+
+#if !defined(lua_lock)
+#define lua_lock(L) ((void) 0)
+#define lua_unlock(L) ((void) 0)
+#endif
+
+#if !defined(luai_threadyield)
+#define luai_threadyield(L) {lua_unlock(L); lua_lock(L);}
+#endif
+
+
+/*
+** these macros allow user-specific actions on threads when you defined
+** LUAI_EXTRASPACE and need to do something extra when a thread is
+** created/deleted/resumed/yielded.
+*/
+#if !defined(luai_userstateopen)
+#define luai_userstateopen(L) ((void)L)
+#endif
+
+#if !defined(luai_userstateclose)
+#define luai_userstateclose(L) ((void)L)
+#endif
+
+#if !defined(luai_userstatethread)
+#define luai_userstatethread(L,L1) ((void)L)
+#endif
+
+#if !defined(luai_userstatefree)
+#define luai_userstatefree(L,L1) ((void)L)
+#endif
+
+#if !defined(luai_userstateresume)
+#define luai_userstateresume(L,n) ((void)L)
+#endif
+
+#if !defined(luai_userstateyield)
+#define luai_userstateyield(L,n) ((void)L)
+#endif
+
+/*
+** lua_number2int is a macro to convert lua_Number to int.
+** lua_number2integer is a macro to convert lua_Number to lua_Integer.
+** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned.
+** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number.
+** luai_hashnum is a macro to hash a lua_Number value into an integer.
+** The hash must be deterministic and give reasonable values for
+** both small and large values (outside the range of integers).
+*/
+
+#if defined(MS_ASMTRICK) || defined(LUA_MSASMTRICK) /* { */
+/* trick with Microsoft assembler for X86 */
+
+#define lua_number2int(i,n) __asm {__asm fld n __asm fistp i}
+#define lua_number2integer(i,n) lua_number2int(i, n)
+#define lua_number2unsigned(i,n) \
+ {__int64 l; __asm {__asm fld n __asm fistp l} i = (unsigned int)l;}
+
+
+#elif defined(LUA_IEEE754TRICK) /* }{ */
+/* the next trick should work on any machine using IEEE754 with
+ a 32-bit int type */
+
+union luai_Cast { double l_d; LUA_INT32 l_p[2]; };
+
+#if !defined(LUA_IEEEENDIAN) /* { */
+#define LUAI_EXTRAIEEE \
+ static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)};
+#define LUA_IEEEENDIANLOC (ieeeendian.l_p[1] == 33)
+#else
+#define LUA_IEEEENDIANLOC LUA_IEEEENDIAN
+#define LUAI_EXTRAIEEE /* empty */
+#endif /* } */
+
+#define lua_number2int32(i,n,t) \
+ { LUAI_EXTRAIEEE \
+ volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \
+ (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; }
+
+#define luai_hashnum(i,n) \
+ { volatile union luai_Cast u; u.l_d = (n) + 1.0; /* avoid -0 */ \
+ (i) = u.l_p[0]; (i) += u.l_p[1]; } /* add double bits for his hash */
+
+#define lua_number2int(i,n) lua_number2int32(i, n, int)
+#define lua_number2unsigned(i,n) lua_number2int32(i, n, lua_Unsigned)
+
+/* the trick can be expanded to lua_Integer when it is a 32-bit value */
+#if defined(LUA_IEEELL)
+#define lua_number2integer(i,n) lua_number2int32(i, n, lua_Integer)
+#endif
+
+#endif /* } */
+
+
+/* the following definitions always work, but may be slow */
+
+#if !defined(lua_number2int)
+#define lua_number2int(i,n) ((i)=(int)(n))
+#endif
+
+#if !defined(lua_number2integer)
+#define lua_number2integer(i,n) ((i)=(lua_Integer)(n))
+#endif
+
+#if !defined(lua_number2unsigned) /* { */
+/* the following definition assures proper modulo behavior */
+#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_NUMBER_FLOAT)
+#include <math.h>
+#define SUPUNSIGNED ((lua_Number)(~(lua_Unsigned)0) + 1)
+#define lua_number2unsigned(i,n) \
+ ((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED))
+#else
+#define lua_number2unsigned(i,n) ((i)=(lua_Unsigned)(n))
+#endif
+#endif /* } */
+
+
+#if !defined(lua_unsigned2number)
+/* on several machines, coercion from unsigned to double is slow,
+ so it may be worth to avoid */
+#define lua_unsigned2number(u) \
+ (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u))
+#endif
+
+
+
+#if defined(ltable_c) && !defined(luai_hashnum)
+
+extern int lcompat_hashnum(int64_t);
+
+#define luai_hashnum(i,n) (i = lcompat_hashnum(n))
+
+#endif
+
+
+
+/*
+** macro to control inclusion of some hard tests on stack reallocation
+*/
+#if !defined(HARDSTACKTESTS)
+#define condmovestack(L) ((void)0)
+#else
+/* realloc stack keeping its size */
+#define condmovestack(L) luaD_reallocstack((L), (L)->stacksize)
+#endif
+
+#if !defined(HARDMEMTESTS)
+#define condchangemem(L) condmovestack(L)
+#else
+#define condchangemem(L) \
+ ((void)(!(G(L)->gcrunning) || (luaC_fullgc(L, 0), 1)))
+#endif
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c
new file mode 100644
index 000000000000..0d070fbde83c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c
@@ -0,0 +1,99 @@
+/*
+** $Id: lmem.c,v 1.84.1.1 2013/04/12 18:48:47 roberto Exp $
+** Interface to Memory Manager
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define lmem_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+
+/*
+** About the realloc function:
+** void * frealloc (void *ud, void *ptr, size_t osize, size_t nsize);
+** (`osize' is the old size, `nsize' is the new size)
+**
+** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no
+** matter 'x').
+**
+** * frealloc(ud, p, x, 0) frees the block `p'
+** (in this specific case, frealloc must return NULL);
+** particularly, frealloc(ud, NULL, 0, 0) does nothing
+** (which is equivalent to free(NULL) in ANSI C)
+**
+** frealloc returns NULL if it cannot create or reallocate the area
+** (any reallocation to an equal or smaller size cannot fail!)
+*/
+
+
+
+#define MINSIZEARRAY 4
+
+
+void *luaM_growaux_ (lua_State *L, void *block, int *size, size_t size_elems,
+ int limit, const char *what) {
+ void *newblock;
+ int newsize;
+ if (*size >= limit/2) { /* cannot double it? */
+ if (*size >= limit) /* cannot grow even a little? */
+ luaG_runerror(L, "too many %s (limit is %d)", what, limit);
+ newsize = limit; /* still have at least one free place */
+ }
+ else {
+ newsize = (*size)*2;
+ if (newsize < MINSIZEARRAY)
+ newsize = MINSIZEARRAY; /* minimum size */
+ }
+ newblock = luaM_reallocv(L, block, *size, newsize, size_elems);
+ *size = newsize; /* update only when everything else is OK */
+ return newblock;
+}
+
+
+l_noret luaM_toobig (lua_State *L) {
+ luaG_runerror(L, "memory allocation error: block too big");
+}
+
+
+
+/*
+** generic allocation routine.
+*/
+void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) {
+ void *newblock;
+ global_State *g = G(L);
+ size_t realosize = (block) ? osize : 0;
+ lua_assert((realosize == 0) == (block == NULL));
+#if defined(HARDMEMTESTS)
+ if (nsize > realosize && g->gcrunning)
+ luaC_fullgc(L, 1); /* force a GC whenever possible */
+#endif
+ newblock = (*g->frealloc)(g->ud, block, osize, nsize);
+ if (newblock == NULL && nsize > 0) {
+ api_check(L, nsize > realosize,
+ "realloc cannot fail when shrinking a block");
+ if (g->gcrunning) {
+ luaC_fullgc(L, 1); /* try to free some memory... */
+ newblock = (*g->frealloc)(g->ud, block, osize, nsize); /* try again */
+ }
+ if (newblock == NULL)
+ luaD_throw(L, LUA_ERRMEM);
+ }
+ lua_assert((nsize == 0) == (newblock == NULL));
+ g->GCdebt = (g->GCdebt + nsize) - realosize;
+ return newblock;
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h
new file mode 100644
index 000000000000..c75a3d50984a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h
@@ -0,0 +1,57 @@
+/*
+** $Id: lmem.h,v 1.40.1.1 2013/04/12 18:48:47 roberto Exp $
+** Interface to Memory Manager
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lmem_h
+#define lmem_h
+
+
+#include <sys/zfs_context.h>
+
+#include "llimits.h"
+#include "lua.h"
+
+
+/*
+** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is
+** always constant.
+** The macro is somewhat complex to avoid warnings:
+** +1 avoids warnings of "comparison has constant result";
+** cast to 'void' avoids warnings of "value unused".
+*/
+#define luaM_reallocv(L,b,on,n,e) \
+ (cast(void, \
+ (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \
+ luaM_realloc_(L, (b), (on)*(e), (n)*(e)))
+
+#define luaM_freemem(L, b, s) luaM_realloc_(L, (b), (s), 0)
+#define luaM_free(L, b) luaM_realloc_(L, (b), sizeof(*(b)), 0)
+#define luaM_freearray(L, b, n) luaM_reallocv(L, (b), n, 0, sizeof((b)[0]))
+
+#define luaM_malloc(L,s) luaM_realloc_(L, NULL, 0, (s))
+#define luaM_new(L,t) cast(t *, luaM_malloc(L, sizeof(t)))
+#define luaM_newvector(L,n,t) \
+ cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t)))
+
+#define luaM_newobject(L,tag,s) luaM_realloc_(L, NULL, tag, (s))
+
+#define luaM_growvector(L,v,nelems,size,t,limit,e) \
+ if ((nelems)+1 > (size)) \
+ ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e)))
+
+#define luaM_reallocvector(L, v,oldn,n,t) \
+ ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t))))
+
+LUAI_FUNC l_noret luaM_toobig (lua_State *L);
+
+/* not to be called directly */
+LUAI_FUNC void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize,
+ size_t size);
+LUAI_FUNC void *luaM_growaux_ (lua_State *L, void *block, int *size,
+ size_t size_elem, int limit,
+ const char *what);
+
+#endif
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c
new file mode 100644
index 000000000000..339c84d21d79
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c
@@ -0,0 +1,283 @@
+/*
+** $Id: lobject.c,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Some generic functions over Lua objects
+** See Copyright Notice in lua.h
+*/
+
+#include <sys/zfs_context.h>
+
+#define lobject_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lctype.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "lvm.h"
+
+
+
+LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT};
+
+
+/*
+** converts an integer to a "floating point byte", represented as
+** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if
+** eeeee != 0 and (xxx) otherwise.
+*/
+int luaO_int2fb (unsigned int x) {
+ int e = 0; /* exponent */
+ if (x < 8) return x;
+ while (x >= 0x10) {
+ x = (x+1) >> 1;
+ e++;
+ }
+ return ((e+1) << 3) | (cast_int(x) - 8);
+}
+
+
+/* converts back */
+int luaO_fb2int (int x) {
+ int e = (x >> 3) & 0x1f;
+ if (e == 0) return x;
+ else return ((x & 7) + 8) << (e - 1);
+}
+
+
+int luaO_ceillog2 (unsigned int x) {
+ static const lu_byte log_2[256] = {
+ 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+ 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ };
+ int l = 0;
+ x--;
+ while (x >= 256) { l += 8; x >>= 8; }
+ return l + log_2[x];
+}
+
+
+lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) {
+ switch (op) {
+ case LUA_OPADD: return luai_numadd(NULL, v1, v2);
+ case LUA_OPSUB: return luai_numsub(NULL, v1, v2);
+ case LUA_OPMUL: return luai_nummul(NULL, v1, v2);
+ case LUA_OPDIV: return luai_numdiv(NULL, v1, v2);
+ case LUA_OPMOD: return luai_nummod(NULL, v1, v2);
+ case LUA_OPPOW: return luai_numpow(NULL, v1, v2);
+ case LUA_OPUNM: return luai_numunm(NULL, v1);
+ default: lua_assert(0); return 0;
+ }
+}
+
+
+int luaO_hexavalue (int c) {
+ if (lisdigit(c)) return c - '0';
+ else return ltolower(c) - 'a' + 10;
+}
+
+
+#if !defined(lua_strx2number)
+
+
+
+static int isneg (const char **s) {
+ if (**s == '-') { (*s)++; return 1; }
+ else if (**s == '+') (*s)++;
+ return 0;
+}
+
+
+static lua_Number readhexa (const char **s, lua_Number r, int *count) {
+ for (; lisxdigit(cast_uchar(**s)); (*s)++) { /* read integer part */
+ r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s)));
+ (*count)++;
+ }
+ return r;
+}
+
+
+/*
+** convert an hexadecimal numeric string to a number, following
+** C99 specification for 'strtod'
+*/
+static lua_Number lua_strx2number (const char *s, char **endptr) {
+ lua_Number r = 0.0;
+ int e = 0, i = 0;
+ int neg = 0; /* 1 if number is negative */
+ *endptr = cast(char *, s); /* nothing is valid yet */
+ while (lisspace(cast_uchar(*s))) s++; /* skip initial spaces */
+ neg = isneg(&s); /* check signal */
+ if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X'))) /* check '0x' */
+ return 0.0; /* invalid format (no '0x') */
+ s += 2; /* skip '0x' */
+ r = readhexa(&s, r, &i); /* read integer part */
+ if (*s == '.') {
+ s++; /* skip dot */
+ r = readhexa(&s, r, &e); /* read fractional part */
+ }
+ if (i == 0 && e == 0)
+ return 0.0; /* invalid format (no digit) */
+ e *= -4; /* each fractional digit divides value by 2^-4 */
+ *endptr = cast(char *, s); /* valid up to here */
+ if (*s == 'p' || *s == 'P') { /* exponent part? */
+ int exp1 = 0;
+ int neg1;
+ s++; /* skip 'p' */
+ neg1 = isneg(&s); /* signal */
+ if (!lisdigit(cast_uchar(*s)))
+ goto ret; /* must have at least one digit */
+ while (lisdigit(cast_uchar(*s))) /* read exponent */
+ exp1 = exp1 * 10 + *(s++) - '0';
+ if (neg1) exp1 = -exp1;
+ e += exp1;
+ }
+ *endptr = cast(char *, s); /* valid up to here */
+ ret:
+ if (neg) r = -r;
+ return (r * (1 << e));
+}
+
+#endif
+
+
+int luaO_str2d (const char *s, size_t len, lua_Number *result) {
+ char *endptr;
+ if (strpbrk(s, "nN")) /* reject 'inf' and 'nan' */
+ return 0;
+ else if (strpbrk(s, "xX")) /* hexa? */
+ *result = lua_strx2number(s, &endptr);
+ else
+ *result = lua_str2number(s, &endptr);
+ if (endptr == s) return 0; /* nothing recognized */
+ while (lisspace(cast_uchar(*endptr))) endptr++;
+ return (endptr == s + len); /* OK if no trailing characters */
+}
+
+
+
+static void pushstr (lua_State *L, const char *str, size_t l) {
+ setsvalue2s(L, L->top++, luaS_newlstr(L, str, l));
+}
+
+
+/* this function handles only `%d', `%c', %f, %p, and `%s' formats */
+const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
+ int n = 0;
+ for (;;) {
+ const char *e = strchr(fmt, '%');
+ if (e == NULL) break;
+ luaD_checkstack(L, 2); /* fmt + item */
+ pushstr(L, fmt, e - fmt);
+ switch (*(e+1)) {
+ case 's': {
+ const char *s = va_arg(argp, char *);
+ if (s == NULL) s = "(null)";
+ pushstr(L, s, strlen(s));
+ break;
+ }
+ case 'c': {
+ char buff;
+ buff = cast(char, va_arg(argp, int));
+ pushstr(L, &buff, 1);
+ break;
+ }
+ case 'd': {
+ setnvalue(L->top++, cast_num(va_arg(argp, int)));
+ break;
+ }
+ case 'f': {
+ setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber)));
+ break;
+ }
+ case 'p': {
+ char buff[4*sizeof(void *) + 8]; /* should be enough space for a `%p' */
+ int l = lcompat_sprintf(buff, "%p", va_arg(argp, void *));
+ pushstr(L, buff, l);
+ break;
+ }
+ case '%': {
+ pushstr(L, "%", 1);
+ break;
+ }
+ default: {
+ luaG_runerror(L,
+ "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"),
+ *(e + 1));
+ }
+ }
+ n += 2;
+ fmt = e+2;
+ }
+ luaD_checkstack(L, 1);
+ pushstr(L, fmt, strlen(fmt));
+ if (n > 0) luaV_concat(L, n + 1);
+ return svalue(L->top - 1);
+}
+
+
+const char *luaO_pushfstring (lua_State *L, const char *fmt, ...) {
+ const char *msg;
+ va_list argp;
+ va_start(argp, fmt);
+ msg = luaO_pushvfstring(L, fmt, argp);
+ va_end(argp);
+ return msg;
+}
+
+
+/* number of chars of a literal string without the ending \0 */
+#define LL(x) (sizeof(x)/sizeof(char) - 1)
+
+#define RETS "..."
+#define PRE "[string \""
+#define POS "\"]"
+
+#define addstr(a,b,l) ( memcpy(a,b,(l) * sizeof(char)), a += (l) )
+
+void luaO_chunkid (char *out, const char *source, size_t bufflen) {
+ size_t l = strlen(source);
+ if (*source == '=') { /* 'literal' source */
+ if (l <= bufflen) /* small enough? */
+ memcpy(out, source + 1, l * sizeof(char));
+ else { /* truncate it */
+ addstr(out, source + 1, bufflen - 1);
+ *out = '\0';
+ }
+ }
+ else if (*source == '@') { /* file name */
+ if (l <= bufflen) /* small enough? */
+ memcpy(out, source + 1, l * sizeof(char));
+ else { /* add '...' before rest of name */
+ addstr(out, RETS, LL(RETS));
+ bufflen -= LL(RETS);
+ memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char));
+ }
+ }
+ else { /* string; format as [string "source"] */
+ const char *nl = strchr(source, '\n'); /* find first new line (if any) */
+ addstr(out, PRE, LL(PRE)); /* add prefix */
+ bufflen -= LL(PRE RETS POS) + 1; /* save space for prefix+suffix+'\0' */
+ if (l < bufflen && nl == NULL) { /* small one-line source? */
+ addstr(out, source, l); /* keep it */
+ }
+ else {
+ if (nl != NULL) l = nl - source; /* stop at first newline */
+ if (l > bufflen) l = bufflen;
+ addstr(out, source, l);
+ addstr(out, RETS, LL(RETS));
+ }
+ memcpy(out, POS, (LL(POS) + 1) * sizeof(char));
+ }
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h
new file mode 100644
index 000000000000..9c9f23542867
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h
@@ -0,0 +1,606 @@
+/*
+** $Id: lobject.h,v 2.71.1.2 2014/05/07 14:14:58 roberto Exp $
+** Type definitions for Lua objects
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lobject_h
+#define lobject_h
+
+
+#include <sys/zfs_context.h>
+
+#include "llimits.h"
+#include "lua.h"
+
+
+/*
+** Extra tags for non-values
+*/
+#define LUA_TPROTO LUA_NUMTAGS
+#define LUA_TUPVAL (LUA_NUMTAGS+1)
+#define LUA_TDEADKEY (LUA_NUMTAGS+2)
+
+/*
+** number of all possible tags (including LUA_TNONE but excluding DEADKEY)
+*/
+#define LUA_TOTALTAGS (LUA_TUPVAL+2)
+
+
+/*
+** tags for Tagged Values have the following use of bits:
+** bits 0-3: actual tag (a LUA_T* value)
+** bits 4-5: variant bits
+** bit 6: whether value is collectable
+*/
+
+#define VARBITS (3 << 4)
+
+
+/*
+** LUA_TFUNCTION variants:
+** 0 - Lua function
+** 1 - light C function
+** 2 - regular C function (closure)
+*/
+
+/* Variant tags for functions */
+#define LUA_TLCL (LUA_TFUNCTION | (0 << 4)) /* Lua closure */
+#define LUA_TLCF (LUA_TFUNCTION | (1 << 4)) /* light C function */
+#define LUA_TCCL (LUA_TFUNCTION | (2 << 4)) /* C closure */
+
+
+/* Variant tags for strings */
+#define LUA_TSHRSTR (LUA_TSTRING | (0 << 4)) /* short strings */
+#define LUA_TLNGSTR (LUA_TSTRING | (1 << 4)) /* long strings */
+
+
+/* Bit mark for collectable types */
+#define BIT_ISCOLLECTABLE (1 << 6)
+
+/* mark a tag as collectable */
+#define ctb(t) ((t) | BIT_ISCOLLECTABLE)
+
+
+/*
+** Union of all collectable objects
+*/
+typedef union GCObject GCObject;
+
+
+/*
+** Common Header for all collectable objects (in macro form, to be
+** included in other objects)
+*/
+#define CommonHeader GCObject *next; lu_byte tt; lu_byte marked
+
+
+/*
+** Common header in struct form
+*/
+typedef struct GCheader {
+ CommonHeader;
+} GCheader;
+
+
+
+/*
+** Union of all Lua values
+*/
+typedef union Value Value;
+
+
+#define numfield lua_Number n; /* numbers */
+
+
+
+/*
+** Tagged Values. This is the basic representation of values in Lua,
+** an actual value plus a tag with its type.
+*/
+
+#define TValuefields Value value_; int tt_
+
+typedef struct lua_TValue TValue;
+
+
+/* macro defining a nil value */
+#define NILCONSTANT {NULL}, LUA_TNIL
+
+
+#define val_(o) ((o)->value_)
+#define num_(o) (val_(o).n)
+
+
+/* raw type tag of a TValue */
+#define rttype(o) ((o)->tt_)
+
+/* tag with no variants (bits 0-3) */
+#define novariant(x) ((x) & 0x0F)
+
+/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */
+#define ttype(o) (rttype(o) & 0x3F)
+
+/* type tag of a TValue with no variants (bits 0-3) */
+#define ttypenv(o) (novariant(rttype(o)))
+
+
+/* Macros to test type */
+#define checktag(o,t) (rttype(o) == (t))
+#define checktype(o,t) (ttypenv(o) == (t))
+#define ttisnumber(o) checktag((o), LUA_TNUMBER)
+#define ttisnil(o) checktag((o), LUA_TNIL)
+#define ttisboolean(o) checktag((o), LUA_TBOOLEAN)
+#define ttislightuserdata(o) checktag((o), LUA_TLIGHTUSERDATA)
+#define ttisstring(o) checktype((o), LUA_TSTRING)
+#define ttisshrstring(o) checktag((o), ctb(LUA_TSHRSTR))
+#define ttislngstring(o) checktag((o), ctb(LUA_TLNGSTR))
+#define ttistable(o) checktag((o), ctb(LUA_TTABLE))
+#define ttisfunction(o) checktype(o, LUA_TFUNCTION)
+#define ttisclosure(o) ((rttype(o) & 0x1F) == LUA_TFUNCTION)
+#define ttisCclosure(o) checktag((o), ctb(LUA_TCCL))
+#define ttisLclosure(o) checktag((o), ctb(LUA_TLCL))
+#define ttislcf(o) checktag((o), LUA_TLCF)
+#define ttisuserdata(o) checktag((o), ctb(LUA_TUSERDATA))
+#define ttisthread(o) checktag((o), ctb(LUA_TTHREAD))
+#define ttisdeadkey(o) checktag((o), LUA_TDEADKEY)
+
+#define ttisequal(o1,o2) (rttype(o1) == rttype(o2))
+
+/* Macros to access values */
+#define nvalue(o) check_exp(ttisnumber(o), num_(o))
+#define gcvalue(o) check_exp(iscollectable(o), val_(o).gc)
+#define pvalue(o) check_exp(ttislightuserdata(o), val_(o).p)
+#define rawtsvalue(o) check_exp(ttisstring(o), &val_(o).gc->ts)
+#define tsvalue(o) (&rawtsvalue(o)->tsv)
+#define rawuvalue(o) check_exp(ttisuserdata(o), &val_(o).gc->u)
+#define uvalue(o) (&rawuvalue(o)->uv)
+#define clvalue(o) check_exp(ttisclosure(o), &val_(o).gc->cl)
+#define clLvalue(o) check_exp(ttisLclosure(o), &val_(o).gc->cl.l)
+#define clCvalue(o) check_exp(ttisCclosure(o), &val_(o).gc->cl.c)
+#define fvalue(o) check_exp(ttislcf(o), val_(o).f)
+#define hvalue(o) check_exp(ttistable(o), &val_(o).gc->h)
+#define bvalue(o) check_exp(ttisboolean(o), val_(o).b)
+#define thvalue(o) check_exp(ttisthread(o), &val_(o).gc->th)
+/* a dead value may get the 'gc' field, but cannot access its contents */
+#define deadvalue(o) check_exp(ttisdeadkey(o), cast(void *, val_(o).gc))
+
+#define l_isfalse(o) (ttisnil(o) || (ttisboolean(o) && bvalue(o) == 0))
+
+
+#define iscollectable(o) (rttype(o) & BIT_ISCOLLECTABLE)
+
+
+/* Macros for internal tests */
+#define righttt(obj) (ttype(obj) == gcvalue(obj)->gch.tt)
+
+#define checkliveness(g,obj) \
+ lua_longassert(!iscollectable(obj) || \
+ (righttt(obj) && !isdead(g,gcvalue(obj))))
+
+
+/* Macros to set values */
+#define settt_(o,t) ((o)->tt_=(t))
+
+#define setnvalue(obj,x) \
+ { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); }
+
+#define setnilvalue(obj) settt_(obj, LUA_TNIL)
+
+#define setfvalue(obj,x) \
+ { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); }
+
+#define setpvalue(obj,x) \
+ { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); }
+
+#define setbvalue(obj,x) \
+ { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); }
+
+#define setgcovalue(L,obj,x) \
+ { TValue *io=(obj); GCObject *i_g=(x); \
+ val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); }
+
+#define setsvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ TString *x_ = (x); \
+ val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \
+ checkliveness(G(L),io); }
+
+#define setuvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \
+ checkliveness(G(L),io); }
+
+#define setthvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \
+ checkliveness(G(L),io); }
+
+#define setclLvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \
+ checkliveness(G(L),io); }
+
+#define setclCvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \
+ checkliveness(G(L),io); }
+
+#define sethvalue(L,obj,x) \
+ { TValue *io=(obj); \
+ val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \
+ checkliveness(G(L),io); }
+
+#define setdeadvalue(obj) settt_(obj, LUA_TDEADKEY)
+
+
+
+#define setobj(L,obj1,obj2) \
+ { const TValue *io2=(obj2); TValue *io1=(obj1); \
+ io1->value_ = io2->value_; io1->tt_ = io2->tt_; \
+ checkliveness(G(L),io1); }
+
+
+/*
+** different types of assignments, according to destination
+*/
+
+/* from stack to (same) stack */
+#define setobjs2s setobj
+/* to stack (not from same stack) */
+#define setobj2s setobj
+#define setsvalue2s setsvalue
+#define sethvalue2s sethvalue
+#define setptvalue2s setptvalue
+/* from table to same table */
+#define setobjt2t setobj
+/* to table */
+#define setobj2t setobj
+/* to new object */
+#define setobj2n setobj
+#define setsvalue2n setsvalue
+
+
+/* check whether a number is valid (useful only for NaN trick) */
+#define luai_checknum(L,o,c) { /* empty */ }
+
+
+/*
+** {======================================================
+** NaN Trick
+** =======================================================
+*/
+#if defined(LUA_NANTRICK)
+
+/*
+** numbers are represented in the 'd_' field. All other values have the
+** value (NNMARK | tag) in 'tt__'. A number with such pattern would be
+** a "signaled NaN", which is never generated by regular operations by
+** the CPU (nor by 'strtod')
+*/
+
+/* allows for external implementation for part of the trick */
+#if !defined(NNMARK) /* { */
+
+
+#if !defined(LUA_IEEEENDIAN)
+#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN'
+#endif
+
+
+#define NNMARK 0x7FF7A500
+#define NNMASK 0x7FFFFF00
+
+#undef TValuefields
+#undef NILCONSTANT
+
+#if (LUA_IEEEENDIAN == 0) /* { */
+
+/* little endian */
+#define TValuefields \
+ union { struct { Value v__; int tt__; } i; double d__; } u
+#define NILCONSTANT {{{NULL}, tag2tt(LUA_TNIL)}}
+/* field-access macros */
+#define v_(o) ((o)->u.i.v__)
+#define d_(o) ((o)->u.d__)
+#define tt_(o) ((o)->u.i.tt__)
+
+#else /* }{ */
+
+/* big endian */
+#define TValuefields \
+ union { struct { int tt__; Value v__; } i; double d__; } u
+#define NILCONSTANT {{tag2tt(LUA_TNIL), {NULL}}}
+/* field-access macros */
+#define v_(o) ((o)->u.i.v__)
+#define d_(o) ((o)->u.d__)
+#define tt_(o) ((o)->u.i.tt__)
+
+#endif /* } */
+
+#endif /* } */
+
+
+/* correspondence with standard representation */
+#undef val_
+#define val_(o) v_(o)
+#undef num_
+#define num_(o) d_(o)
+
+
+#undef numfield
+#define numfield /* no such field; numbers are the entire struct */
+
+/* basic check to distinguish numbers from non-numbers */
+#undef ttisnumber
+#define ttisnumber(o) ((tt_(o) & NNMASK) != NNMARK)
+
+#define tag2tt(t) (NNMARK | (t))
+
+#undef rttype
+#define rttype(o) (ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff)
+
+#undef settt_
+#define settt_(o,t) (tt_(o) = tag2tt(t))
+
+#undef setnvalue
+#define setnvalue(obj,x) \
+ { TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); }
+
+#undef setobj
+#define setobj(L,obj1,obj2) \
+ { const TValue *o2_=(obj2); TValue *o1_=(obj1); \
+ o1_->u = o2_->u; \
+ checkliveness(G(L),o1_); }
+
+
+/*
+** these redefinitions are not mandatory, but these forms are more efficient
+*/
+
+#undef checktag
+#undef checktype
+#define checktag(o,t) (tt_(o) == tag2tt(t))
+#define checktype(o,t) (ctb(tt_(o) | VARBITS) == ctb(tag2tt(t) | VARBITS))
+
+#undef ttisequal
+#define ttisequal(o1,o2) \
+ (ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2)))
+
+
+#undef luai_checknum
+#define luai_checknum(L,o,c) { if (!ttisnumber(o)) c; }
+
+#endif
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** types and prototypes
+** =======================================================
+*/
+
+
+union Value {
+ GCObject *gc; /* collectable objects */
+ void *p; /* light userdata */
+ int b; /* booleans */
+ lua_CFunction f; /* light C functions */
+ numfield /* numbers */
+};
+
+
+struct lua_TValue {
+ TValuefields;
+};
+
+
+typedef TValue *StkId; /* index to stack elements */
+
+
+
+
+/*
+** Header for string value; string bytes follow the end of this structure
+*/
+typedef union TString {
+ L_Umaxalign dummy; /* ensures maximum alignment for strings */
+ struct {
+ CommonHeader;
+ lu_byte extra; /* reserved words for short strings; "has hash" for longs */
+ unsigned int hash;
+ size_t len; /* number of characters in string */
+ } tsv;
+} TString;
+
+
+/* get the actual string (array of bytes) from a TString */
+#define getstr(ts) cast(const char *, (ts) + 1)
+
+/* get the actual string (array of bytes) from a Lua value */
+#define svalue(o) getstr(rawtsvalue(o))
+
+
+/*
+** Header for userdata; memory area follows the end of this structure
+*/
+typedef union Udata {
+ L_Umaxalign dummy; /* ensures maximum alignment for `local' udata */
+ struct {
+ CommonHeader;
+ struct Table *metatable;
+ struct Table *env;
+ size_t len; /* number of bytes */
+ } uv;
+} Udata;
+
+
+
+/*
+** Description of an upvalue for function prototypes
+*/
+typedef struct Upvaldesc {
+ TString *name; /* upvalue name (for debug information) */
+ lu_byte instack; /* whether it is in stack */
+ lu_byte idx; /* index of upvalue (in stack or in outer function's list) */
+} Upvaldesc;
+
+
+/*
+** Description of a local variable for function prototypes
+** (used for debug information)
+*/
+typedef struct LocVar {
+ TString *varname;
+ int startpc; /* first point where variable is active */
+ int endpc; /* first point where variable is dead */
+} LocVar;
+
+
+/*
+** Function Prototypes
+*/
+typedef struct Proto {
+ CommonHeader;
+ TValue *k; /* constants used by the function */
+ Instruction *code;
+ struct Proto **p; /* functions defined inside the function */
+ int *lineinfo; /* map from opcodes to source lines (debug information) */
+ LocVar *locvars; /* information about local variables (debug information) */
+ Upvaldesc *upvalues; /* upvalue information */
+ union Closure *cache; /* last created closure with this prototype */
+ TString *source; /* used for debug information */
+ int sizeupvalues; /* size of 'upvalues' */
+ int sizek; /* size of `k' */
+ int sizecode;
+ int sizelineinfo;
+ int sizep; /* size of `p' */
+ int sizelocvars;
+ int linedefined;
+ int lastlinedefined;
+ GCObject *gclist;
+ lu_byte numparams; /* number of fixed parameters */
+ lu_byte is_vararg;
+ lu_byte maxstacksize; /* maximum stack used by this function */
+} Proto;
+
+
+
+/*
+** Lua Upvalues
+*/
+typedef struct UpVal {
+ CommonHeader;
+ TValue *v; /* points to stack or to its own value */
+ union {
+ TValue value; /* the value (when closed) */
+ struct { /* double linked list (when open) */
+ struct UpVal *prev;
+ struct UpVal *next;
+ } l;
+ } u;
+} UpVal;
+
+
+/*
+** Closures
+*/
+
+#define ClosureHeader \
+ CommonHeader; lu_byte nupvalues; GCObject *gclist
+
+typedef struct CClosure {
+ ClosureHeader;
+ lua_CFunction f;
+ TValue upvalue[1]; /* list of upvalues */
+} CClosure;
+
+
+typedef struct LClosure {
+ ClosureHeader;
+ struct Proto *p;
+ UpVal *upvals[1]; /* list of upvalues */
+} LClosure;
+
+
+typedef union Closure {
+ CClosure c;
+ LClosure l;
+} Closure;
+
+
+#define isLfunction(o) ttisLclosure(o)
+
+#define getproto(o) (clLvalue(o)->p)
+
+
+/*
+** Tables
+*/
+
+typedef union TKey {
+ struct {
+ TValuefields;
+ struct Node *next; /* for chaining */
+ } nk;
+ TValue tvk;
+} TKey;
+
+
+typedef struct Node {
+ TValue i_val;
+ TKey i_key;
+} Node;
+
+
+typedef struct Table {
+ CommonHeader;
+ lu_byte flags; /* 1<<p means tagmethod(p) is not present */
+ lu_byte lsizenode; /* log2 of size of `node' array */
+ int sizearray; /* size of `array' array */
+ TValue *array; /* array part */
+ Node *node;
+ Node *lastfree; /* any free position is before this position */
+ struct Table *metatable;
+ GCObject *gclist;
+} Table;
+
+
+
+/*
+** `module' operation for hashing (size is always a power of 2)
+*/
+#define lmod(s,size) \
+ (check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1)))))
+
+
+#define twoto(x) (1<<(x))
+#define sizenode(t) (twoto((t)->lsizenode))
+
+
+/*
+** (address of) a fixed nil value
+*/
+#define luaO_nilobject (&luaO_nilobject_)
+
+
+LUAI_DDEC const TValue luaO_nilobject_;
+
+
+LUAI_FUNC int luaO_int2fb (unsigned int x);
+LUAI_FUNC int luaO_fb2int (int x);
+LUAI_FUNC int luaO_ceillog2 (unsigned int x);
+LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2);
+LUAI_FUNC int luaO_str2d (const char *s, size_t len, lua_Number *result);
+LUAI_FUNC int luaO_hexavalue (int c);
+LUAI_FUNC const char *luaO_pushvfstring (lua_State *L, const char *fmt,
+ va_list argp);
+LUAI_FUNC const char *luaO_pushfstring (lua_State *L, const char *fmt, ...);
+LUAI_FUNC void luaO_chunkid (char *out, const char *source, size_t len);
+
+
+#endif
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c
new file mode 100644
index 000000000000..4190dc762428
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c
@@ -0,0 +1,107 @@
+/*
+** $Id: lopcodes.c,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
+** Opcodes for Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+
+#define lopcodes_c
+#define LUA_CORE
+
+
+#include "lopcodes.h"
+
+
+/* ORDER OP */
+
+LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = {
+ "MOVE",
+ "LOADK",
+ "LOADKX",
+ "LOADBOOL",
+ "LOADNIL",
+ "GETUPVAL",
+ "GETTABUP",
+ "GETTABLE",
+ "SETTABUP",
+ "SETUPVAL",
+ "SETTABLE",
+ "NEWTABLE",
+ "SELF",
+ "ADD",
+ "SUB",
+ "MUL",
+ "DIV",
+ "MOD",
+ "POW",
+ "UNM",
+ "NOT",
+ "LEN",
+ "CONCAT",
+ "JMP",
+ "EQ",
+ "LT",
+ "LE",
+ "TEST",
+ "TESTSET",
+ "CALL",
+ "TAILCALL",
+ "RETURN",
+ "FORLOOP",
+ "FORPREP",
+ "TFORCALL",
+ "TFORLOOP",
+ "SETLIST",
+ "CLOSURE",
+ "VARARG",
+ "EXTRAARG",
+ NULL
+};
+
+
+#define opmode(t,a,b,c,m) (((t)<<7) | ((a)<<6) | ((b)<<4) | ((c)<<2) | (m))
+
+LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = {
+/* T A B C mode opcode */
+ opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_MOVE */
+ ,opmode(0, 1, OpArgK, OpArgN, iABx) /* OP_LOADK */
+ ,opmode(0, 1, OpArgN, OpArgN, iABx) /* OP_LOADKX */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_LOADBOOL */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_LOADNIL */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_GETUPVAL */
+ ,opmode(0, 1, OpArgU, OpArgK, iABC) /* OP_GETTABUP */
+ ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_GETTABLE */
+ ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABUP */
+ ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_SETUPVAL */
+ ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABLE */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_NEWTABLE */
+ ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_SELF */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_ADD */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_SUB */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MUL */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_DIV */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MOD */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_POW */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_UNM */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_NOT */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_LEN */
+ ,opmode(0, 1, OpArgR, OpArgR, iABC) /* OP_CONCAT */
+ ,opmode(0, 0, OpArgR, OpArgN, iAsBx) /* OP_JMP */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_EQ */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LT */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LE */
+ ,opmode(1, 0, OpArgN, OpArgU, iABC) /* OP_TEST */
+ ,opmode(1, 1, OpArgR, OpArgU, iABC) /* OP_TESTSET */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_CALL */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_TAILCALL */
+ ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_RETURN */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORLOOP */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORPREP */
+ ,opmode(0, 0, OpArgN, OpArgU, iABC) /* OP_TFORCALL */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_TFORLOOP */
+ ,opmode(0, 0, OpArgU, OpArgU, iABC) /* OP_SETLIST */
+ ,opmode(0, 1, OpArgU, OpArgN, iABx) /* OP_CLOSURE */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_VARARG */
+ ,opmode(0, 0, OpArgU, OpArgU, iAx) /* OP_EXTRAARG */
+};
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h
new file mode 100644
index 000000000000..8e2f80a13141
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h
@@ -0,0 +1,288 @@
+/*
+** $Id: lopcodes.h,v 1.142.1.2 2014/10/20 18:32:09 roberto Exp $
+** Opcodes for Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lopcodes_h
+#define lopcodes_h
+
+#include "llimits.h"
+
+
+/*===========================================================================
+ We assume that instructions are unsigned numbers.
+ All instructions have an opcode in the first 6 bits.
+ Instructions can have the following fields:
+ `A' : 8 bits
+ `B' : 9 bits
+ `C' : 9 bits
+ 'Ax' : 26 bits ('A', 'B', and 'C' together)
+ `Bx' : 18 bits (`B' and `C' together)
+ `sBx' : signed Bx
+
+ A signed argument is represented in excess K; that is, the number
+ value is the unsigned value minus K. K is exactly the maximum value
+ for that argument (so that -max is represented by 0, and +max is
+ represented by 2*max), which is half the maximum for the corresponding
+ unsigned argument.
+===========================================================================*/
+
+
+enum OpMode {iABC, iABx, iAsBx, iAx}; /* basic instruction format */
+
+
+/*
+** size and position of opcode arguments.
+*/
+#define SIZE_C 9
+#define SIZE_B 9
+#define SIZE_Bx (SIZE_C + SIZE_B)
+#define SIZE_A 8
+#define SIZE_Ax (SIZE_C + SIZE_B + SIZE_A)
+
+#define SIZE_OP 6
+
+#define POS_OP 0
+#define POS_A (POS_OP + SIZE_OP)
+#define POS_C (POS_A + SIZE_A)
+#define POS_B (POS_C + SIZE_C)
+#define POS_Bx POS_C
+#define POS_Ax POS_A
+
+
+/*
+** limits for opcode arguments.
+** we use (signed) int to manipulate most arguments,
+** so they must fit in LUAI_BITSINT-1 bits (-1 for sign)
+*/
+#if SIZE_Bx < LUAI_BITSINT-1
+#define MAXARG_Bx ((1<<SIZE_Bx)-1)
+#define MAXARG_sBx (MAXARG_Bx>>1) /* `sBx' is signed */
+#else
+#define MAXARG_Bx MAX_INT
+#define MAXARG_sBx MAX_INT
+#endif
+
+#if SIZE_Ax < LUAI_BITSINT-1
+#define MAXARG_Ax ((1<<SIZE_Ax)-1)
+#else
+#define MAXARG_Ax MAX_INT
+#endif
+
+
+#define MAXARG_A ((1<<SIZE_A)-1)
+#define MAXARG_B ((1<<SIZE_B)-1)
+#define MAXARG_C ((1<<SIZE_C)-1)
+
+
+/* creates a mask with `n' 1 bits at position `p' */
+#define MASK1(n,p) ((~((~(Instruction)0)<<(n)))<<(p))
+
+/* creates a mask with `n' 0 bits at position `p' */
+#define MASK0(n,p) (~MASK1(n,p))
+
+/*
+** the following macros help to manipulate instructions
+*/
+
+#define GET_OPCODE(i) (cast(OpCode, ((i)>>POS_OP) & MASK1(SIZE_OP,0)))
+#define SET_OPCODE(i,o) ((i) = (((i)&MASK0(SIZE_OP,POS_OP)) | \
+ ((cast(Instruction, o)<<POS_OP)&MASK1(SIZE_OP,POS_OP))))
+
+#define getarg(i,pos,size) (cast(int, ((i)>>pos) & MASK1(size,0)))
+#define setarg(i,v,pos,size) ((i) = (((i)&MASK0(size,pos)) | \
+ ((cast(Instruction, v)<<pos)&MASK1(size,pos))))
+
+#define GETARG_A(i) getarg(i, POS_A, SIZE_A)
+#define SETARG_A(i,v) setarg(i, v, POS_A, SIZE_A)
+
+#define GETARG_B(i) getarg(i, POS_B, SIZE_B)
+#define SETARG_B(i,v) setarg(i, v, POS_B, SIZE_B)
+
+#define GETARG_C(i) getarg(i, POS_C, SIZE_C)
+#define SETARG_C(i,v) setarg(i, v, POS_C, SIZE_C)
+
+#define GETARG_Bx(i) getarg(i, POS_Bx, SIZE_Bx)
+#define SETARG_Bx(i,v) setarg(i, v, POS_Bx, SIZE_Bx)
+
+#define GETARG_Ax(i) getarg(i, POS_Ax, SIZE_Ax)
+#define SETARG_Ax(i,v) setarg(i, v, POS_Ax, SIZE_Ax)
+
+#define GETARG_sBx(i) (GETARG_Bx(i)-MAXARG_sBx)
+#define SETARG_sBx(i,b) SETARG_Bx((i),cast(unsigned int, (b)+MAXARG_sBx))
+
+
+#define CREATE_ABC(o,a,b,c) ((cast(Instruction, o)<<POS_OP) \
+ | (cast(Instruction, a)<<POS_A) \
+ | (cast(Instruction, b)<<POS_B) \
+ | (cast(Instruction, c)<<POS_C))
+
+#define CREATE_ABx(o,a,bc) ((cast(Instruction, o)<<POS_OP) \
+ | (cast(Instruction, a)<<POS_A) \
+ | (cast(Instruction, bc)<<POS_Bx))
+
+#define CREATE_Ax(o,a) ((cast(Instruction, o)<<POS_OP) \
+ | (cast(Instruction, a)<<POS_Ax))
+
+
+/*
+** Macros to operate RK indices
+*/
+
+/* this bit 1 means constant (0 means register) */
+#define BITRK (1 << (SIZE_B - 1))
+
+/* test whether value is a constant */
+#define ISK(x) ((x) & BITRK)
+
+/* gets the index of the constant */
+#define INDEXK(r) ((int)(r) & ~BITRK)
+
+#define MAXINDEXRK (BITRK - 1)
+
+/* code a constant index as a RK value */
+#define RKASK(x) ((x) | BITRK)
+
+
+/*
+** invalid register that fits in 8 bits
+*/
+#define NO_REG MAXARG_A
+
+
+/*
+** R(x) - register
+** Kst(x) - constant (in constant table)
+** RK(x) == if ISK(x) then Kst(INDEXK(x)) else R(x)
+*/
+
+
+/*
+** grep "ORDER OP" if you change these enums
+*/
+
+typedef enum {
+/*----------------------------------------------------------------------
+name args description
+------------------------------------------------------------------------*/
+OP_MOVE,/* A B R(A) := R(B) */
+OP_LOADK,/* A Bx R(A) := Kst(Bx) */
+OP_LOADKX,/* A R(A) := Kst(extra arg) */
+OP_LOADBOOL,/* A B C R(A) := (Bool)B; if (C) pc++ */
+OP_LOADNIL,/* A B R(A), R(A+1), ..., R(A+B) := nil */
+OP_GETUPVAL,/* A B R(A) := UpValue[B] */
+
+OP_GETTABUP,/* A B C R(A) := UpValue[B][RK(C)] */
+OP_GETTABLE,/* A B C R(A) := R(B)[RK(C)] */
+
+OP_SETTABUP,/* A B C UpValue[A][RK(B)] := RK(C) */
+OP_SETUPVAL,/* A B UpValue[B] := R(A) */
+OP_SETTABLE,/* A B C R(A)[RK(B)] := RK(C) */
+
+OP_NEWTABLE,/* A B C R(A) := {} (size = B,C) */
+
+OP_SELF,/* A B C R(A+1) := R(B); R(A) := R(B)[RK(C)] */
+
+OP_ADD,/* A B C R(A) := RK(B) + RK(C) */
+OP_SUB,/* A B C R(A) := RK(B) - RK(C) */
+OP_MUL,/* A B C R(A) := RK(B) * RK(C) */
+OP_DIV,/* A B C R(A) := RK(B) / RK(C) */
+OP_MOD,/* A B C R(A) := RK(B) % RK(C) */
+OP_POW,/* A B C R(A) := RK(B) ^ RK(C) */
+OP_UNM,/* A B R(A) := -R(B) */
+OP_NOT,/* A B R(A) := not R(B) */
+OP_LEN,/* A B R(A) := length of R(B) */
+
+OP_CONCAT,/* A B C R(A) := R(B).. ... ..R(C) */
+
+OP_JMP,/* A sBx pc+=sBx; if (A) close all upvalues >= R(A - 1) */
+OP_EQ,/* A B C if ((RK(B) == RK(C)) ~= A) then pc++ */
+OP_LT,/* A B C if ((RK(B) < RK(C)) ~= A) then pc++ */
+OP_LE,/* A B C if ((RK(B) <= RK(C)) ~= A) then pc++ */
+
+OP_TEST,/* A C if not (R(A) <=> C) then pc++ */
+OP_TESTSET,/* A B C if (R(B) <=> C) then R(A) := R(B) else pc++ */
+
+OP_CALL,/* A B C R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */
+OP_TAILCALL,/* A B C return R(A)(R(A+1), ... ,R(A+B-1)) */
+OP_RETURN,/* A B return R(A), ... ,R(A+B-2) (see note) */
+
+OP_FORLOOP,/* A sBx R(A)+=R(A+2);
+ if R(A) <?= R(A+1) then { pc+=sBx; R(A+3)=R(A) }*/
+OP_FORPREP,/* A sBx R(A)-=R(A+2); pc+=sBx */
+
+OP_TFORCALL,/* A C R(A+3), ... ,R(A+2+C) := R(A)(R(A+1), R(A+2)); */
+OP_TFORLOOP,/* A sBx if R(A+1) ~= nil then { R(A)=R(A+1); pc += sBx }*/
+
+OP_SETLIST,/* A B C R(A)[(C-1)*FPF+i] := R(A+i), 1 <= i <= B */
+
+OP_CLOSURE,/* A Bx R(A) := closure(KPROTO[Bx]) */
+
+OP_VARARG,/* A B R(A), R(A+1), ..., R(A+B-2) = vararg */
+
+OP_EXTRAARG/* Ax extra (larger) argument for previous opcode */
+} OpCode;
+
+
+#define NUM_OPCODES (cast(int, OP_EXTRAARG) + 1)
+
+
+
+/*===========================================================================
+ Notes:
+ (*) In OP_CALL, if (B == 0) then B = top. If (C == 0), then `top' is
+ set to last_result+1, so next open instruction (OP_CALL, OP_RETURN,
+ OP_SETLIST) may use `top'.
+
+ (*) In OP_VARARG, if (B == 0) then use actual number of varargs and
+ set top (like in OP_CALL with C == 0).
+
+ (*) In OP_RETURN, if (B == 0) then return up to `top'.
+
+ (*) In OP_SETLIST, if (B == 0) then B = `top'; if (C == 0) then next
+ 'instruction' is EXTRAARG(real C).
+
+ (*) In OP_LOADKX, the next 'instruction' is always EXTRAARG.
+
+ (*) For comparisons, A specifies what condition the test should accept
+ (true or false).
+
+ (*) All `skips' (pc++) assume that next instruction is a jump.
+
+===========================================================================*/
+
+
+/*
+** masks for instruction properties. The format is:
+** bits 0-1: op mode
+** bits 2-3: C arg mode
+** bits 4-5: B arg mode
+** bit 6: instruction set register A
+** bit 7: operator is a test (next instruction must be a jump)
+*/
+
+enum OpArgMask {
+ OpArgN, /* argument is not used */
+ OpArgU, /* argument is used */
+ OpArgR, /* argument is a register or a jump offset */
+ OpArgK /* argument is a constant or register/constant */
+};
+
+LUAI_DDEC const lu_byte luaP_opmodes[NUM_OPCODES];
+
+#define getOpMode(m) (cast(enum OpMode, luaP_opmodes[m] & 3))
+#define getBMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 4) & 3))
+#define getCMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3))
+#define testAMode(m) (luaP_opmodes[m] & (1 << 6))
+#define testTMode(m) (luaP_opmodes[m] & (1 << 7))
+
+
+LUAI_DDEC const char *const luaP_opnames[NUM_OPCODES+1]; /* opcode names */
+
+
+/* number of list items to accumulate before a SETLIST instruction */
+#define LFIELDS_PER_FLUSH 50
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c
new file mode 100644
index 000000000000..73f1af64f834
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c
@@ -0,0 +1,1637 @@
+/*
+** $Id: lparser.c,v 2.130.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua Parser
+** See Copyright Notice in lua.h
+*/
+
+#include <sys/zfs_context.h>
+
+#define lparser_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+
+
+
+/* maximum number of local variables per function (must be smaller
+ than 250, due to the bytecode format) */
+#define MAXVARS 200
+
+
+#define hasmultret(k) ((k) == VCALL || (k) == VVARARG)
+
+
+
+/*
+** nodes for block list (list of active blocks)
+*/
+typedef struct BlockCnt {
+ struct BlockCnt *previous; /* chain */
+ short firstlabel; /* index of first label in this block */
+ short firstgoto; /* index of first pending goto in this block */
+ lu_byte nactvar; /* # active locals outside the block */
+ lu_byte upval; /* true if some variable in the block is an upvalue */
+ lu_byte isloop; /* true if `block' is a loop */
+} BlockCnt;
+
+
+
+/*
+** prototypes for recursive non-terminal functions
+*/
+static void statement (LexState *ls);
+static void expr (LexState *ls, expdesc *v);
+
+
+static void anchor_token (LexState *ls) {
+ /* last token from outer function must be EOS */
+ lua_assert(ls->fs != NULL || ls->t.token == TK_EOS);
+ if (ls->t.token == TK_NAME || ls->t.token == TK_STRING) {
+ TString *ts = ls->t.seminfo.ts;
+ luaX_newstring(ls, getstr(ts), ts->tsv.len);
+ }
+}
+
+
+/* semantic error */
+static l_noret semerror (LexState *ls, const char *msg) {
+ ls->t.token = 0; /* remove 'near to' from final message */
+ luaX_syntaxerror(ls, msg);
+}
+
+
+static l_noret error_expected (LexState *ls, int token) {
+ luaX_syntaxerror(ls,
+ luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token)));
+}
+
+
+static l_noret errorlimit (FuncState *fs, int limit, const char *what) {
+ lua_State *L = fs->ls->L;
+ const char *msg;
+ int line = fs->f->linedefined;
+ const char *where = (line == 0)
+ ? "main function"
+ : luaO_pushfstring(L, "function at line %d", line);
+ msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s",
+ what, limit, where);
+ luaX_syntaxerror(fs->ls, msg);
+}
+
+
+static void checklimit (FuncState *fs, int v, int l, const char *what) {
+ if (v > l) errorlimit(fs, l, what);
+}
+
+
+static int testnext (LexState *ls, int c) {
+ if (ls->t.token == c) {
+ luaX_next(ls);
+ return 1;
+ }
+ else return 0;
+}
+
+
+static void check (LexState *ls, int c) {
+ if (ls->t.token != c)
+ error_expected(ls, c);
+}
+
+
+static void checknext (LexState *ls, int c) {
+ check(ls, c);
+ luaX_next(ls);
+}
+
+
+#define check_condition(ls,c,msg) { if (!(c)) luaX_syntaxerror(ls, msg); }
+
+
+
+static void check_match (LexState *ls, int what, int who, int where) {
+ if (!testnext(ls, what)) {
+ if (where == ls->linenumber)
+ error_expected(ls, what);
+ else {
+ luaX_syntaxerror(ls, luaO_pushfstring(ls->L,
+ "%s expected (to close %s at line %d)",
+ luaX_token2str(ls, what), luaX_token2str(ls, who), where));
+ }
+ }
+}
+
+
+static TString *str_checkname (LexState *ls) {
+ TString *ts;
+ check(ls, TK_NAME);
+ ts = ls->t.seminfo.ts;
+ luaX_next(ls);
+ return ts;
+}
+
+
+static void init_exp (expdesc *e, expkind k, int i) {
+ e->f = e->t = NO_JUMP;
+ e->k = k;
+ e->u.info = i;
+}
+
+
+static void codestring (LexState *ls, expdesc *e, TString *s) {
+ init_exp(e, VK, luaK_stringK(ls->fs, s));
+}
+
+
+static void checkname (LexState *ls, expdesc *e) {
+ codestring(ls, e, str_checkname(ls));
+}
+
+
+static int registerlocalvar (LexState *ls, TString *varname) {
+ FuncState *fs = ls->fs;
+ Proto *f = fs->f;
+ int oldsize = f->sizelocvars;
+ luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars,
+ LocVar, SHRT_MAX, "local variables");
+ while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL;
+ f->locvars[fs->nlocvars].varname = varname;
+ luaC_objbarrier(ls->L, f, varname);
+ return fs->nlocvars++;
+}
+
+
+static void new_localvar (LexState *ls, TString *name) {
+ FuncState *fs = ls->fs;
+ Dyndata *dyd = ls->dyd;
+ int reg = registerlocalvar(ls, name);
+ checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal,
+ MAXVARS, "local variables");
+ luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1,
+ dyd->actvar.size, Vardesc, MAX_INT, "local variables");
+ dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg);
+}
+
+
+static void new_localvarliteral_ (LexState *ls, const char *name, size_t sz) {
+ new_localvar(ls, luaX_newstring(ls, name, sz));
+}
+
+#define new_localvarliteral(ls,v) \
+ new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1)
+
+
+static LocVar *getlocvar (FuncState *fs, int i) {
+ int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx;
+ lua_assert(idx < fs->nlocvars);
+ return &fs->f->locvars[idx];
+}
+
+
+static void adjustlocalvars (LexState *ls, int nvars) {
+ FuncState *fs = ls->fs;
+ fs->nactvar = cast_byte(fs->nactvar + nvars);
+ for (; nvars; nvars--) {
+ getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc;
+ }
+}
+
+
+static void removevars (FuncState *fs, int tolevel) {
+ fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel);
+ while (fs->nactvar > tolevel)
+ getlocvar(fs, --fs->nactvar)->endpc = fs->pc;
+}
+
+
+static int searchupvalue (FuncState *fs, TString *name) {
+ int i;
+ Upvaldesc *up = fs->f->upvalues;
+ for (i = 0; i < fs->nups; i++) {
+ if (luaS_eqstr(up[i].name, name)) return i;
+ }
+ return -1; /* not found */
+}
+
+
+static int newupvalue (FuncState *fs, TString *name, expdesc *v) {
+ Proto *f = fs->f;
+ int oldsize = f->sizeupvalues;
+ checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues");
+ luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues,
+ Upvaldesc, MAXUPVAL, "upvalues");
+ while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL;
+ f->upvalues[fs->nups].instack = (v->k == VLOCAL);
+ f->upvalues[fs->nups].idx = cast_byte(v->u.info);
+ f->upvalues[fs->nups].name = name;
+ luaC_objbarrier(fs->ls->L, f, name);
+ return fs->nups++;
+}
+
+
+static int searchvar (FuncState *fs, TString *n) {
+ int i;
+ for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) {
+ if (luaS_eqstr(n, getlocvar(fs, i)->varname))
+ return i;
+ }
+ return -1; /* not found */
+}
+
+
+/*
+ Mark block where variable at given level was defined
+ (to emit close instructions later).
+*/
+static void markupval (FuncState *fs, int level) {
+ BlockCnt *bl = fs->bl;
+ while (bl->nactvar > level) bl = bl->previous;
+ bl->upval = 1;
+}
+
+
+/*
+ Find variable with given name 'n'. If it is an upvalue, add this
+ upvalue into all intermediate functions.
+*/
+static int singlevaraux (FuncState *fs, TString *n, expdesc *var, int base) {
+ if (fs == NULL) /* no more levels? */
+ return VVOID; /* default is global */
+ else {
+ int v = searchvar(fs, n); /* look up locals at current level */
+ if (v >= 0) { /* found? */
+ init_exp(var, VLOCAL, v); /* variable is local */
+ if (!base)
+ markupval(fs, v); /* local will be used as an upval */
+ return VLOCAL;
+ }
+ else { /* not found as local at current level; try upvalues */
+ int idx = searchupvalue(fs, n); /* try existing upvalues */
+ if (idx < 0) { /* not found? */
+ if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */
+ return VVOID; /* not found; is a global */
+ /* else was LOCAL or UPVAL */
+ idx = newupvalue(fs, n, var); /* will be a new upvalue */
+ }
+ init_exp(var, VUPVAL, idx);
+ return VUPVAL;
+ }
+ }
+}
+
+
+static void singlevar (LexState *ls, expdesc *var) {
+ TString *varname = str_checkname(ls);
+ FuncState *fs = ls->fs;
+ if (singlevaraux(fs, varname, var, 1) == VVOID) { /* global name? */
+ expdesc key;
+ singlevaraux(fs, ls->envn, var, 1); /* get environment variable */
+ lua_assert(var->k == VLOCAL || var->k == VUPVAL);
+ codestring(ls, &key, varname); /* key is variable name */
+ luaK_indexed(fs, var, &key); /* env[varname] */
+ }
+}
+
+
+static void adjust_assign (LexState *ls, int nvars, int nexps, expdesc *e) {
+ FuncState *fs = ls->fs;
+ int extra = nvars - nexps;
+ if (hasmultret(e->k)) {
+ extra++; /* includes call itself */
+ if (extra < 0) extra = 0;
+ luaK_setreturns(fs, e, extra); /* last exp. provides the difference */
+ if (extra > 1) luaK_reserveregs(fs, extra-1);
+ }
+ else {
+ if (e->k != VVOID) luaK_exp2nextreg(fs, e); /* close last expression */
+ if (extra > 0) {
+ int reg = fs->freereg;
+ luaK_reserveregs(fs, extra);
+ luaK_nil(fs, reg, extra);
+ }
+ }
+}
+
+
+static void enterlevel (LexState *ls) {
+ lua_State *L = ls->L;
+ ++L->nCcalls;
+ checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels");
+}
+
+
+#define leavelevel(ls) ((ls)->L->nCcalls--)
+
+
+static void closegoto (LexState *ls, int g, Labeldesc *label) {
+ int i;
+ FuncState *fs = ls->fs;
+ Labellist *gl = &ls->dyd->gt;
+ Labeldesc *gt = &gl->arr[g];
+ lua_assert(luaS_eqstr(gt->name, label->name));
+ if (gt->nactvar < label->nactvar) {
+ TString *vname = getlocvar(fs, gt->nactvar)->varname;
+ const char *msg = luaO_pushfstring(ls->L,
+ "<goto %s> at line %d jumps into the scope of local " LUA_QS,
+ getstr(gt->name), gt->line, getstr(vname));
+ semerror(ls, msg);
+ }
+ luaK_patchlist(fs, gt->pc, label->pc);
+ /* remove goto from pending list */
+ for (i = g; i < gl->n - 1; i++)
+ gl->arr[i] = gl->arr[i + 1];
+ gl->n--;
+}
+
+
+/*
+** try to close a goto with existing labels; this solves backward jumps
+*/
+static int findlabel (LexState *ls, int g) {
+ int i;
+ BlockCnt *bl = ls->fs->bl;
+ Dyndata *dyd = ls->dyd;
+ Labeldesc *gt = &dyd->gt.arr[g];
+ /* check labels in current block for a match */
+ for (i = bl->firstlabel; i < dyd->label.n; i++) {
+ Labeldesc *lb = &dyd->label.arr[i];
+ if (luaS_eqstr(lb->name, gt->name)) { /* correct label? */
+ if (gt->nactvar > lb->nactvar &&
+ (bl->upval || dyd->label.n > bl->firstlabel))
+ luaK_patchclose(ls->fs, gt->pc, lb->nactvar);
+ closegoto(ls, g, lb); /* close it */
+ return 1;
+ }
+ }
+ return 0; /* label not found; cannot close goto */
+}
+
+
+static int newlabelentry (LexState *ls, Labellist *l, TString *name,
+ int line, int pc) {
+ int n = l->n;
+ luaM_growvector(ls->L, l->arr, n, l->size,
+ Labeldesc, SHRT_MAX, "labels/gotos");
+ l->arr[n].name = name;
+ l->arr[n].line = line;
+ l->arr[n].nactvar = ls->fs->nactvar;
+ l->arr[n].pc = pc;
+ l->n++;
+ return n;
+}
+
+
+/*
+** check whether new label 'lb' matches any pending gotos in current
+** block; solves forward jumps
+*/
+static void findgotos (LexState *ls, Labeldesc *lb) {
+ Labellist *gl = &ls->dyd->gt;
+ int i = ls->fs->bl->firstgoto;
+ while (i < gl->n) {
+ if (luaS_eqstr(gl->arr[i].name, lb->name))
+ closegoto(ls, i, lb);
+ else
+ i++;
+ }
+}
+
+
+/*
+** "export" pending gotos to outer level, to check them against
+** outer labels; if the block being exited has upvalues, and
+** the goto exits the scope of any variable (which can be the
+** upvalue), close those variables being exited.
+*/
+static void movegotosout (FuncState *fs, BlockCnt *bl) {
+ int i = bl->firstgoto;
+ Labellist *gl = &fs->ls->dyd->gt;
+ /* correct pending gotos to current block and try to close it
+ with visible labels */
+ while (i < gl->n) {
+ Labeldesc *gt = &gl->arr[i];
+ if (gt->nactvar > bl->nactvar) {
+ if (bl->upval)
+ luaK_patchclose(fs, gt->pc, bl->nactvar);
+ gt->nactvar = bl->nactvar;
+ }
+ if (!findlabel(fs->ls, i))
+ i++; /* move to next one */
+ }
+}
+
+
+static void enterblock (FuncState *fs, BlockCnt *bl, lu_byte isloop) {
+ bl->isloop = isloop;
+ bl->nactvar = fs->nactvar;
+ bl->firstlabel = fs->ls->dyd->label.n;
+ bl->firstgoto = fs->ls->dyd->gt.n;
+ bl->upval = 0;
+ bl->previous = fs->bl;
+ fs->bl = bl;
+ lua_assert(fs->freereg == fs->nactvar);
+}
+
+
+/*
+** create a label named "break" to resolve break statements
+*/
+static void breaklabel (LexState *ls) {
+ TString *n = luaS_new(ls->L, "break");
+ int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc);
+ findgotos(ls, &ls->dyd->label.arr[l]);
+}
+
+/*
+** generates an error for an undefined 'goto'; choose appropriate
+** message when label name is a reserved word (which can only be 'break')
+*/
+static l_noret undefgoto (LexState *ls, Labeldesc *gt) {
+ const char *msg = isreserved(gt->name)
+ ? "<%s> at line %d not inside a loop"
+ : "no visible label " LUA_QS " for <goto> at line %d";
+ msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line);
+ semerror(ls, msg);
+}
+
+
+static void leaveblock (FuncState *fs) {
+ BlockCnt *bl = fs->bl;
+ LexState *ls = fs->ls;
+ if (bl->previous && bl->upval) {
+ /* create a 'jump to here' to close upvalues */
+ int j = luaK_jump(fs);
+ luaK_patchclose(fs, j, bl->nactvar);
+ luaK_patchtohere(fs, j);
+ }
+ if (bl->isloop)
+ breaklabel(ls); /* close pending breaks */
+ fs->bl = bl->previous;
+ removevars(fs, bl->nactvar);
+ lua_assert(bl->nactvar == fs->nactvar);
+ fs->freereg = fs->nactvar; /* free registers */
+ ls->dyd->label.n = bl->firstlabel; /* remove local labels */
+ if (bl->previous) /* inner block? */
+ movegotosout(fs, bl); /* update pending gotos to outer block */
+ else if (bl->firstgoto < ls->dyd->gt.n) /* pending gotos in outer block? */
+ undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]); /* error */
+}
+
+
+/*
+** adds a new prototype into list of prototypes
+*/
+static Proto *addprototype (LexState *ls) {
+ Proto *clp;
+ lua_State *L = ls->L;
+ FuncState *fs = ls->fs;
+ Proto *f = fs->f; /* prototype of current function */
+ if (fs->np >= f->sizep) {
+ int oldsize = f->sizep;
+ luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions");
+ while (oldsize < f->sizep) f->p[oldsize++] = NULL;
+ }
+ f->p[fs->np++] = clp = luaF_newproto(L);
+ luaC_objbarrier(L, f, clp);
+ return clp;
+}
+
+
+/*
+** codes instruction to create new closure in parent function.
+** The OP_CLOSURE instruction must use the last available register,
+** so that, if it invokes the GC, the GC knows which registers
+** are in use at that time.
+*/
+static void codeclosure (LexState *ls, expdesc *v) {
+ FuncState *fs = ls->fs->prev;
+ init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1));
+ luaK_exp2nextreg(fs, v); /* fix it at the last register */
+}
+
+
+static void open_func (LexState *ls, FuncState *fs, BlockCnt *bl) {
+ lua_State *L = ls->L;
+ Proto *f;
+ fs->prev = ls->fs; /* linked list of funcstates */
+ fs->ls = ls;
+ ls->fs = fs;
+ fs->pc = 0;
+ fs->lasttarget = 0;
+ fs->jpc = NO_JUMP;
+ fs->freereg = 0;
+ fs->nk = 0;
+ fs->np = 0;
+ fs->nups = 0;
+ fs->nlocvars = 0;
+ fs->nactvar = 0;
+ fs->firstlocal = ls->dyd->actvar.n;
+ fs->bl = NULL;
+ f = fs->f;
+ f->source = ls->source;
+ f->maxstacksize = 2; /* registers 0/1 are always valid */
+ fs->h = luaH_new(L);
+ /* anchor table of constants (to avoid being collected) */
+ sethvalue2s(L, L->top, fs->h);
+ incr_top(L);
+ enterblock(fs, bl, 0);
+}
+
+
+static void close_func (LexState *ls) {
+ lua_State *L = ls->L;
+ FuncState *fs = ls->fs;
+ Proto *f = fs->f;
+ luaK_ret(fs, 0, 0); /* final return */
+ leaveblock(fs);
+ luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction);
+ f->sizecode = fs->pc;
+ luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int);
+ f->sizelineinfo = fs->pc;
+ luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue);
+ f->sizek = fs->nk;
+ luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *);
+ f->sizep = fs->np;
+ luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar);
+ f->sizelocvars = fs->nlocvars;
+ luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc);
+ f->sizeupvalues = fs->nups;
+ lua_assert(fs->bl == NULL);
+ ls->fs = fs->prev;
+ /* last token read was anchored in defunct function; must re-anchor it */
+ anchor_token(ls);
+ L->top--; /* pop table of constants */
+ luaC_checkGC(L);
+}
+
+
+
+/*============================================================*/
+/* GRAMMAR RULES */
+/*============================================================*/
+
+
+/*
+** check whether current token is in the follow set of a block.
+** 'until' closes syntactical blocks, but do not close scope,
+** so it handled in separate.
+*/
+static int block_follow (LexState *ls, int withuntil) {
+ switch (ls->t.token) {
+ case TK_ELSE: case TK_ELSEIF:
+ case TK_END: case TK_EOS:
+ return 1;
+ case TK_UNTIL: return withuntil;
+ default: return 0;
+ }
+}
+
+
+static void statlist (LexState *ls) {
+ /* statlist -> { stat [`;'] } */
+ while (!block_follow(ls, 1)) {
+ if (ls->t.token == TK_RETURN) {
+ statement(ls);
+ return; /* 'return' must be last statement */
+ }
+ statement(ls);
+ }
+}
+
+
+static void fieldsel (LexState *ls, expdesc *v) {
+ /* fieldsel -> ['.' | ':'] NAME */
+ FuncState *fs = ls->fs;
+ expdesc key;
+ luaK_exp2anyregup(fs, v);
+ luaX_next(ls); /* skip the dot or colon */
+ checkname(ls, &key);
+ luaK_indexed(fs, v, &key);
+}
+
+
+static void yindex (LexState *ls, expdesc *v) {
+ /* index -> '[' expr ']' */
+ luaX_next(ls); /* skip the '[' */
+ expr(ls, v);
+ luaK_exp2val(ls->fs, v);
+ checknext(ls, ']');
+}
+
+
+/*
+** {======================================================================
+** Rules for Constructors
+** =======================================================================
+*/
+
+
+struct ConsControl {
+ expdesc v; /* last list item read */
+ expdesc *t; /* table descriptor */
+ int nh; /* total number of `record' elements */
+ int na; /* total number of array elements */
+ int tostore; /* number of array elements pending to be stored */
+};
+
+
+static void recfield (LexState *ls, struct ConsControl *cc) {
+ /* recfield -> (NAME | `['exp1`]') = exp1 */
+ FuncState *fs = ls->fs;
+ int reg = ls->fs->freereg;
+ expdesc key, val;
+ int rkkey;
+ if (ls->t.token == TK_NAME) {
+ checklimit(fs, cc->nh, MAX_INT, "items in a constructor");
+ checkname(ls, &key);
+ }
+ else /* ls->t.token == '[' */
+ yindex(ls, &key);
+ cc->nh++;
+ checknext(ls, '=');
+ rkkey = luaK_exp2RK(fs, &key);
+ expr(ls, &val);
+ luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val));
+ fs->freereg = reg; /* free registers */
+}
+
+
+static void closelistfield (FuncState *fs, struct ConsControl *cc) {
+ if (cc->v.k == VVOID) return; /* there is no list item */
+ luaK_exp2nextreg(fs, &cc->v);
+ cc->v.k = VVOID;
+ if (cc->tostore == LFIELDS_PER_FLUSH) {
+ luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); /* flush */
+ cc->tostore = 0; /* no more items pending */
+ }
+}
+
+
+static void lastlistfield (FuncState *fs, struct ConsControl *cc) {
+ if (cc->tostore == 0) return;
+ if (hasmultret(cc->v.k)) {
+ luaK_setmultret(fs, &cc->v);
+ luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET);
+ cc->na--; /* do not count last expression (unknown number of elements) */
+ }
+ else {
+ if (cc->v.k != VVOID)
+ luaK_exp2nextreg(fs, &cc->v);
+ luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);
+ }
+}
+
+
+static void listfield (LexState *ls, struct ConsControl *cc) {
+ /* listfield -> exp */
+ expr(ls, &cc->v);
+ checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor");
+ cc->na++;
+ cc->tostore++;
+}
+
+
+static void field (LexState *ls, struct ConsControl *cc) {
+ /* field -> listfield | recfield */
+ switch(ls->t.token) {
+ case TK_NAME: { /* may be 'listfield' or 'recfield' */
+ if (luaX_lookahead(ls) != '=') /* expression? */
+ listfield(ls, cc);
+ else
+ recfield(ls, cc);
+ break;
+ }
+ case '[': {
+ recfield(ls, cc);
+ break;
+ }
+ default: {
+ listfield(ls, cc);
+ break;
+ }
+ }
+}
+
+
+static void constructor (LexState *ls, expdesc *t) {
+ /* constructor -> '{' [ field { sep field } [sep] ] '}'
+ sep -> ',' | ';' */
+ FuncState *fs = ls->fs;
+ int line = ls->linenumber;
+ int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0);
+ struct ConsControl cc;
+ cc.na = cc.nh = cc.tostore = 0;
+ cc.t = t;
+ init_exp(t, VRELOCABLE, pc);
+ init_exp(&cc.v, VVOID, 0); /* no value (yet) */
+ luaK_exp2nextreg(ls->fs, t); /* fix it at stack top */
+ checknext(ls, '{');
+ do {
+ lua_assert(cc.v.k == VVOID || cc.tostore > 0);
+ if (ls->t.token == '}') break;
+ closelistfield(fs, &cc);
+ field(ls, &cc);
+ } while (testnext(ls, ',') || testnext(ls, ';'));
+ check_match(ls, '}', '{', line);
+ lastlistfield(fs, &cc);
+ SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */
+ SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh)); /* set initial table size */
+}
+
+/* }====================================================================== */
+
+
+
+static void parlist (LexState *ls) {
+ /* parlist -> [ param { `,' param } ] */
+ FuncState *fs = ls->fs;
+ Proto *f = fs->f;
+ int nparams = 0;
+ f->is_vararg = 0;
+ if (ls->t.token != ')') { /* is `parlist' not empty? */
+ do {
+ switch (ls->t.token) {
+ case TK_NAME: { /* param -> NAME */
+ new_localvar(ls, str_checkname(ls));
+ nparams++;
+ break;
+ }
+ case TK_DOTS: { /* param -> `...' */
+ luaX_next(ls);
+ f->is_vararg = 1;
+ break;
+ }
+ default: luaX_syntaxerror(ls, "<name> or " LUA_QL("...") " expected");
+ }
+ } while (!f->is_vararg && testnext(ls, ','));
+ }
+ adjustlocalvars(ls, nparams);
+ f->numparams = cast_byte(fs->nactvar);
+ luaK_reserveregs(fs, fs->nactvar); /* reserve register for parameters */
+}
+
+
+static void body (LexState *ls, expdesc *e, int ismethod, int line) {
+ /* body -> `(' parlist `)' block END */
+ FuncState new_fs;
+ BlockCnt bl;
+ new_fs.f = addprototype(ls);
+ new_fs.f->linedefined = line;
+ open_func(ls, &new_fs, &bl);
+ checknext(ls, '(');
+ if (ismethod) {
+ new_localvarliteral(ls, "self"); /* create 'self' parameter */
+ adjustlocalvars(ls, 1);
+ }
+ parlist(ls);
+ checknext(ls, ')');
+ statlist(ls);
+ new_fs.f->lastlinedefined = ls->linenumber;
+ check_match(ls, TK_END, TK_FUNCTION, line);
+ codeclosure(ls, e);
+ close_func(ls);
+}
+
+
+static int explist (LexState *ls, expdesc *v) {
+ /* explist -> expr { `,' expr } */
+ int n = 1; /* at least one expression */
+ expr(ls, v);
+ while (testnext(ls, ',')) {
+ luaK_exp2nextreg(ls->fs, v);
+ expr(ls, v);
+ n++;
+ }
+ return n;
+}
+
+
+static void funcargs (LexState *ls, expdesc *f, int line) {
+ FuncState *fs = ls->fs;
+ expdesc args;
+ int base, nparams;
+ switch (ls->t.token) {
+ case '(': { /* funcargs -> `(' [ explist ] `)' */
+ luaX_next(ls);
+ if (ls->t.token == ')') /* arg list is empty? */
+ args.k = VVOID;
+ else {
+ explist(ls, &args);
+ luaK_setmultret(fs, &args);
+ }
+ check_match(ls, ')', '(', line);
+ break;
+ }
+ case '{': { /* funcargs -> constructor */
+ constructor(ls, &args);
+ break;
+ }
+ case TK_STRING: { /* funcargs -> STRING */
+ codestring(ls, &args, ls->t.seminfo.ts);
+ luaX_next(ls); /* must use `seminfo' before `next' */
+ break;
+ }
+ default: {
+ luaX_syntaxerror(ls, "function arguments expected");
+ }
+ }
+ lua_assert(f->k == VNONRELOC);
+ base = f->u.info; /* base register for call */
+ if (hasmultret(args.k))
+ nparams = LUA_MULTRET; /* open call */
+ else {
+ if (args.k != VVOID)
+ luaK_exp2nextreg(fs, &args); /* close last argument */
+ nparams = fs->freereg - (base+1);
+ }
+ init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2));
+ luaK_fixline(fs, line);
+ fs->freereg = base+1; /* call remove function and arguments and leaves
+ (unless changed) one result */
+}
+
+
+
+
+/*
+** {======================================================================
+** Expression parsing
+** =======================================================================
+*/
+
+
+static void primaryexp (LexState *ls, expdesc *v) {
+ /* primaryexp -> NAME | '(' expr ')' */
+ switch (ls->t.token) {
+ case '(': {
+ int line = ls->linenumber;
+ luaX_next(ls);
+ expr(ls, v);
+ check_match(ls, ')', '(', line);
+ luaK_dischargevars(ls->fs, v);
+ return;
+ }
+ case TK_NAME: {
+ singlevar(ls, v);
+ return;
+ }
+ default: {
+ luaX_syntaxerror(ls, "unexpected symbol");
+ }
+ }
+}
+
+
+static void suffixedexp (LexState *ls, expdesc *v) {
+ /* suffixedexp ->
+ primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs } */
+ FuncState *fs = ls->fs;
+ int line = ls->linenumber;
+ primaryexp(ls, v);
+ for (;;) {
+ switch (ls->t.token) {
+ case '.': { /* fieldsel */
+ fieldsel(ls, v);
+ break;
+ }
+ case '[': { /* `[' exp1 `]' */
+ expdesc key;
+ luaK_exp2anyregup(fs, v);
+ yindex(ls, &key);
+ luaK_indexed(fs, v, &key);
+ break;
+ }
+ case ':': { /* `:' NAME funcargs */
+ expdesc key;
+ luaX_next(ls);
+ checkname(ls, &key);
+ luaK_self(fs, v, &key);
+ funcargs(ls, v, line);
+ break;
+ }
+ case '(': case TK_STRING: case '{': { /* funcargs */
+ luaK_exp2nextreg(fs, v);
+ funcargs(ls, v, line);
+ break;
+ }
+ default: return;
+ }
+ }
+}
+
+
+static void simpleexp (LexState *ls, expdesc *v) {
+ /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... |
+ constructor | FUNCTION body | suffixedexp */
+ switch (ls->t.token) {
+ case TK_NUMBER: {
+ init_exp(v, VKNUM, 0);
+ v->u.nval = ls->t.seminfo.r;
+ break;
+ }
+ case TK_STRING: {
+ codestring(ls, v, ls->t.seminfo.ts);
+ break;
+ }
+ case TK_NIL: {
+ init_exp(v, VNIL, 0);
+ break;
+ }
+ case TK_TRUE: {
+ init_exp(v, VTRUE, 0);
+ break;
+ }
+ case TK_FALSE: {
+ init_exp(v, VFALSE, 0);
+ break;
+ }
+ case TK_DOTS: { /* vararg */
+ FuncState *fs = ls->fs;
+ check_condition(ls, fs->f->is_vararg,
+ "cannot use " LUA_QL("...") " outside a vararg function");
+ init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0));
+ break;
+ }
+ case '{': { /* constructor */
+ constructor(ls, v);
+ return;
+ }
+ case TK_FUNCTION: {
+ luaX_next(ls);
+ body(ls, v, 0, ls->linenumber);
+ return;
+ }
+ default: {
+ suffixedexp(ls, v);
+ return;
+ }
+ }
+ luaX_next(ls);
+}
+
+
+static UnOpr getunopr (int op) {
+ switch (op) {
+ case TK_NOT: return OPR_NOT;
+ case '-': return OPR_MINUS;
+ case '#': return OPR_LEN;
+ default: return OPR_NOUNOPR;
+ }
+}
+
+
+static BinOpr getbinopr (int op) {
+ switch (op) {
+ case '+': return OPR_ADD;
+ case '-': return OPR_SUB;
+ case '*': return OPR_MUL;
+ case '/': return OPR_DIV;
+ case '%': return OPR_MOD;
+ case '^': return OPR_POW;
+ case TK_CONCAT: return OPR_CONCAT;
+ case TK_NE: return OPR_NE;
+ case TK_EQ: return OPR_EQ;
+ case '<': return OPR_LT;
+ case TK_LE: return OPR_LE;
+ case '>': return OPR_GT;
+ case TK_GE: return OPR_GE;
+ case TK_AND: return OPR_AND;
+ case TK_OR: return OPR_OR;
+ default: return OPR_NOBINOPR;
+ }
+}
+
+
+static const struct {
+ lu_byte left; /* left priority for each binary operator */
+ lu_byte right; /* right priority */
+} priority[] = { /* ORDER OPR */
+ {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7}, /* `+' `-' `*' `/' `%' */
+ {10, 9}, {5, 4}, /* ^, .. (right associative) */
+ {3, 3}, {3, 3}, {3, 3}, /* ==, <, <= */
+ {3, 3}, {3, 3}, {3, 3}, /* ~=, >, >= */
+ {2, 2}, {1, 1} /* and, or */
+};
+
+#define UNARY_PRIORITY 8 /* priority for unary operators */
+
+
+/*
+** subexpr -> (simpleexp | unop subexpr) { binop subexpr }
+** where `binop' is any binary operator with a priority higher than `limit'
+*/
+static BinOpr subexpr (LexState *ls, expdesc *v, int limit) {
+ BinOpr op;
+ UnOpr uop;
+ enterlevel(ls);
+ uop = getunopr(ls->t.token);
+ if (uop != OPR_NOUNOPR) {
+ int line = ls->linenumber;
+ luaX_next(ls);
+ subexpr(ls, v, UNARY_PRIORITY);
+ luaK_prefix(ls->fs, uop, v, line);
+ }
+ else simpleexp(ls, v);
+ /* expand while operators have priorities higher than `limit' */
+ op = getbinopr(ls->t.token);
+ while (op != OPR_NOBINOPR && priority[op].left > limit) {
+ expdesc v2;
+ BinOpr nextop;
+ int line = ls->linenumber;
+ luaX_next(ls);
+ luaK_infix(ls->fs, op, v);
+ /* read sub-expression with higher priority */
+ nextop = subexpr(ls, &v2, priority[op].right);
+ luaK_posfix(ls->fs, op, v, &v2, line);
+ op = nextop;
+ }
+ leavelevel(ls);
+ return op; /* return first untreated operator */
+}
+
+
+static void expr (LexState *ls, expdesc *v) {
+ subexpr(ls, v, 0);
+}
+
+/* }==================================================================== */
+
+
+
+/*
+** {======================================================================
+** Rules for Statements
+** =======================================================================
+*/
+
+
+static void block (LexState *ls) {
+ /* block -> statlist */
+ FuncState *fs = ls->fs;
+ BlockCnt bl;
+ enterblock(fs, &bl, 0);
+ statlist(ls);
+ leaveblock(fs);
+}
+
+
+/*
+** structure to chain all variables in the left-hand side of an
+** assignment
+*/
+struct LHS_assign {
+ struct LHS_assign *prev;
+ expdesc v; /* variable (global, local, upvalue, or indexed) */
+};
+
+
+/*
+** check whether, in an assignment to an upvalue/local variable, the
+** upvalue/local variable is begin used in a previous assignment to a
+** table. If so, save original upvalue/local value in a safe place and
+** use this safe copy in the previous assignment.
+*/
+static void check_conflict (LexState *ls, struct LHS_assign *lh, expdesc *v) {
+ FuncState *fs = ls->fs;
+ int extra = fs->freereg; /* eventual position to save local variable */
+ int conflict = 0;
+ for (; lh; lh = lh->prev) { /* check all previous assignments */
+ if (lh->v.k == VINDEXED) { /* assigning to a table? */
+ /* table is the upvalue/local being assigned now? */
+ if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) {
+ conflict = 1;
+ lh->v.u.ind.vt = VLOCAL;
+ lh->v.u.ind.t = extra; /* previous assignment will use safe copy */
+ }
+ /* index is the local being assigned? (index cannot be upvalue) */
+ if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) {
+ conflict = 1;
+ lh->v.u.ind.idx = extra; /* previous assignment will use safe copy */
+ }
+ }
+ }
+ if (conflict) {
+ /* copy upvalue/local value to a temporary (in position 'extra') */
+ OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL;
+ luaK_codeABC(fs, op, extra, v->u.info, 0);
+ luaK_reserveregs(fs, 1);
+ }
+}
+
+
+static void assignment (LexState *ls, struct LHS_assign *lh, int nvars) {
+ expdesc e;
+ check_condition(ls, vkisvar(lh->v.k), "syntax error");
+ if (testnext(ls, ',')) { /* assignment -> ',' suffixedexp assignment */
+ struct LHS_assign nv;
+ nv.prev = lh;
+ suffixedexp(ls, &nv.v);
+ if (nv.v.k != VINDEXED)
+ check_conflict(ls, lh, &nv.v);
+ checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS,
+ "C levels");
+ assignment(ls, &nv, nvars+1);
+ }
+ else { /* assignment -> `=' explist */
+ int nexps;
+ checknext(ls, '=');
+ nexps = explist(ls, &e);
+ if (nexps != nvars) {
+ adjust_assign(ls, nvars, nexps, &e);
+ if (nexps > nvars)
+ ls->fs->freereg -= nexps - nvars; /* remove extra values */
+ }
+ else {
+ luaK_setoneret(ls->fs, &e); /* close last expression */
+ luaK_storevar(ls->fs, &lh->v, &e);
+ return; /* avoid default */
+ }
+ }
+ init_exp(&e, VNONRELOC, ls->fs->freereg-1); /* default assignment */
+ luaK_storevar(ls->fs, &lh->v, &e);
+}
+
+
+static int cond (LexState *ls) {
+ /* cond -> exp */
+ expdesc v;
+ expr(ls, &v); /* read condition */
+ if (v.k == VNIL) v.k = VFALSE; /* `falses' are all equal here */
+ luaK_goiftrue(ls->fs, &v);
+ return v.f;
+}
+
+
+static void gotostat (LexState *ls, int pc) {
+ int line = ls->linenumber;
+ TString *label;
+ int g;
+ if (testnext(ls, TK_GOTO))
+ label = str_checkname(ls);
+ else {
+ luaX_next(ls); /* skip break */
+ label = luaS_new(ls->L, "break");
+ }
+ g = newlabelentry(ls, &ls->dyd->gt, label, line, pc);
+ findlabel(ls, g); /* close it if label already defined */
+}
+
+
+/* check for repeated labels on the same block */
+static void checkrepeated (FuncState *fs, Labellist *ll, TString *label) {
+ int i;
+ for (i = fs->bl->firstlabel; i < ll->n; i++) {
+ if (luaS_eqstr(label, ll->arr[i].name)) {
+ const char *msg = luaO_pushfstring(fs->ls->L,
+ "label " LUA_QS " already defined on line %d",
+ getstr(label), ll->arr[i].line);
+ semerror(fs->ls, msg);
+ }
+ }
+}
+
+
+/* skip no-op statements */
+static void skipnoopstat (LexState *ls) {
+ while (ls->t.token == ';' || ls->t.token == TK_DBCOLON)
+ statement(ls);
+}
+
+
+static void labelstat (LexState *ls, TString *label, int line) {
+ /* label -> '::' NAME '::' */
+ FuncState *fs = ls->fs;
+ Labellist *ll = &ls->dyd->label;
+ int l; /* index of new label being created */
+ checkrepeated(fs, ll, label); /* check for repeated labels */
+ checknext(ls, TK_DBCOLON); /* skip double colon */
+ /* create new entry for this label */
+ l = newlabelentry(ls, ll, label, line, fs->pc);
+ skipnoopstat(ls); /* skip other no-op statements */
+ if (block_follow(ls, 0)) { /* label is last no-op statement in the block? */
+ /* assume that locals are already out of scope */
+ ll->arr[l].nactvar = fs->bl->nactvar;
+ }
+ findgotos(ls, &ll->arr[l]);
+}
+
+
+static void whilestat (LexState *ls, int line) {
+ /* whilestat -> WHILE cond DO block END */
+ FuncState *fs = ls->fs;
+ int whileinit;
+ int condexit;
+ BlockCnt bl;
+ luaX_next(ls); /* skip WHILE */
+ whileinit = luaK_getlabel(fs);
+ condexit = cond(ls);
+ enterblock(fs, &bl, 1);
+ checknext(ls, TK_DO);
+ block(ls);
+ luaK_jumpto(fs, whileinit);
+ check_match(ls, TK_END, TK_WHILE, line);
+ leaveblock(fs);
+ luaK_patchtohere(fs, condexit); /* false conditions finish the loop */
+}
+
+
+static void repeatstat (LexState *ls, int line) {
+ /* repeatstat -> REPEAT block UNTIL cond */
+ int condexit;
+ FuncState *fs = ls->fs;
+ int repeat_init = luaK_getlabel(fs);
+ BlockCnt bl1, bl2;
+ enterblock(fs, &bl1, 1); /* loop block */
+ enterblock(fs, &bl2, 0); /* scope block */
+ luaX_next(ls); /* skip REPEAT */
+ statlist(ls);
+ check_match(ls, TK_UNTIL, TK_REPEAT, line);
+ condexit = cond(ls); /* read condition (inside scope block) */
+ if (bl2.upval) /* upvalues? */
+ luaK_patchclose(fs, condexit, bl2.nactvar);
+ leaveblock(fs); /* finish scope */
+ luaK_patchlist(fs, condexit, repeat_init); /* close the loop */
+ leaveblock(fs); /* finish loop */
+}
+
+
+static int exp1 (LexState *ls) {
+ expdesc e;
+ int reg;
+ expr(ls, &e);
+ luaK_exp2nextreg(ls->fs, &e);
+ lua_assert(e.k == VNONRELOC);
+ reg = e.u.info;
+ return reg;
+}
+
+
+static void forbody (LexState *ls, int base, int line, int nvars, int isnum) {
+ /* forbody -> DO block */
+ BlockCnt bl;
+ FuncState *fs = ls->fs;
+ int prep, endfor;
+ adjustlocalvars(ls, 3); /* control variables */
+ checknext(ls, TK_DO);
+ prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs);
+ enterblock(fs, &bl, 0); /* scope for declared variables */
+ adjustlocalvars(ls, nvars);
+ luaK_reserveregs(fs, nvars);
+ block(ls);
+ leaveblock(fs); /* end of scope for declared variables */
+ luaK_patchtohere(fs, prep);
+ if (isnum) /* numeric for? */
+ endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP);
+ else { /* generic for */
+ luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars);
+ luaK_fixline(fs, line);
+ endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP);
+ }
+ luaK_patchlist(fs, endfor, prep + 1);
+ luaK_fixline(fs, line);
+}
+
+
+static void fornum (LexState *ls, TString *varname, int line) {
+ /* fornum -> NAME = exp1,exp1[,exp1] forbody */
+ FuncState *fs = ls->fs;
+ int base = fs->freereg;
+ new_localvarliteral(ls, "(for index)");
+ new_localvarliteral(ls, "(for limit)");
+ new_localvarliteral(ls, "(for step)");
+ new_localvar(ls, varname);
+ checknext(ls, '=');
+ exp1(ls); /* initial value */
+ checknext(ls, ',');
+ exp1(ls); /* limit */
+ if (testnext(ls, ','))
+ exp1(ls); /* optional step */
+ else { /* default step = 1 */
+ luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1));
+ luaK_reserveregs(fs, 1);
+ }
+ forbody(ls, base, line, 1, 1);
+}
+
+
+static void forlist (LexState *ls, TString *indexname) {
+ /* forlist -> NAME {,NAME} IN explist forbody */
+ FuncState *fs = ls->fs;
+ expdesc e;
+ int nvars = 4; /* gen, state, control, plus at least one declared var */
+ int line;
+ int base = fs->freereg;
+ /* create control variables */
+ new_localvarliteral(ls, "(for generator)");
+ new_localvarliteral(ls, "(for state)");
+ new_localvarliteral(ls, "(for control)");
+ /* create declared variables */
+ new_localvar(ls, indexname);
+ while (testnext(ls, ',')) {
+ new_localvar(ls, str_checkname(ls));
+ nvars++;
+ }
+ checknext(ls, TK_IN);
+ line = ls->linenumber;
+ adjust_assign(ls, 3, explist(ls, &e), &e);
+ luaK_checkstack(fs, 3); /* extra space to call generator */
+ forbody(ls, base, line, nvars - 3, 0);
+}
+
+
+static void forstat (LexState *ls, int line) {
+ /* forstat -> FOR (fornum | forlist) END */
+ FuncState *fs = ls->fs;
+ TString *varname;
+ BlockCnt bl;
+ enterblock(fs, &bl, 1); /* scope for loop and control variables */
+ luaX_next(ls); /* skip `for' */
+ varname = str_checkname(ls); /* first variable name */
+ switch (ls->t.token) {
+ case '=': fornum(ls, varname, line); break;
+ case ',': case TK_IN: forlist(ls, varname); break;
+ default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected");
+ }
+ check_match(ls, TK_END, TK_FOR, line);
+ leaveblock(fs); /* loop scope (`break' jumps to this point) */
+}
+
+
+static void test_then_block (LexState *ls, int *escapelist) {
+ /* test_then_block -> [IF | ELSEIF] cond THEN block */
+ BlockCnt bl;
+ FuncState *fs = ls->fs;
+ expdesc v;
+ int jf; /* instruction to skip 'then' code (if condition is false) */
+ luaX_next(ls); /* skip IF or ELSEIF */
+ expr(ls, &v); /* read condition */
+ checknext(ls, TK_THEN);
+ if (ls->t.token == TK_GOTO || ls->t.token == TK_BREAK) {
+ luaK_goiffalse(ls->fs, &v); /* will jump to label if condition is true */
+ enterblock(fs, &bl, 0); /* must enter block before 'goto' */
+ gotostat(ls, v.t); /* handle goto/break */
+ skipnoopstat(ls); /* skip other no-op statements */
+ if (block_follow(ls, 0)) { /* 'goto' is the entire block? */
+ leaveblock(fs);
+ return; /* and that is it */
+ }
+ else /* must skip over 'then' part if condition is false */
+ jf = luaK_jump(fs);
+ }
+ else { /* regular case (not goto/break) */
+ luaK_goiftrue(ls->fs, &v); /* skip over block if condition is false */
+ enterblock(fs, &bl, 0);
+ jf = v.f;
+ }
+ statlist(ls); /* `then' part */
+ leaveblock(fs);
+ if (ls->t.token == TK_ELSE ||
+ ls->t.token == TK_ELSEIF) /* followed by 'else'/'elseif'? */
+ luaK_concat(fs, escapelist, luaK_jump(fs)); /* must jump over it */
+ luaK_patchtohere(fs, jf);
+}
+
+
+static void ifstat (LexState *ls, int line) {
+ /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */
+ FuncState *fs = ls->fs;
+ int escapelist = NO_JUMP; /* exit list for finished parts */
+ test_then_block(ls, &escapelist); /* IF cond THEN block */
+ while (ls->t.token == TK_ELSEIF)
+ test_then_block(ls, &escapelist); /* ELSEIF cond THEN block */
+ if (testnext(ls, TK_ELSE))
+ block(ls); /* `else' part */
+ check_match(ls, TK_END, TK_IF, line);
+ luaK_patchtohere(fs, escapelist); /* patch escape list to 'if' end */
+}
+
+
+static void localfunc (LexState *ls) {
+ expdesc b;
+ FuncState *fs = ls->fs;
+ new_localvar(ls, str_checkname(ls)); /* new local variable */
+ adjustlocalvars(ls, 1); /* enter its scope */
+ body(ls, &b, 0, ls->linenumber); /* function created in next register */
+ /* debug information will only see the variable after this point! */
+ getlocvar(fs, b.u.info)->startpc = fs->pc;
+}
+
+
+static void localstat (LexState *ls) {
+ /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */
+ int nvars = 0;
+ int nexps;
+ expdesc e;
+ do {
+ new_localvar(ls, str_checkname(ls));
+ nvars++;
+ } while (testnext(ls, ','));
+ if (testnext(ls, '='))
+ nexps = explist(ls, &e);
+ else {
+ e.k = VVOID;
+ nexps = 0;
+ }
+ adjust_assign(ls, nvars, nexps, &e);
+ adjustlocalvars(ls, nvars);
+}
+
+
+static int funcname (LexState *ls, expdesc *v) {
+ /* funcname -> NAME {fieldsel} [`:' NAME] */
+ int ismethod = 0;
+ singlevar(ls, v);
+ while (ls->t.token == '.')
+ fieldsel(ls, v);
+ if (ls->t.token == ':') {
+ ismethod = 1;
+ fieldsel(ls, v);
+ }
+ return ismethod;
+}
+
+
+static void funcstat (LexState *ls, int line) {
+ /* funcstat -> FUNCTION funcname body */
+ int ismethod;
+ expdesc v, b;
+ luaX_next(ls); /* skip FUNCTION */
+ ismethod = funcname(ls, &v);
+ body(ls, &b, ismethod, line);
+ luaK_storevar(ls->fs, &v, &b);
+ luaK_fixline(ls->fs, line); /* definition `happens' in the first line */
+}
+
+
+static void exprstat (LexState *ls) {
+ /* stat -> func | assignment */
+ FuncState *fs = ls->fs;
+ struct LHS_assign v;
+ suffixedexp(ls, &v.v);
+ if (ls->t.token == '=' || ls->t.token == ',') { /* stat -> assignment ? */
+ v.prev = NULL;
+ assignment(ls, &v, 1);
+ }
+ else { /* stat -> func */
+ check_condition(ls, v.v.k == VCALL, "syntax error");
+ SETARG_C(getcode(fs, &v.v), 1); /* call statement uses no results */
+ }
+}
+
+
+static void retstat (LexState *ls) {
+ /* stat -> RETURN [explist] [';'] */
+ FuncState *fs = ls->fs;
+ expdesc e;
+ int first, nret; /* registers with returned values */
+ if (block_follow(ls, 1) || ls->t.token == ';')
+ first = nret = 0; /* return no values */
+ else {
+ nret = explist(ls, &e); /* optional return values */
+ if (hasmultret(e.k)) {
+ luaK_setmultret(fs, &e);
+ if (e.k == VCALL && nret == 1) { /* tail call? */
+ SET_OPCODE(getcode(fs,&e), OP_TAILCALL);
+ lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar);
+ }
+ first = fs->nactvar;
+ nret = LUA_MULTRET; /* return all values */
+ }
+ else {
+ if (nret == 1) /* only one single value? */
+ first = luaK_exp2anyreg(fs, &e);
+ else {
+ luaK_exp2nextreg(fs, &e); /* values must go to the `stack' */
+ first = fs->nactvar; /* return all `active' values */
+ lua_assert(nret == fs->freereg - first);
+ }
+ }
+ }
+ luaK_ret(fs, first, nret);
+ testnext(ls, ';'); /* skip optional semicolon */
+}
+
+
+static void statement (LexState *ls) {
+ int line = ls->linenumber; /* may be needed for error messages */
+ enterlevel(ls);
+ switch (ls->t.token) {
+ case ';': { /* stat -> ';' (empty statement) */
+ luaX_next(ls); /* skip ';' */
+ break;
+ }
+ case TK_IF: { /* stat -> ifstat */
+ ifstat(ls, line);
+ break;
+ }
+ case TK_WHILE: { /* stat -> whilestat */
+ whilestat(ls, line);
+ break;
+ }
+ case TK_DO: { /* stat -> DO block END */
+ luaX_next(ls); /* skip DO */
+ block(ls);
+ check_match(ls, TK_END, TK_DO, line);
+ break;
+ }
+ case TK_FOR: { /* stat -> forstat */
+ forstat(ls, line);
+ break;
+ }
+ case TK_REPEAT: { /* stat -> repeatstat */
+ repeatstat(ls, line);
+ break;
+ }
+ case TK_FUNCTION: { /* stat -> funcstat */
+ funcstat(ls, line);
+ break;
+ }
+ case TK_LOCAL: { /* stat -> localstat */
+ luaX_next(ls); /* skip LOCAL */
+ if (testnext(ls, TK_FUNCTION)) /* local function? */
+ localfunc(ls);
+ else
+ localstat(ls);
+ break;
+ }
+ case TK_DBCOLON: { /* stat -> label */
+ luaX_next(ls); /* skip double colon */
+ labelstat(ls, str_checkname(ls), line);
+ break;
+ }
+ case TK_RETURN: { /* stat -> retstat */
+ luaX_next(ls); /* skip RETURN */
+ retstat(ls);
+ break;
+ }
+ case TK_BREAK: /* stat -> breakstat */
+ case TK_GOTO: { /* stat -> 'goto' NAME */
+ gotostat(ls, luaK_jump(ls->fs));
+ break;
+ }
+ default: { /* stat -> func | assignment */
+ exprstat(ls);
+ break;
+ }
+ }
+ lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg &&
+ ls->fs->freereg >= ls->fs->nactvar);
+ ls->fs->freereg = ls->fs->nactvar; /* free registers */
+ leavelevel(ls);
+}
+
+/* }====================================================================== */
+
+
+/*
+** compiles the main function, which is a regular vararg function with an
+** upvalue named LUA_ENV
+*/
+static void mainfunc (LexState *ls, FuncState *fs) {
+ BlockCnt bl;
+ expdesc v;
+ open_func(ls, fs, &bl);
+ fs->f->is_vararg = 1; /* main function is always vararg */
+ init_exp(&v, VLOCAL, 0); /* create and... */
+ newupvalue(fs, ls->envn, &v); /* ...set environment upvalue */
+ luaX_next(ls); /* read first token */
+ statlist(ls); /* parse main body */
+ check(ls, TK_EOS);
+ close_func(ls);
+}
+
+
+Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
+ Dyndata *dyd, const char *name, int firstchar) {
+ LexState lexstate;
+ FuncState funcstate;
+ Closure *cl = luaF_newLclosure(L, 1); /* create main closure */
+ /* anchor closure (to avoid being collected) */
+ setclLvalue(L, L->top, cl);
+ incr_top(L);
+ funcstate.f = cl->l.p = luaF_newproto(L);
+ funcstate.f->source = luaS_new(L, name); /* create and anchor TString */
+ lexstate.buff = buff;
+ lexstate.dyd = dyd;
+ dyd->actvar.n = dyd->gt.n = dyd->label.n = 0;
+ luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar);
+ mainfunc(&lexstate, &funcstate);
+ lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs);
+ /* all scopes should be correctly finished */
+ lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0);
+ return cl; /* it's on the stack too */
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h
new file mode 100644
index 000000000000..0346e3c41a80
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h
@@ -0,0 +1,119 @@
+/*
+** $Id: lparser.h,v 1.70.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua Parser
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lparser_h
+#define lparser_h
+
+#include "llimits.h"
+#include "lobject.h"
+#include "lzio.h"
+
+
+/*
+** Expression descriptor
+*/
+
+typedef enum {
+ VVOID, /* no value */
+ VNIL,
+ VTRUE,
+ VFALSE,
+ VK, /* info = index of constant in `k' */
+ VKNUM, /* nval = numerical value */
+ VNONRELOC, /* info = result register */
+ VLOCAL, /* info = local register */
+ VUPVAL, /* info = index of upvalue in 'upvalues' */
+ VINDEXED, /* t = table register/upvalue; idx = index R/K */
+ VJMP, /* info = instruction pc */
+ VRELOCABLE, /* info = instruction pc */
+ VCALL, /* info = instruction pc */
+ VVARARG /* info = instruction pc */
+} expkind;
+
+
+#define vkisvar(k) (VLOCAL <= (k) && (k) <= VINDEXED)
+#define vkisinreg(k) ((k) == VNONRELOC || (k) == VLOCAL)
+
+typedef struct expdesc {
+ expkind k;
+ union {
+ struct { /* for indexed variables (VINDEXED) */
+ short idx; /* index (R/K) */
+ lu_byte t; /* table (register or upvalue) */
+ lu_byte vt; /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */
+ } ind;
+ int info; /* for generic use */
+ lua_Number nval; /* for VKNUM */
+ } u;
+ int t; /* patch list of `exit when true' */
+ int f; /* patch list of `exit when false' */
+} expdesc;
+
+
+/* description of active local variable */
+typedef struct Vardesc {
+ short idx; /* variable index in stack */
+} Vardesc;
+
+
+/* description of pending goto statements and label statements */
+typedef struct Labeldesc {
+ TString *name; /* label identifier */
+ int pc; /* position in code */
+ int line; /* line where it appeared */
+ lu_byte nactvar; /* local level where it appears in current block */
+} Labeldesc;
+
+
+/* list of labels or gotos */
+typedef struct Labellist {
+ Labeldesc *arr; /* array */
+ int n; /* number of entries in use */
+ int size; /* array size */
+} Labellist;
+
+
+/* dynamic structures used by the parser */
+typedef struct Dyndata {
+ struct { /* list of active local variables */
+ Vardesc *arr;
+ int n;
+ int size;
+ } actvar;
+ Labellist gt; /* list of pending gotos */
+ Labellist label; /* list of active labels */
+} Dyndata;
+
+
+/* control of blocks */
+struct BlockCnt; /* defined in lparser.c */
+
+
+/* state needed to generate code for a given function */
+typedef struct FuncState {
+ Proto *f; /* current function header */
+ Table *h; /* table to find (and reuse) elements in `k' */
+ struct FuncState *prev; /* enclosing function */
+ struct LexState *ls; /* lexical state */
+ struct BlockCnt *bl; /* chain of current blocks */
+ int pc; /* next position to code (equivalent to `ncode') */
+ int lasttarget; /* 'label' of last 'jump label' */
+ int jpc; /* list of pending jumps to `pc' */
+ int nk; /* number of elements in `k' */
+ int np; /* number of elements in `p' */
+ int firstlocal; /* index of first local var (in Dyndata array) */
+ short nlocvars; /* number of elements in 'f->locvars' */
+ lu_byte nactvar; /* number of active local variables */
+ lu_byte nups; /* number of upvalues */
+ lu_byte freereg; /* first free register */
+} FuncState;
+
+
+LUAI_FUNC Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
+ Dyndata *dyd, const char *name, int firstchar);
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c
new file mode 100644
index 000000000000..b98ce5c2b52b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c
@@ -0,0 +1,321 @@
+/*
+** $Id: lstate.c,v 2.99.1.2 2013/11/08 17:45:31 roberto Exp $
+** Global State
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define lstate_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+#if !defined(LUAI_GCPAUSE)
+#define LUAI_GCPAUSE 200 /* 200% */
+#endif
+
+#if !defined(LUAI_GCMAJOR)
+#define LUAI_GCMAJOR 200 /* 200% */
+#endif
+
+#if !defined(LUAI_GCMUL)
+#define LUAI_GCMUL 200 /* GC runs 'twice the speed' of memory allocation */
+#endif
+
+
+#define MEMERRMSG "not enough memory"
+
+
+/*
+** a macro to help the creation of a unique random seed when a state is
+** created; the seed is used to randomize hashes.
+*/
+#if !defined(luai_makeseed)
+#define luai_makeseed() cast(unsigned int, gethrtime())
+#endif
+
+
+
+/*
+** thread state + extra space
+*/
+typedef struct LX {
+#if defined(LUAI_EXTRASPACE)
+ char buff[LUAI_EXTRASPACE];
+#endif
+ lua_State l;
+} LX;
+
+
+/*
+** Main thread combines a thread state and the global state
+*/
+typedef struct LG {
+ LX l;
+ global_State g;
+} LG;
+
+
+
+#define fromstate(L) (cast(LX *, cast(lu_byte *, (L)) - offsetof(LX, l)))
+
+
+/*
+** Compute an initial seed as random as possible. In ANSI, rely on
+** Address Space Layout Randomization (if present) to increase
+** randomness..
+*/
+#define addbuff(b,p,e) \
+ { size_t t = cast(size_t, e); \
+ memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); }
+
+static unsigned int makeseed (lua_State *L) {
+ char buff[4 * sizeof(size_t)];
+ unsigned int h = luai_makeseed();
+ int p = 0;
+ addbuff(buff, p, L); /* heap variable */
+ addbuff(buff, p, &h); /* local variable */
+ addbuff(buff, p, luaO_nilobject); /* global variable */
+ addbuff(buff, p, &lua_newstate); /* public function */
+ lua_assert(p == sizeof(buff));
+ return luaS_hash(buff, p, h);
+}
+
+
+/*
+** set GCdebt to a new value keeping the value (totalbytes + GCdebt)
+** invariant
+*/
+void luaE_setdebt (global_State *g, l_mem debt) {
+ g->totalbytes -= (debt - g->GCdebt);
+ g->GCdebt = debt;
+}
+
+
+CallInfo *luaE_extendCI (lua_State *L) {
+ CallInfo *ci = luaM_new(L, CallInfo);
+ lua_assert(L->ci->next == NULL);
+ L->ci->next = ci;
+ ci->previous = L->ci;
+ ci->next = NULL;
+ return ci;
+}
+
+
+void luaE_freeCI (lua_State *L) {
+ CallInfo *ci = L->ci;
+ CallInfo *next = ci->next;
+ ci->next = NULL;
+ while ((ci = next) != NULL) {
+ next = ci->next;
+ luaM_free(L, ci);
+ }
+}
+
+
+static void stack_init (lua_State *L1, lua_State *L) {
+ int i; CallInfo *ci;
+ /* initialize stack array */
+ L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue);
+ L1->stacksize = BASIC_STACK_SIZE;
+ for (i = 0; i < BASIC_STACK_SIZE; i++)
+ setnilvalue(L1->stack + i); /* erase new stack */
+ L1->top = L1->stack;
+ L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK;
+ /* initialize first ci */
+ ci = &L1->base_ci;
+ ci->next = ci->previous = NULL;
+ ci->callstatus = 0;
+ ci->func = L1->top;
+ setnilvalue(L1->top++); /* 'function' entry for this 'ci' */
+ ci->top = L1->top + LUA_MINSTACK;
+ L1->ci = ci;
+}
+
+
+static void freestack (lua_State *L) {
+ if (L->stack == NULL)
+ return; /* stack not completely built yet */
+ L->ci = &L->base_ci; /* free the entire 'ci' list */
+ luaE_freeCI(L);
+ luaM_freearray(L, L->stack, L->stacksize); /* free stack array */
+}
+
+
+/*
+** Create registry table and its predefined values
+*/
+static void init_registry (lua_State *L, global_State *g) {
+ TValue mt;
+ /* create registry */
+ Table *registry = luaH_new(L);
+ sethvalue(L, &g->l_registry, registry);
+ luaH_resize(L, registry, LUA_RIDX_LAST, 0);
+ /* registry[LUA_RIDX_MAINTHREAD] = L */
+ setthvalue(L, &mt, L);
+ luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt);
+ /* registry[LUA_RIDX_GLOBALS] = table of globals */
+ sethvalue(L, &mt, luaH_new(L));
+ luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt);
+}
+
+
+/*
+** open parts of the state that may cause memory-allocation errors
+*/
+static void f_luaopen (lua_State *L, void *ud) {
+ global_State *g = G(L);
+ UNUSED(ud);
+ stack_init(L, L); /* init stack */
+ init_registry(L, g);
+ luaS_resize(L, MINSTRTABSIZE); /* initial size of string table */
+ luaT_init(L);
+ luaX_init(L);
+ /* pre-create memory-error message */
+ g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
+ luaS_fix(g->memerrmsg); /* it should never be collected */
+ g->gcrunning = 1; /* allow gc */
+ g->version = lua_version(NULL);
+ luai_userstateopen(L);
+}
+
+
+/*
+** preinitialize a state with consistent values without allocating
+** any memory (to avoid errors)
+*/
+static void preinit_state (lua_State *L, global_State *g) {
+ G(L) = g;
+ L->stack = NULL;
+ L->ci = NULL;
+ L->stacksize = 0;
+ L->errorJmp = NULL;
+ L->nCcalls = 0;
+ L->hook = NULL;
+ L->hookmask = 0;
+ L->basehookcount = 0;
+ L->allowhook = 1;
+ resethookcount(L);
+ L->openupval = NULL;
+ L->nny = 1;
+ L->status = LUA_OK;
+ L->errfunc = 0;
+}
+
+
+static void close_state (lua_State *L) {
+ global_State *g = G(L);
+ luaF_close(L, L->stack); /* close all upvalues for this thread */
+ luaC_freeallobjects(L); /* collect all objects */
+ if (g->version) /* closing a fully built state? */
+ luai_userstateclose(L);
+ luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size);
+ luaZ_freebuffer(L, &g->buff);
+ freestack(L);
+ lua_assert(gettotalbytes(g) == sizeof(LG));
+ (*g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0); /* free main block */
+}
+
+
+LUA_API lua_State *lua_newthread (lua_State *L) {
+ lua_State *L1;
+ lua_lock(L);
+ luaC_checkGC(L);
+ L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th;
+ setthvalue(L, L->top, L1);
+ api_incr_top(L);
+ preinit_state(L1, G(L));
+ L1->hookmask = L->hookmask;
+ L1->basehookcount = L->basehookcount;
+ L1->hook = L->hook;
+ resethookcount(L1);
+ luai_userstatethread(L, L1);
+ stack_init(L1, L); /* init stack */
+ lua_unlock(L);
+ return L1;
+}
+
+
+void luaE_freethread (lua_State *L, lua_State *L1) {
+ LX *l = fromstate(L1);
+ luaF_close(L1, L1->stack); /* close all upvalues for this thread */
+ lua_assert(L1->openupval == NULL);
+ luai_userstatefree(L, L1);
+ freestack(L1);
+ luaM_free(L, l);
+}
+
+
+LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
+ int i;
+ lua_State *L;
+ global_State *g;
+ LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG)));
+ if (l == NULL) return NULL;
+ L = &l->l.l;
+ g = &l->g;
+ L->next = NULL;
+ L->tt = LUA_TTHREAD;
+ g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT);
+ L->marked = luaC_white(g);
+ g->gckind = KGC_NORMAL;
+ preinit_state(L, g);
+ g->frealloc = f;
+ g->ud = ud;
+ g->mainthread = L;
+ g->seed = makeseed(L);
+ g->uvhead.u.l.prev = &g->uvhead;
+ g->uvhead.u.l.next = &g->uvhead;
+ g->gcrunning = 0; /* no GC while building state */
+ g->GCestimate = 0;
+ g->strt.size = 0;
+ g->strt.nuse = 0;
+ g->strt.hash = NULL;
+ setnilvalue(&g->l_registry);
+ luaZ_initbuffer(L, &g->buff);
+ g->panic = NULL;
+ g->version = NULL;
+ g->gcstate = GCSpause;
+ g->allgc = NULL;
+ g->finobj = NULL;
+ g->tobefnz = NULL;
+ g->sweepgc = g->sweepfin = NULL;
+ g->gray = g->grayagain = NULL;
+ g->weak = g->ephemeron = g->allweak = NULL;
+ g->totalbytes = sizeof(LG);
+ g->GCdebt = 0;
+ g->gcpause = LUAI_GCPAUSE;
+ g->gcmajorinc = LUAI_GCMAJOR;
+ g->gcstepmul = LUAI_GCMUL;
+ for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL;
+ if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) {
+ /* memory allocation error: free partial state */
+ close_state(L);
+ L = NULL;
+ }
+ return L;
+}
+
+
+LUA_API void lua_close (lua_State *L) {
+ L = G(L)->mainthread; /* only the main thread can be closed */
+ lua_lock(L);
+ close_state(L);
+}
+
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h
new file mode 100644
index 000000000000..daffd9aacfbb
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h
@@ -0,0 +1,228 @@
+/*
+** $Id: lstate.h,v 2.82.1.1 2013/04/12 18:48:47 roberto Exp $
+** Global State
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lstate_h
+#define lstate_h
+
+#include "lua.h"
+
+#include "lobject.h"
+#include "ltm.h"
+#include "lzio.h"
+
+
+/*
+
+** Some notes about garbage-collected objects: All objects in Lua must
+** be kept somehow accessible until being freed.
+**
+** Lua keeps most objects linked in list g->allgc. The link uses field
+** 'next' of the CommonHeader.
+**
+** Strings are kept in several lists headed by the array g->strt.hash.
+**
+** Open upvalues are not subject to independent garbage collection. They
+** are collected together with their respective threads. Lua keeps a
+** double-linked list with all open upvalues (g->uvhead) so that it can
+** mark objects referred by them. (They are always gray, so they must
+** be remarked in the atomic step. Usually their contents would be marked
+** when traversing the respective threads, but the thread may already be
+** dead, while the upvalue is still accessible through closures.)
+**
+** Objects with finalizers are kept in the list g->finobj.
+**
+** The list g->tobefnz links all objects being finalized.
+
+*/
+
+
+struct lua_longjmp; /* defined in ldo.c */
+
+
+
+/* extra stack space to handle TM calls and some other extras */
+#define EXTRA_STACK 5
+
+
+#define BASIC_STACK_SIZE (2*LUA_MINSTACK)
+
+
+/* kinds of Garbage Collection */
+#define KGC_NORMAL 0
+#define KGC_EMERGENCY 1 /* gc was forced by an allocation failure */
+#define KGC_GEN 2 /* generational collection */
+
+
+typedef struct stringtable {
+ GCObject **hash;
+ lu_int32 nuse; /* number of elements */
+ int size;
+} stringtable;
+
+
+/*
+** information about a call
+*/
+typedef struct CallInfo {
+ StkId func; /* function index in the stack */
+ StkId top; /* top for this function */
+ struct CallInfo *previous, *next; /* dynamic call link */
+ short nresults; /* expected number of results from this function */
+ lu_byte callstatus;
+ ptrdiff_t extra;
+ union {
+ struct { /* only for Lua functions */
+ StkId base; /* base for this function */
+ const Instruction *savedpc;
+ } l;
+ struct { /* only for C functions */
+ int ctx; /* context info. in case of yields */
+ lua_CFunction k; /* continuation in case of yields */
+ ptrdiff_t old_errfunc;
+ lu_byte old_allowhook;
+ lu_byte status;
+ } c;
+ } u;
+} CallInfo;
+
+
+/*
+** Bits in CallInfo status
+*/
+#define CIST_LUA (1<<0) /* call is running a Lua function */
+#define CIST_HOOKED (1<<1) /* call is running a debug hook */
+#define CIST_REENTRY (1<<2) /* call is running on same invocation of
+ luaV_execute of previous call */
+#define CIST_YIELDED (1<<3) /* call reentered after suspension */
+#define CIST_YPCALL (1<<4) /* call is a yieldable protected call */
+#define CIST_STAT (1<<5) /* call has an error status (pcall) */
+#define CIST_TAIL (1<<6) /* call was tail called */
+#define CIST_HOOKYIELD (1<<7) /* last hook called yielded */
+
+
+#define isLua(ci) ((ci)->callstatus & CIST_LUA)
+
+
+/*
+** `global state', shared by all threads of this state
+*/
+typedef struct global_State {
+ lua_Alloc frealloc; /* function to reallocate memory */
+ void *ud; /* auxiliary data to `frealloc' */
+ lu_mem totalbytes; /* number of bytes currently allocated - GCdebt */
+ l_mem GCdebt; /* bytes allocated not yet compensated by the collector */
+ lu_mem GCmemtrav; /* memory traversed by the GC */
+ lu_mem GCestimate; /* an estimate of the non-garbage memory in use */
+ stringtable strt; /* hash table for strings */
+ TValue l_registry;
+ unsigned int seed; /* randomized seed for hashes */
+ lu_byte currentwhite;
+ lu_byte gcstate; /* state of garbage collector */
+ lu_byte gckind; /* kind of GC running */
+ lu_byte gcrunning; /* true if GC is running */
+ int sweepstrgc; /* position of sweep in `strt' */
+ GCObject *allgc; /* list of all collectable objects */
+ GCObject *finobj; /* list of collectable objects with finalizers */
+ GCObject **sweepgc; /* current position of sweep in list 'allgc' */
+ GCObject **sweepfin; /* current position of sweep in list 'finobj' */
+ GCObject *gray; /* list of gray objects */
+ GCObject *grayagain; /* list of objects to be traversed atomically */
+ GCObject *weak; /* list of tables with weak values */
+ GCObject *ephemeron; /* list of ephemeron tables (weak keys) */
+ GCObject *allweak; /* list of all-weak tables */
+ GCObject *tobefnz; /* list of userdata to be GC */
+ UpVal uvhead; /* head of double-linked list of all open upvalues */
+ Mbuffer buff; /* temporary buffer for string concatenation */
+ int gcpause; /* size of pause between successive GCs */
+ int gcmajorinc; /* pause between major collections (only in gen. mode) */
+ int gcstepmul; /* GC `granularity' */
+ lua_CFunction panic; /* to be called in unprotected errors */
+ struct lua_State *mainthread;
+ const lua_Number *version; /* pointer to version number */
+ TString *memerrmsg; /* memory-error message */
+ TString *tmname[TM_N]; /* array with tag-method names */
+ struct Table *mt[LUA_NUMTAGS]; /* metatables for basic types */
+} global_State;
+
+
+/*
+** `per thread' state
+*/
+struct lua_State {
+ CommonHeader;
+ lu_byte status;
+ StkId top; /* first free slot in the stack */
+ global_State *l_G;
+ CallInfo *ci; /* call info for current function */
+ const Instruction *oldpc; /* last pc traced */
+ StkId stack_last; /* last free slot in the stack */
+ StkId stack; /* stack base */
+ int stacksize;
+ unsigned short nny; /* number of non-yieldable calls in stack */
+ unsigned short nCcalls; /* number of nested C calls */
+ lu_byte hookmask;
+ lu_byte allowhook;
+ int basehookcount;
+ int hookcount;
+ lua_Hook hook;
+ GCObject *openupval; /* list of open upvalues in this stack */
+ GCObject *gclist;
+ struct lua_longjmp *errorJmp; /* current error recover point */
+ ptrdiff_t errfunc; /* current error handling function (stack index) */
+ CallInfo base_ci; /* CallInfo for first level (C calling Lua) */
+};
+
+
+#define G(L) (L->l_G)
+
+
+/*
+** Union of all collectable objects
+*/
+union GCObject {
+ GCheader gch; /* common header */
+ union TString ts;
+ union Udata u;
+ union Closure cl;
+ struct Table h;
+ struct Proto p;
+ struct UpVal uv;
+ struct lua_State th; /* thread */
+};
+
+
+#define gch(o) (&(o)->gch)
+
+/* macros to convert a GCObject into a specific value */
+#define rawgco2ts(o) \
+ check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts))
+#define gco2ts(o) (&rawgco2ts(o)->tsv)
+#define rawgco2u(o) check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u))
+#define gco2u(o) (&rawgco2u(o)->uv)
+#define gco2lcl(o) check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l))
+#define gco2ccl(o) check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c))
+#define gco2cl(o) \
+ check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl))
+#define gco2t(o) check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h))
+#define gco2p(o) check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p))
+#define gco2uv(o) check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv))
+#define gco2th(o) check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th))
+
+/* macro to convert any Lua object into a GCObject */
+#define obj2gco(v) (cast(GCObject *, (v)))
+
+
+/* actual number of total bytes allocated */
+#define gettotalbytes(g) ((g)->totalbytes + (g)->GCdebt)
+
+LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt);
+LUAI_FUNC void luaE_freethread (lua_State *L, lua_State *L1);
+LUAI_FUNC CallInfo *luaE_extendCI (lua_State *L);
+LUAI_FUNC void luaE_freeCI (lua_State *L);
+
+
+#endif
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c
new file mode 100644
index 000000000000..e20ab04b12de
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c
@@ -0,0 +1,185 @@
+/*
+** $Id: lstring.c,v 2.26.1.1 2013/04/12 18:48:47 roberto Exp $
+** String table (keeps all strings handled by Lua)
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define lstring_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+
+
+/*
+** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to
+** compute its hash
+*/
+#if !defined(LUAI_HASHLIMIT)
+#define LUAI_HASHLIMIT 5
+#endif
+
+
+/*
+** equality for long strings
+*/
+int luaS_eqlngstr (TString *a, TString *b) {
+ size_t len = a->tsv.len;
+ lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR);
+ return (a == b) || /* same instance or... */
+ ((len == b->tsv.len) && /* equal length and ... */
+ (memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */
+}
+
+
+/*
+** equality for strings
+*/
+int luaS_eqstr (TString *a, TString *b) {
+ return (a->tsv.tt == b->tsv.tt) &&
+ (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b));
+}
+
+
+unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
+ unsigned int h = seed ^ cast(unsigned int, l);
+ size_t l1;
+ size_t step = (l >> LUAI_HASHLIMIT) + 1;
+ for (l1 = l; l1 >= step; l1 -= step)
+ h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1]));
+ return h;
+}
+
+
+/*
+** resizes the string table
+*/
+void luaS_resize (lua_State *L, int newsize) {
+ int i;
+ stringtable *tb = &G(L)->strt;
+ /* cannot resize while GC is traversing strings */
+ luaC_runtilstate(L, ~bitmask(GCSsweepstring));
+ if (newsize > tb->size) {
+ luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
+ for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL;
+ }
+ /* rehash */
+ for (i=0; i<tb->size; i++) {
+ GCObject *p = tb->hash[i];
+ tb->hash[i] = NULL;
+ while (p) { /* for each node in the list */
+ GCObject *next = gch(p)->next; /* save next */
+ unsigned int h = lmod(gco2ts(p)->hash, newsize); /* new position */
+ gch(p)->next = tb->hash[h]; /* chain it */
+ tb->hash[h] = p;
+ resetoldbit(p); /* see MOVE OLD rule */
+ p = next;
+ }
+ }
+ if (newsize < tb->size) {
+ /* shrinking slice must be empty */
+ lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL);
+ luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
+ }
+ tb->size = newsize;
+}
+
+
+/*
+** creates a new string object
+*/
+static TString *createstrobj (lua_State *L, const char *str, size_t l,
+ int tag, unsigned int h, GCObject **list) {
+ TString *ts;
+ size_t totalsize; /* total size of TString object */
+ totalsize = sizeof(TString) + ((l + 1) * sizeof(char));
+ ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts;
+ ts->tsv.len = l;
+ ts->tsv.hash = h;
+ ts->tsv.extra = 0;
+ memcpy(ts+1, str, l*sizeof(char));
+ ((char *)(ts+1))[l] = '\0'; /* ending 0 */
+ return ts;
+}
+
+
+/*
+** creates a new short string, inserting it into string table
+*/
+static TString *newshrstr (lua_State *L, const char *str, size_t l,
+ unsigned int h) {
+ GCObject **list; /* (pointer to) list where it will be inserted */
+ stringtable *tb = &G(L)->strt;
+ TString *s;
+ if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2)
+ luaS_resize(L, tb->size*2); /* too crowded */
+ list = &tb->hash[lmod(h, tb->size)];
+ s = createstrobj(L, str, l, LUA_TSHRSTR, h, list);
+ tb->nuse++;
+ return s;
+}
+
+
+/*
+** checks whether short string exists and reuses it or creates a new one
+*/
+static TString *internshrstr (lua_State *L, const char *str, size_t l) {
+ GCObject *o;
+ global_State *g = G(L);
+ unsigned int h = luaS_hash(str, l, g->seed);
+ for (o = g->strt.hash[lmod(h, g->strt.size)];
+ o != NULL;
+ o = gch(o)->next) {
+ TString *ts = rawgco2ts(o);
+ if (h == ts->tsv.hash &&
+ l == ts->tsv.len &&
+ (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
+ if (isdead(G(L), o)) /* string is dead (but was not collected yet)? */
+ changewhite(o); /* resurrect it */
+ return ts;
+ }
+ }
+ return newshrstr(L, str, l, h); /* not found; create a new string */
+}
+
+
+/*
+** new string (with explicit length)
+*/
+TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
+ if (l <= LUAI_MAXSHORTLEN) /* short string? */
+ return internshrstr(L, str, l);
+ else {
+ if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char))
+ luaM_toobig(L);
+ return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL);
+ }
+}
+
+
+/*
+** new zero-terminated string
+*/
+TString *luaS_new (lua_State *L, const char *str) {
+ return luaS_newlstr(L, str, strlen(str));
+}
+
+
+Udata *luaS_newudata (lua_State *L, size_t s, Table *e) {
+ Udata *u;
+ if (s > MAX_SIZET - sizeof(Udata))
+ luaM_toobig(L);
+ u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u;
+ u->uv.len = s;
+ u->uv.metatable = NULL;
+ u->uv.env = e;
+ return u;
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h
new file mode 100644
index 000000000000..260e7f169bd0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h
@@ -0,0 +1,46 @@
+/*
+** $Id: lstring.h,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
+** String table (keep all strings handled by Lua)
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lstring_h
+#define lstring_h
+
+#include "lgc.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+#define sizestring(s) (sizeof(union TString)+((s)->len+1)*sizeof(char))
+
+#define sizeudata(u) (sizeof(union Udata)+(u)->len)
+
+#define luaS_newliteral(L, s) (luaS_newlstr(L, "" s, \
+ (sizeof(s)/sizeof(char))-1))
+
+#define luaS_fix(s) l_setbit((s)->tsv.marked, FIXEDBIT)
+
+
+/*
+** test whether a string is a reserved word
+*/
+#define isreserved(s) ((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0)
+
+
+/*
+** equality for short strings, which are always internalized
+*/
+#define eqshrstr(a,b) check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b))
+
+
+LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed);
+LUAI_FUNC int luaS_eqlngstr (TString *a, TString *b);
+LUAI_FUNC int luaS_eqstr (TString *a, TString *b);
+LUAI_FUNC void luaS_resize (lua_State *L, int newsize);
+LUAI_FUNC Udata *luaS_newudata (lua_State *L, size_t s, Table *e);
+LUAI_FUNC TString *luaS_newlstr (lua_State *L, const char *str, size_t l);
+LUAI_FUNC TString *luaS_new (lua_State *L, const char *str);
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c
new file mode 100644
index 000000000000..589752d3690e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c
@@ -0,0 +1,1050 @@
+/*
+** $Id: lstrlib.c,v 1.178.1.1 2013/04/12 18:48:47 roberto Exp $
+** Standard library for string operations and pattern-matching
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/ctype.h>
+#include <sys/zfs_context.h>
+
+#define lstrlib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+/*
+** maximum number of captures that a pattern can do during
+** pattern-matching. This limit is arbitrary.
+*/
+#if !defined(LUA_MAXCAPTURES)
+#define LUA_MAXCAPTURES 32
+#endif
+
+
+/* macro to `unsign' a character */
+#define uchar(c) ((unsigned char)(c))
+
+/*
+ * PATCHED: add missing character macros.
+ */
+#ifdef illumos
+#define tolower(C) (((C) >= 'A' && (C) <= 'Z') ? (C) - 'A' + 'a' : (C))
+#define toupper(C) (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A': (C))
+#define iscntrl(C) ((((C) >= 0) && ((C) <= 0x1f)) || ((C) == 0x7f))
+#else
+#define isalnum(C) (isalpha(C) || isdigit(C))
+#define iscntrl(C) (uchar(C) <= 0x1f || uchar(C) == 0x7f)
+#endif
+#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E)
+#define ispunct(C) (((C) >= 0x21 && (C) <= 0x2F) || \
+ ((C) >= 0x3A && (C) <= 0x40) || \
+ ((C) >= 0x5B && (C) <= 0x60) || \
+ ((C) >= 0x7B && (C) <= 0x7E))
+
+/*
+ * The provided version of sprintf returns a char *, but str_format expects
+ * it to return the number of characters printed. This version has the expected
+ * behavior.
+ */
+static size_t str_sprintf(char *buf, const char *fmt, ...) {
+ va_list args;
+ size_t len;
+
+ va_start(args, fmt);
+ len = vsnprintf(buf, INT_MAX, fmt, args);
+ va_end(args);
+
+ return len;
+}
+
+
+static int str_len (lua_State *L) {
+ size_t l;
+ luaL_checklstring(L, 1, &l);
+ lua_pushinteger(L, (lua_Integer)l);
+ return 1;
+}
+
+
+/* translate a relative string position: negative means back from end */
+static size_t posrelat (ptrdiff_t pos, size_t len) {
+ if (pos >= 0) return (size_t)pos;
+ else if (0u - (size_t)pos > len) return 0;
+ else return len - ((size_t)-pos) + 1;
+}
+
+
+static int str_sub (lua_State *L) {
+ size_t l;
+ const char *s = luaL_checklstring(L, 1, &l);
+ size_t start = posrelat(luaL_checkinteger(L, 2), l);
+ size_t end = posrelat(luaL_optinteger(L, 3, -1), l);
+ if (start < 1) start = 1;
+ if (end > l) end = l;
+ if (start <= end)
+ lua_pushlstring(L, s + start - 1, end - start + 1);
+ else lua_pushliteral(L, "");
+ return 1;
+}
+
+
+static int str_reverse (lua_State *L) {
+ size_t l, i;
+ luaL_Buffer b;
+ const char *s = luaL_checklstring(L, 1, &l);
+ char *p = luaL_buffinitsize(L, &b, l);
+ for (i = 0; i < l; i++)
+ p[i] = s[l - i - 1];
+ luaL_pushresultsize(&b, l);
+ return 1;
+}
+
+
+static int str_lower (lua_State *L) {
+ size_t l;
+ size_t i;
+ luaL_Buffer b;
+ const char *s = luaL_checklstring(L, 1, &l);
+ char *p = luaL_buffinitsize(L, &b, l);
+ for (i=0; i<l; i++)
+ p[i] = tolower(uchar(s[i]));
+ luaL_pushresultsize(&b, l);
+ return 1;
+}
+
+
+static int str_upper (lua_State *L) {
+ size_t l;
+ size_t i;
+ luaL_Buffer b;
+ const char *s = luaL_checklstring(L, 1, &l);
+ char *p = luaL_buffinitsize(L, &b, l);
+ for (i=0; i<l; i++)
+ p[i] = toupper(uchar(s[i]));
+ luaL_pushresultsize(&b, l);
+ return 1;
+}
+
+
+/* reasonable limit to avoid arithmetic overflow */
+#define MAXSIZE ((~(size_t)0) >> 1)
+
+static int str_rep (lua_State *L) {
+ size_t l, lsep;
+ const char *s = luaL_checklstring(L, 1, &l);
+ int n = luaL_checkint(L, 2);
+ const char *sep = luaL_optlstring(L, 3, "", &lsep);
+ if (n <= 0) lua_pushliteral(L, "");
+ else if (l + lsep < l || l + lsep >= MAXSIZE / n) /* may overflow? */
+ return luaL_error(L, "resulting string too large");
+ else {
+ size_t totallen = n * l + (n - 1) * lsep;
+ luaL_Buffer b;
+ char *p = luaL_buffinitsize(L, &b, totallen);
+ while (n-- > 1) { /* first n-1 copies (followed by separator) */
+ memcpy(p, s, l * sizeof(char)); p += l;
+ if (lsep > 0) { /* avoid empty 'memcpy' (may be expensive) */
+ memcpy(p, sep, lsep * sizeof(char)); p += lsep;
+ }
+ }
+ memcpy(p, s, l * sizeof(char)); /* last copy (not followed by separator) */
+ luaL_pushresultsize(&b, totallen);
+ }
+ return 1;
+}
+
+
+static int str_byte (lua_State *L) {
+ size_t l;
+ const char *s = luaL_checklstring(L, 1, &l);
+ size_t posi = posrelat(luaL_optinteger(L, 2, 1), l);
+ size_t pose = posrelat(luaL_optinteger(L, 3, posi), l);
+ int n, i;
+ if (posi < 1) posi = 1;
+ if (pose > l) pose = l;
+ if (posi > pose) return 0; /* empty interval; return no values */
+ n = (int)(pose - posi + 1);
+ if (posi + n <= pose) /* (size_t -> int) overflow? */
+ return luaL_error(L, "string slice too long");
+ luaL_checkstack(L, n, "string slice too long");
+ for (i=0; i<n; i++)
+ lua_pushinteger(L, uchar(s[posi+i-1]));
+ return n;
+}
+
+
+static int str_char (lua_State *L) {
+ int n = lua_gettop(L); /* number of arguments */
+ int i;
+ luaL_Buffer b;
+ char *p = luaL_buffinitsize(L, &b, n);
+ for (i=1; i<=n; i++) {
+ int c = luaL_checkint(L, i);
+ luaL_argcheck(L, uchar(c) == c, i, "value out of range");
+ p[i - 1] = uchar(c);
+ }
+ luaL_pushresultsize(&b, n);
+ return 1;
+}
+
+
+static int writer (lua_State *L, const void* b, size_t size, void* B) {
+ (void)L;
+ luaL_addlstring((luaL_Buffer*) B, (const char *)b, size);
+ return 0;
+}
+
+
+static int str_dump (lua_State *L) {
+ luaL_Buffer b;
+ luaL_checktype(L, 1, LUA_TFUNCTION);
+ lua_settop(L, 1);
+ luaL_buffinit(L,&b);
+ if (lua_dump(L, writer, &b) != 0)
+ return luaL_error(L, "unable to dump given function");
+ luaL_pushresult(&b);
+ return 1;
+}
+
+
+
+/*
+** {======================================================
+** PATTERN MATCHING
+** =======================================================
+*/
+
+
+#define CAP_UNFINISHED (-1)
+#define CAP_POSITION (-2)
+
+
+typedef struct MatchState {
+ int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
+ const char *src_init; /* init of source string */
+ const char *src_end; /* end ('\0') of source string */
+ const char *p_end; /* end ('\0') of pattern */
+ lua_State *L;
+ int level; /* total number of captures (finished or unfinished) */
+ struct {
+ const char *init;
+ ptrdiff_t len;
+ } capture[LUA_MAXCAPTURES];
+} MatchState;
+
+
+/* recursive function */
+static const char *match (MatchState *ms, const char *s, const char *p);
+
+
+/* maximum recursion depth for 'match' */
+#if !defined(MAXCCALLS)
+#define MAXCCALLS 200
+#endif
+
+
+#define L_ESC '%'
+#define SPECIALS "^$*+?.([%-"
+
+
+static int check_capture (MatchState *ms, int l) {
+ l -= '1';
+ if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
+ return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
+ return l;
+}
+
+
+static int capture_to_close (MatchState *ms) {
+ int level = ms->level;
+ for (level--; level>=0; level--)
+ if (ms->capture[level].len == CAP_UNFINISHED) return level;
+ return luaL_error(ms->L, "invalid pattern capture");
+}
+
+
+static const char *classend (MatchState *ms, const char *p) {
+ switch (*p++) {
+ case L_ESC: {
+ if (p == ms->p_end)
+ luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
+ return p+1;
+ }
+ case '[': {
+ if (*p == '^') p++;
+ do { /* look for a `]' */
+ if (p == ms->p_end)
+ luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
+ if (*(p++) == L_ESC && p < ms->p_end)
+ p++; /* skip escapes (e.g. `%]') */
+ } while (*p != ']');
+ return p+1;
+ }
+ default: {
+ return p;
+ }
+ }
+}
+
+
+static int match_class (int c, int cl) {
+ int res;
+ switch (tolower(cl)) {
+ case 'a' : res = isalpha(c); break;
+ case 'c' : res = iscntrl(c); break;
+ case 'd' : res = isdigit(c); break;
+ case 'g' : res = isgraph(c); break;
+ case 'l' : res = islower(c); break;
+ case 'p' : res = ispunct(c); break;
+ case 's' : res = isspace(c); break;
+ case 'u' : res = isupper(c); break;
+ case 'w' : res = isalnum(c); break;
+ case 'x' : res = isxdigit(c); break;
+ case 'z' : res = (c == 0); break; /* deprecated option */
+ default: return (cl == c);
+ }
+ return (islower(cl) ? res : !res);
+}
+
+
+static int matchbracketclass (int c, const char *p, const char *ec) {
+ int sig = 1;
+ if (*(p+1) == '^') {
+ sig = 0;
+ p++; /* skip the `^' */
+ }
+ while (++p < ec) {
+ if (*p == L_ESC) {
+ p++;
+ if (match_class(c, uchar(*p)))
+ return sig;
+ }
+ else if ((*(p+1) == '-') && (p+2 < ec)) {
+ p+=2;
+ if (uchar(*(p-2)) <= c && c <= uchar(*p))
+ return sig;
+ }
+ else if (uchar(*p) == c) return sig;
+ }
+ return !sig;
+}
+
+
+static int singlematch (MatchState *ms, const char *s, const char *p,
+ const char *ep) {
+ if (s >= ms->src_end)
+ return 0;
+ else {
+ int c = uchar(*s);
+ switch (*p) {
+ case '.': return 1; /* matches any char */
+ case L_ESC: return match_class(c, uchar(*(p+1)));
+ case '[': return matchbracketclass(c, p, ep-1);
+ default: return (uchar(*p) == c);
+ }
+ }
+}
+
+
+static const char *matchbalance (MatchState *ms, const char *s,
+ const char *p) {
+ if (p >= ms->p_end - 1)
+ luaL_error(ms->L, "malformed pattern "
+ "(missing arguments to " LUA_QL("%%b") ")");
+ if (*s != *p) return NULL;
+ else {
+ int b = *p;
+ int e = *(p+1);
+ int cont = 1;
+ while (++s < ms->src_end) {
+ if (*s == e) {
+ if (--cont == 0) return s+1;
+ }
+ else if (*s == b) cont++;
+ }
+ }
+ return NULL; /* string ends out of balance */
+}
+
+
+static const char *max_expand (MatchState *ms, const char *s,
+ const char *p, const char *ep) {
+ ptrdiff_t i = 0; /* counts maximum expand for item */
+ while (singlematch(ms, s + i, p, ep))
+ i++;
+ /* keeps trying to match with the maximum repetitions */
+ while (i>=0) {
+ const char *res = match(ms, (s+i), ep+1);
+ if (res) return res;
+ i--; /* else didn't match; reduce 1 repetition to try again */
+ }
+ return NULL;
+}
+
+
+static const char *min_expand (MatchState *ms, const char *s,
+ const char *p, const char *ep) {
+ for (;;) {
+ const char *res = match(ms, s, ep+1);
+ if (res != NULL)
+ return res;
+ else if (singlematch(ms, s, p, ep))
+ s++; /* try with one more repetition */
+ else return NULL;
+ }
+}
+
+
+static const char *start_capture (MatchState *ms, const char *s,
+ const char *p, int what) {
+ const char *res;
+ int level = ms->level;
+ if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
+ ms->capture[level].init = s;
+ ms->capture[level].len = what;
+ ms->level = level+1;
+ if ((res=match(ms, s, p)) == NULL) /* match failed? */
+ ms->level--; /* undo capture */
+ return res;
+}
+
+
+static const char *end_capture (MatchState *ms, const char *s,
+ const char *p) {
+ int l = capture_to_close(ms);
+ const char *res;
+ ms->capture[l].len = s - ms->capture[l].init; /* close capture */
+ if ((res = match(ms, s, p)) == NULL) /* match failed? */
+ ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
+ return res;
+}
+
+
+static const char *match_capture (MatchState *ms, const char *s, int l) {
+ size_t len;
+ l = check_capture(ms, l);
+ len = ms->capture[l].len;
+ if ((size_t)(ms->src_end-s) >= len &&
+ memcmp(ms->capture[l].init, s, len) == 0)
+ return s+len;
+ else return NULL;
+}
+
+
+static const char *match (MatchState *ms, const char *s, const char *p) {
+ if (ms->matchdepth-- == 0)
+ luaL_error(ms->L, "pattern too complex");
+ init: /* using goto's to optimize tail recursion */
+ if (p != ms->p_end) { /* end of pattern? */
+ switch (*p) {
+ case '(': { /* start capture */
+ if (*(p + 1) == ')') /* position capture? */
+ s = start_capture(ms, s, p + 2, CAP_POSITION);
+ else
+ s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
+ break;
+ }
+ case ')': { /* end capture */
+ s = end_capture(ms, s, p + 1);
+ break;
+ }
+ case '$': {
+ if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
+ goto dflt; /* no; go to default */
+ s = (s == ms->src_end) ? s : NULL; /* check end of string */
+ break;
+ }
+ case L_ESC: { /* escaped sequences not in the format class[*+?-]? */
+ switch (*(p + 1)) {
+ case 'b': { /* balanced string? */
+ s = matchbalance(ms, s, p + 2);
+ if (s != NULL) {
+ p += 4; goto init; /* return match(ms, s, p + 4); */
+ } /* else fail (s == NULL) */
+ break;
+ }
+ case 'f': { /* frontier? */
+ const char *ep; char previous;
+ p += 2;
+ if (*p != '[')
+ luaL_error(ms->L, "missing " LUA_QL("[") " after "
+ LUA_QL("%%f") " in pattern");
+ ep = classend(ms, p); /* points to what is next */
+ previous = (s == ms->src_init) ? '\0' : *(s - 1);
+ if (!matchbracketclass(uchar(previous), p, ep - 1) &&
+ matchbracketclass(uchar(*s), p, ep - 1)) {
+ p = ep; goto init; /* return match(ms, s, ep); */
+ }
+ s = NULL; /* match failed */
+ break;
+ }
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ case '8': case '9': { /* capture results (%0-%9)? */
+ s = match_capture(ms, s, uchar(*(p + 1)));
+ if (s != NULL) {
+ p += 2; goto init; /* return match(ms, s, p + 2) */
+ }
+ break;
+ }
+ default: goto dflt;
+ }
+ break;
+ }
+ default: dflt: { /* pattern class plus optional suffix */
+ const char *ep = classend(ms, p); /* points to optional suffix */
+ /* does not match at least once? */
+ if (!singlematch(ms, s, p, ep)) {
+ if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
+ p = ep + 1; goto init; /* return match(ms, s, ep + 1); */
+ }
+ else /* '+' or no suffix */
+ s = NULL; /* fail */
+ }
+ else { /* matched once */
+ switch (*ep) { /* handle optional suffix */
+ case '?': { /* optional */
+ const char *res;
+ if ((res = match(ms, s + 1, ep + 1)) != NULL)
+ s = res;
+ else {
+ p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */
+ }
+ break;
+ }
+ case '+': /* 1 or more repetitions */
+ s++; /* 1 match already done */
+ /* FALLTHROUGH */
+ case '*': /* 0 or more repetitions */
+ s = max_expand(ms, s, p, ep);
+ break;
+ case '-': /* 0 or more repetitions (minimum) */
+ s = min_expand(ms, s, p, ep);
+ break;
+ default: /* no suffix */
+ s++; p = ep; goto init; /* return match(ms, s + 1, ep); */
+ }
+ }
+ break;
+ }
+ }
+ }
+ ms->matchdepth++;
+ return s;
+}
+
+
+
+static const char *lmemfind (const char *s1, size_t l1,
+ const char *s2, size_t l2) {
+ if (l2 == 0) return s1; /* empty strings are everywhere */
+ else if (l2 > l1) return NULL; /* avoids a negative `l1' */
+ else {
+ const char *init; /* to search for a `*s2' inside `s1' */
+ l2--; /* 1st char will be checked by `memchr' */
+ l1 = l1-l2; /* `s2' cannot be found after that */
+ while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
+ init++; /* 1st char is already checked */
+ if (memcmp(init, s2+1, l2) == 0)
+ return init-1;
+ else { /* correct `l1' and `s1' to try again */
+ l1 -= init-s1;
+ s1 = init;
+ }
+ }
+ return NULL; /* not found */
+ }
+}
+
+
+static void push_onecapture (MatchState *ms, int i, const char *s,
+ const char *e) {
+ if (i >= ms->level) {
+ if (i == 0) /* ms->level == 0, too */
+ lua_pushlstring(ms->L, s, e - s); /* add whole match */
+ else
+ luaL_error(ms->L, "invalid capture index");
+ }
+ else {
+ ptrdiff_t l = ms->capture[i].len;
+ if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
+ if (l == CAP_POSITION)
+ lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1);
+ else
+ lua_pushlstring(ms->L, ms->capture[i].init, l);
+ }
+}
+
+
+static int push_captures (MatchState *ms, const char *s, const char *e) {
+ int i;
+ int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
+ luaL_checkstack(ms->L, nlevels, "too many captures");
+ for (i = 0; i < nlevels; i++)
+ push_onecapture(ms, i, s, e);
+ return nlevels; /* number of strings pushed */
+}
+
+
+/* check whether pattern has no special characters */
+static int nospecials (const char *p, size_t l) {
+ size_t upto = 0;
+ do {
+ if (strpbrk(p + upto, SPECIALS))
+ return 0; /* pattern has a special character */
+ upto += strlen(p + upto) + 1; /* may have more after \0 */
+ } while (upto <= l);
+ return 1; /* no special chars found */
+}
+
+
+static int str_find_aux (lua_State *L, int find) {
+ size_t ls, lp;
+ const char *s = luaL_checklstring(L, 1, &ls);
+ const char *p = luaL_checklstring(L, 2, &lp);
+ size_t init = posrelat(luaL_optinteger(L, 3, 1), ls);
+ if (init < 1) init = 1;
+ else if (init > ls + 1) { /* start after string's end? */
+ lua_pushnil(L); /* cannot find anything */
+ return 1;
+ }
+ /* explicit request or no special characters? */
+ if (find && (lua_toboolean(L, 4) || nospecials(p, lp))) {
+ /* do a plain search */
+ const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp);
+ if (s2) {
+ lua_pushinteger(L, s2 - s + 1);
+ lua_pushinteger(L, s2 - s + lp);
+ return 2;
+ }
+ }
+ else {
+ MatchState ms;
+ const char *s1 = s + init - 1;
+ int anchor = (*p == '^');
+ if (anchor) {
+ p++; lp--; /* skip anchor character */
+ }
+ ms.L = L;
+ ms.matchdepth = MAXCCALLS;
+ ms.src_init = s;
+ ms.src_end = s + ls;
+ ms.p_end = p + lp;
+ do {
+ const char *res;
+ ms.level = 0;
+ lua_assert(ms.matchdepth == MAXCCALLS);
+ if ((res=match(&ms, s1, p)) != NULL) {
+ if (find) {
+ lua_pushinteger(L, s1 - s + 1); /* start */
+ lua_pushinteger(L, res - s); /* end */
+ return push_captures(&ms, NULL, 0) + 2;
+ }
+ else
+ return push_captures(&ms, s1, res);
+ }
+ } while (s1++ < ms.src_end && !anchor);
+ }
+ lua_pushnil(L); /* not found */
+ return 1;
+}
+
+
+static int str_find (lua_State *L) {
+ return str_find_aux(L, 1);
+}
+
+
+static int str_match (lua_State *L) {
+ return str_find_aux(L, 0);
+}
+
+
+static int gmatch_aux (lua_State *L) {
+ MatchState ms;
+ size_t ls, lp;
+ const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls);
+ const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp);
+ const char *src;
+ ms.L = L;
+ ms.matchdepth = MAXCCALLS;
+ ms.src_init = s;
+ ms.src_end = s+ls;
+ ms.p_end = p + lp;
+ for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
+ src <= ms.src_end;
+ src++) {
+ const char *e;
+ ms.level = 0;
+ lua_assert(ms.matchdepth == MAXCCALLS);
+ if ((e = match(&ms, src, p)) != NULL) {
+ lua_Integer newstart = e-s;
+ if (e == src) newstart++; /* empty match? go at least one position */
+ lua_pushinteger(L, newstart);
+ lua_replace(L, lua_upvalueindex(3));
+ return push_captures(&ms, src, e);
+ }
+ }
+ return 0; /* not found */
+}
+
+
+static int str_gmatch (lua_State *L) {
+ luaL_checkstring(L, 1);
+ luaL_checkstring(L, 2);
+ lua_settop(L, 2);
+ lua_pushinteger(L, 0);
+ lua_pushcclosure(L, gmatch_aux, 3);
+ return 1;
+}
+
+
+static void add_s (MatchState *ms, luaL_Buffer *b, const char *s,
+ const char *e) {
+ size_t l, i;
+ const char *news = lua_tolstring(ms->L, 3, &l);
+ for (i = 0; i < l; i++) {
+ if (news[i] != L_ESC)
+ luaL_addchar(b, news[i]);
+ else {
+ i++; /* skip ESC */
+ if (!isdigit(uchar(news[i]))) {
+ if (news[i] != L_ESC)
+ luaL_error(ms->L, "invalid use of " LUA_QL("%c")
+ " in replacement string", L_ESC);
+ luaL_addchar(b, news[i]);
+ }
+ else if (news[i] == '0')
+ luaL_addlstring(b, s, e - s);
+ else {
+ push_onecapture(ms, news[i] - '1', s, e);
+ luaL_addvalue(b); /* add capture to accumulated result */
+ }
+ }
+ }
+}
+
+
+static void add_value (MatchState *ms, luaL_Buffer *b, const char *s,
+ const char *e, int tr) {
+ lua_State *L = ms->L;
+ switch (tr) {
+ case LUA_TFUNCTION: {
+ int n;
+ lua_pushvalue(L, 3);
+ n = push_captures(ms, s, e);
+ lua_call(L, n, 1);
+ break;
+ }
+ case LUA_TTABLE: {
+ push_onecapture(ms, 0, s, e);
+ lua_gettable(L, 3);
+ break;
+ }
+ default: { /* LUA_TNUMBER or LUA_TSTRING */
+ add_s(ms, b, s, e);
+ return;
+ }
+ }
+ if (!lua_toboolean(L, -1)) { /* nil or false? */
+ lua_pop(L, 1);
+ lua_pushlstring(L, s, e - s); /* keep original text */
+ }
+ else if (!lua_isstring(L, -1))
+ luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
+ luaL_addvalue(b); /* add result to accumulator */
+}
+
+
+static int str_gsub (lua_State *L) {
+ size_t srcl, lp;
+ const char *src = luaL_checklstring(L, 1, &srcl);
+ const char *p = luaL_checklstring(L, 2, &lp);
+ int tr = lua_type(L, 3);
+ size_t max_s = luaL_optinteger(L, 4, srcl+1);
+ int anchor = (*p == '^');
+ size_t n = 0;
+ MatchState ms;
+ luaL_Buffer b;
+ luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
+ tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
+ "string/function/table expected");
+ luaL_buffinit(L, &b);
+ if (anchor) {
+ p++; lp--; /* skip anchor character */
+ }
+ ms.L = L;
+ ms.matchdepth = MAXCCALLS;
+ ms.src_init = src;
+ ms.src_end = src+srcl;
+ ms.p_end = p + lp;
+ while (n < max_s) {
+ const char *e;
+ ms.level = 0;
+ lua_assert(ms.matchdepth == MAXCCALLS);
+ e = match(&ms, src, p);
+ if (e) {
+ n++;
+ add_value(&ms, &b, src, e, tr);
+ }
+ if (e && e>src) /* non empty match? */
+ src = e; /* skip it */
+ else if (src < ms.src_end)
+ luaL_addchar(&b, *src++);
+ else break;
+ if (anchor) break;
+ }
+ luaL_addlstring(&b, src, ms.src_end-src);
+ luaL_pushresult(&b);
+ lua_pushinteger(L, n); /* number of substitutions */
+ return 2;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** STRING FORMAT
+** =======================================================
+*/
+
+/*
+** LUA_INTFRMLEN is the length modifier for integer conversions in
+** 'string.format'; LUA_INTFRM_T is the integer type corresponding to
+** the previous length
+*/
+#if !defined(LUA_INTFRMLEN) /* { */
+#if defined(LUA_USE_LONGLONG)
+
+#define LUA_INTFRMLEN "ll"
+#define LUA_INTFRM_T long long
+
+#else
+
+#define LUA_INTFRMLEN "l"
+#define LUA_INTFRM_T long
+
+#endif
+#endif /* } */
+
+
+/*
+** LUA_FLTFRMLEN is the length modifier for float conversions in
+** 'string.format'; LUA_FLTFRM_T is the float type corresponding to
+** the previous length
+*/
+#if !defined(LUA_FLTFRMLEN)
+
+#define LUA_FLTFRMLEN ""
+#define LUA_FLTFRM_T double
+
+#endif
+
+
+/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
+#define MAX_ITEM 512
+/* valid flags in a format specification */
+#define FLAGS "-+ #0"
+/*
+** maximum size of each format specification (such as '%-099.99d')
+** (+10 accounts for %99.99x plus margin of error)
+*/
+#define MAX_FORMAT (sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
+
+
+static void addquoted (lua_State *L, luaL_Buffer *b, int arg) {
+ size_t l;
+ const char *s = luaL_checklstring(L, arg, &l);
+ luaL_addchar(b, '"');
+ while (l--) {
+ if (*s == '"' || *s == '\\' || *s == '\n') {
+ luaL_addchar(b, '\\');
+ luaL_addchar(b, *s);
+ }
+ else if (*s == '\0' || iscntrl(uchar(*s))) {
+ char buff[10];
+ if (!isdigit(uchar(*(s+1))))
+ sprintf(buff, "\\%d", (int)uchar(*s));
+ else
+ sprintf(buff, "\\%03d", (int)uchar(*s));
+ luaL_addstring(b, buff);
+ }
+ else
+ luaL_addchar(b, *s);
+ s++;
+ }
+ luaL_addchar(b, '"');
+}
+
+static const char *scanformat (lua_State *L, const char *strfrmt, char *form) {
+ const char *p = strfrmt;
+ while (*p != '\0' && strchr(FLAGS, *p) != NULL) p++; /* skip flags */
+ if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char))
+ luaL_error(L, "invalid format (repeated flags)");
+ if (isdigit(uchar(*p))) p++; /* skip width */
+ if (isdigit(uchar(*p))) p++; /* (2 digits at most) */
+ if (*p == '.') {
+ p++;
+ if (isdigit(uchar(*p))) p++; /* skip precision */
+ if (isdigit(uchar(*p))) p++; /* (2 digits at most) */
+ }
+ if (isdigit(uchar(*p)))
+ luaL_error(L, "invalid format (width or precision too long)");
+ *(form++) = '%';
+ memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char));
+ form += p - strfrmt + 1;
+ *form = '\0';
+ return p;
+}
+
+
+/*
+** add length modifier into formats
+*/
+static void addlenmod (char *form, const char *lenmod) {
+ size_t l = strlen(form);
+ size_t lm = strlen(lenmod);
+ char spec = form[l - 1];
+ strcpy(form + l - 1, lenmod);
+ form[l + lm - 1] = spec;
+ form[l + lm] = '\0';
+}
+
+
+static int str_format (lua_State *L) {
+ int top = lua_gettop(L);
+ int arg = 1;
+ size_t sfl;
+ const char *strfrmt = luaL_checklstring(L, arg, &sfl);
+ const char *strfrmt_end = strfrmt+sfl;
+ luaL_Buffer b;
+ luaL_buffinit(L, &b);
+ while (strfrmt < strfrmt_end) {
+ if (*strfrmt != L_ESC)
+ luaL_addchar(&b, *strfrmt++);
+ else if (*++strfrmt == L_ESC)
+ luaL_addchar(&b, *strfrmt++); /* %% */
+ else { /* format item */
+ char form[MAX_FORMAT]; /* to store the format (`%...') */
+ char *buff = luaL_prepbuffsize(&b, MAX_ITEM); /* to put formatted item */
+ int nb = 0; /* number of bytes in added item */
+ if (++arg > top)
+ luaL_argerror(L, arg, "no value");
+ strfrmt = scanformat(L, strfrmt, form);
+ switch (*strfrmt++) {
+ case 'c': {
+ nb = str_sprintf(buff, form, luaL_checkint(L, arg));
+ break;
+ }
+ case 'd': case 'i': {
+ lua_Number n = luaL_checknumber(L, arg);
+ LUA_INTFRM_T ni = (LUA_INTFRM_T)n;
+ lua_Number diff = n - (lua_Number)ni;
+ luaL_argcheck(L, -1 < diff && diff < 1, arg,
+ "not a number in proper range");
+ addlenmod(form, LUA_INTFRMLEN);
+ nb = str_sprintf(buff, form, ni);
+ break;
+ }
+ case 'o': case 'u': case 'x': case 'X': {
+ lua_Number n = luaL_checknumber(L, arg);
+ unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n;
+ lua_Number diff = n - (lua_Number)ni;
+ luaL_argcheck(L, -1 < diff && diff < 1, arg,
+ "not a non-negative number in proper range");
+ addlenmod(form, LUA_INTFRMLEN);
+ nb = str_sprintf(buff, form, ni);
+ break;
+ }
+#if defined(LUA_USE_FLOAT_FORMATS)
+ case 'e': case 'E': case 'f':
+#if defined(LUA_USE_AFORMAT)
+ case 'a': case 'A':
+#endif
+ case 'g': case 'G': {
+ addlenmod(form, LUA_FLTFRMLEN);
+ nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg));
+ break;
+ }
+#endif
+ case 'q': {
+ addquoted(L, &b, arg);
+ break;
+ }
+ case 's': {
+ size_t l;
+ const char *s = luaL_tolstring(L, arg, &l);
+ if (!strchr(form, '.') && l >= 100) {
+ /* no precision and string is too long to be formatted;
+ keep original string */
+ luaL_addvalue(&b);
+ break;
+ }
+ else {
+ nb = str_sprintf(buff, form, s);
+ lua_pop(L, 1); /* remove result from 'luaL_tolstring' */
+ break;
+ }
+ }
+ default: { /* also treat cases `pnLlh' */
+ return luaL_error(L, "invalid option " LUA_QL("%%%c") " to "
+ LUA_QL("format"), *(strfrmt - 1));
+ }
+ }
+ luaL_addsize(&b, nb);
+ }
+ }
+ luaL_pushresult(&b);
+ return 1;
+}
+
+/* }====================================================== */
+
+
+static const luaL_Reg strlib[] = {
+ {"byte", str_byte},
+ {"char", str_char},
+ {"dump", str_dump},
+ {"find", str_find},
+ {"format", str_format},
+ {"gmatch", str_gmatch},
+ {"gsub", str_gsub},
+ {"len", str_len},
+ {"lower", str_lower},
+ {"match", str_match},
+ {"rep", str_rep},
+ {"reverse", str_reverse},
+ {"sub", str_sub},
+ {"upper", str_upper},
+ {NULL, NULL}
+};
+
+
+static void createmetatable (lua_State *L) {
+ lua_createtable(L, 0, 1); /* table to be metatable for strings */
+ lua_pushliteral(L, ""); /* dummy string */
+ lua_pushvalue(L, -2); /* copy table */
+ lua_setmetatable(L, -2); /* set table as metatable for strings */
+ lua_pop(L, 1); /* pop dummy string */
+ lua_pushvalue(L, -2); /* get string library */
+ lua_setfield(L, -2, "__index"); /* metatable.__index = string */
+ lua_pop(L, 1); /* pop metatable */
+}
+
+
+/*
+** Open string library
+*/
+LUAMOD_API int luaopen_string (lua_State *L) {
+ luaL_newlib(L, strlib);
+ createmetatable(L);
+ return 1;
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c
new file mode 100644
index 000000000000..4f8ab1b16733
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c
@@ -0,0 +1,589 @@
+/*
+** $Id: ltable.c,v 2.72.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua tables (hash)
+** See Copyright Notice in lua.h
+*/
+
+
+/*
+** Implementation of tables (aka arrays, objects, or hash tables).
+** Tables keep its elements in two parts: an array part and a hash part.
+** Non-negative integer keys are all candidates to be kept in the array
+** part. The actual size of the array is the largest `n' such that at
+** least half the slots between 0 and n are in use.
+** Hash uses a mix of chained scatter table with Brent's variation.
+** A main invariant of these tables is that, if an element is not
+** in its main position (i.e. the `original' position that its hash gives
+** to it), then the colliding element is in its own main position.
+** Hence even when the load factor reaches 100%, performance remains good.
+*/
+
+#include <sys/zfs_context.h>
+
+#define ltable_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lvm.h"
+
+
+/*
+** max size of array part is 2^MAXBITS
+*/
+#if LUAI_BITSINT >= 32
+#define MAXBITS 30
+#else
+#define MAXBITS (LUAI_BITSINT-2)
+#endif
+
+#define MAXASIZE (1 << MAXBITS)
+
+
+#define hashpow2(t,n) (gnode(t, lmod((n), sizenode(t))))
+
+#define hashstr(t,str) hashpow2(t, (str)->tsv.hash)
+#define hashboolean(t,p) hashpow2(t, p)
+
+
+/*
+** for some types, it is better to avoid modulus by power of 2, as
+** they tend to have many 2 factors.
+*/
+#define hashmod(t,n) (gnode(t, ((n) % ((sizenode(t)-1)|1))))
+
+
+#define hashpointer(t,p) hashmod(t, IntPoint(p))
+
+
+#define dummynode (&dummynode_)
+
+#define isdummy(n) ((n) == dummynode)
+
+static const Node dummynode_ = {
+ {NILCONSTANT}, /* value */
+ {{NILCONSTANT, NULL}} /* key */
+};
+
+
+/*
+** hash for lua_Numbers
+*/
+static Node *hashnum (const Table *t, lua_Number n) {
+ int i;
+ luai_hashnum(i, n);
+ if (i < 0) {
+ if (cast(unsigned int, i) == 0u - i) /* use unsigned to avoid overflows */
+ i = 0; /* handle INT_MIN */
+ i = -i; /* must be a positive value */
+ }
+ return hashmod(t, i);
+}
+
+
+
+/*
+** returns the `main' position of an element in a table (that is, the index
+** of its hash value)
+*/
+static Node *mainposition (const Table *t, const TValue *key) {
+ switch (ttype(key)) {
+ case LUA_TNUMBER:
+ return hashnum(t, nvalue(key));
+ case LUA_TLNGSTR: {
+ TString *s = rawtsvalue(key);
+ if (s->tsv.extra == 0) { /* no hash? */
+ s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash);
+ s->tsv.extra = 1; /* now it has its hash */
+ }
+ return hashstr(t, rawtsvalue(key));
+ }
+ case LUA_TSHRSTR:
+ return hashstr(t, rawtsvalue(key));
+ case LUA_TBOOLEAN:
+ return hashboolean(t, bvalue(key));
+ case LUA_TLIGHTUSERDATA:
+ return hashpointer(t, pvalue(key));
+ case LUA_TLCF:
+ return hashpointer(t, fvalue(key));
+ default:
+ return hashpointer(t, gcvalue(key));
+ }
+}
+
+
+/*
+** returns the index for `key' if `key' is an appropriate key to live in
+** the array part of the table, -1 otherwise.
+*/
+static int arrayindex (const TValue *key) {
+ if (ttisnumber(key)) {
+ lua_Number n = nvalue(key);
+ int k;
+ lua_number2int(k, n);
+ if (luai_numeq(cast_num(k), n))
+ return k;
+ }
+ return -1; /* `key' did not match some condition */
+}
+
+
+/*
+** returns the index of a `key' for table traversals. First goes all
+** elements in the array part, then elements in the hash part. The
+** beginning of a traversal is signaled by -1.
+*/
+static int findindex (lua_State *L, Table *t, StkId key) {
+ int i;
+ if (ttisnil(key)) return -1; /* first iteration */
+ i = arrayindex(key);
+ if (0 < i && i <= t->sizearray) /* is `key' inside array part? */
+ return i-1; /* yes; that's the index (corrected to C) */
+ else {
+ Node *n = mainposition(t, key);
+ for (;;) { /* check whether `key' is somewhere in the chain */
+ /* key may be dead already, but it is ok to use it in `next' */
+ if (luaV_rawequalobj(gkey(n), key) ||
+ (ttisdeadkey(gkey(n)) && iscollectable(key) &&
+ deadvalue(gkey(n)) == gcvalue(key))) {
+ i = cast_int(n - gnode(t, 0)); /* key index in hash table */
+ /* hash elements are numbered after array ones */
+ return i + t->sizearray;
+ }
+ else n = gnext(n);
+ if (n == NULL)
+ luaG_runerror(L, "invalid key to " LUA_QL("next")); /* key not found */
+ }
+ }
+}
+
+
+int luaH_next (lua_State *L, Table *t, StkId key) {
+ int i = findindex(L, t, key); /* find original element */
+ for (i++; i < t->sizearray; i++) { /* try first array part */
+ if (!ttisnil(&t->array[i])) { /* a non-nil value? */
+ setnvalue(key, cast_num(i+1));
+ setobj2s(L, key+1, &t->array[i]);
+ return 1;
+ }
+ }
+ for (i -= t->sizearray; i < sizenode(t); i++) { /* then hash part */
+ if (!ttisnil(gval(gnode(t, i)))) { /* a non-nil value? */
+ setobj2s(L, key, gkey(gnode(t, i)));
+ setobj2s(L, key+1, gval(gnode(t, i)));
+ return 1;
+ }
+ }
+ return 0; /* no more elements */
+}
+
+
+/*
+** {=============================================================
+** Rehash
+** ==============================================================
+*/
+
+
+static int computesizes (int nums[], int *narray) {
+ int i;
+ int twotoi; /* 2^i */
+ int a = 0; /* number of elements smaller than 2^i */
+ int na = 0; /* number of elements to go to array part */
+ int n = 0; /* optimal size for array part */
+ for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) {
+ if (nums[i] > 0) {
+ a += nums[i];
+ if (a > twotoi/2) { /* more than half elements present? */
+ n = twotoi; /* optimal size (till now) */
+ na = a; /* all elements smaller than n will go to array part */
+ }
+ }
+ if (a == *narray) break; /* all elements already counted */
+ }
+ *narray = n;
+ lua_assert(*narray/2 <= na && na <= *narray);
+ return na;
+}
+
+
+static int countint (const TValue *key, int *nums) {
+ int k = arrayindex(key);
+ if (0 < k && k <= MAXASIZE) { /* is `key' an appropriate array index? */
+ nums[luaO_ceillog2(k)]++; /* count as such */
+ return 1;
+ }
+ else
+ return 0;
+}
+
+
+static int numusearray (const Table *t, int *nums) {
+ int lg;
+ int ttlg; /* 2^lg */
+ int ause = 0; /* summation of `nums' */
+ int i = 1; /* count to traverse all array keys */
+ for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) { /* for each slice */
+ int lc = 0; /* counter */
+ int lim = ttlg;
+ if (lim > t->sizearray) {
+ lim = t->sizearray; /* adjust upper limit */
+ if (i > lim)
+ break; /* no more elements to count */
+ }
+ /* count elements in range (2^(lg-1), 2^lg] */
+ for (; i <= lim; i++) {
+ if (!ttisnil(&t->array[i-1]))
+ lc++;
+ }
+ nums[lg] += lc;
+ ause += lc;
+ }
+ return ause;
+}
+
+
+static int numusehash (const Table *t, int *nums, int *pnasize) {
+ int totaluse = 0; /* total number of elements */
+ int ause = 0; /* summation of `nums' */
+ int i = sizenode(t);
+ while (i--) {
+ Node *n = &t->node[i];
+ if (!ttisnil(gval(n))) {
+ ause += countint(gkey(n), nums);
+ totaluse++;
+ }
+ }
+ *pnasize += ause;
+ return totaluse;
+}
+
+
+static void setarrayvector (lua_State *L, Table *t, int size) {
+ int i;
+ luaM_reallocvector(L, t->array, t->sizearray, size, TValue);
+ for (i=t->sizearray; i<size; i++)
+ setnilvalue(&t->array[i]);
+ t->sizearray = size;
+}
+
+
+static void setnodevector (lua_State *L, Table *t, int size) {
+ int lsize;
+ if (size == 0) { /* no elements to hash part? */
+ t->node = cast(Node *, dummynode); /* use common `dummynode' */
+ lsize = 0;
+ }
+ else {
+ int i;
+ lsize = luaO_ceillog2(size);
+ if (lsize > MAXBITS)
+ luaG_runerror(L, "table overflow");
+ size = twoto(lsize);
+ t->node = luaM_newvector(L, size, Node);
+ for (i=0; i<size; i++) {
+ Node *n = gnode(t, i);
+ gnext(n) = NULL;
+ setnilvalue(gkey(n));
+ setnilvalue(gval(n));
+ }
+ }
+ t->lsizenode = cast_byte(lsize);
+ t->lastfree = gnode(t, size); /* all positions are free */
+}
+
+
+void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize) {
+ int i;
+ int oldasize = t->sizearray;
+ int oldhsize = t->lsizenode;
+ Node *nold = t->node; /* save old hash ... */
+ if (nasize > oldasize) /* array part must grow? */
+ setarrayvector(L, t, nasize);
+ /* create new hash part with appropriate size */
+ setnodevector(L, t, nhsize);
+ if (nasize < oldasize) { /* array part must shrink? */
+ t->sizearray = nasize;
+ /* re-insert elements from vanishing slice */
+ for (i=nasize; i<oldasize; i++) {
+ if (!ttisnil(&t->array[i]))
+ luaH_setint(L, t, i + 1, &t->array[i]);
+ }
+ /* shrink array */
+ luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
+ }
+ /* re-insert elements from hash part */
+ for (i = twoto(oldhsize) - 1; i >= 0; i--) {
+ Node *old = nold+i;
+ if (!ttisnil(gval(old))) {
+ /* doesn't need barrier/invalidate cache, as entry was
+ already present in the table */
+ setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old));
+ }
+ }
+ if (!isdummy(nold))
+ luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */
+}
+
+
+void luaH_resizearray (lua_State *L, Table *t, int nasize) {
+ int nsize = isdummy(t->node) ? 0 : sizenode(t);
+ luaH_resize(L, t, nasize, nsize);
+}
+
+
+static void rehash (lua_State *L, Table *t, const TValue *ek) {
+ int nasize, na;
+ int nums[MAXBITS+1]; /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */
+ int i;
+ int totaluse;
+ for (i=0; i<=MAXBITS; i++) nums[i] = 0; /* reset counts */
+ nasize = numusearray(t, nums); /* count keys in array part */
+ totaluse = nasize; /* all those keys are integer keys */
+ totaluse += numusehash(t, nums, &nasize); /* count keys in hash part */
+ /* count extra key */
+ nasize += countint(ek, nums);
+ totaluse++;
+ /* compute new size for array part */
+ na = computesizes(nums, &nasize);
+ /* resize the table to new computed sizes */
+ luaH_resize(L, t, nasize, totaluse - na);
+}
+
+
+
+/*
+** }=============================================================
+*/
+
+
+Table *luaH_new (lua_State *L) {
+ Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h;
+ t->metatable = NULL;
+ t->flags = cast_byte(~0);
+ t->array = NULL;
+ t->sizearray = 0;
+ setnodevector(L, t, 0);
+ return t;
+}
+
+
+void luaH_free (lua_State *L, Table *t) {
+ if (!isdummy(t->node))
+ luaM_freearray(L, t->node, cast(size_t, sizenode(t)));
+ luaM_freearray(L, t->array, t->sizearray);
+ luaM_free(L, t);
+}
+
+
+static Node *getfreepos (Table *t) {
+ while (t->lastfree > t->node) {
+ t->lastfree--;
+ if (ttisnil(gkey(t->lastfree)))
+ return t->lastfree;
+ }
+ return NULL; /* could not find a free place */
+}
+
+
+
+/*
+** inserts a new key into a hash table; first, check whether key's main
+** position is free. If not, check whether colliding node is in its main
+** position or not: if it is not, move colliding node to an empty place and
+** put new key in its main position; otherwise (colliding node is in its main
+** position), new key goes to an empty position.
+*/
+TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) {
+ Node *mp;
+ if (ttisnil(key)) luaG_runerror(L, "table index is nil");
+ else if (ttisnumber(key) && luai_numisnan(L, nvalue(key)))
+ luaG_runerror(L, "table index is NaN");
+ mp = mainposition(t, key);
+ if (!ttisnil(gval(mp)) || isdummy(mp)) { /* main position is taken? */
+ Node *othern;
+ Node *n = getfreepos(t); /* get a free place */
+ if (n == NULL) { /* cannot find a free place? */
+ rehash(L, t, key); /* grow table */
+ /* whatever called 'newkey' take care of TM cache and GC barrier */
+ return luaH_set(L, t, key); /* insert key into grown table */
+ }
+ lua_assert(!isdummy(n));
+ othern = mainposition(t, gkey(mp));
+ if (othern != mp) { /* is colliding node out of its main position? */
+ /* yes; move colliding node into free position */
+ while (gnext(othern) != mp) othern = gnext(othern); /* find previous */
+ gnext(othern) = n; /* redo the chain with `n' in place of `mp' */
+ *n = *mp; /* copy colliding node into free pos. (mp->next also goes) */
+ gnext(mp) = NULL; /* now `mp' is free */
+ setnilvalue(gval(mp));
+ }
+ else { /* colliding node is in its own main position */
+ /* new node will go into free position */
+ gnext(n) = gnext(mp); /* chain new position */
+ gnext(mp) = n;
+ mp = n;
+ }
+ }
+ setobj2t(L, gkey(mp), key);
+ luaC_barrierback(L, obj2gco(t), key);
+ lua_assert(ttisnil(gval(mp)));
+ return gval(mp);
+}
+
+
+/*
+** search function for integers
+*/
+const TValue *luaH_getint (Table *t, int key) {
+ /* (1 <= key && key <= t->sizearray) */
+ if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray))
+ return &t->array[key-1];
+ else {
+ lua_Number nk = cast_num(key);
+ Node *n = hashnum(t, nk);
+ do { /* check whether `key' is somewhere in the chain */
+ if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk))
+ return gval(n); /* that's it */
+ else n = gnext(n);
+ } while (n);
+ return luaO_nilobject;
+ }
+}
+
+
+/*
+** search function for short strings
+*/
+const TValue *luaH_getstr (Table *t, TString *key) {
+ Node *n = hashstr(t, key);
+ lua_assert(key->tsv.tt == LUA_TSHRSTR);
+ do { /* check whether `key' is somewhere in the chain */
+ if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key))
+ return gval(n); /* that's it */
+ else n = gnext(n);
+ } while (n);
+ return luaO_nilobject;
+}
+
+
+/*
+** main search function
+*/
+const TValue *luaH_get (Table *t, const TValue *key) {
+ switch (ttype(key)) {
+ case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key));
+ case LUA_TNIL: return luaO_nilobject;
+ case LUA_TNUMBER: {
+ int k;
+ lua_Number n = nvalue(key);
+ lua_number2int(k, n);
+ if (luai_numeq(cast_num(k), n)) /* index is int? */
+ return luaH_getint(t, k); /* use specialized version */
+ /* else go through */
+ }
+ /* FALLTHROUGH */
+ default: {
+ Node *n = mainposition(t, key);
+ do { /* check whether `key' is somewhere in the chain */
+ if (luaV_rawequalobj(gkey(n), key))
+ return gval(n); /* that's it */
+ else n = gnext(n);
+ } while (n);
+ return luaO_nilobject;
+ }
+ }
+}
+
+
+/*
+** beware: when using this function you probably need to check a GC
+** barrier and invalidate the TM cache.
+*/
+TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
+ const TValue *p = luaH_get(t, key);
+ if (p != luaO_nilobject)
+ return cast(TValue *, p);
+ else return luaH_newkey(L, t, key);
+}
+
+
+void luaH_setint (lua_State *L, Table *t, int key, TValue *value) {
+ const TValue *p = luaH_getint(t, key);
+ TValue *cell;
+ if (p != luaO_nilobject)
+ cell = cast(TValue *, p);
+ else {
+ TValue k;
+ setnvalue(&k, cast_num(key));
+ cell = luaH_newkey(L, t, &k);
+ }
+ setobj2t(L, cell, value);
+}
+
+
+static int unbound_search (Table *t, unsigned int j) {
+ unsigned int i = j; /* i is zero or a present index */
+ j++;
+ /* find `i' and `j' such that i is present and j is not */
+ while (!ttisnil(luaH_getint(t, j))) {
+ i = j;
+ j *= 2;
+ if (j > cast(unsigned int, MAX_INT)) { /* overflow? */
+ /* table was built with bad purposes: resort to linear search */
+ i = 1;
+ while (!ttisnil(luaH_getint(t, i))) i++;
+ return i - 1;
+ }
+ }
+ /* now do a binary search between them */
+ while (j - i > 1) {
+ unsigned int m = (i+j)/2;
+ if (ttisnil(luaH_getint(t, m))) j = m;
+ else i = m;
+ }
+ return i;
+}
+
+
+/*
+** Try to find a boundary in table `t'. A `boundary' is an integer index
+** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
+*/
+int luaH_getn (Table *t) {
+ unsigned int j = t->sizearray;
+ if (j > 0 && ttisnil(&t->array[j - 1])) {
+ /* there is a boundary in the array part: (binary) search for it */
+ unsigned int i = 0;
+ while (j - i > 1) {
+ unsigned int m = (i+j)/2;
+ if (ttisnil(&t->array[m - 1])) j = m;
+ else i = m;
+ }
+ return i;
+ }
+ /* else must find a boundary in hash part */
+ else if (isdummy(t->node)) /* hash part is empty? */
+ return j; /* that is easy... */
+ else return unbound_search(t, j);
+}
+
+
+
+#if defined(LUA_DEBUG)
+
+Node *luaH_mainposition (const Table *t, const TValue *key) {
+ return mainposition(t, key);
+}
+
+int luaH_isdummy (Node *n) { return isdummy(n); }
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h
new file mode 100644
index 000000000000..d69449b2b863
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h
@@ -0,0 +1,45 @@
+/*
+** $Id: ltable.h,v 2.16.1.2 2013/08/30 15:49:41 roberto Exp $
+** Lua tables (hash)
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ltable_h
+#define ltable_h
+
+#include "lobject.h"
+
+
+#define gnode(t,i) (&(t)->node[i])
+#define gkey(n) (&(n)->i_key.tvk)
+#define gval(n) (&(n)->i_val)
+#define gnext(n) ((n)->i_key.nk.next)
+
+#define invalidateTMcache(t) ((t)->flags = 0)
+
+/* returns the key, given the value of a table entry */
+#define keyfromval(v) \
+ (gkey(cast(Node *, cast(char *, (v)) - offsetof(Node, i_val))))
+
+
+LUAI_FUNC const TValue *luaH_getint (Table *t, int key);
+LUAI_FUNC void luaH_setint (lua_State *L, Table *t, int key, TValue *value);
+LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key);
+LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key);
+LUAI_FUNC TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key);
+LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key);
+LUAI_FUNC Table *luaH_new (lua_State *L);
+LUAI_FUNC void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize);
+LUAI_FUNC void luaH_resizearray (lua_State *L, Table *t, int nasize);
+LUAI_FUNC void luaH_free (lua_State *L, Table *t);
+LUAI_FUNC int luaH_next (lua_State *L, Table *t, StkId key);
+LUAI_FUNC int luaH_getn (Table *t);
+
+
+#if defined(LUA_DEBUG)
+LUAI_FUNC Node *luaH_mainposition (const Table *t, const TValue *key);
+LUAI_FUNC int luaH_isdummy (Node *n);
+#endif
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c
new file mode 100644
index 000000000000..ac9a662448fa
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c
@@ -0,0 +1,284 @@
+/*
+** $Id: ltablib.c,v 1.65.1.2 2014/05/07 16:32:55 roberto Exp $
+** Library for Table Manipulation
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define ltablib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+#define aux_getn(L,n) (luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n))
+
+
+
+#if defined(LUA_COMPAT_MAXN)
+static int maxn (lua_State *L) {
+ lua_Number max = 0;
+ luaL_checktype(L, 1, LUA_TTABLE);
+ lua_pushnil(L); /* first key */
+ while (lua_next(L, 1)) {
+ lua_pop(L, 1); /* remove value */
+ if (lua_type(L, -1) == LUA_TNUMBER) {
+ lua_Number v = lua_tonumber(L, -1);
+ if (v > max) max = v;
+ }
+ }
+ lua_pushnumber(L, max);
+ return 1;
+}
+#endif
+
+
+static int tinsert (lua_State *L) {
+ int e = aux_getn(L, 1) + 1; /* first empty element */
+ int pos; /* where to insert new element */
+ switch (lua_gettop(L)) {
+ case 2: { /* called with only 2 arguments */
+ pos = e; /* insert new element at the end */
+ break;
+ }
+ case 3: {
+ int i;
+ pos = luaL_checkint(L, 2); /* 2nd argument is the position */
+ luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds");
+ for (i = e; i > pos; i--) { /* move up elements */
+ lua_rawgeti(L, 1, i-1);
+ lua_rawseti(L, 1, i); /* t[i] = t[i-1] */
+ }
+ break;
+ }
+ default: {
+ return luaL_error(L, "wrong number of arguments to " LUA_QL("insert"));
+ }
+ }
+ lua_rawseti(L, 1, pos); /* t[pos] = v */
+ return 0;
+}
+
+
+static int tremove (lua_State *L) {
+ int size = aux_getn(L, 1);
+ int pos = luaL_optint(L, 2, size);
+ if (pos != size) /* validate 'pos' if given */
+ luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds");
+ lua_rawgeti(L, 1, pos); /* result = t[pos] */
+ for ( ; pos < size; pos++) {
+ lua_rawgeti(L, 1, pos+1);
+ lua_rawseti(L, 1, pos); /* t[pos] = t[pos+1] */
+ }
+ lua_pushnil(L);
+ lua_rawseti(L, 1, pos); /* t[pos] = nil */
+ return 1;
+}
+
+
+static void addfield (lua_State *L, luaL_Buffer *b, int i) {
+ lua_rawgeti(L, 1, i);
+ if (!lua_isstring(L, -1))
+ luaL_error(L, "invalid value (%s) at index %d in table for "
+ LUA_QL("concat"), luaL_typename(L, -1), i);
+ luaL_addvalue(b);
+}
+
+
+static int tconcat (lua_State *L) {
+ luaL_Buffer b;
+ size_t lsep;
+ int i, last;
+ const char *sep = luaL_optlstring(L, 2, "", &lsep);
+ luaL_checktype(L, 1, LUA_TTABLE);
+ i = luaL_optint(L, 3, 1);
+ last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1));
+ luaL_buffinit(L, &b);
+ for (; i < last; i++) {
+ addfield(L, &b, i);
+ luaL_addlstring(&b, sep, lsep);
+ }
+ if (i == last) /* add last value (if interval was not empty) */
+ addfield(L, &b, i);
+ luaL_pushresult(&b);
+ return 1;
+}
+
+
+/*
+** {======================================================
+** Pack/unpack
+** =======================================================
+*/
+
+static int pack (lua_State *L) {
+ int n = lua_gettop(L); /* number of elements to pack */
+ lua_createtable(L, n, 1); /* create result table */
+ lua_pushinteger(L, n);
+ lua_setfield(L, -2, "n"); /* t.n = number of elements */
+ if (n > 0) { /* at least one element? */
+ int i;
+ lua_pushvalue(L, 1);
+ lua_rawseti(L, -2, 1); /* insert first element */
+ lua_replace(L, 1); /* move table into index 1 */
+ for (i = n; i >= 2; i--) /* assign other elements */
+ lua_rawseti(L, 1, i);
+ }
+ return 1; /* return table */
+}
+
+
+static int unpack (lua_State *L) {
+ int i, e;
+ unsigned int n;
+ luaL_checktype(L, 1, LUA_TTABLE);
+ i = luaL_optint(L, 2, 1);
+ e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1));
+ if (i > e) return 0; /* empty range */
+ n = (unsigned int)e - (unsigned int)i; /* number of elements minus 1 */
+ if (n > (INT_MAX - 10) || !lua_checkstack(L, ++n))
+ return luaL_error(L, "too many results to unpack");
+ lua_rawgeti(L, 1, i); /* push arg[i] (avoiding overflow problems) */
+ while (i++ < e) /* push arg[i + 1...e] */
+ lua_rawgeti(L, 1, i);
+ return n;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** Quicksort
+** (based on `Algorithms in MODULA-3', Robert Sedgewick;
+** Addison-Wesley, 1993.)
+** =======================================================
+*/
+
+
+static void set2 (lua_State *L, int i, int j) {
+ lua_rawseti(L, 1, i);
+ lua_rawseti(L, 1, j);
+}
+
+static int sort_comp (lua_State *L, int a, int b) {
+ if (!lua_isnil(L, 2)) { /* function? */
+ int res;
+ lua_pushvalue(L, 2);
+ lua_pushvalue(L, a-1); /* -1 to compensate function */
+ lua_pushvalue(L, b-2); /* -2 to compensate function and `a' */
+ lua_call(L, 2, 1);
+ res = lua_toboolean(L, -1);
+ lua_pop(L, 1);
+ return res;
+ }
+ else /* a < b? */
+ return lua_compare(L, a, b, LUA_OPLT);
+}
+
+static void auxsort (lua_State *L, int l, int u) {
+ while (l < u) { /* for tail recursion */
+ int i, j;
+ /* sort elements a[l], a[(l+u)/2] and a[u] */
+ lua_rawgeti(L, 1, l);
+ lua_rawgeti(L, 1, u);
+ if (sort_comp(L, -1, -2)) /* a[u] < a[l]? */
+ set2(L, l, u); /* swap a[l] - a[u] */
+ else
+ lua_pop(L, 2);
+ if (u-l == 1) break; /* only 2 elements */
+ i = (l+u)/2;
+ lua_rawgeti(L, 1, i);
+ lua_rawgeti(L, 1, l);
+ if (sort_comp(L, -2, -1)) /* a[i]<a[l]? */
+ set2(L, i, l);
+ else {
+ lua_pop(L, 1); /* remove a[l] */
+ lua_rawgeti(L, 1, u);
+ if (sort_comp(L, -1, -2)) /* a[u]<a[i]? */
+ set2(L, i, u);
+ else
+ lua_pop(L, 2);
+ }
+ if (u-l == 2) break; /* only 3 elements */
+ lua_rawgeti(L, 1, i); /* Pivot */
+ lua_pushvalue(L, -1);
+ lua_rawgeti(L, 1, u-1);
+ set2(L, i, u-1);
+ /* a[l] <= P == a[u-1] <= a[u], only need to sort from l+1 to u-2 */
+ i = l; j = u-1;
+ for (;;) { /* invariant: a[l..i] <= P <= a[j..u] */
+ /* repeat ++i until a[i] >= P */
+ while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) {
+ if (i>=u) luaL_error(L, "invalid order function for sorting");
+ lua_pop(L, 1); /* remove a[i] */
+ }
+ /* repeat --j until a[j] <= P */
+ while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) {
+ if (j<=l) luaL_error(L, "invalid order function for sorting");
+ lua_pop(L, 1); /* remove a[j] */
+ }
+ if (j<i) {
+ lua_pop(L, 3); /* pop pivot, a[i], a[j] */
+ break;
+ }
+ set2(L, i, j);
+ }
+ lua_rawgeti(L, 1, u-1);
+ lua_rawgeti(L, 1, i);
+ set2(L, u-1, i); /* swap pivot (a[u-1]) with a[i] */
+ /* a[l..i-1] <= a[i] == P <= a[i+1..u] */
+ /* adjust so that smaller half is in [j..i] and larger one in [l..u] */
+ if (i-l < u-i) {
+ j=l; i=i-1; l=i+2;
+ }
+ else {
+ j=i+1; i=u; u=j-2;
+ }
+ auxsort(L, j, i); /* call recursively the smaller one */
+ } /* repeat the routine for the larger one */
+}
+
+static int sort (lua_State *L) {
+ int n = aux_getn(L, 1);
+ luaL_checkstack(L, 40, ""); /* assume array is smaller than 2^40 */
+ if (!lua_isnoneornil(L, 2)) /* is there a 2nd argument? */
+ luaL_checktype(L, 2, LUA_TFUNCTION);
+ lua_settop(L, 2); /* make sure there is two arguments */
+ auxsort(L, 1, n);
+ return 0;
+}
+
+/* }====================================================== */
+
+
+static const luaL_Reg tab_funcs[] = {
+ {"concat", tconcat},
+#if defined(LUA_COMPAT_MAXN)
+ {"maxn", maxn},
+#endif
+ {"insert", tinsert},
+ {"pack", pack},
+ {"unpack", unpack},
+ {"remove", tremove},
+ {"sort", sort},
+ {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_table (lua_State *L) {
+ luaL_newlib(L, tab_funcs);
+#if defined(LUA_COMPAT_UNPACK)
+ /* _G.unpack = table.unpack */
+ lua_getfield(L, -1, "unpack");
+ lua_setglobal(L, "unpack");
+#endif
+ return 1;
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c
new file mode 100644
index 000000000000..671ff82505b1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c
@@ -0,0 +1,77 @@
+/*
+** $Id: ltm.c,v 2.14.1.1 2013/04/12 18:48:47 roberto Exp $
+** Tag methods
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define ltm_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+static const char udatatypename[] = "userdata";
+
+LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = {
+ "no value",
+ "nil", "boolean", udatatypename, "number",
+ "string", "table", "function", udatatypename, "thread",
+ "proto", "upval" /* these last two cases are used for tests only */
+};
+
+
+void luaT_init (lua_State *L) {
+ static const char *const luaT_eventname[] = { /* ORDER TM */
+ "__index", "__newindex",
+ "__gc", "__mode", "__len", "__eq",
+ "__add", "__sub", "__mul", "__div", "__mod",
+ "__pow", "__unm", "__lt", "__le",
+ "__concat", "__call"
+ };
+ int i;
+ for (i=0; i<TM_N; i++) {
+ G(L)->tmname[i] = luaS_new(L, luaT_eventname[i]);
+ luaS_fix(G(L)->tmname[i]); /* never collect these names */
+ }
+}
+
+
+/*
+** function to be used with macro "fasttm": optimized for absence of
+** tag methods
+*/
+const TValue *luaT_gettm (Table *events, TMS event, TString *ename) {
+ const TValue *tm = luaH_getstr(events, ename);
+ lua_assert(event <= TM_EQ);
+ if (ttisnil(tm)) { /* no tag method? */
+ events->flags |= cast_byte(1u<<event); /* cache this fact */
+ return NULL;
+ }
+ else return tm;
+}
+
+
+const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, TMS event) {
+ Table *mt;
+ switch (ttypenv(o)) {
+ case LUA_TTABLE:
+ mt = hvalue(o)->metatable;
+ break;
+ case LUA_TUSERDATA:
+ mt = uvalue(o)->metatable;
+ break;
+ default:
+ mt = G(L)->mt[ttypenv(o)];
+ }
+ return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject);
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h
new file mode 100644
index 000000000000..7f89c841f9c0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h
@@ -0,0 +1,57 @@
+/*
+** $Id: ltm.h,v 2.11.1.1 2013/04/12 18:48:47 roberto Exp $
+** Tag methods
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ltm_h
+#define ltm_h
+
+
+#include "lobject.h"
+
+
+/*
+* WARNING: if you change the order of this enumeration,
+* grep "ORDER TM"
+*/
+typedef enum {
+ TM_INDEX,
+ TM_NEWINDEX,
+ TM_GC,
+ TM_MODE,
+ TM_LEN,
+ TM_EQ, /* last tag method with `fast' access */
+ TM_ADD,
+ TM_SUB,
+ TM_MUL,
+ TM_DIV,
+ TM_MOD,
+ TM_POW,
+ TM_UNM,
+ TM_LT,
+ TM_LE,
+ TM_CONCAT,
+ TM_CALL,
+ TM_N /* number of elements in the enum */
+} TMS;
+
+
+
+#define gfasttm(g,et,e) ((et) == NULL ? NULL : \
+ ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e]))
+
+#define fasttm(l,et,e) gfasttm(G(l), et, e)
+
+#define ttypename(x) luaT_typenames_[(x) + 1]
+#define objtypename(x) ttypename(ttypenv(x))
+
+LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS];
+
+
+LUAI_FUNC const TValue *luaT_gettm (Table *events, TMS event, TString *ename);
+LUAI_FUNC const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o,
+ TMS event);
+LUAI_FUNC void luaT_init (lua_State *L);
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h
new file mode 100644
index 000000000000..4610dad45ed8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h
@@ -0,0 +1,443 @@
+/*
+** $Id: lua.h,v 1.285.1.4 2015/02/21 14:04:50 roberto Exp $
+** Lua - A Scripting Language
+** Lua.org, PUC-Rio, Brazil (http://www.lua.org)
+** See Copyright Notice at the end of this file
+*/
+
+
+#ifndef lua_h
+#define lua_h
+
+#include <sys/zfs_context.h>
+
+#include "luaconf.h"
+
+
+#define LUA_VERSION_MAJOR "5"
+#define LUA_VERSION_MINOR "2"
+#define LUA_VERSION_NUM 502
+#define LUA_VERSION_RELEASE "4"
+
+#define LUA_VERSION "Lua " LUA_VERSION_MAJOR "." LUA_VERSION_MINOR
+#define LUA_RELEASE LUA_VERSION "." LUA_VERSION_RELEASE
+#define LUA_COPYRIGHT LUA_RELEASE " Copyright (C) 1994-2015 Lua.org, PUC-Rio"
+#define LUA_AUTHORS "R. Ierusalimschy, L. H. de Figueiredo, W. Celes"
+
+
+/* mark for precompiled code ('<esc>Lua') */
+#define LUA_SIGNATURE "\033Lua"
+
+/* option for multiple returns in 'lua_pcall' and 'lua_call' */
+#define LUA_MULTRET (-1)
+
+
+/*
+** pseudo-indices
+*/
+#define LUA_REGISTRYINDEX LUAI_FIRSTPSEUDOIDX
+#define lua_upvalueindex(i) (LUA_REGISTRYINDEX - (i))
+
+
+/* thread status */
+#define LUA_OK 0
+#define LUA_YIELD 1
+#define LUA_ERRRUN 2
+#define LUA_ERRSYNTAX 3
+#define LUA_ERRMEM 4
+#define LUA_ERRGCMM 5
+#define LUA_ERRERR 6
+
+
+typedef struct lua_State lua_State;
+
+typedef int (*lua_CFunction) (lua_State *L);
+
+
+/*
+** functions that read/write blocks when loading/dumping Lua chunks
+*/
+typedef const char * (*lua_Reader) (lua_State *L, void *ud, size_t *sz);
+
+typedef int (*lua_Writer) (lua_State *L, const void* p, size_t sz, void* ud);
+
+
+/*
+** prototype for memory-allocation functions
+*/
+typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize);
+
+
+/*
+** basic types
+*/
+#define LUA_TNONE (-1)
+
+#define LUA_TNIL 0
+#define LUA_TBOOLEAN 1
+#define LUA_TLIGHTUSERDATA 2
+#define LUA_TNUMBER 3
+#define LUA_TSTRING 4
+#define LUA_TTABLE 5
+#define LUA_TFUNCTION 6
+#define LUA_TUSERDATA 7
+#define LUA_TTHREAD 8
+
+#define LUA_NUMTAGS 9
+
+
+
+/* minimum Lua stack available to a C function */
+#define LUA_MINSTACK 20
+
+
+/* predefined values in the registry */
+#define LUA_RIDX_MAINTHREAD 1
+#define LUA_RIDX_GLOBALS 2
+#define LUA_RIDX_LAST LUA_RIDX_GLOBALS
+
+
+/* type of numbers in Lua */
+typedef LUA_NUMBER lua_Number;
+
+
+/* type for integer functions */
+typedef LUA_INTEGER lua_Integer;
+
+/* unsigned integer type */
+typedef LUA_UNSIGNED lua_Unsigned;
+
+
+
+
+/*
+** generic extra include file
+*/
+#if defined(LUA_USER_H)
+#include LUA_USER_H
+#endif
+
+
+/*
+** RCS ident string
+*/
+extern const char lua_ident[];
+
+
+/*
+** state manipulation
+*/
+LUA_API lua_State *(lua_newstate) (lua_Alloc f, void *ud);
+LUA_API void (lua_close) (lua_State *L);
+LUA_API lua_State *(lua_newthread) (lua_State *L);
+
+LUA_API lua_CFunction (lua_atpanic) (lua_State *L, lua_CFunction panicf);
+
+
+LUA_API const lua_Number *(lua_version) (lua_State *L);
+
+
+/*
+** basic stack manipulation
+*/
+LUA_API int (lua_absindex) (lua_State *L, int idx);
+LUA_API int (lua_gettop) (lua_State *L);
+LUA_API void (lua_settop) (lua_State *L, int idx);
+LUA_API void (lua_pushvalue) (lua_State *L, int idx);
+LUA_API void (lua_remove) (lua_State *L, int idx);
+LUA_API void (lua_insert) (lua_State *L, int idx);
+LUA_API void (lua_replace) (lua_State *L, int idx);
+LUA_API void (lua_copy) (lua_State *L, int fromidx, int toidx);
+LUA_API int (lua_checkstack) (lua_State *L, int sz);
+
+LUA_API void (lua_xmove) (lua_State *from, lua_State *to, int n);
+
+
+/*
+** access functions (stack -> C)
+*/
+
+LUA_API int (lua_isnumber) (lua_State *L, int idx);
+LUA_API int (lua_isstring) (lua_State *L, int idx);
+LUA_API int (lua_iscfunction) (lua_State *L, int idx);
+LUA_API int (lua_isuserdata) (lua_State *L, int idx);
+LUA_API int (lua_type) (lua_State *L, int idx);
+LUA_API const char *(lua_typename) (lua_State *L, int tp);
+
+LUA_API lua_Number (lua_tonumberx) (lua_State *L, int idx, int *isnum);
+LUA_API lua_Integer (lua_tointegerx) (lua_State *L, int idx, int *isnum);
+LUA_API lua_Unsigned (lua_tounsignedx) (lua_State *L, int idx, int *isnum);
+LUA_API int (lua_toboolean) (lua_State *L, int idx);
+LUA_API const char *(lua_tolstring) (lua_State *L, int idx, size_t *len);
+LUA_API size_t (lua_rawlen) (lua_State *L, int idx);
+LUA_API lua_CFunction (lua_tocfunction) (lua_State *L, int idx);
+LUA_API void *(lua_touserdata) (lua_State *L, int idx);
+LUA_API lua_State *(lua_tothread) (lua_State *L, int idx);
+LUA_API const void *(lua_topointer) (lua_State *L, int idx);
+
+
+/*
+** Comparison and arithmetic functions
+*/
+
+#define LUA_OPADD 0 /* ORDER TM */
+#define LUA_OPSUB 1
+#define LUA_OPMUL 2
+#define LUA_OPDIV 3
+#define LUA_OPMOD 4
+#define LUA_OPPOW 5
+#define LUA_OPUNM 6
+
+LUA_API void (lua_arith) (lua_State *L, int op);
+
+#define LUA_OPEQ 0
+#define LUA_OPLT 1
+#define LUA_OPLE 2
+
+LUA_API int (lua_rawequal) (lua_State *L, int idx1, int idx2);
+LUA_API int (lua_compare) (lua_State *L, int idx1, int idx2, int op);
+
+
+/*
+** push functions (C -> stack)
+*/
+LUA_API void (lua_pushnil) (lua_State *L);
+LUA_API void (lua_pushnumber) (lua_State *L, lua_Number n);
+LUA_API void (lua_pushinteger) (lua_State *L, lua_Integer n);
+LUA_API void (lua_pushunsigned) (lua_State *L, lua_Unsigned n);
+LUA_API const char *(lua_pushlstring) (lua_State *L, const char *s, size_t l);
+LUA_API const char *(lua_pushstring) (lua_State *L, const char *s);
+LUA_API const char *(lua_pushvfstring) (lua_State *L, const char *fmt,
+ va_list argp);
+LUA_API const char *(lua_pushfstring) (lua_State *L, const char *fmt, ...);
+LUA_API void (lua_pushcclosure) (lua_State *L, lua_CFunction fn, int n);
+LUA_API void (lua_pushboolean) (lua_State *L, int b);
+LUA_API void (lua_pushlightuserdata) (lua_State *L, void *p);
+LUA_API int (lua_pushthread) (lua_State *L);
+
+
+/*
+** get functions (Lua -> stack)
+*/
+LUA_API void (lua_getglobal) (lua_State *L, const char *var);
+LUA_API void (lua_gettable) (lua_State *L, int idx);
+LUA_API void (lua_getfield) (lua_State *L, int idx, const char *k);
+LUA_API void (lua_rawget) (lua_State *L, int idx);
+LUA_API void (lua_rawgeti) (lua_State *L, int idx, int n);
+LUA_API void (lua_rawgetp) (lua_State *L, int idx, const void *p);
+LUA_API void (lua_createtable) (lua_State *L, int narr, int nrec);
+LUA_API void *(lua_newuserdata) (lua_State *L, size_t sz);
+LUA_API int (lua_getmetatable) (lua_State *L, int objindex);
+LUA_API void (lua_getuservalue) (lua_State *L, int idx);
+
+
+/*
+** set functions (stack -> Lua)
+*/
+LUA_API void (lua_setglobal) (lua_State *L, const char *var);
+LUA_API void (lua_settable) (lua_State *L, int idx);
+LUA_API void (lua_setfield) (lua_State *L, int idx, const char *k);
+LUA_API void (lua_rawset) (lua_State *L, int idx);
+LUA_API void (lua_rawseti) (lua_State *L, int idx, int n);
+LUA_API void (lua_rawsetp) (lua_State *L, int idx, const void *p);
+LUA_API int (lua_setmetatable) (lua_State *L, int objindex);
+LUA_API void (lua_setuservalue) (lua_State *L, int idx);
+
+
+/*
+** 'load' and 'call' functions (load and run Lua code)
+*/
+LUA_API void (lua_callk) (lua_State *L, int nargs, int nresults, int ctx,
+ lua_CFunction k);
+#define lua_call(L,n,r) lua_callk(L, (n), (r), 0, NULL)
+
+LUA_API int (lua_getctx) (lua_State *L, int *ctx);
+
+LUA_API int (lua_pcallk) (lua_State *L, int nargs, int nresults, int errfunc,
+ int ctx, lua_CFunction k);
+#define lua_pcall(L,n,r,f) lua_pcallk(L, (n), (r), (f), 0, NULL)
+
+LUA_API int (lua_load) (lua_State *L, lua_Reader reader, void *dt,
+ const char *chunkname,
+ const char *mode);
+
+LUA_API int (lua_dump) (lua_State *L, lua_Writer writer, void *data);
+
+
+/*
+** coroutine functions
+*/
+LUA_API int (lua_yieldk) (lua_State *L, int nresults, int ctx,
+ lua_CFunction k);
+#define lua_yield(L,n) lua_yieldk(L, (n), 0, NULL)
+LUA_API int (lua_resume) (lua_State *L, lua_State *from, int narg);
+LUA_API int (lua_status) (lua_State *L);
+
+/*
+** garbage-collection function and options
+*/
+
+#define LUA_GCSTOP 0
+#define LUA_GCRESTART 1
+#define LUA_GCCOLLECT 2
+#define LUA_GCCOUNT 3
+#define LUA_GCCOUNTB 4
+#define LUA_GCSTEP 5
+#define LUA_GCSETPAUSE 6
+#define LUA_GCSETSTEPMUL 7
+#define LUA_GCSETMAJORINC 8
+#define LUA_GCISRUNNING 9
+#define LUA_GCGEN 10
+#define LUA_GCINC 11
+
+LUA_API int (lua_gc) (lua_State *L, int what, int data);
+
+
+/*
+** miscellaneous functions
+*/
+
+LUA_API int (lua_error) (lua_State *L);
+
+LUA_API int (lua_next) (lua_State *L, int idx);
+
+LUA_API void (lua_concat) (lua_State *L, int n);
+LUA_API void (lua_len) (lua_State *L, int idx);
+
+LUA_API lua_Alloc (lua_getallocf) (lua_State *L, void **ud);
+LUA_API void (lua_setallocf) (lua_State *L, lua_Alloc f, void *ud);
+
+
+
+/*
+** ===============================================================
+** some useful macros
+** ===============================================================
+*/
+
+#define lua_tonumber(L,i) lua_tonumberx(L,i,NULL)
+#define lua_tointeger(L,i) lua_tointegerx(L,i,NULL)
+#define lua_tounsigned(L,i) lua_tounsignedx(L,i,NULL)
+
+#define lua_pop(L,n) lua_settop(L, -(n)-1)
+
+#define lua_newtable(L) lua_createtable(L, 0, 0)
+
+#define lua_register(L,n,f) (lua_pushcfunction(L, (f)), lua_setglobal(L, (n)))
+
+#define lua_pushcfunction(L,f) lua_pushcclosure(L, (f), 0)
+
+#define lua_isfunction(L,n) (lua_type(L, (n)) == LUA_TFUNCTION)
+#define lua_istable(L,n) (lua_type(L, (n)) == LUA_TTABLE)
+#define lua_islightuserdata(L,n) (lua_type(L, (n)) == LUA_TLIGHTUSERDATA)
+#define lua_isnil(L,n) (lua_type(L, (n)) == LUA_TNIL)
+#define lua_isboolean(L,n) (lua_type(L, (n)) == LUA_TBOOLEAN)
+#define lua_isthread(L,n) (lua_type(L, (n)) == LUA_TTHREAD)
+#define lua_isnone(L,n) (lua_type(L, (n)) == LUA_TNONE)
+#define lua_isnoneornil(L, n) (lua_type(L, (n)) <= 0)
+
+#define lua_pushliteral(L, s) \
+ lua_pushlstring(L, "" s, (sizeof(s)/sizeof(char))-1)
+
+#define lua_pushglobaltable(L) \
+ lua_rawgeti(L, LUA_REGISTRYINDEX, LUA_RIDX_GLOBALS)
+
+#define lua_tostring(L,i) lua_tolstring(L, (i), NULL)
+
+
+
+/*
+** {======================================================================
+** Debug API
+** =======================================================================
+*/
+
+
+/*
+** Event codes
+*/
+#define LUA_HOOKCALL 0
+#define LUA_HOOKRET 1
+#define LUA_HOOKLINE 2
+#define LUA_HOOKCOUNT 3
+#define LUA_HOOKTAILCALL 4
+
+
+/*
+** Event masks
+*/
+#define LUA_MASKCALL (1 << LUA_HOOKCALL)
+#define LUA_MASKRET (1 << LUA_HOOKRET)
+#define LUA_MASKLINE (1 << LUA_HOOKLINE)
+#define LUA_MASKCOUNT (1 << LUA_HOOKCOUNT)
+
+typedef struct lua_Debug lua_Debug; /* activation record */
+
+
+/* Functions to be called by the debugger in specific events */
+typedef void (*lua_Hook) (lua_State *L, lua_Debug *ar);
+
+
+LUA_API int (lua_getstack) (lua_State *L, int level, lua_Debug *ar);
+LUA_API int (lua_getinfo) (lua_State *L, const char *what, lua_Debug *ar);
+LUA_API const char *(lua_getlocal) (lua_State *L, const lua_Debug *ar, int n);
+LUA_API const char *(lua_setlocal) (lua_State *L, const lua_Debug *ar, int n);
+LUA_API const char *(lua_getupvalue) (lua_State *L, int funcindex, int n);
+LUA_API const char *(lua_setupvalue) (lua_State *L, int funcindex, int n);
+
+LUA_API void *(lua_upvalueid) (lua_State *L, int fidx, int n);
+LUA_API void (lua_upvaluejoin) (lua_State *L, int fidx1, int n1,
+ int fidx2, int n2);
+
+LUA_API int (lua_sethook) (lua_State *L, lua_Hook func, int mask, int count);
+LUA_API lua_Hook (lua_gethook) (lua_State *L);
+LUA_API int (lua_gethookmask) (lua_State *L);
+LUA_API int (lua_gethookcount) (lua_State *L);
+
+
+struct lua_Debug {
+ int event;
+ const char *name; /* (n) */
+ const char *namewhat; /* (n) 'global', 'local', 'field', 'method' */
+ const char *what; /* (S) 'Lua', 'C', 'main', 'tail' */
+ const char *source; /* (S) */
+ int currentline; /* (l) */
+ int linedefined; /* (S) */
+ int lastlinedefined; /* (S) */
+ unsigned char nups; /* (u) number of upvalues */
+ unsigned char nparams;/* (u) number of parameters */
+ char isvararg; /* (u) */
+ char istailcall; /* (t) */
+ char short_src[LUA_IDSIZE]; /* (S) */
+ /* private part */
+ struct CallInfo *i_ci; /* active function */
+};
+
+/* }====================================================================== */
+
+
+/******************************************************************************
+* Copyright (C) 1994-2015 Lua.org, PUC-Rio.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+******************************************************************************/
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h
new file mode 100644
index 000000000000..e856eee264ff
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h
@@ -0,0 +1,555 @@
+/*
+** $Id: luaconf.h,v 1.176.1.2 2013/11/21 17:26:16 roberto Exp $
+** Configuration file for Lua
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lconfig_h
+#define lconfig_h
+
+#include <sys/zfs_context.h>
+#ifdef illumos
+#include <sys/int_fmtio.h>
+#else
+#include <machine/_inttypes.h>
+#endif
+
+extern ssize_t lcompat_sprintf(char *, const char *, ...);
+extern int64_t lcompat_strtoll(const char *, char **);
+extern int64_t lcompat_pow(int64_t, int64_t);
+
+/*
+** ==================================================================
+** Search for "@@" to find all configurable definitions.
+** ===================================================================
+*/
+
+
+/*
+@@ LUA_ANSI controls the use of non-ansi features.
+** CHANGE it (define it) if you want Lua to avoid the use of any
+** non-ansi feature or library.
+*/
+#if !defined(LUA_ANSI) && defined(__STRICT_ANSI__)
+#define LUA_ANSI
+#endif
+
+
+#if !defined(LUA_ANSI) && defined(_WIN32) && !defined(_WIN32_WCE)
+#define LUA_WIN /* enable goodies for regular Windows platforms */
+#endif
+
+#if defined(LUA_WIN)
+#define LUA_DL_DLL
+#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */
+#endif
+
+
+
+#if defined(LUA_USE_LINUX)
+#define LUA_USE_POSIX
+#define LUA_USE_DLOPEN /* needs an extra library: -ldl */
+#define LUA_USE_READLINE /* needs some extra libraries */
+#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */
+#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */
+#define LUA_USE_LONGLONG /* assume support for long long */
+#endif
+
+#if defined(LUA_USE_MACOSX)
+#define LUA_USE_POSIX
+#define LUA_USE_DLOPEN /* does not need -ldl */
+#define LUA_USE_READLINE /* needs an extra library: -lreadline */
+#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */
+#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */
+#define LUA_USE_LONGLONG /* assume support for long long */
+#endif
+
+
+
+/*
+@@ LUA_USE_POSIX includes all functionality listed as X/Open System
+@* Interfaces Extension (XSI).
+** CHANGE it (define it) if your system is XSI compatible.
+*/
+#if defined(LUA_USE_POSIX)
+#define LUA_USE_MKSTEMP
+#define LUA_USE_ISATTY
+#define LUA_USE_POPEN
+#define LUA_USE_ULONGJMP
+#define LUA_USE_GMTIME_R
+#endif
+
+
+
+/*
+@@ LUA_PATH_DEFAULT is the default path that Lua uses to look for
+@* Lua libraries.
+@@ LUA_CPATH_DEFAULT is the default path that Lua uses to look for
+@* C libraries.
+** CHANGE them if your machine has a non-conventional directory
+** hierarchy or if you want to install your libraries in
+** non-conventional directories.
+*/
+#if defined(_WIN32) /* { */
+/*
+** In Windows, any exclamation mark ('!') in the path is replaced by the
+** path of the directory of the executable file of the current process.
+*/
+#define LUA_LDIR "!\\lua\\"
+#define LUA_CDIR "!\\"
+#define LUA_PATH_DEFAULT \
+ LUA_LDIR"?.lua;" LUA_LDIR"?\\init.lua;" \
+ LUA_CDIR"?.lua;" LUA_CDIR"?\\init.lua;" ".\\?.lua"
+#define LUA_CPATH_DEFAULT \
+ LUA_CDIR"?.dll;" LUA_CDIR"loadall.dll;" ".\\?.dll"
+
+#else /* }{ */
+
+#define LUA_VDIR LUA_VERSION_MAJOR "." LUA_VERSION_MINOR "/"
+#define LUA_ROOT "/usr/local/"
+#define LUA_LDIR LUA_ROOT "share/lua/" LUA_VDIR
+#define LUA_CDIR LUA_ROOT "lib/lua/" LUA_VDIR
+#define LUA_PATH_DEFAULT \
+ LUA_LDIR"?.lua;" LUA_LDIR"?/init.lua;" \
+ LUA_CDIR"?.lua;" LUA_CDIR"?/init.lua;" "./?.lua"
+#define LUA_CPATH_DEFAULT \
+ LUA_CDIR"?.so;" LUA_CDIR"loadall.so;" "./?.so"
+#endif /* } */
+
+
+/*
+@@ LUA_DIRSEP is the directory separator (for submodules).
+** CHANGE it if your machine does not use "/" as the directory separator
+** and is not Windows. (On Windows Lua automatically uses "\".)
+*/
+#if defined(_WIN32)
+#define LUA_DIRSEP "\\"
+#else
+#define LUA_DIRSEP "/"
+#endif
+
+
+/*
+@@ LUA_ENV is the name of the variable that holds the current
+@@ environment, used to access global names.
+** CHANGE it if you do not like this name.
+*/
+#define LUA_ENV "_ENV"
+
+
+/*
+@@ LUA_API is a mark for all core API functions.
+@@ LUALIB_API is a mark for all auxiliary library functions.
+@@ LUAMOD_API is a mark for all standard library opening functions.
+** CHANGE them if you need to define those functions in some special way.
+** For instance, if you want to create one Windows DLL with the core and
+** the libraries, you may want to use the following definition (define
+** LUA_BUILD_AS_DLL to get it).
+*/
+#if defined(LUA_BUILD_AS_DLL) /* { */
+
+#if defined(LUA_CORE) || defined(LUA_LIB) /* { */
+#define LUA_API __declspec(dllexport)
+#else /* }{ */
+#define LUA_API __declspec(dllimport)
+#endif /* } */
+
+#else /* }{ */
+
+#define LUA_API extern
+
+#endif /* } */
+
+
+/* more often than not the libs go together with the core */
+#define LUALIB_API LUA_API
+#define LUAMOD_API LUALIB_API
+
+
+/*
+@@ LUAI_FUNC is a mark for all extern functions that are not to be
+@* exported to outside modules.
+@@ LUAI_DDEF and LUAI_DDEC are marks for all extern (const) variables
+@* that are not to be exported to outside modules (LUAI_DDEF for
+@* definitions and LUAI_DDEC for declarations).
+** CHANGE them if you need to mark them in some special way. Elf/gcc
+** (versions 3.2 and later) mark them as "hidden" to optimize access
+** when Lua is compiled as a shared library. Not all elf targets support
+** this attribute. Unfortunately, gcc does not offer a way to check
+** whether the target offers that support, and those without support
+** give a warning about it. To avoid these warnings, change to the
+** default definition.
+*/
+#if defined(__GNUC__) && ((__GNUC__*100 + __GNUC_MINOR__) >= 302) && \
+ defined(__ELF__) /* { */
+#define LUAI_FUNC __attribute__((visibility("hidden"))) extern
+#define LUAI_DDEC LUAI_FUNC
+#define LUAI_DDEF /* empty */
+
+#else /* }{ */
+#define LUAI_FUNC extern
+#define LUAI_DDEC extern
+#define LUAI_DDEF /* empty */
+#endif /* } */
+
+
+
+/*
+@@ LUA_QL describes how error messages quote program elements.
+** CHANGE it if you want a different appearance.
+*/
+#define LUA_QL(x) "'" x "'"
+#define LUA_QS LUA_QL("%s")
+
+
+/*
+@@ LUA_IDSIZE gives the maximum size for the description of the source
+@* of a function in debug information.
+** CHANGE it if you want a different size.
+*/
+#define LUA_IDSIZE 60
+
+
+/*
+@@ luai_writestringerror defines how to print error messages.
+** (A format string with one argument is enough for Lua...)
+*/
+#ifdef _KERNEL
+#define luai_writestringerror(s,p) \
+ (zfs_dbgmsg((s), (p)))
+#else
+#define luai_writestringerror(s,p) \
+ (fprintf(stderr, (s), (p)), fflush(stderr))
+#endif
+
+
+/*
+@@ LUAI_MAXSHORTLEN is the maximum length for short strings, that is,
+** strings that are internalized. (Cannot be smaller than reserved words
+** or tags for metamethods, as these strings must be internalized;
+** #("function") = 8, #("__newindex") = 10.)
+*/
+#define LUAI_MAXSHORTLEN 40
+
+
+
+/*
+** {==================================================================
+** Compatibility with previous versions
+** ===================================================================
+*/
+
+/*
+@@ LUA_COMPAT_ALL controls all compatibility options.
+** You can define it to get all options, or change specific options
+** to fit your specific needs.
+*/
+#if defined(LUA_COMPAT_ALL) /* { */
+
+/*
+@@ LUA_COMPAT_UNPACK controls the presence of global 'unpack'.
+** You can replace it with 'table.unpack'.
+*/
+#define LUA_COMPAT_UNPACK
+
+/*
+@@ LUA_COMPAT_LOADERS controls the presence of table 'package.loaders'.
+** You can replace it with 'package.searchers'.
+*/
+#define LUA_COMPAT_LOADERS
+
+/*
+@@ macro 'lua_cpcall' emulates deprecated function lua_cpcall.
+** You can call your C function directly (with light C functions).
+*/
+#define lua_cpcall(L,f,u) \
+ (lua_pushcfunction(L, (f)), \
+ lua_pushlightuserdata(L,(u)), \
+ lua_pcall(L,1,0,0))
+
+
+/*
+@@ LUA_COMPAT_LOG10 defines the function 'log10' in the math library.
+** You can rewrite 'log10(x)' as 'log(x, 10)'.
+*/
+#define LUA_COMPAT_LOG10
+
+/*
+@@ LUA_COMPAT_LOADSTRING defines the function 'loadstring' in the base
+** library. You can rewrite 'loadstring(s)' as 'load(s)'.
+*/
+#define LUA_COMPAT_LOADSTRING
+
+/*
+@@ LUA_COMPAT_MAXN defines the function 'maxn' in the table library.
+*/
+#define LUA_COMPAT_MAXN
+
+/*
+@@ The following macros supply trivial compatibility for some
+** changes in the API. The macros themselves document how to
+** change your code to avoid using them.
+*/
+#define lua_strlen(L,i) lua_rawlen(L, (i))
+
+#define lua_objlen(L,i) lua_rawlen(L, (i))
+
+#define lua_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ)
+#define lua_lessthan(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPLT)
+
+/*
+@@ LUA_COMPAT_MODULE controls compatibility with previous
+** module functions 'module' (Lua) and 'luaL_register' (C).
+*/
+#define LUA_COMPAT_MODULE
+
+#endif /* } */
+
+/* }================================================================== */
+
+
+
+/*
+@@ LUAI_BITSINT defines the number of bits in an int.
+** CHANGE here if Lua cannot automatically detect the number of bits of
+** your machine. Probably you do not need to change this.
+*/
+/* avoid overflows in comparison */
+#if INT_MAX-20 < 32760 /* { */
+#define LUAI_BITSINT 16
+#elif INT_MAX > 2147483640L /* }{ */
+/* int has at least 32 bits */
+#define LUAI_BITSINT 32
+#else /* }{ */
+#error "you must define LUA_BITSINT with number of bits in an integer"
+#endif /* } */
+
+
+/*
+@@ LUA_INT32 is a signed integer with exactly 32 bits.
+@@ LUAI_UMEM is an unsigned integer big enough to count the total
+@* memory used by Lua.
+@@ LUAI_MEM is a signed integer big enough to count the total memory
+@* used by Lua.
+** CHANGE here if for some weird reason the default definitions are not
+** good enough for your machine. Probably you do not need to change
+** this.
+*/
+#if LUAI_BITSINT >= 32 /* { */
+#define LUA_INT32 int
+#define LUAI_UMEM size_t
+#define LUAI_MEM ptrdiff_t
+#else /* }{ */
+/* 16-bit ints */
+#define LUA_INT32 long
+#define LUAI_UMEM unsigned long
+#define LUAI_MEM long
+#endif /* } */
+
+
+/*
+@@ LUAI_MAXSTACK limits the size of the Lua stack.
+** CHANGE it if you need a different limit. This limit is arbitrary;
+** its only purpose is to stop Lua from consuming unlimited stack
+** space (and to reserve some numbers for pseudo-indices).
+*/
+#if LUAI_BITSINT >= 32
+#define LUAI_MAXSTACK 1000000
+#else
+#define LUAI_MAXSTACK 15000
+#endif
+
+/* reserve some space for error handling */
+#define LUAI_FIRSTPSEUDOIDX (-LUAI_MAXSTACK - 1000)
+
+
+
+
+/*
+@@ LUAL_BUFFERSIZE is the buffer size used by the lauxlib buffer system.
+** CHANGE it if it uses too much C-stack space.
+*/
+#define LUAL_BUFFERSIZE 1024
+
+
+
+
+/*
+** {==================================================================
+@@ LUA_NUMBER is the type of numbers in Lua.
+** CHANGE the following definitions only if you want to build Lua
+** with a number type different from double. You may also need to
+** change lua_number2int & lua_number2integer.
+** ===================================================================
+*/
+
+#define LUA_NUMBER int64_t
+
+/*
+@@ LUAI_UACNUMBER is the result of an 'usual argument conversion'
+@* over a number.
+*/
+#define LUAI_UACNUMBER int64_t
+
+
+/*
+@@ LUA_NUMBER_SCAN is the format for reading numbers.
+@@ LUA_NUMBER_FMT is the format for writing numbers.
+@@ lua_number2str converts a number to a string.
+@@ LUAI_MAXNUMBER2STR is maximum size of previous conversion.
+*/
+#define LUA_NUMBER_FMT "%" PRId64
+#define lua_number2str(s,n) lcompat_sprintf((s), LUA_NUMBER_FMT, (n))
+#define LUAI_MAXNUMBER2STR 32 /* 16 digits, sign, point, and \0 */
+
+
+/*
+@@ l_mathop allows the addition of an 'l' or 'f' to all math operations
+*/
+#define l_mathop(x) (x ## l)
+
+
+/*
+@@ lua_str2number converts a decimal numeric string to a number.
+@@ lua_strx2number converts an hexadecimal numeric string to a number.
+** In C99, 'strtod' does both conversions. C89, however, has no function
+** to convert floating hexadecimal strings to numbers. For these
+** systems, you can leave 'lua_strx2number' undefined and Lua will
+** provide its own implementation.
+*/
+#define lua_str2number(s,p) lcompat_strtoll((s), (p))
+
+#if defined(LUA_USE_STRTODHEX)
+#define lua_strx2number(s,p) lcompat_strtoll((s), (p))
+#endif
+
+
+/*
+@@ The luai_num* macros define the primitive operations over numbers.
+*/
+
+/* the following operations need the math library */
+#if defined(lobject_c) || defined(lvm_c)
+#define luai_nummod(L,a,b) ((a) % (b))
+#define luai_numpow(L,a,b) (lcompat_pow((a),(b)))
+#endif
+
+/* these are quite standard operations */
+#if defined(LUA_CORE)
+#define luai_numadd(L,a,b) ((a)+(b))
+#define luai_numsub(L,a,b) ((a)-(b))
+#define luai_nummul(L,a,b) ((a)*(b))
+#define luai_numdiv(L,a,b) ((a)/(b))
+#define luai_numunm(L,a) (-(a))
+#define luai_numeq(a,b) ((a)==(b))
+#define luai_numlt(L,a,b) ((a)<(b))
+#define luai_numle(L,a,b) ((a)<=(b))
+#define luai_numisnan(L,a) (!luai_numeq((a), (a)))
+#endif
+
+
+
+/*
+@@ LUA_INTEGER is the integral type used by lua_pushinteger/lua_tointeger.
+** CHANGE that if ptrdiff_t is not adequate on your machine. (On most
+** machines, ptrdiff_t gives a good choice between int or long.)
+*/
+#define LUA_INTEGER ptrdiff_t
+
+/*
+@@ LUA_UNSIGNED is the integral type used by lua_pushunsigned/lua_tounsigned.
+** It must have at least 32 bits.
+*/
+#define LUA_UNSIGNED uint64_t
+
+
+
+/*
+** Some tricks with doubles
+*/
+
+#if defined(LUA_NUMBER_DOUBLE) && !defined(LUA_ANSI) /* { */
+/*
+** The next definitions activate some tricks to speed up the
+** conversion from doubles to integer types, mainly to LUA_UNSIGNED.
+**
+@@ LUA_MSASMTRICK uses Microsoft assembler to avoid clashes with a
+** DirectX idiosyncrasy.
+**
+@@ LUA_IEEE754TRICK uses a trick that should work on any machine
+** using IEEE754 with a 32-bit integer type.
+**
+@@ LUA_IEEELL extends the trick to LUA_INTEGER; should only be
+** defined when LUA_INTEGER is a 32-bit integer.
+**
+@@ LUA_IEEEENDIAN is the endianness of doubles in your machine
+** (0 for little endian, 1 for big endian); if not defined, Lua will
+** check it dynamically for LUA_IEEE754TRICK (but not for LUA_NANTRICK).
+**
+@@ LUA_NANTRICK controls the use of a trick to pack all types into
+** a single double value, using NaN values to represent non-number
+** values. The trick only works on 32-bit machines (ints and pointers
+** are 32-bit values) with numbers represented as IEEE 754-2008 doubles
+** with conventional endianess (12345678 or 87654321), in CPUs that do
+** not produce signaling NaN values (all NaNs are quiet).
+*/
+
+/* Microsoft compiler on a Pentium (32 bit) ? */
+#if defined(LUA_WIN) && defined(_MSC_VER) && defined(_M_IX86) /* { */
+
+#define LUA_MSASMTRICK
+#define LUA_IEEEENDIAN 0
+#define LUA_NANTRICK
+
+
+/* pentium 32 bits? */
+#elif defined(__i386__) || defined(__i386) || defined(__X86__) /* }{ */
+
+#define LUA_IEEE754TRICK
+#define LUA_IEEELL
+#define LUA_IEEEENDIAN 0
+#define LUA_NANTRICK
+
+/* pentium 64 bits? */
+#elif defined(__x86_64) /* }{ */
+
+#define LUA_IEEE754TRICK
+#define LUA_IEEEENDIAN 0
+
+#elif defined(__POWERPC__) || defined(__ppc__) /* }{ */
+
+#define LUA_IEEE754TRICK
+#define LUA_IEEEENDIAN 1
+
+#else /* }{ */
+
+/* assume IEEE754 and a 32-bit integer type */
+#define LUA_IEEE754TRICK
+
+#endif /* } */
+
+#endif /* } */
+
+/* }================================================================== */
+
+
+
+
+/* =================================================================== */
+
+/*
+** Local configuration. You can use this space to add your redefinitions
+** without modifying the main part of the file.
+*/
+
+#define getlocaledecpoint() ('.')
+
+#define abs(x) (((x) < 0) ? -(x) : (x))
+
+#if !defined(UCHAR_MAX)
+#define UCHAR_MAX (0xff)
+#endif
+
+#endif
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h
new file mode 100644
index 000000000000..da82005c9de2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h
@@ -0,0 +1,55 @@
+/*
+** $Id: lualib.h,v 1.43.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua standard libraries
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lualib_h
+#define lualib_h
+
+#include "lua.h"
+
+
+
+LUAMOD_API int (luaopen_base) (lua_State *L);
+
+#define LUA_COLIBNAME "coroutine"
+LUAMOD_API int (luaopen_coroutine) (lua_State *L);
+
+#define LUA_TABLIBNAME "table"
+LUAMOD_API int (luaopen_table) (lua_State *L);
+
+#define LUA_IOLIBNAME "io"
+LUAMOD_API int (luaopen_io) (lua_State *L);
+
+#define LUA_OSLIBNAME "os"
+LUAMOD_API int (luaopen_os) (lua_State *L);
+
+#define LUA_STRLIBNAME "string"
+LUAMOD_API int (luaopen_string) (lua_State *L);
+
+#define LUA_BITLIBNAME "bit32"
+LUAMOD_API int (luaopen_bit32) (lua_State *L);
+
+#define LUA_MATHLIBNAME "math"
+LUAMOD_API int (luaopen_math) (lua_State *L);
+
+#define LUA_DBLIBNAME "debug"
+LUAMOD_API int (luaopen_debug) (lua_State *L);
+
+#define LUA_LOADLIBNAME "package"
+LUAMOD_API int (luaopen_package) (lua_State *L);
+
+
+/* open all previous libraries */
+LUALIB_API void (luaL_openlibs) (lua_State *L);
+
+
+
+#if !defined(lua_assert)
+#define lua_assert(x) ((void)0)
+#endif
+
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c
new file mode 100644
index 000000000000..4d53749a0273
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c
@@ -0,0 +1,258 @@
+/*
+** $Id: lundump.c,v 2.22.1.1 2013/04/12 18:48:47 roberto Exp $
+** load precompiled Lua chunks
+** See Copyright Notice in lua.h
+*/
+
+#include <sys/zfs_context.h>
+
+#define lundump_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstring.h"
+#include "lundump.h"
+#include "lzio.h"
+
+typedef struct {
+ lua_State* L;
+ ZIO* Z;
+ Mbuffer* b;
+ const char* name;
+} LoadState;
+
+static l_noret error(LoadState* S, const char* why)
+{
+ luaO_pushfstring(S->L,"%s: %s precompiled chunk",S->name,why);
+ luaD_throw(S->L,LUA_ERRSYNTAX);
+}
+
+#define LoadMem(S,b,n,size) LoadBlock(S,b,(n)*(size))
+#define LoadByte(S) (lu_byte)LoadChar(S)
+#define LoadVar(S,x) LoadMem(S,&x,1,sizeof(x))
+#define LoadVector(S,b,n,size) LoadMem(S,b,n,size)
+
+#if !defined(luai_verifycode)
+#define luai_verifycode(L,b,f) /* empty */
+#endif
+
+static void LoadBlock(LoadState* S, void* b, size_t size)
+{
+ if (luaZ_read(S->Z,b,size)!=0) error(S,"truncated");
+}
+
+static int LoadChar(LoadState* S)
+{
+ char x;
+ LoadVar(S,x);
+ return x;
+}
+
+static int LoadInt(LoadState* S)
+{
+ int x;
+ LoadVar(S,x);
+ if (x<0) error(S,"corrupted");
+ return x;
+}
+
+static lua_Number LoadNumber(LoadState* S)
+{
+ lua_Number x;
+ LoadVar(S,x);
+ return x;
+}
+
+static TString* LoadString(LoadState* S)
+{
+ size_t size;
+ LoadVar(S,size);
+ if (size==0)
+ return NULL;
+ else
+ {
+ char* s=luaZ_openspace(S->L,S->b,size);
+ LoadBlock(S,s,size*sizeof(char));
+ return luaS_newlstr(S->L,s,size-1); /* remove trailing '\0' */
+ }
+}
+
+static void LoadCode(LoadState* S, Proto* f)
+{
+ int n=LoadInt(S);
+ f->code=luaM_newvector(S->L,n,Instruction);
+ f->sizecode=n;
+ LoadVector(S,f->code,n,sizeof(Instruction));
+}
+
+static void LoadFunction(LoadState* S, Proto* f);
+
+static void LoadConstants(LoadState* S, Proto* f)
+{
+ int i,n;
+ n=LoadInt(S);
+ f->k=luaM_newvector(S->L,n,TValue);
+ f->sizek=n;
+ for (i=0; i<n; i++) setnilvalue(&f->k[i]);
+ for (i=0; i<n; i++)
+ {
+ TValue* o=&f->k[i];
+ int t=LoadChar(S);
+ switch (t)
+ {
+ case LUA_TNIL:
+ setnilvalue(o);
+ break;
+ case LUA_TBOOLEAN:
+ setbvalue(o,LoadChar(S));
+ break;
+ case LUA_TNUMBER:
+ setnvalue(o,LoadNumber(S));
+ break;
+ case LUA_TSTRING:
+ setsvalue2n(S->L,o,LoadString(S));
+ break;
+ default: lua_assert(0);
+ }
+ }
+ n=LoadInt(S);
+ f->p=luaM_newvector(S->L,n,Proto*);
+ f->sizep=n;
+ for (i=0; i<n; i++) f->p[i]=NULL;
+ for (i=0; i<n; i++)
+ {
+ f->p[i]=luaF_newproto(S->L);
+ LoadFunction(S,f->p[i]);
+ }
+}
+
+static void LoadUpvalues(LoadState* S, Proto* f)
+{
+ int i,n;
+ n=LoadInt(S);
+ f->upvalues=luaM_newvector(S->L,n,Upvaldesc);
+ f->sizeupvalues=n;
+ for (i=0; i<n; i++) f->upvalues[i].name=NULL;
+ for (i=0; i<n; i++)
+ {
+ f->upvalues[i].instack=LoadByte(S);
+ f->upvalues[i].idx=LoadByte(S);
+ }
+}
+
+static void LoadDebug(LoadState* S, Proto* f)
+{
+ int i,n;
+ f->source=LoadString(S);
+ n=LoadInt(S);
+ f->lineinfo=luaM_newvector(S->L,n,int);
+ f->sizelineinfo=n;
+ LoadVector(S,f->lineinfo,n,sizeof(int));
+ n=LoadInt(S);
+ f->locvars=luaM_newvector(S->L,n,LocVar);
+ f->sizelocvars=n;
+ for (i=0; i<n; i++) f->locvars[i].varname=NULL;
+ for (i=0; i<n; i++)
+ {
+ f->locvars[i].varname=LoadString(S);
+ f->locvars[i].startpc=LoadInt(S);
+ f->locvars[i].endpc=LoadInt(S);
+ }
+ n=LoadInt(S);
+ for (i=0; i<n; i++) f->upvalues[i].name=LoadString(S);
+}
+
+static void LoadFunction(LoadState* S, Proto* f)
+{
+ f->linedefined=LoadInt(S);
+ f->lastlinedefined=LoadInt(S);
+ f->numparams=LoadByte(S);
+ f->is_vararg=LoadByte(S);
+ f->maxstacksize=LoadByte(S);
+ LoadCode(S,f);
+ LoadConstants(S,f);
+ LoadUpvalues(S,f);
+ LoadDebug(S,f);
+}
+
+/* the code below must be consistent with the code in luaU_header */
+#define N0 LUAC_HEADERSIZE
+#define N1 (sizeof(LUA_SIGNATURE)-sizeof(char))
+#define N2 N1+2
+#define N3 N2+6
+
+static void LoadHeader(LoadState* S)
+{
+ lu_byte h[LUAC_HEADERSIZE];
+ lu_byte s[LUAC_HEADERSIZE];
+ luaU_header(h);
+ memcpy(s,h,sizeof(char)); /* first char already read */
+ LoadBlock(S,s+sizeof(char),LUAC_HEADERSIZE-sizeof(char));
+ if (memcmp(h,s,N0)==0) return;
+ if (memcmp(h,s,N1)!=0) error(S,"not a");
+ if (memcmp(h,s,N2)!=0) error(S,"version mismatch in");
+ if (memcmp(h,s,N3)!=0) error(S,"incompatible"); else error(S,"corrupted");
+}
+
+/*
+** load precompiled chunk
+*/
+Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name)
+{
+ LoadState S;
+ Closure* cl;
+ if (*name=='@' || *name=='=')
+ S.name=name+1;
+ else if (*name==LUA_SIGNATURE[0])
+ S.name="binary string";
+ else
+ S.name=name;
+ S.L=L;
+ S.Z=Z;
+ S.b=buff;
+ LoadHeader(&S);
+ cl=luaF_newLclosure(L,1);
+ setclLvalue(L,L->top,cl); incr_top(L);
+ cl->l.p=luaF_newproto(L);
+ LoadFunction(&S,cl->l.p);
+ if (cl->l.p->sizeupvalues != 1)
+ {
+ Proto* p=cl->l.p;
+ cl=luaF_newLclosure(L,cl->l.p->sizeupvalues);
+ cl->l.p=p;
+ setclLvalue(L,L->top-1,cl);
+ }
+ luai_verifycode(L,buff,cl->l.p);
+ return cl;
+}
+
+#define MYINT(s) (s[0]-'0')
+#define VERSION MYINT(LUA_VERSION_MAJOR)*16+MYINT(LUA_VERSION_MINOR)
+#define FORMAT 0 /* this is the official format */
+
+/*
+* make header for precompiled chunks
+* if you change the code below be sure to update LoadHeader and FORMAT above
+* and LUAC_HEADERSIZE in lundump.h
+*/
+void luaU_header (lu_byte* h)
+{
+ int x=1;
+ memcpy(h,LUA_SIGNATURE,sizeof(LUA_SIGNATURE)-sizeof(char));
+ h+=sizeof(LUA_SIGNATURE)-sizeof(char);
+ *h++=cast_byte(VERSION);
+ *h++=cast_byte(FORMAT);
+ *h++=cast_byte(*(char*)&x); /* endianness */
+ *h++=cast_byte(sizeof(int));
+ *h++=cast_byte(sizeof(size_t));
+ *h++=cast_byte(sizeof(Instruction));
+ *h++=cast_byte(sizeof(lua_Number));
+ *h++=cast_byte(((lua_Number)0.5)==0); /* is lua_Number integral? */
+ memcpy(h,LUAC_TAIL,sizeof(LUAC_TAIL)-sizeof(char));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h
new file mode 100644
index 000000000000..5255db259dfe
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h
@@ -0,0 +1,28 @@
+/*
+** $Id: lundump.h,v 1.39.1.1 2013/04/12 18:48:47 roberto Exp $
+** load precompiled Lua chunks
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lundump_h
+#define lundump_h
+
+#include "lobject.h"
+#include "lzio.h"
+
+/* load one chunk; from lundump.c */
+LUAI_FUNC Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name);
+
+/* make header; from lundump.c */
+LUAI_FUNC void luaU_header (lu_byte* h);
+
+/* dump one chunk; from ldump.c */
+LUAI_FUNC int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip);
+
+/* data to catch conversion errors */
+#define LUAC_TAIL "\x19\x93\r\n\x1a\n"
+
+/* size in bytes of header of binary files */
+#define LUAC_HEADERSIZE (sizeof(LUA_SIGNATURE)-sizeof(char)+2+6+sizeof(LUAC_TAIL)-sizeof(char))
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c
new file mode 100644
index 000000000000..a06e36e5ceae
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c
@@ -0,0 +1,930 @@
+/*
+** $Id: lvm.c,v 2.155.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define strcoll(l,r) (strcmp((l),(r)))
+
+#define lvm_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+
+
+/* limit for table tag-method chains (to avoid loops) */
+#define MAXTAGLOOP 100
+
+
+const TValue *luaV_tonumber (const TValue *obj, TValue *n) {
+ lua_Number num;
+ if (ttisnumber(obj)) return obj;
+ if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) {
+ setnvalue(n, num);
+ return n;
+ }
+ else
+ return NULL;
+}
+
+
+int luaV_tostring (lua_State *L, StkId obj) {
+ if (!ttisnumber(obj))
+ return 0;
+ else {
+ char s[LUAI_MAXNUMBER2STR];
+ lua_Number n = nvalue(obj);
+ int l = lua_number2str(s, n);
+ setsvalue2s(L, obj, luaS_newlstr(L, s, l));
+ return 1;
+ }
+}
+
+
+static void traceexec (lua_State *L) {
+ CallInfo *ci = L->ci;
+ lu_byte mask = L->hookmask;
+ int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0);
+ if (counthook)
+ resethookcount(L); /* reset count */
+ if (ci->callstatus & CIST_HOOKYIELD) { /* called hook last time? */
+ ci->callstatus &= ~CIST_HOOKYIELD; /* erase mark */
+ return; /* do not call hook again (VM yielded, so it did not move) */
+ }
+ if (counthook)
+ luaD_hook(L, LUA_HOOKCOUNT, -1); /* call count hook */
+ if (mask & LUA_MASKLINE) {
+ Proto *p = ci_func(ci)->p;
+ int npc = pcRel(ci->u.l.savedpc, p);
+ int newline = getfuncline(p, npc);
+ if (npc == 0 || /* call linehook when enter a new function, */
+ ci->u.l.savedpc <= L->oldpc || /* when jump back (loop), or when */
+ newline != getfuncline(p, pcRel(L->oldpc, p))) /* enter a new line */
+ luaD_hook(L, LUA_HOOKLINE, newline); /* call line hook */
+ }
+ L->oldpc = ci->u.l.savedpc;
+ if (L->status == LUA_YIELD) { /* did hook yield? */
+ if (counthook)
+ L->hookcount = 1; /* undo decrement to zero */
+ ci->u.l.savedpc--; /* undo increment (resume will increment it again) */
+ ci->callstatus |= CIST_HOOKYIELD; /* mark that it yielded */
+ ci->func = L->top - 1; /* protect stack below results */
+ luaD_throw(L, LUA_YIELD);
+ }
+}
+
+
+static void callTM (lua_State *L, const TValue *f, const TValue *p1,
+ const TValue *p2, TValue *p3, int hasres) {
+ ptrdiff_t result = savestack(L, p3);
+ setobj2s(L, L->top++, f); /* push function */
+ setobj2s(L, L->top++, p1); /* 1st argument */
+ setobj2s(L, L->top++, p2); /* 2nd argument */
+ if (!hasres) /* no result? 'p3' is third argument */
+ setobj2s(L, L->top++, p3); /* 3rd argument */
+ /* metamethod may yield only when called from Lua code */
+ luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci));
+ if (hasres) { /* if has result, move it to its place */
+ p3 = restorestack(L, result);
+ setobjs2s(L, p3, --L->top);
+ }
+}
+
+
+void luaV_gettable (lua_State *L, const TValue *t, TValue *key, StkId val) {
+ int loop;
+ for (loop = 0; loop < MAXTAGLOOP; loop++) {
+ const TValue *tm;
+ if (ttistable(t)) { /* `t' is a table? */
+ Table *h = hvalue(t);
+ const TValue *res = luaH_get(h, key); /* do a primitive get */
+ if (!ttisnil(res) || /* result is not nil? */
+ (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */
+ setobj2s(L, val, res);
+ return;
+ }
+ /* else will try the tag method */
+ }
+ else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX)))
+ luaG_typeerror(L, t, "index");
+ if (ttisfunction(tm)) {
+ callTM(L, tm, t, key, val, 1);
+ return;
+ }
+ t = tm; /* else repeat with 'tm' */
+ }
+ luaG_runerror(L, "loop in gettable");
+}
+
+
+void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) {
+ int loop;
+ for (loop = 0; loop < MAXTAGLOOP; loop++) {
+ const TValue *tm;
+ if (ttistable(t)) { /* `t' is a table? */
+ Table *h = hvalue(t);
+ TValue *oldval = cast(TValue *, luaH_get(h, key));
+ /* if previous value is not nil, there must be a previous entry
+ in the table; moreover, a metamethod has no relevance */
+ if (!ttisnil(oldval) ||
+ /* previous value is nil; must check the metamethod */
+ ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL &&
+ /* no metamethod; is there a previous entry in the table? */
+ (oldval != luaO_nilobject ||
+ /* no previous entry; must create one. (The next test is
+ always true; we only need the assignment.) */
+ (oldval = luaH_newkey(L, h, key), 1)))) {
+ /* no metamethod and (now) there is an entry with given key */
+ setobj2t(L, oldval, val); /* assign new value to that entry */
+ invalidateTMcache(h);
+ luaC_barrierback(L, obj2gco(h), val);
+ return;
+ }
+ /* else will try the metamethod */
+ }
+ else /* not a table; check metamethod */
+ if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX)))
+ luaG_typeerror(L, t, "index");
+ /* there is a metamethod */
+ if (ttisfunction(tm)) {
+ callTM(L, tm, t, key, val, 0);
+ return;
+ }
+ t = tm; /* else repeat with 'tm' */
+ }
+ luaG_runerror(L, "loop in settable");
+}
+
+
+static int call_binTM (lua_State *L, const TValue *p1, const TValue *p2,
+ StkId res, TMS event) {
+ const TValue *tm = luaT_gettmbyobj(L, p1, event); /* try first operand */
+ if (ttisnil(tm))
+ tm = luaT_gettmbyobj(L, p2, event); /* try second operand */
+ if (ttisnil(tm)) return 0;
+ callTM(L, tm, p1, p2, res, 1);
+ return 1;
+}
+
+
+static const TValue *get_equalTM (lua_State *L, Table *mt1, Table *mt2,
+ TMS event) {
+ const TValue *tm1 = fasttm(L, mt1, event);
+ const TValue *tm2;
+ if (tm1 == NULL) return NULL; /* no metamethod */
+ if (mt1 == mt2) return tm1; /* same metatables => same metamethods */
+ tm2 = fasttm(L, mt2, event);
+ if (tm2 == NULL) return NULL; /* no metamethod */
+ if (luaV_rawequalobj(tm1, tm2)) /* same metamethods? */
+ return tm1;
+ return NULL;
+}
+
+
+static int call_orderTM (lua_State *L, const TValue *p1, const TValue *p2,
+ TMS event) {
+ if (!call_binTM(L, p1, p2, L->top, event))
+ return -1; /* no metamethod */
+ else
+ return !l_isfalse(L->top);
+}
+
+
+static int l_strcmp (const TString *ls, const TString *rs) {
+ const char *l = getstr(ls);
+ size_t ll = ls->tsv.len;
+ const char *r = getstr(rs);
+ size_t lr = rs->tsv.len;
+ for (;;) {
+ int temp = strcoll(l, r);
+ if (temp != 0) return temp;
+ else { /* strings are equal up to a `\0' */
+ size_t len = strlen(l); /* index of first `\0' in both strings */
+ if (len == lr) /* r is finished? */
+ return (len == ll) ? 0 : 1;
+ else if (len == ll) /* l is finished? */
+ return -1; /* l is smaller than r (because r is not finished) */
+ /* both strings longer than `len'; go on comparing (after the `\0') */
+ len++;
+ l += len; ll -= len; r += len; lr -= len;
+ }
+ }
+}
+
+
+int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r) {
+ int res;
+ if (ttisnumber(l) && ttisnumber(r))
+ return luai_numlt(L, nvalue(l), nvalue(r));
+ else if (ttisstring(l) && ttisstring(r))
+ return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0;
+ else if ((res = call_orderTM(L, l, r, TM_LT)) < 0)
+ luaG_ordererror(L, l, r);
+ return res;
+}
+
+
+int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r) {
+ int res;
+ if (ttisnumber(l) && ttisnumber(r))
+ return luai_numle(L, nvalue(l), nvalue(r));
+ else if (ttisstring(l) && ttisstring(r))
+ return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0;
+ else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0) /* first try `le' */
+ return res;
+ else if ((res = call_orderTM(L, r, l, TM_LT)) < 0) /* else try `lt' */
+ luaG_ordererror(L, l, r);
+ return !res;
+}
+
+
+/*
+** equality of Lua values. L == NULL means raw equality (no metamethods)
+*/
+int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2) {
+ const TValue *tm;
+ lua_assert(ttisequal(t1, t2));
+ switch (ttype(t1)) {
+ case LUA_TNIL: return 1;
+ case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2));
+ case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); /* true must be 1 !! */
+ case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2);
+ case LUA_TLCF: return fvalue(t1) == fvalue(t2);
+ case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2));
+ case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2));
+ case LUA_TUSERDATA: {
+ if (uvalue(t1) == uvalue(t2)) return 1;
+ else if (L == NULL) return 0;
+ tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ);
+ break; /* will try TM */
+ }
+ case LUA_TTABLE: {
+ if (hvalue(t1) == hvalue(t2)) return 1;
+ else if (L == NULL) return 0;
+ tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ);
+ break; /* will try TM */
+ }
+ default:
+ lua_assert(iscollectable(t1));
+ return gcvalue(t1) == gcvalue(t2);
+ }
+ if (tm == NULL) return 0; /* no TM? */
+ callTM(L, tm, t1, t2, L->top, 1); /* call TM */
+ return !l_isfalse(L->top);
+}
+
+
+void luaV_concat (lua_State *L, int total) {
+ lua_assert(total >= 2);
+ do {
+ StkId top = L->top;
+ int n = 2; /* number of elements handled in this pass (at least 2) */
+ if (!(ttisstring(top-2) || ttisnumber(top-2)) || !tostring(L, top-1)) {
+ if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT))
+ luaG_concaterror(L, top-2, top-1);
+ }
+ else if (tsvalue(top-1)->len == 0) /* second operand is empty? */
+ (void)tostring(L, top - 2); /* result is first operand */
+ else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) {
+ setobjs2s(L, top - 2, top - 1); /* result is second op. */
+ }
+ else {
+ /* at least two non-empty string values; get as many as possible */
+ size_t tl = tsvalue(top-1)->len;
+ char *buffer;
+ int i;
+ /* collect total length */
+ for (i = 1; i < total && tostring(L, top-i-1); i++) {
+ size_t l = tsvalue(top-i-1)->len;
+ if (l >= (MAX_SIZET/sizeof(char)) - tl)
+ luaG_runerror(L, "string length overflow");
+ tl += l;
+ }
+ buffer = luaZ_openspace(L, &G(L)->buff, tl);
+ tl = 0;
+ n = i;
+ do { /* concat all strings */
+ size_t l = tsvalue(top-i)->len;
+ memcpy(buffer+tl, svalue(top-i), l * sizeof(char));
+ tl += l;
+ } while (--i > 0);
+ setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl));
+ }
+ total -= n-1; /* got 'n' strings to create 1 new */
+ L->top -= n-1; /* popped 'n' strings and pushed one */
+ } while (total > 1); /* repeat until only 1 result left */
+}
+
+
+void luaV_objlen (lua_State *L, StkId ra, const TValue *rb) {
+ const TValue *tm;
+ switch (ttypenv(rb)) {
+ case LUA_TTABLE: {
+ Table *h = hvalue(rb);
+ tm = fasttm(L, h->metatable, TM_LEN);
+ if (tm) break; /* metamethod? break switch to call it */
+ setnvalue(ra, cast_num(luaH_getn(h))); /* else primitive len */
+ return;
+ }
+ case LUA_TSTRING: {
+ setnvalue(ra, cast_num(tsvalue(rb)->len));
+ return;
+ }
+ default: { /* try metamethod */
+ tm = luaT_gettmbyobj(L, rb, TM_LEN);
+ if (ttisnil(tm)) /* no metamethod? */
+ luaG_typeerror(L, rb, "get length of");
+ break;
+ }
+ }
+ callTM(L, tm, rb, rb, ra, 1);
+}
+
+/*
+ * luaV_div and luaV_mod patched in from Lua 5.3.2 in order to properly handle
+ * div/mod by zero (instead of crashing, which is the default behavior in
+ * Lua 5.2)
+ */
+
+/*
+** Integer division; return 'm // n', that is, floor(m/n).
+** C division truncates its result (rounds towards zero).
+** 'floor(q) == trunc(q)' when 'q >= 0' or when 'q' is integer,
+** otherwise 'floor(q) == trunc(q) - 1'.
+*/
+static lua_Number luaV_div (lua_State *L, lua_Number m, lua_Number n) {
+ if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */
+ if (n == 0)
+ luaG_runerror(L, "attempt to divide by zero");
+ return (0 - m); /* n==-1; avoid overflow with 0x80000...//-1 */
+ }
+ else {
+ lua_Number q = m / n; /* perform C division */
+ if ((m ^ n) < 0 && m % n != 0) /* 'm/n' would be negative non-integer? */
+ q -= 1; /* correct result for different rounding */
+ return q;
+ }
+}
+
+
+/*
+** Integer modulus; return 'm % n'. (Assume that C '%' with
+** negative operands follows C99 behavior. See previous comment
+** about luaV_div.)
+*/
+static lua_Number luaV_mod (lua_State *L, lua_Number m, lua_Number n) {
+ if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */
+ if (n == 0)
+ luaG_runerror(L, "attempt to perform 'n%%0'");
+ return 0; /* m % -1 == 0; avoid overflow with 0x80000...%-1 */
+ }
+ else {
+ lua_Number r = m % n;
+ if (r != 0 && (m ^ n) < 0) /* 'm/n' would be non-integer negative? */
+ r += n; /* correct result for different rounding */
+ return r;
+ }
+}
+
+/*
+ * End patch from 5.3.2
+ */
+
+void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
+ const TValue *rc, TMS op) {
+ TValue tempb, tempc;
+ const TValue *b, *c;
+ if ((b = luaV_tonumber(rb, &tempb)) != NULL &&
+ (c = luaV_tonumber(rc, &tempc)) != NULL) {
+ /*
+ * Patched: if dividing or modding, use patched functions from 5.3
+ */
+ lua_Number res;
+ int lop = op - TM_ADD + LUA_OPADD;
+ if (lop == LUA_OPDIV) {
+ res = luaV_div(L, nvalue(b), nvalue(c));
+ } else if (lop == LUA_OPMOD) {
+ res = luaV_mod(L, nvalue(b), nvalue(c));
+ } else {
+ res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c));
+ }
+ setnvalue(ra, res);
+ }
+ else if (!call_binTM(L, rb, rc, ra, op))
+ luaG_aritherror(L, rb, rc);
+}
+
+
+/*
+** check whether cached closure in prototype 'p' may be reused, that is,
+** whether there is a cached closure with the same upvalues needed by
+** new closure to be created.
+*/
+static Closure *getcached (Proto *p, UpVal **encup, StkId base) {
+ Closure *c = p->cache;
+ if (c != NULL) { /* is there a cached closure? */
+ int nup = p->sizeupvalues;
+ Upvaldesc *uv = p->upvalues;
+ int i;
+ for (i = 0; i < nup; i++) { /* check whether it has right upvalues */
+ TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v;
+ if (c->l.upvals[i]->v != v)
+ return NULL; /* wrong upvalue; cannot reuse closure */
+ }
+ }
+ return c; /* return cached closure (or NULL if no cached closure) */
+}
+
+
+/*
+** create a new Lua closure, push it in the stack, and initialize
+** its upvalues. Note that the call to 'luaC_barrierproto' must come
+** before the assignment to 'p->cache', as the function needs the
+** original value of that field.
+*/
+static void pushclosure (lua_State *L, Proto *p, UpVal **encup, StkId base,
+ StkId ra) {
+ int nup = p->sizeupvalues;
+ Upvaldesc *uv = p->upvalues;
+ int i;
+ Closure *ncl = luaF_newLclosure(L, nup);
+ ncl->l.p = p;
+ setclLvalue(L, ra, ncl); /* anchor new closure in stack */
+ for (i = 0; i < nup; i++) { /* fill in its upvalues */
+ if (uv[i].instack) /* upvalue refers to local variable? */
+ ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx);
+ else /* get upvalue from enclosing function */
+ ncl->l.upvals[i] = encup[uv[i].idx];
+ }
+ luaC_barrierproto(L, p, ncl);
+ p->cache = ncl; /* save it on cache for reuse */
+}
+
+
+/*
+** finish execution of an opcode interrupted by an yield
+*/
+void luaV_finishOp (lua_State *L) {
+ CallInfo *ci = L->ci;
+ StkId base = ci->u.l.base;
+ Instruction inst = *(ci->u.l.savedpc - 1); /* interrupted instruction */
+ OpCode op = GET_OPCODE(inst);
+ switch (op) { /* finish its execution */
+ case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV:
+ case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN:
+ case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: {
+ setobjs2s(L, base + GETARG_A(inst), --L->top);
+ break;
+ }
+ case OP_LE: case OP_LT: case OP_EQ: {
+ int res = !l_isfalse(L->top - 1);
+ L->top--;
+ /* metamethod should not be called when operand is K */
+ lua_assert(!ISK(GETARG_B(inst)));
+ if (op == OP_LE && /* "<=" using "<" instead? */
+ ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE)))
+ res = !res; /* invert result */
+ lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP);
+ if (res != GETARG_A(inst)) /* condition failed? */
+ ci->u.l.savedpc++; /* skip jump instruction */
+ break;
+ }
+ case OP_CONCAT: {
+ StkId top = L->top - 1; /* top when 'call_binTM' was called */
+ int b = GETARG_B(inst); /* first element to concatenate */
+ int total = cast_int(top - 1 - (base + b)); /* yet to concatenate */
+ setobj2s(L, top - 2, top); /* put TM result in proper position */
+ if (total > 1) { /* are there elements to concat? */
+ L->top = top - 1; /* top is one after last element (at top-2) */
+ luaV_concat(L, total); /* concat them (may yield again) */
+ }
+ /* move final result to final position */
+ setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1);
+ L->top = ci->top; /* restore top */
+ break;
+ }
+ case OP_TFORCALL: {
+ lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP);
+ L->top = ci->top; /* correct top */
+ break;
+ }
+ case OP_CALL: {
+ if (GETARG_C(inst) - 1 >= 0) /* nresults >= 0? */
+ L->top = ci->top; /* adjust results */
+ break;
+ }
+ case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE:
+ break;
+ default: lua_assert(0);
+ }
+}
+
+
+
+/*
+** some macros for common tasks in `luaV_execute'
+*/
+
+#if !defined luai_runtimecheck
+#define luai_runtimecheck(L, c) /* void */
+#endif
+
+
+#define RA(i) (base+GETARG_A(i))
+/* to be used after possible stack reallocation */
+#define RB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i))
+#define RC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i))
+#define RKB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \
+ ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i))
+#define RKC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \
+ ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i))
+#define KBx(i) \
+ (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++)))
+
+
+/* execute a jump instruction */
+#define dojump(ci,i,e) \
+ { int a = GETARG_A(i); \
+ if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \
+ ci->u.l.savedpc += GETARG_sBx(i) + e; }
+
+/* for test instructions, execute the jump instruction that follows it */
+#define donextjump(ci) { i = *ci->u.l.savedpc; dojump(ci, i, 1); }
+
+
+#define Protect(x) { {x;}; base = ci->u.l.base; }
+
+#define checkGC(L,c) \
+ Protect( luaC_condGC(L,{L->top = (c); /* limit of live values */ \
+ luaC_step(L); \
+ L->top = ci->top;}) /* restore top */ \
+ luai_threadyield(L); )
+
+
+#define arith_op(op,tm) { \
+ TValue *rb = RKB(i); \
+ TValue *rc = RKC(i); \
+ if (ttisnumber(rb) && ttisnumber(rc)) { \
+ lua_Number nb = nvalue(rb), nc = nvalue(rc); \
+ setnvalue(ra, op(L, nb, nc)); \
+ } \
+ else { Protect(luaV_arith(L, ra, rb, rc, tm)); } }
+
+
+#define vmdispatch(o) switch(o)
+#define vmcase(l,b) case l: {b} break;
+#define vmcasenb(l,b) case l: {b} /* nb = no break */
+
+void luaV_execute (lua_State *L) {
+ CallInfo *ci = L->ci;
+ LClosure *cl;
+ TValue *k;
+ StkId base;
+ newframe: /* reentry point when frame changes (call/return) */
+ lua_assert(ci == L->ci);
+ cl = clLvalue(ci->func);
+ k = cl->p->k;
+ base = ci->u.l.base;
+ /* main loop of interpreter */
+ for (;;) {
+ Instruction i = *(ci->u.l.savedpc++);
+ StkId ra;
+ if ((L->hookmask & (LUA_MASKLINE | LUA_MASKCOUNT)) &&
+ (--L->hookcount == 0 || L->hookmask & LUA_MASKLINE)) {
+ Protect(traceexec(L));
+ }
+ /* WARNING: several calls may realloc the stack and invalidate `ra' */
+ ra = RA(i);
+ lua_assert(base == ci->u.l.base);
+ lua_assert(base <= L->top && L->top < L->stack + L->stacksize);
+ vmdispatch (GET_OPCODE(i)) {
+ vmcase(OP_MOVE,
+ setobjs2s(L, ra, RB(i));
+ )
+ vmcase(OP_LOADK,
+ TValue *rb = k + GETARG_Bx(i);
+ setobj2s(L, ra, rb);
+ )
+ vmcase(OP_LOADKX,
+ TValue *rb;
+ lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
+ rb = k + GETARG_Ax(*ci->u.l.savedpc++);
+ setobj2s(L, ra, rb);
+ )
+ vmcase(OP_LOADBOOL,
+ setbvalue(ra, GETARG_B(i));
+ if (GETARG_C(i)) ci->u.l.savedpc++; /* skip next instruction (if C) */
+ )
+ vmcase(OP_LOADNIL,
+ int b = GETARG_B(i);
+ do {
+ setnilvalue(ra++);
+ } while (b--);
+ )
+ vmcase(OP_GETUPVAL,
+ int b = GETARG_B(i);
+ setobj2s(L, ra, cl->upvals[b]->v);
+ )
+ vmcase(OP_GETTABUP,
+ int b = GETARG_B(i);
+ Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra));
+ )
+ vmcase(OP_GETTABLE,
+ Protect(luaV_gettable(L, RB(i), RKC(i), ra));
+ )
+ vmcase(OP_SETTABUP,
+ int a = GETARG_A(i);
+ Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i)));
+ )
+ vmcase(OP_SETUPVAL,
+ UpVal *uv = cl->upvals[GETARG_B(i)];
+ setobj(L, uv->v, ra);
+ luaC_barrier(L, uv, ra);
+ )
+ vmcase(OP_SETTABLE,
+ Protect(luaV_settable(L, ra, RKB(i), RKC(i)));
+ )
+ vmcase(OP_NEWTABLE,
+ int b = GETARG_B(i);
+ int c = GETARG_C(i);
+ Table *t = luaH_new(L);
+ sethvalue(L, ra, t);
+ if (b != 0 || c != 0)
+ luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c));
+ checkGC(L, ra + 1);
+ )
+ vmcase(OP_SELF,
+ StkId rb = RB(i);
+ setobjs2s(L, ra+1, rb);
+ Protect(luaV_gettable(L, rb, RKC(i), ra));
+ )
+ vmcase(OP_ADD,
+ arith_op(luai_numadd, TM_ADD);
+ )
+ vmcase(OP_SUB,
+ arith_op(luai_numsub, TM_SUB);
+ )
+ vmcase(OP_MUL,
+ arith_op(luai_nummul, TM_MUL);
+ )
+ /*
+ * Patched: use luaV_* instead of luai_* to handle div/mod by 0
+ */
+ vmcase(OP_DIV,
+ arith_op(luaV_div, TM_DIV);
+ )
+ vmcase(OP_MOD,
+ arith_op(luaV_mod, TM_MOD);
+ )
+ vmcase(OP_POW,
+ arith_op(luai_numpow, TM_POW);
+ )
+ vmcase(OP_UNM,
+ TValue *rb = RB(i);
+ if (ttisnumber(rb)) {
+ lua_Number nb = nvalue(rb);
+ setnvalue(ra, luai_numunm(L, nb));
+ }
+ else {
+ Protect(luaV_arith(L, ra, rb, rb, TM_UNM));
+ }
+ )
+ vmcase(OP_NOT,
+ TValue *rb = RB(i);
+ int res = l_isfalse(rb); /* next assignment may change this value */
+ setbvalue(ra, res);
+ )
+ vmcase(OP_LEN,
+ Protect(luaV_objlen(L, ra, RB(i)));
+ )
+ vmcase(OP_CONCAT,
+ int b = GETARG_B(i);
+ int c = GETARG_C(i);
+ StkId rb;
+ L->top = base + c + 1; /* mark the end of concat operands */
+ Protect(luaV_concat(L, c - b + 1));
+ ra = RA(i); /* 'luav_concat' may invoke TMs and move the stack */
+ rb = b + base;
+ setobjs2s(L, ra, rb);
+ checkGC(L, (ra >= rb ? ra + 1 : rb));
+ L->top = ci->top; /* restore top */
+ )
+ vmcase(OP_JMP,
+ dojump(ci, i, 0);
+ )
+ vmcase(OP_EQ,
+ TValue *rb = RKB(i);
+ TValue *rc = RKC(i);
+ Protect(
+ if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i))
+ ci->u.l.savedpc++;
+ else
+ donextjump(ci);
+ )
+ )
+ vmcase(OP_LT,
+ Protect(
+ if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i))
+ ci->u.l.savedpc++;
+ else
+ donextjump(ci);
+ )
+ )
+ vmcase(OP_LE,
+ Protect(
+ if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i))
+ ci->u.l.savedpc++;
+ else
+ donextjump(ci);
+ )
+ )
+ vmcase(OP_TEST,
+ if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra))
+ ci->u.l.savedpc++;
+ else
+ donextjump(ci);
+ )
+ vmcase(OP_TESTSET,
+ TValue *rb = RB(i);
+ if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb))
+ ci->u.l.savedpc++;
+ else {
+ setobjs2s(L, ra, rb);
+ donextjump(ci);
+ }
+ )
+ vmcase(OP_CALL,
+ int b = GETARG_B(i);
+ int nresults = GETARG_C(i) - 1;
+ if (b != 0) L->top = ra+b; /* else previous instruction set top */
+ if (luaD_precall(L, ra, nresults)) { /* C function? */
+ if (nresults >= 0) L->top = ci->top; /* adjust results */
+ base = ci->u.l.base;
+ }
+ else { /* Lua function */
+ ci = L->ci;
+ ci->callstatus |= CIST_REENTRY;
+ goto newframe; /* restart luaV_execute over new Lua function */
+ }
+ )
+ vmcase(OP_TAILCALL,
+ int b = GETARG_B(i);
+ if (b != 0) L->top = ra+b; /* else previous instruction set top */
+ lua_assert(GETARG_C(i) - 1 == LUA_MULTRET);
+ if (luaD_precall(L, ra, LUA_MULTRET)) /* C function? */
+ base = ci->u.l.base;
+ else {
+ /* tail call: put called frame (n) in place of caller one (o) */
+ CallInfo *nci = L->ci; /* called frame */
+ CallInfo *oci = nci->previous; /* caller frame */
+ StkId nfunc = nci->func; /* called function */
+ StkId ofunc = oci->func; /* caller function */
+ /* last stack slot filled by 'precall' */
+ StkId lim = nci->u.l.base + getproto(nfunc)->numparams;
+ int aux;
+ /* close all upvalues from previous call */
+ if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base);
+ /* move new frame into old one */
+ for (aux = 0; nfunc + aux < lim; aux++)
+ setobjs2s(L, ofunc + aux, nfunc + aux);
+ oci->u.l.base = ofunc + (nci->u.l.base - nfunc); /* correct base */
+ oci->top = L->top = ofunc + (L->top - nfunc); /* correct top */
+ oci->u.l.savedpc = nci->u.l.savedpc;
+ oci->callstatus |= CIST_TAIL; /* function was tail called */
+ ci = L->ci = oci; /* remove new frame */
+ lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize);
+ goto newframe; /* restart luaV_execute over new Lua function */
+ }
+ )
+ vmcasenb(OP_RETURN,
+ int b = GETARG_B(i);
+ if (b != 0) L->top = ra+b-1;
+ if (cl->p->sizep > 0) luaF_close(L, base);
+ b = luaD_poscall(L, ra);
+ if (!(ci->callstatus & CIST_REENTRY)) /* 'ci' still the called one */
+ return; /* external invocation: return */
+ else { /* invocation via reentry: continue execution */
+ ci = L->ci;
+ if (b) L->top = ci->top;
+ lua_assert(isLua(ci));
+ lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL);
+ goto newframe; /* restart luaV_execute over new Lua function */
+ }
+ )
+ vmcase(OP_FORLOOP,
+ lua_Number step = nvalue(ra+2);
+ lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */
+ lua_Number limit = nvalue(ra+1);
+ if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit)
+ : luai_numle(L, limit, idx)) {
+ ci->u.l.savedpc += GETARG_sBx(i); /* jump back */
+ setnvalue(ra, idx); /* update internal index... */
+ setnvalue(ra+3, idx); /* ...and external index */
+ }
+ )
+ vmcase(OP_FORPREP,
+ const TValue *init = ra;
+ const TValue *plimit = ra+1;
+ const TValue *pstep = ra+2;
+ if (!tonumber(init, ra))
+ luaG_runerror(L, LUA_QL("for") " initial value must be a number");
+ else if (!tonumber(plimit, ra+1))
+ luaG_runerror(L, LUA_QL("for") " limit must be a number");
+ else if (!tonumber(pstep, ra+2))
+ luaG_runerror(L, LUA_QL("for") " step must be a number");
+ setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep)));
+ ci->u.l.savedpc += GETARG_sBx(i);
+ )
+ vmcasenb(OP_TFORCALL,
+ StkId cb = ra + 3; /* call base */
+ setobjs2s(L, cb+2, ra+2);
+ setobjs2s(L, cb+1, ra+1);
+ setobjs2s(L, cb, ra);
+ L->top = cb + 3; /* func. + 2 args (state and index) */
+ Protect(luaD_call(L, cb, GETARG_C(i), 1));
+ L->top = ci->top;
+ i = *(ci->u.l.savedpc++); /* go to next instruction */
+ ra = RA(i);
+ lua_assert(GET_OPCODE(i) == OP_TFORLOOP);
+ goto l_tforloop;
+ )
+ vmcase(OP_TFORLOOP,
+ l_tforloop:
+ if (!ttisnil(ra + 1)) { /* continue loop? */
+ setobjs2s(L, ra, ra + 1); /* save control variable */
+ ci->u.l.savedpc += GETARG_sBx(i); /* jump back */
+ }
+ )
+ vmcase(OP_SETLIST,
+ int n = GETARG_B(i);
+ int c = GETARG_C(i);
+ int last;
+ Table *h;
+ if (n == 0) n = cast_int(L->top - ra) - 1;
+ if (c == 0) {
+ lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
+ c = GETARG_Ax(*ci->u.l.savedpc++);
+ }
+ luai_runtimecheck(L, ttistable(ra));
+ h = hvalue(ra);
+ last = ((c-1)*LFIELDS_PER_FLUSH) + n;
+ if (last > h->sizearray) /* needs more space? */
+ luaH_resizearray(L, h, last); /* pre-allocate it at once */
+ for (; n > 0; n--) {
+ TValue *val = ra+n;
+ luaH_setint(L, h, last--, val);
+ luaC_barrierback(L, obj2gco(h), val);
+ }
+ L->top = ci->top; /* correct top (in case of previous open call) */
+ )
+ vmcase(OP_CLOSURE,
+ Proto *p = cl->p->p[GETARG_Bx(i)];
+ Closure *ncl = getcached(p, cl->upvals, base); /* cached closure */
+ if (ncl == NULL) /* no match? */
+ pushclosure(L, p, cl->upvals, base, ra); /* create a new one */
+ else
+ setclLvalue(L, ra, ncl); /* push cashed closure */
+ checkGC(L, ra + 1);
+ )
+ vmcase(OP_VARARG,
+ int b = GETARG_B(i) - 1;
+ int j;
+ int n = cast_int(base - ci->func) - cl->p->numparams - 1;
+ if (b < 0) { /* B == 0? */
+ b = n; /* get all var. arguments */
+ Protect(luaD_checkstack(L, n));
+ ra = RA(i); /* previous call may change the stack */
+ L->top = ra + n;
+ }
+ for (j = 0; j < b; j++) {
+ if (j < n) {
+ setobjs2s(L, ra + j, base - n + j);
+ }
+ else {
+ setnilvalue(ra + j);
+ }
+ }
+ )
+ vmcase(OP_EXTRAARG,
+ lua_assert(0);
+ )
+ }
+ }
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h
new file mode 100644
index 000000000000..5380270da63d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h
@@ -0,0 +1,44 @@
+/*
+** $Id: lvm.h,v 2.18.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lvm_h
+#define lvm_h
+
+
+#include "ldo.h"
+#include "lobject.h"
+#include "ltm.h"
+
+
+#define tostring(L,o) (ttisstring(o) || (luaV_tostring(L, o)))
+
+#define tonumber(o,n) (ttisnumber(o) || (((o) = luaV_tonumber(o,n)) != NULL))
+
+#define equalobj(L,o1,o2) (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2))
+
+#define luaV_rawequalobj(o1,o2) equalobj(NULL,o1,o2)
+
+
+/* not to called directly */
+LUAI_FUNC int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2);
+
+
+LUAI_FUNC int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r);
+LUAI_FUNC int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r);
+LUAI_FUNC const TValue *luaV_tonumber (const TValue *obj, TValue *n);
+LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj);
+LUAI_FUNC void luaV_gettable (lua_State *L, const TValue *t, TValue *key,
+ StkId val);
+LUAI_FUNC void luaV_settable (lua_State *L, const TValue *t, TValue *key,
+ StkId val);
+LUAI_FUNC void luaV_finishOp (lua_State *L);
+LUAI_FUNC void luaV_execute (lua_State *L);
+LUAI_FUNC void luaV_concat (lua_State *L, int total);
+LUAI_FUNC void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
+ const TValue *rc, TMS op);
+LUAI_FUNC void luaV_objlen (lua_State *L, StkId ra, const TValue *rb);
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c
new file mode 100644
index 000000000000..53e6a3daeb5a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c
@@ -0,0 +1,76 @@
+/*
+** $Id: lzio.c,v 1.35.1.1 2013/04/12 18:48:47 roberto Exp $
+** Buffered streams
+** See Copyright Notice in lua.h
+*/
+
+
+#include <sys/zfs_context.h>
+
+#define lzio_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "llimits.h"
+#include "lmem.h"
+#include "lstate.h"
+#include "lzio.h"
+
+
+int luaZ_fill (ZIO *z) {
+ size_t size;
+ lua_State *L = z->L;
+ const char *buff;
+ lua_unlock(L);
+ buff = z->reader(L, z->data, &size);
+ lua_lock(L);
+ if (buff == NULL || size == 0)
+ return EOZ;
+ z->n = size - 1; /* discount char being returned */
+ z->p = buff;
+ return cast_uchar(*(z->p++));
+}
+
+
+void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) {
+ z->L = L;
+ z->reader = reader;
+ z->data = data;
+ z->n = 0;
+ z->p = NULL;
+}
+
+
+/* --------------------------------------------------------------- read --- */
+size_t luaZ_read (ZIO *z, void *b, size_t n) {
+ while (n) {
+ size_t m;
+ if (z->n == 0) { /* no bytes in buffer? */
+ if (luaZ_fill(z) == EOZ) /* try to read more */
+ return n; /* no more input; return number of missing bytes */
+ else {
+ z->n++; /* luaZ_fill consumed first byte; put it back */
+ z->p--;
+ }
+ }
+ m = (n <= z->n) ? n : z->n; /* min. between n and z->n */
+ memcpy(b, z->p, m);
+ z->n -= m;
+ z->p += m;
+ b = (char *)b + m;
+ n -= m;
+ }
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n) {
+ if (n > buff->buffsize) {
+ if (n < LUA_MINBUFFER) n = LUA_MINBUFFER;
+ luaZ_resizebuffer(L, buff, n);
+ }
+ return buff->buffer;
+}
+
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h
new file mode 100644
index 000000000000..441f7479cb14
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h
@@ -0,0 +1,65 @@
+/*
+** $Id: lzio.h,v 1.26.1.1 2013/04/12 18:48:47 roberto Exp $
+** Buffered streams
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lzio_h
+#define lzio_h
+
+#include "lua.h"
+
+#include "lmem.h"
+
+
+#define EOZ (-1) /* end of stream */
+
+typedef struct Zio ZIO;
+
+#define zgetc(z) (((z)->n--)>0 ? cast_uchar(*(z)->p++) : luaZ_fill(z))
+
+
+typedef struct Mbuffer {
+ char *buffer;
+ size_t n;
+ size_t buffsize;
+} Mbuffer;
+
+#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0)
+
+#define luaZ_buffer(buff) ((buff)->buffer)
+#define luaZ_sizebuffer(buff) ((buff)->buffsize)
+#define luaZ_bufflen(buff) ((buff)->n)
+
+#define luaZ_resetbuffer(buff) ((buff)->n = 0)
+
+
+#define luaZ_resizebuffer(L, buff, size) \
+ (luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \
+ (buff)->buffsize = size)
+
+#define luaZ_freebuffer(L, buff) luaZ_resizebuffer(L, buff, 0)
+
+
+LUAI_FUNC char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n);
+LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader,
+ void *data);
+LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n); /* read next n bytes */
+
+
+
+/* --------- Private Part ------------------ */
+
+struct Zio {
+ size_t n; /* bytes still unread */
+ const char *p; /* current position in buffer */
+ lua_Reader reader; /* reader function */
+ void* data; /* additional data */
+ lua_State *L; /* Lua state (for reader) */
+};
+
+
+LUAI_FUNC int luaZ_fill (ZIO *z);
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
new file mode 100644
index 000000000000..699373ad4d43
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
@@ -0,0 +1,129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * We keep our own copy of this algorithm for 3 main reasons:
+ * 1. If we didn't, anyone modifying common/os/compress.c would
+ * directly break our on disk format
+ * 2. Our version of lzjb does not have a number of checks that the
+ * common/os version needs and uses
+ * 3. We initialize the lempel to ensure deterministic results,
+ * so that identical blocks can always be deduplicated.
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and returns the compressed length, or the
+ * source length if compression would overflow the destination buffer.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+
+#define MATCH_BITS 6
+#define MATCH_MIN 3
+#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
+#define LEMPEL_SIZE 1024
+
+/*ARGSUSED*/
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *cpy;
+ uchar_t *copymap = NULL;
+ int copymask = 1 << (NBBY - 1);
+ int mlen, offset, hash;
+ uint16_t *hp;
+ uint16_t lempel[LEMPEL_SIZE] = { 0 };
+
+ while (src < (uchar_t *)s_start + s_len) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
+ return (s_len);
+ copymask = 1;
+ copymap = dst;
+ *dst++ = 0;
+ }
+ if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+ *dst++ = *src++;
+ continue;
+ }
+ hash = (src[0] << 16) + (src[1] << 8) + src[2];
+ hash += hash >> 9;
+ hash += hash >> 5;
+ hp = &lempel[hash & (LEMPEL_SIZE - 1)];
+ offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+ *hp = (uint16_t)(uintptr_t)src;
+ cpy = src - offset;
+ if (cpy >= (uchar_t *)s_start && cpy != src &&
+ src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+ *copymap |= copymask;
+ for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+ if (src[mlen] != cpy[mlen])
+ break;
+ *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+ (offset >> NBBY);
+ *dst++ = (uchar_t)offset;
+ src += mlen;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *d_end = (uchar_t *)d_start + d_len;
+ uchar_t *cpy;
+ uchar_t copymap = 0;
+ int copymask = 1 << (NBBY - 1);
+
+ while (dst < d_end) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ copymask = 1;
+ copymap = *src++;
+ }
+ if (copymap & copymask) {
+ int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+ int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+ src += 2;
+ if ((cpy = dst - offset) < (uchar_t *)d_start)
+ return (-1);
+ if (mlen > (d_end - dst))
+ mlen = d_end - dst;
+ while (--mlen >= 0)
+ *dst++ = *cpy++;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
new file mode 100644
index 000000000000..6cd862baff30
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -0,0 +1,4624 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS metaslab");
+
+#define GANG_ALLOCATION(flags) \
+ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
+
+uint64_t metaslab_aliquot = 512ULL << 10;
+uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, force_ganging, CTLFLAG_RWTUN,
+ &metaslab_force_ganging, 0,
+ "Force gang block allocation for blocks larger than or equal to this value");
+
+/*
+ * Since we can touch multiple metaslabs (and their respective space maps)
+ * with each transaction group, we benefit from having a smaller space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk.
+ */
+int zfs_metaslab_sm_blksz = (1 << 12);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, metaslab_sm_blksz, CTLFLAG_RDTUN,
+ &zfs_metaslab_sm_blksz, 0,
+ "Block size for metaslab DTL space map. Power of 2 and greater than 4096.");
+
+/*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+int zfs_condense_pct = 200;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
+ &zfs_condense_pct, 0,
+ "Condense on-disk spacemap when it is more than this many percents"
+ " of in-memory counterpart");
+
+/*
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
+ */
+int zfs_metaslab_condense_block_threshold = 4;
+
+/*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
+ &zfs_mg_noalloc_threshold, 0,
+ "Percentage of metaslab group size that should be free"
+ " to make it eligible for allocation");
+
+/*
+ * Metaslab groups are considered eligible for allocations if their
+ * fragmenation metric (measured as a percentage) is less than or equal to
+ * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
+ * then it will be skipped unless all metaslab groups within the metaslab
+ * class have also crossed this threshold.
+ */
+int zfs_mg_fragmentation_threshold = 85;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
+ &zfs_mg_fragmentation_threshold, 0,
+ "Percentage of metaslab group size that should be considered "
+ "eligible for allocations unless all metaslab groups within the metaslab class "
+ "have also crossed this threshold");
+
+/*
+ * Allow metaslabs to keep their active state as long as their fragmentation
+ * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
+ * active metaslab that exceeds this threshold will no longer keep its active
+ * status allowing better metaslabs to be selected.
+ */
+int zfs_metaslab_fragmentation_threshold = 70;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
+ &zfs_metaslab_fragmentation_threshold, 0,
+ "Maximum percentage of metaslab fragmentation level to keep their active state");
+
+/*
+ * When set will load all metaslabs when pool is first opened.
+ */
+int metaslab_debug_load = 0;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
+ &metaslab_debug_load, 0,
+ "Load all metaslabs when pool is first opened");
+
+/*
+ * When set will prevent metaslabs from being unloaded.
+ */
+int metaslab_debug_unload = 0;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
+ &metaslab_debug_unload, 0,
+ "Prevent metaslabs from being unloaded");
+
+/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
+ &metaslab_df_alloc_threshold, 0,
+ "Minimum size which forces the dynamic allocator to change it's allocation strategy");
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 4;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
+ &metaslab_df_free_pct, 0,
+ "The minimum free space, in percent, which must be available in a "
+ "space map to continue allocations in a first-fit fashion");
+
+/*
+ * A metaslab is considered "free" if it contains a contiguous
+ * segment which is greater than metaslab_min_alloc_size.
+ */
+uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
+ &metaslab_min_alloc_size, 0,
+ "A metaslab is considered \"free\" if it contains a contiguous "
+ "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
+
+/*
+ * Percentage of all cpus that can be used by the metaslab taskq.
+ */
+int metaslab_load_pct = 50;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
+ &metaslab_load_pct, 0,
+ "Percentage of cpus that can be used by the metaslab taskq");
+
+/*
+ * Determines how many txgs a metaslab may remain loaded without having any
+ * allocations from it. As long as a metaslab continues to be used we will
+ * keep it loaded.
+ */
+int metaslab_unload_delay = TXG_SIZE * 2;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
+ &metaslab_unload_delay, 0,
+ "Number of TXGs that an unused metaslab can be kept in memory");
+
+/*
+ * Max number of metaslabs per group to preload.
+ */
+int metaslab_preload_limit = SPA_DVAS_PER_BP;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
+ &metaslab_preload_limit, 0,
+ "Max number of metaslabs per group to preload");
+
+/*
+ * Enable/disable preloading of metaslab.
+ */
+boolean_t metaslab_preload_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
+ &metaslab_preload_enabled, 0,
+ "Max number of metaslabs per group to preload");
+
+/*
+ * Enable/disable fragmentation weighting on metaslabs.
+ */
+boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
+ &metaslab_fragmentation_factor_enabled, 0,
+ "Enable fragmentation weighting on metaslabs");
+
+/*
+ * Enable/disable lba weighting (i.e. outer tracks are given preference).
+ */
+boolean_t metaslab_lba_weighting_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
+ &metaslab_lba_weighting_enabled, 0,
+ "Enable LBA weighting (i.e. outer tracks are given preference)");
+
+/*
+ * Enable/disable metaslab group biasing.
+ */
+boolean_t metaslab_bias_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
+ &metaslab_bias_enabled, 0,
+ "Enable metaslab group biasing");
+
+/*
+ * Enable/disable remapping of indirect DVAs to their concrete vdevs.
+ */
+boolean_t zfs_remap_blkptr_enable = B_TRUE;
+
+/*
+ * Enable/disable segment-based metaslab selection.
+ */
+boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
+
+/*
+ * When using segment-based metaslab selection, we will continue
+ * allocating from the active metaslab until we have exhausted
+ * zfs_metaslab_switch_threshold of its buckets.
+ */
+int zfs_metaslab_switch_threshold = 2;
+
+/*
+ * Internal switch to enable/disable the metaslab allocation tracing
+ * facility.
+ */
+#ifdef _METASLAB_TRACING
+boolean_t metaslab_trace_enabled = B_TRUE;
+#endif
+
+/*
+ * Maximum entries that the metaslab allocation tracing facility will keep
+ * in a given list when running in non-debug mode. We limit the number
+ * of entries in non-debug mode to prevent us from using up too much memory.
+ * The limit should be sufficiently large that we don't expect any allocation
+ * to every exceed this value. In debug mode, the system will panic if this
+ * limit is ever reached allowing for further investigation.
+ */
+#ifdef _METASLAB_TRACING
+uint64_t metaslab_trace_max_entries = 5000;
+#endif
+
+static uint64_t metaslab_weight(metaslab_t *);
+static void metaslab_set_fragmentation(metaslab_t *);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
+static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
+static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
+static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
+#ifdef _METASLAB_TRACING
+kmem_cache_t *metaslab_alloc_trace_cache;
+#endif
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
+{
+ metaslab_class_t *mc;
+
+ mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+
+ mc->mc_spa = spa;
+ mc->mc_rotor = NULL;
+ mc->mc_ops = ops;
+ mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (zfs_refcount_t), KM_SLEEP);
+ mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (uint64_t), KM_SLEEP);
+ for (int i = 0; i < spa->spa_alloc_count; i++)
+ zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
+
+ return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+ ASSERT(mc->mc_rotor == NULL);
+ ASSERT(mc->mc_alloc == 0);
+ ASSERT(mc->mc_deferred == 0);
+ ASSERT(mc->mc_space == 0);
+ ASSERT(mc->mc_dspace == 0);
+
+ for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
+ zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
+ kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
+ sizeof (zfs_refcount_t));
+ kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
+ sizeof (uint64_t));
+ mutex_destroy(&mc->mc_lock);
+ kmem_free(mc, sizeof (metaslab_class_t));
+}
+
+int
+metaslab_class_validate(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+ vdev_t *vd;
+
+ /*
+ * Must hold one of the spa_config locks.
+ */
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+ spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
+
+ if ((mg = mc->mc_rotor) == NULL)
+ return (0);
+
+ do {
+ vd = mg->mg_vd;
+ ASSERT(vd->vdev_mg != NULL);
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(mg->mg_class, ==, mc);
+ ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+ } while ((mg = mg->mg_next) != mc->mc_rotor);
+
+ return (0);
+}
+
+static void
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
+{
+ atomic_add_64(&mc->mc_alloc, alloc_delta);
+ atomic_add_64(&mc->mc_deferred, defer_delta);
+ atomic_add_64(&mc->mc_space, space_delta);
+ atomic_add_64(&mc->mc_dspace, dspace_delta);
+}
+
+void
+metaslab_class_minblocksize_update(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+ vdev_t *vd;
+ uint64_t minashift = UINT64_MAX;
+
+ if ((mg = mc->mc_rotor) == NULL) {
+ mc->mc_minblocksize = SPA_MINBLOCKSIZE;
+ return;
+ }
+
+ do {
+ vd = mg->mg_vd;
+ if (vd->vdev_ashift < minashift)
+ minashift = vd->vdev_ashift;
+ } while ((mg = mg->mg_next) != mc->mc_rotor);
+
+ mc->mc_minblocksize = 1ULL << minashift;
+}
+
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+ return (mc->mc_alloc);
+}
+
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+ return (mc->mc_deferred);
+}
+
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+ return (mc->mc_space);
+}
+
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+ return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
+}
+
+uint64_t
+metaslab_class_get_minblocksize(metaslab_class_t *mc)
+{
+ return (mc->mc_minblocksize);
+}
+
+void
+metaslab_class_histogram_verify(metaslab_class_t *mc)
+{
+ spa_t *spa = mc->mc_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *mc_hist;
+ int i;
+
+ if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+ return;
+
+ mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+ KM_SLEEP);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ /*
+ * Skip any holes, uninitialized top-levels, or
+ * vdevs that are not in this metalab class.
+ */
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ mc_hist[i] += mg->mg_histogram[i];
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
+
+ kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
+/*
+ * Calculate the metaslab class's fragmentation metric. The metric
+ * is weighted based on the space contribution of each metaslab group.
+ * The return value will be a number between 0 and 100 (inclusive), or
+ * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
+ * zfs_frag_table for more information about the metric.
+ */
+uint64_t
+metaslab_class_fragmentation(metaslab_class_t *mc)
+{
+ vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ uint64_t fragmentation = 0;
+
+ spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ /*
+ * Skip any holes, uninitialized top-levels,
+ * or vdevs that are not in this metalab class.
+ */
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ /*
+ * If a metaslab group does not contain a fragmentation
+ * metric then just bail out.
+ */
+ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (ZFS_FRAG_INVALID);
+ }
+
+ /*
+ * Determine how much this metaslab_group is contributing
+ * to the overall pool fragmentation metric.
+ */
+ fragmentation += mg->mg_fragmentation *
+ metaslab_group_get_space(mg);
+ }
+ fragmentation /= metaslab_class_get_space(mc);
+
+ ASSERT3U(fragmentation, <=, 100);
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (fragmentation);
+}
+
+/*
+ * Calculate the amount of expandable space that is available in
+ * this metaslab class. If a device is expanded then its expandable
+ * space will be the amount of allocatable space that is currently not
+ * part of this metaslab class.
+ */
+uint64_t
+metaslab_class_expandable_space(metaslab_class_t *mc)
+{
+ vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ uint64_t space = 0;
+
+ spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ uint64_t tspace;
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ /*
+ * Calculate if we have enough space to add additional
+ * metaslabs. We report the expandable space in terms
+ * of the metaslab size since that's the unit of expansion.
+ * Adjust by efi system partition size.
+ */
+ tspace = tvd->vdev_max_asize - tvd->vdev_asize;
+ if (tspace > mc->mc_spa->spa_bootsize) {
+ tspace -= mc->mc_spa->spa_bootsize;
+ }
+ space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
+ }
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (space);
+}
+
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+ const metaslab_t *m1 = (const metaslab_t *)x1;
+ const metaslab_t *m2 = (const metaslab_t *)x2;
+
+ int sort1 = 0;
+ int sort2 = 0;
+ if (m1->ms_allocator != -1 && m1->ms_primary)
+ sort1 = 1;
+ else if (m1->ms_allocator != -1 && !m1->ms_primary)
+ sort1 = 2;
+ if (m2->ms_allocator != -1 && m2->ms_primary)
+ sort2 = 1;
+ else if (m2->ms_allocator != -1 && !m2->ms_primary)
+ sort2 = 2;
+
+ /*
+ * Sort inactive metaslabs first, then primaries, then secondaries. When
+ * selecting a metaslab to allocate from, an allocator first tries its
+ * primary, then secondary active metaslab. If it doesn't have active
+ * metaslabs, or can't allocate from them, it searches for an inactive
+ * metaslab to activate. If it can't find a suitable one, it will steal
+ * a primary or secondary metaslab from another allocator.
+ */
+ if (sort1 < sort2)
+ return (-1);
+ if (sort1 > sort2)
+ return (1);
+
+ int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
+ if (likely(cmp))
+ return (cmp);
+
+ IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
+
+ return (AVL_CMP(m1->ms_start, m2->ms_start));
+}
+
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+ return (msp->ms_allocated_space);
+}
+
+/*
+ * Verify that the space accounting on disk matches the in-core range_trees.
+ */
+static void
+metaslab_verify_space(metaslab_t *msp, uint64_t txg)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ uint64_t allocating = 0;
+ uint64_t sm_free_space, msp_free_space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!msp->ms_condensing);
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /*
+ * We can only verify the metaslab space when we're called
+ * from syncing context with a loaded metaslab that has an
+ * allocated space map. Calling this in non-syncing context
+ * does not provide a consistent view of the metaslab since
+ * we're performing allocations in the future.
+ */
+ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
+ !msp->ms_loaded)
+ return;
+
+ /*
+ * Even though the smp_alloc field can get negative (e.g.
+ * see vdev_checkpoint_sm), that should never be the case
+ * when it come's to a metaslab's space map.
+ */
+ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+ sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
+
+ /*
+ * Account for future allocations since we would have
+ * already deducted that space from the ms_allocatable.
+ */
+ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+ allocating +=
+ range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
+ }
+
+ ASSERT3U(msp->ms_deferspace, ==,
+ range_tree_space(msp->ms_defer[0]) +
+ range_tree_space(msp->ms_defer[1]));
+
+ msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
+ msp->ms_deferspace + range_tree_space(msp->ms_freed);
+
+ VERIFY3U(sm_free_space, ==, msp_free_space);
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold or has a fragmentation value that is
+ * greater than zfs_mg_fragmentation_threshold. If a metaslab group
+ * transitions from allocatable to non-allocatable or vice versa then the
+ * metaslab group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ metaslab_class_t *mc = mg->mg_class;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ boolean_t was_allocatable;
+ boolean_t was_initialized;
+
+ ASSERT(vd == vd->vdev_top);
+ ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
+ SCL_ALLOC);
+
+ mutex_enter(&mg->mg_lock);
+ was_allocatable = mg->mg_allocatable;
+ was_initialized = mg->mg_initialized;
+
+ mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+ (vs->vs_space + 1);
+
+ mutex_enter(&mc->mc_lock);
+
+ /*
+ * If the metaslab group was just added then it won't
+ * have any space until we finish syncing out this txg.
+ * At that point we will consider it initialized and available
+ * for allocations. We also don't consider non-activated
+ * metaslab groups (e.g. vdevs that are in the middle of being removed)
+ * to be initialized, because they can't be used for allocation.
+ */
+ mg->mg_initialized = metaslab_group_initialized(mg);
+ if (!was_initialized && mg->mg_initialized) {
+ mc->mc_groups++;
+ } else if (was_initialized && !mg->mg_initialized) {
+ ASSERT3U(mc->mc_groups, >, 0);
+ mc->mc_groups--;
+ }
+ if (mg->mg_initialized)
+ mg->mg_no_free_space = B_FALSE;
+
+ /*
+ * A metaslab group is considered allocatable if it has plenty
+ * of free space or is not heavily fragmented. We only take
+ * fragmentation into account if the metaslab group has a valid
+ * fragmentation metric (i.e. a value between 0 and 100).
+ */
+ mg->mg_allocatable = (mg->mg_activation_count > 0 &&
+ mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
+ (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
+ mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
+
+ /*
+ * The mc_alloc_groups maintains a count of the number of
+ * groups in this metaslab class that are still above the
+ * zfs_mg_noalloc_threshold. This is used by the allocating
+ * threads to determine if they should avoid allocations to
+ * a given group. The allocator will avoid allocations to a group
+ * if that group has reached or is below the zfs_mg_noalloc_threshold
+ * and there are still other groups that are above the threshold.
+ * When a group transitions from allocatable to non-allocatable or
+ * vice versa we update the metaslab class to reflect that change.
+ * When the mc_alloc_groups value drops to 0 that means that all
+ * groups have reached the zfs_mg_noalloc_threshold making all groups
+ * eligible for allocations. This effectively means that all devices
+ * are balanced again.
+ */
+ if (was_allocatable && !mg->mg_allocatable)
+ mc->mc_alloc_groups--;
+ else if (!was_allocatable && mg->mg_allocatable)
+ mc->mc_alloc_groups++;
+ mutex_exit(&mc->mc_lock);
+
+ mutex_exit(&mg->mg_lock);
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
+{
+ metaslab_group_t *mg;
+
+ mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+ mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
+ mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+ KM_SLEEP);
+ mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+ KM_SLEEP);
+ avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+ sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+ mg->mg_vd = vd;
+ mg->mg_class = mc;
+ mg->mg_activation_count = 0;
+ mg->mg_initialized = B_FALSE;
+ mg->mg_no_free_space = B_TRUE;
+ mg->mg_allocators = allocators;
+
+ mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
+ sizeof (zfs_refcount_t), KM_SLEEP);
+ mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
+ sizeof (uint64_t), KM_SLEEP);
+ for (int i = 0; i < allocators; i++) {
+ zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
+ mg->mg_cur_max_alloc_queue_depth[i] = 0;
+ }
+
+ mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
+ minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
+
+ return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ /*
+ * We may have gone below zero with the activation count
+ * either because we never activated in the first place or
+ * because we're done, and possibly removing the vdev.
+ */
+ ASSERT(mg->mg_activation_count <= 0);
+
+ taskq_destroy(mg->mg_taskq);
+ avl_destroy(&mg->mg_metaslab_tree);
+ kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
+ kmem_free(mg->mg_secondaries, mg->mg_allocators *
+ sizeof (metaslab_t *));
+ mutex_destroy(&mg->mg_lock);
+ mutex_destroy(&mg->mg_ms_initialize_lock);
+ cv_destroy(&mg->mg_ms_initialize_cv);
+
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
+ mg->mg_cur_max_alloc_queue_depth[i] = 0;
+ }
+ kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
+ sizeof (zfs_refcount_t));
+ kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
+ sizeof (uint64_t));
+
+ kmem_free(mg, sizeof (metaslab_group_t));
+}
+
+void
+metaslab_group_activate(metaslab_group_t *mg)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
+
+ ASSERT(mc->mc_rotor != mg);
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ ASSERT(mg->mg_activation_count <= 0);
+
+ if (++mg->mg_activation_count <= 0)
+ return;
+
+ mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+ metaslab_group_alloc_update(mg);
+
+ if ((mgprev = mc->mc_rotor) == NULL) {
+ mg->mg_prev = mg;
+ mg->mg_next = mg;
+ } else {
+ mgnext = mgprev->mg_next;
+ mg->mg_prev = mgprev;
+ mg->mg_next = mgnext;
+ mgprev->mg_next = mg;
+ mgnext->mg_prev = mg;
+ }
+ mc->mc_rotor = mg;
+ metaslab_class_minblocksize_update(mc);
+}
+
+/*
+ * Passivate a metaslab group and remove it from the allocation rotor.
+ * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
+ * a metaslab group. This function will momentarily drop spa_config_locks
+ * that are lower than the SCL_ALLOC lock (see comment below).
+ */
+void
+metaslab_group_passivate(metaslab_group_t *mg)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ spa_t *spa = mc->mc_spa;
+ metaslab_group_t *mgprev, *mgnext;
+ int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
+
+ ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
+ (SCL_ALLOC | SCL_ZIO));
+
+ if (--mg->mg_activation_count != 0) {
+ ASSERT(mc->mc_rotor != mg);
+ ASSERT(mg->mg_prev == NULL);
+ ASSERT(mg->mg_next == NULL);
+ ASSERT(mg->mg_activation_count < 0);
+ return;
+ }
+
+ /*
+ * The spa_config_lock is an array of rwlocks, ordered as
+ * follows (from highest to lowest):
+ * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
+ * SCL_ZIO > SCL_FREE > SCL_VDEV
+ * (For more information about the spa_config_lock see spa_misc.c)
+ * The higher the lock, the broader its coverage. When we passivate
+ * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
+ * config locks. However, the metaslab group's taskq might be trying
+ * to preload metaslabs so we must drop the SCL_ZIO lock and any
+ * lower locks to allow the I/O to complete. At a minimum,
+ * we continue to hold the SCL_ALLOC lock, which prevents any future
+ * allocations from taking place and any changes to the vdev tree.
+ */
+ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
+ taskq_wait(mg->mg_taskq);
+ spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
+ metaslab_group_alloc_update(mg);
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ metaslab_t *msp = mg->mg_primaries[i];
+ if (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+ msp = mg->mg_secondaries[i];
+ if (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+
+ mgprev = mg->mg_prev;
+ mgnext = mg->mg_next;
+
+ if (mg == mgnext) {
+ mc->mc_rotor = NULL;
+ } else {
+ mc->mc_rotor = mgnext;
+ mgprev->mg_next = mgnext;
+ mgnext->mg_prev = mgprev;
+ }
+
+ mg->mg_prev = NULL;
+ mg->mg_next = NULL;
+ metaslab_class_minblocksize_update(mc);
+}
+
+boolean_t
+metaslab_group_initialized(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ return (vs->vs_space != 0 && mg->mg_activation_count > 0);
+}
+
+uint64_t
+metaslab_group_get_space(metaslab_group_t *mg)
+{
+ return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
+}
+
+void
+metaslab_group_histogram_verify(metaslab_group_t *mg)
+{
+ uint64_t *mg_hist;
+ vdev_t *vd = mg->mg_vd;
+ uint64_t ashift = vd->vdev_ashift;
+ int i;
+
+ if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+ return;
+
+ mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+ KM_SLEEP);
+
+ ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
+ SPACE_MAP_HISTOGRAM_SIZE + ashift);
+
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT(msp != NULL);
+
+ /* skip if not active or not a member */
+ if (msp->ms_sm == NULL || msp->ms_group != mg)
+ continue;
+
+ for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+ mg_hist[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+ VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
+
+ kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
+static void
+metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_sm == NULL)
+ return;
+
+ mutex_enter(&mg->mg_lock);
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ mg->mg_histogram[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ mc->mc_histogram[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+ mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_sm == NULL)
+ return;
+
+ mutex_enter(&mg->mg_lock);
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(mg->mg_histogram[i + ashift], >=,
+ msp->ms_sm->sm_phys->smp_histogram[i]);
+ ASSERT3U(mc->mc_histogram[i + ashift], >=,
+ msp->ms_sm->sm_phys->smp_histogram[i]);
+
+ mg->mg_histogram[i + ashift] -=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ mc->mc_histogram[i + ashift] -=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+ ASSERT(msp->ms_group == NULL);
+ mutex_enter(&mg->mg_lock);
+ msp->ms_group = mg;
+ msp->ms_weight = 0;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_histogram_add(mg, msp);
+ mutex_exit(&msp->ms_lock);
+}
+
+static void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_histogram_remove(mg, msp);
+ mutex_exit(&msp->ms_lock);
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_group = NULL;
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_lock));
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+
+}
+
+static void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ /*
+ * Although in principle the weight can be any value, in
+ * practice we do not use values in the range [1, 511].
+ */
+ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ mutex_enter(&mg->mg_lock);
+ metaslab_group_sort_impl(mg, msp, weight);
+ mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * Calculate the fragmentation for a given metaslab group. We can use
+ * a simple average here since all metaslabs within the group must have
+ * the same size. The return value will be a value between 0 and 100
+ * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
+ * group have a fragmentation metric.
+ */
+uint64_t
+metaslab_group_fragmentation(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ uint64_t fragmentation = 0;
+ uint64_t valid_ms = 0;
+
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
+ continue;
+ if (msp->ms_group != mg)
+ continue;
+
+ valid_ms++;
+ fragmentation += msp->ms_fragmentation;
+ }
+
+ if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
+ return (ZFS_FRAG_INVALID);
+
+ fragmentation /= valid_ms;
+ ASSERT3U(fragmentation, <=, 100);
+ return (fragmentation);
+}
+
+/*
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its free capacity is less than the
+ * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
+ * zfs_mg_fragmentation_threshold and there is at least one metaslab group
+ * that can still handle allocations. If the allocation throttle is enabled
+ * then we skip allocations to devices that have reached their maximum
+ * allocation queue depth unless the selected metaslab group is the only
+ * eligible group remaining.
+ */
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
+ uint64_t psize, int allocator, int d)
+{
+ spa_t *spa = mg->mg_vd->vdev_spa;
+ metaslab_class_t *mc = mg->mg_class;
+
+ /*
+ * We can only consider skipping this metaslab group if it's
+ * in the normal metaslab class and there are other metaslab
+ * groups to select from. Otherwise, we always consider it eligible
+ * for allocations.
+ */
+ if ((mc != spa_normal_class(spa) &&
+ mc != spa_special_class(spa) &&
+ mc != spa_dedup_class(spa)) ||
+ mc->mc_groups <= 1)
+ return (B_TRUE);
+
+ /*
+ * If the metaslab group's mg_allocatable flag is set (see comments
+ * in metaslab_group_alloc_update() for more information) and
+ * the allocation throttle is disabled then allow allocations to this
+ * device. However, if the allocation throttle is enabled then
+ * check if we have reached our allocation limit (mg_alloc_queue_depth)
+ * to determine if we should allow allocations to this metaslab group.
+ * If all metaslab groups are no longer considered allocatable
+ * (mc_alloc_groups == 0) or we're trying to allocate the smallest
+ * gang block size then we allow allocations on this metaslab group
+ * regardless of the mg_allocatable or throttle settings.
+ */
+ if (mg->mg_allocatable) {
+ metaslab_group_t *mgp;
+ int64_t qdepth;
+ uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
+
+ if (!mc->mc_alloc_throttle_enabled)
+ return (B_TRUE);
+
+ /*
+ * If this metaslab group does not have any free space, then
+ * there is no point in looking further.
+ */
+ if (mg->mg_no_free_space)
+ return (B_FALSE);
+
+ /*
+ * Relax allocation throttling for ditto blocks. Due to
+ * random imbalances in allocation it tends to push copies
+ * to one vdev, that looks a bit better at the moment.
+ */
+ qmax = qmax * (4 + d) / 4;
+
+ qdepth = zfs_refcount_count(
+ &mg->mg_alloc_queue_depth[allocator]);
+
+ /*
+ * If this metaslab group is below its qmax or it's
+ * the only allocatable metasable group, then attempt
+ * to allocate from it.
+ */
+ if (qdepth < qmax || mc->mc_alloc_groups == 1)
+ return (B_TRUE);
+ ASSERT3U(mc->mc_alloc_groups, >, 1);
+
+ /*
+ * Since this metaslab group is at or over its qmax, we
+ * need to determine if there are metaslab groups after this
+ * one that might be able to handle this allocation. This is
+ * racy since we can't hold the locks for all metaslab
+ * groups at the same time when we make this check.
+ */
+ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
+ qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
+ qmax = qmax * (4 + d) / 4;
+ qdepth = zfs_refcount_count(
+ &mgp->mg_alloc_queue_depth[allocator]);
+
+ /*
+ * If there is another metaslab group that
+ * might be able to handle the allocation, then
+ * we return false so that we skip this group.
+ */
+ if (qdepth < qmax && !mgp->mg_no_free_space)
+ return (B_FALSE);
+ }
+
+ /*
+ * We didn't find another group to handle the allocation
+ * so we can't skip this metaslab group even though
+ * we are at or over our qmax.
+ */
+ return (B_TRUE);
+
+ } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * ==========================================================================
+ * Range tree callbacks
+ * ==========================================================================
+ */
+
+/*
+ * Comparison function for the private size-ordered tree. Tree is sorted
+ * by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize_compare(const void *x1, const void *x2)
+{
+ const range_seg_t *r1 = x1;
+ const range_seg_t *r2 = x2;
+ uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+ uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+ int cmp = AVL_CMP(rs_size1, rs_size2);
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_CMP(r1->rs_start, r2->rs_start));
+}
+
+/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+
+/*
+ * Return the maximum contiguous segment within the metaslab.
+ */
+uint64_t
+metaslab_block_maxsize(metaslab_t *msp)
+{
+ avl_tree_t *t = &msp->ms_allocatable_by_size;
+ range_seg_t *rs;
+
+ if (t == NULL || (rs = avl_last(t)) == NULL)
+ return (0ULL);
+
+ return (rs->rs_end - rs->rs_start);
+}
+
+static range_seg_t *
+metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
+{
+ range_seg_t *rs, rsearch;
+ avl_index_t where;
+
+ rsearch.rs_start = start;
+ rsearch.rs_end = start + size;
+
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL) {
+ rs = avl_nearest(t, where, AVL_AFTER);
+ }
+
+ return (rs);
+}
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
+static uint64_t
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+ uint64_t align)
+{
+ range_seg_t *rs = metaslab_block_find(t, *cursor, size);
+
+ while (rs != NULL) {
+ uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+
+ if (offset + size <= rs->rs_end) {
+ *cursor = offset + size;
+ return (offset);
+ }
+ rs = AVL_NEXT(t, rs);
+ }
+
+ /*
+ * If we know we've searched the whole map (*cursor == 0), give up.
+ * Otherwise, reset the cursor to the beginning and try again.
+ */
+ if (*cursor == 0)
+ return (-1ULL);
+
+ *cursor = 0;
+ return (metaslab_block_picker(t, cursor, size, align));
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
+{
+ /*
+ * Find the largest power of 2 block size that evenly divides the
+ * requested size. This is used to try to allocate blocks with similar
+ * alignment from the same area of the metaslab (i.e. same cursor
+ * bucket) but it does not guarantee that other allocations sizes
+ * may exist in the same region.
+ */
+ uint64_t align = size & -size;
+ uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+ avl_tree_t *t = &msp->ms_allocatable->rt_root;
+
+ return (metaslab_block_picker(t, cursor, size, align));
+}
+
+static metaslab_ops_t metaslab_ff_ops = {
+ metaslab_ff_alloc
+};
+
+/*
+ * ==========================================================================
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
+{
+ /*
+ * Find the largest power of 2 block size that evenly divides the
+ * requested size. This is used to try to allocate blocks with similar
+ * alignment from the same area of the metaslab (i.e. same cursor
+ * bucket) but it does not guarantee that other allocations sizes
+ * may exist in the same region.
+ */
+ uint64_t align = size & -size;
+ uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+ range_tree_t *rt = msp->ms_allocatable;
+ avl_tree_t *t = &rt->rt_root;
+ uint64_t max_size = metaslab_block_maxsize(msp);
+ int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==,
+ avl_numnodes(&msp->ms_allocatable_by_size));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ /*
+ * If we're running low on space switch to using the size
+ * sorted AVL tree (best-fit).
+ */
+ if (max_size < metaslab_df_alloc_threshold ||
+ free_pct < metaslab_df_free_pct) {
+ t = &msp->ms_allocatable_by_size;
+ *cursor = 0;
+ }
+
+ return (metaslab_block_picker(t, cursor, size, 1ULL));
+}
+
+static metaslab_ops_t metaslab_df_ops = {
+ metaslab_df_alloc
+};
+
+/*
+ * ==========================================================================
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
+{
+ range_tree_t *rt = msp->ms_allocatable;
+ avl_tree_t *t = &msp->ms_allocatable_by_size;
+ uint64_t *cursor = &msp->ms_lbas[0];
+ uint64_t *cursor_end = &msp->ms_lbas[1];
+ uint64_t offset = 0;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
+
+ ASSERT3U(*cursor_end, >=, *cursor);
+
+ if ((*cursor + size) > *cursor_end) {
+ range_seg_t *rs;
+
+ rs = avl_last(&msp->ms_allocatable_by_size);
+ if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+ return (-1ULL);
+
+ *cursor = rs->rs_start;
+ *cursor_end = rs->rs_end;
+ }
+
+ offset = *cursor;
+ *cursor += size;
+
+ return (offset);
+}
+
+static metaslab_ops_t metaslab_cf_ops = {
+ metaslab_cf_alloc
+};
+
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
+uint64_t metaslab_ndf_clump_shift = 4;
+
+static uint64_t
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
+{
+ avl_tree_t *t = &msp->ms_allocatable->rt_root;
+ avl_index_t where;
+ range_seg_t *rs, rsearch;
+ uint64_t hbit = highbit64(size);
+ uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+ uint64_t max_size = metaslab_block_maxsize(msp);
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==,
+ avl_numnodes(&msp->ms_allocatable_by_size));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ rsearch.rs_start = *cursor;
+ rsearch.rs_end = *cursor + size;
+
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+ t = &msp->ms_allocatable_by_size;
+
+ rsearch.rs_start = 0;
+ rsearch.rs_end = MIN(max_size,
+ 1ULL << (hbit + metaslab_ndf_clump_shift));
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL)
+ rs = avl_nearest(t, where, AVL_AFTER);
+ ASSERT(rs != NULL);
+ }
+
+ if ((rs->rs_end - rs->rs_start) >= size) {
+ *cursor = rs->rs_start + size;
+ return (rs->rs_start);
+ }
+ return (-1ULL);
+}
+
+static metaslab_ops_t metaslab_ndf_ops = {
+ metaslab_ndf_alloc
+};
+
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+
+static void
+metaslab_aux_histograms_clear(metaslab_t *msp)
+{
+ /*
+ * Auxiliary histograms are only cleared when resetting them,
+ * which can only happen while the metaslab is loaded.
+ */
+ ASSERT(msp->ms_loaded);
+
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+}
+
+static void
+metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
+ range_tree_t *rt)
+{
+ /*
+ * This is modeled after space_map_histogram_add(), so refer to that
+ * function for implementation details. We want this to work like
+ * the space map histogram, and not the range tree histogram, as we
+ * are essentially constructing a delta that will be later subtracted
+ * from the space map histogram.
+ */
+ int idx = 0;
+ for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(i, >=, idx + shift);
+ histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
+
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
+}
+
+/*
+ * Called at every sync pass that the metaslab gets synced.
+ *
+ * The reason is that we want our auxiliary histograms to be updated
+ * wherever the metaslab's space map histogram is updated. This way
+ * we stay consistent on which parts of the metaslab space map's
+ * histogram are currently not available for allocations (e.g because
+ * they are in the defer, freed, and freeing trees).
+ */
+static void
+metaslab_aux_histograms_update(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(sm != NULL);
+
+ /*
+ * This is similar to the metaslab's space map histogram updates
+ * that take place in metaslab_sync(). The only difference is that
+ * we only care about segments that haven't made it into the
+ * ms_allocatable tree yet.
+ */
+ if (msp->ms_loaded) {
+ metaslab_aux_histograms_clear(msp);
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freed);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ metaslab_aux_histogram_add(msp->ms_deferhist[t],
+ sm->sm_shift, msp->ms_defer[t]);
+ }
+ }
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freeing);
+}
+
+/*
+ * Called every time we are done syncing (writing to) the metaslab,
+ * i.e. at the end of each sync pass.
+ * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
+ */
+static void
+metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ space_map_t *sm = msp->ms_sm;
+
+ if (sm == NULL) {
+ /*
+ * We came here from metaslab_init() when creating/opening a
+ * pool, looking at a metaslab that hasn't had any allocations
+ * yet.
+ */
+ return;
+ }
+
+ /*
+ * This is similar to the actions that we take for the ms_freed
+ * and ms_defer trees in metaslab_sync_done().
+ */
+ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
+ if (defer_allowed) {
+ bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_synchist));
+ } else {
+ bzero(msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_deferhist[hist_index]));
+ }
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+}
+
+/*
+ * Ensure that the metaslab's weight and fragmentation are consistent
+ * with the contents of the histogram (either the range tree's histogram
+ * or the space map's depending whether the metaslab is loaded).
+ */
+static void
+metaslab_verify_weight_and_frag(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /* see comment in metaslab_verify_unflushed_changes() */
+ if (msp->ms_group == NULL)
+ return;
+
+ /*
+ * Devices being removed always return a weight of 0 and leave
+ * fragmentation and ms_max_size as is - there is nothing for
+ * us to verify here.
+ */
+ vdev_t *vd = msp->ms_group->mg_vd;
+ if (vd->vdev_removing)
+ return;
+
+ /*
+ * If the metaslab is dirty it probably means that we've done
+ * some allocations or frees that have changed our histograms
+ * and thus the weight.
+ */
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&vd->vdev_ms_list, msp, t))
+ return;
+ }
+
+ /*
+ * This verification checks that our in-memory state is consistent
+ * with what's on disk. If the pool is read-only then there aren't
+ * any changes and we just have the initially-loaded state.
+ */
+ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
+ return;
+
+ /* some extra verification for in-core tree if you can */
+ if (msp->ms_loaded) {
+ range_tree_stat_verify(msp->ms_allocatable);
+ VERIFY(space_map_histogram_verify(msp->ms_sm,
+ msp->ms_allocatable));
+ }
+
+ uint64_t weight = msp->ms_weight;
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
+ uint64_t frag = msp->ms_fragmentation;
+ uint64_t max_segsize = msp->ms_max_size;
+
+ msp->ms_weight = 0;
+ msp->ms_fragmentation = 0;
+ msp->ms_max_size = 0;
+
+ /*
+ * This function is used for verification purposes. Regardless of
+ * whether metaslab_weight() thinks this metaslab should be active or
+ * not, we want to ensure that the actual weight (and therefore the
+ * value of ms_weight) would be the same if it was to be recalculated
+ * at this point.
+ */
+ msp->ms_weight = metaslab_weight(msp) | was_active;
+
+ VERIFY3U(max_segsize, ==, msp->ms_max_size);
+
+ /*
+ * If the weight type changed then there is no point in doing
+ * verification. Revert fields to their original values.
+ */
+ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
+ (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
+ msp->ms_fragmentation = frag;
+ msp->ms_weight = weight;
+ return;
+ }
+
+ VERIFY3U(msp->ms_fragmentation, ==, frag);
+ VERIFY3U(msp->ms_weight, ==, weight);
+}
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+static void
+metaslab_load_wait(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ while (msp->ms_loading) {
+ ASSERT(!msp->ms_loaded);
+ cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+ }
+}
+
+static int
+metaslab_load_impl(metaslab_t *msp)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
+
+ /*
+ * We temporarily drop the lock to unblock other operations while we
+ * are reading the space map. Therefore, metaslab_sync() and
+ * metaslab_sync_done() can run at the same time as we do.
+ *
+ * metaslab_sync() can append to the space map while we are loading.
+ * Therefore we load only entries that existed when we started the
+ * load. Additionally, metaslab_sync_done() has to wait for the load
+ * to complete because there are potential races like metaslab_load()
+ * loading parts of the space map that are currently being appended
+ * by metaslab_sync(). If we didn't, the ms_allocatable would have
+ * entries that metaslab_sync_done() would try to re-add later.
+ *
+ * That's why before dropping the lock we remember the synced length
+ * of the metaslab and read up to that point of the space map,
+ * ignoring entries appended by metaslab_sync() that happen after we
+ * drop the lock.
+ */
+ uint64_t length = msp->ms_synced_length;
+ mutex_exit(&msp->ms_lock);
+
+ if (msp->ms_sm != NULL) {
+ error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
+ SM_FREE, length);
+ } else {
+ /*
+ * The space map has not been allocated yet, so treat
+ * all the space in the metaslab as free and add it to the
+ * ms_allocatable tree.
+ */
+ range_tree_add(msp->ms_allocatable,
+ msp->ms_start, msp->ms_size);
+ }
+
+ /*
+ * We need to grab the ms_sync_lock to prevent metaslab_sync() from
+ * changing the ms_sm and the metaslab's range trees while we are
+ * about to use them and populate the ms_allocatable. The ms_lock
+ * is insufficient for this because metaslab_sync() doesn't hold
+ * the ms_lock while writing the ms_checkpointing tree to disk.
+ */
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+ ASSERT(!msp->ms_condensing);
+
+ if (error != 0) {
+ mutex_exit(&msp->ms_sync_lock);
+ return (error);
+ }
+
+ ASSERT3P(msp->ms_group, !=, NULL);
+ msp->ms_loaded = B_TRUE;
+
+ /*
+ * The ms_allocatable contains the segments that exist in the
+ * ms_defer trees [see ms_synced_length]. Thus we need to remove
+ * them from ms_allocatable as they will be added again in
+ * metaslab_sync_done().
+ */
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, msp->ms_allocatable);
+ }
+
+ /*
+ * Call metaslab_recalculate_weight_and_sort() now that the
+ * metaslab is loaded so we get the metaslab's real weight.
+ *
+ * Unless this metaslab was created with older software and
+ * has not yet been converted to use segment-based weight, we
+ * expect the new weight to be better or equal to the weight
+ * that the metaslab had while it was not loaded. This is
+ * because the old weight does not take into account the
+ * consolidation of adjacent segments between TXGs. [see
+ * comment for ms_synchist and ms_deferhist[] for more info]
+ */
+ uint64_t weight = msp->ms_weight;
+ metaslab_recalculate_weight_and_sort(msp);
+ if (!WEIGHT_IS_SPACEBASED(weight))
+ ASSERT3U(weight, <=, msp->ms_weight);
+ msp->ms_max_size = metaslab_block_maxsize(msp);
+
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ metaslab_verify_space(msp, spa_syncing_txg(spa));
+ mutex_exit(&msp->ms_sync_lock);
+
+ return (0);
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * There may be another thread loading the same metaslab, if that's
+ * the case just wait until the other thread is done and return.
+ */
+ metaslab_load_wait(msp);
+ if (msp->ms_loaded)
+ return (0);
+ VERIFY(!msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
+
+ msp->ms_loading = B_TRUE;
+ int error = metaslab_load_impl(msp);
+ msp->ms_loading = B_FALSE;
+ cv_broadcast(&msp->ms_load_cv);
+
+ return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_verify_weight_and_frag(msp);
+
+ range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+ msp->ms_loaded = B_FALSE;
+
+ msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+ msp->ms_max_size = 0;
+
+ /*
+ * We explicitly recalculate the metaslab's weight based on its space
+ * map (as it is now not loaded). We want unload metaslabs to always
+ * have their weights calculated from the space map histograms, while
+ * loaded ones have it calculated from their in-core range tree
+ * [see metaslab_load()]. This way, the weight reflects the information
+ * available in-core, whether it is loaded or not
+ *
+ * If ms_group == NULL means that we came here from metaslab_fini(),
+ * at which point it doesn't make sense for us to do the recalculation
+ * and the sorting.
+ */
+ if (msp->ms_group != NULL)
+ metaslab_recalculate_weight_and_sort(msp);
+}
+
+static void
+metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta)
+{
+ vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
+
+ ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
+ vdev_deflated_space(vd, space_delta));
+}
+
+int
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
+ metaslab_t **msp)
+{
+ vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ metaslab_t *ms;
+ int error;
+
+ ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+ mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+
+ ms->ms_id = id;
+ ms->ms_start = id << vd->vdev_ms_shift;
+ ms->ms_size = 1ULL << vd->vdev_ms_shift;
+ ms->ms_allocator = -1;
+ ms->ms_new = B_TRUE;
+
+ /*
+ * We only open space map objects that already exist. All others
+ * will be opened when we finally allocate an object for it.
+ *
+ * Note:
+ * When called from vdev_expand(), we can't call into the DMU as
+ * we are holding the spa_config_lock as a writer and we would
+ * deadlock [see relevant comment in vdev_metaslab_init()]. in
+ * that case, the object parameter is zero though, so we won't
+ * call into the DMU.
+ */
+ if (object != 0) {
+ error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
+ ms->ms_size, vd->vdev_ashift);
+
+ if (error != 0) {
+ kmem_free(ms, sizeof (metaslab_t));
+ return (error);
+ }
+
+ ASSERT(ms->ms_sm != NULL);
+ ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
+ ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
+ }
+
+ /*
+ * We create the ms_allocatable here, but we don't create the
+ * other range trees until metaslab_sync_done(). This serves
+ * two purposes: it allows metaslab_sync_done() to detect the
+ * addition of new space; and for debugging, it ensures that
+ * we'd data fault on any attempt to use this metaslab before
+ * it's ready.
+ */
+ ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
+ metaslab_rangesize_compare, 0);
+ metaslab_group_add(mg, ms);
+
+ metaslab_set_fragmentation(ms);
+
+ /*
+ * If we're opening an existing pool (txg == 0) or creating
+ * a new one (txg == TXG_INITIAL), all space is available now.
+ * If we're adding space to an existing pool, the new space
+ * does not become available until after this txg has synced.
+ * The metaslab's weight will also be initialized when we sync
+ * out this txg. This ensures that we don't attempt to allocate
+ * from it before we have initialized it completely.
+ */
+ if (txg <= TXG_INITIAL) {
+ metaslab_sync_done(ms, 0);
+ metaslab_space_update(vd, mg->mg_class,
+ metaslab_allocated_space(ms), 0, 0);
+ }
+
+ /*
+ * If metaslab_debug_load is set and we're initializing a metaslab
+ * that has an allocated space map object then load the space map
+ * so that we can verify frees.
+ */
+ if (metaslab_debug_load && ms->ms_sm != NULL) {
+ mutex_enter(&ms->ms_lock);
+ VERIFY0(metaslab_load(ms));
+ mutex_exit(&ms->ms_lock);
+ }
+
+ if (txg != 0) {
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_dirty(vd, VDD_METASLAB, ms, txg);
+ }
+
+ *msp = ms;
+
+ return (0);
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+
+ metaslab_group_remove(mg, msp);
+
+ mutex_enter(&msp->ms_lock);
+ VERIFY(msp->ms_group == NULL);
+ metaslab_space_update(vd, mg->mg_class,
+ -metaslab_allocated_space(msp), 0, -msp->ms_size);
+
+ space_map_close(msp->ms_sm);
+
+ metaslab_unload(msp);
+
+ range_tree_destroy(msp->ms_allocatable);
+ range_tree_destroy(msp->ms_freeing);
+ range_tree_destroy(msp->ms_freed);
+
+ for (int t = 0; t < TXG_SIZE; t++) {
+ range_tree_destroy(msp->ms_allocating[t]);
+ }
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_destroy(msp->ms_defer[t]);
+ }
+ ASSERT0(msp->ms_deferspace);
+
+ range_tree_destroy(msp->ms_checkpointing);
+
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
+
+ mutex_exit(&msp->ms_lock);
+ cv_destroy(&msp->ms_load_cv);
+ mutex_destroy(&msp->ms_lock);
+ mutex_destroy(&msp->ms_sync_lock);
+ ASSERT3U(msp->ms_allocator, ==, -1);
+
+ kmem_free(msp, sizeof (metaslab_t));
+}
+
+#define FRAGMENTATION_TABLE_SIZE 17
+
+/*
+ * This table defines a segment size based fragmentation metric that will
+ * allow each metaslab to derive its own fragmentation value. This is done
+ * by calculating the space in each bucket of the spacemap histogram and
+ * multiplying that by the fragmentation metric in this table. Doing
+ * this for all buckets and dividing it by the total amount of free
+ * space in this metaslab (i.e. the total free space in all buckets) gives
+ * us the fragmentation metric. This means that a high fragmentation metric
+ * equates to most of the free space being comprised of small segments.
+ * Conversely, if the metric is low, then most of the free space is in
+ * large segments. A 10% change in fragmentation equates to approximately
+ * double the number of segments.
+ *
+ * This table defines 0% fragmented space using 16MB segments. Testing has
+ * shown that segments that are greater than or equal to 16MB do not suffer
+ * from drastic performance problems. Using this value, we derive the rest
+ * of the table. Since the fragmentation value is never stored on disk, it
+ * is possible to change these calculations in the future.
+ */
+int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
+ 100, /* 512B */
+ 100, /* 1K */
+ 98, /* 2K */
+ 95, /* 4K */
+ 90, /* 8K */
+ 80, /* 16K */
+ 70, /* 32K */
+ 60, /* 64K */
+ 50, /* 128K */
+ 40, /* 256K */
+ 30, /* 512K */
+ 20, /* 1M */
+ 15, /* 2M */
+ 10, /* 4M */
+ 5, /* 8M */
+ 0 /* 16M */
+};
+
+/*
+ * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
+ * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
+ * been upgraded and does not support this metric. Otherwise, the return
+ * value should be in the range [0, 100].
+ */
+static void
+metaslab_set_fragmentation(metaslab_t *msp)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ uint64_t fragmentation = 0;
+ uint64_t total = 0;
+ boolean_t feature_enabled = spa_feature_is_enabled(spa,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM);
+
+ if (!feature_enabled) {
+ msp->ms_fragmentation = ZFS_FRAG_INVALID;
+ return;
+ }
+
+ /*
+ * A null space map means that the entire metaslab is free
+ * and thus is not fragmented.
+ */
+ if (msp->ms_sm == NULL) {
+ msp->ms_fragmentation = 0;
+ return;
+ }
+
+ /*
+ * If this metaslab's space map has not been upgraded, flag it
+ * so that we upgrade next time we encounter it.
+ */
+ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
+ uint64_t txg = spa_syncing_txg(spa);
+ vdev_t *vd = msp->ms_group->mg_vd;
+
+ /*
+ * If we've reached the final dirty txg, then we must
+ * be shutting down the pool. We don't want to dirty
+ * any data past this point so skip setting the condense
+ * flag. We can retry this action the next time the pool
+ * is imported.
+ */
+ if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
+ msp->ms_condense_wanted = B_TRUE;
+ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+ zfs_dbgmsg("txg %llu, requesting force condense: "
+ "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
+ vd->vdev_id);
+ }
+ msp->ms_fragmentation = ZFS_FRAG_INVALID;
+ return;
+ }
+
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ uint64_t space = 0;
+ uint8_t shift = msp->ms_sm->sm_shift;
+
+ int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
+ FRAGMENTATION_TABLE_SIZE - 1);
+
+ if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
+ continue;
+
+ space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
+ total += space;
+
+ ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
+ fragmentation += space * zfs_frag_table[idx];
+ }
+
+ if (total > 0)
+ fragmentation /= total;
+ ASSERT3U(fragmentation, <=, 100);
+
+ msp->ms_fragmentation = fragmentation;
+}
+
+/*
+ * Compute a weight -- a selection preference value -- for the given metaslab.
+ * This is based on the amount of free space, the level of fragmentation,
+ * the LBA range, and whether the metaslab is loaded.
+ */
+static uint64_t
+metaslab_space_weight(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ uint64_t weight, space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!vd->vdev_removing);
+
+ /*
+ * The baseline weight is the metaslab's free space.
+ */
+ space = msp->ms_size - metaslab_allocated_space(msp);
+
+ if (metaslab_fragmentation_factor_enabled &&
+ msp->ms_fragmentation != ZFS_FRAG_INVALID) {
+ /*
+ * Use the fragmentation information to inversely scale
+ * down the baseline weight. We need to ensure that we
+ * don't exclude this metaslab completely when it's 100%
+ * fragmented. To avoid this we reduce the fragmented value
+ * by 1.
+ */
+ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
+
+ /*
+ * If space < SPA_MINBLOCKSIZE, then we will not allocate from
+ * this metaslab again. The fragmentation metric may have
+ * decreased the space to something smaller than
+ * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
+ * so that we can consume any remaining space.
+ */
+ if (space > 0 && space < SPA_MINBLOCKSIZE)
+ space = SPA_MINBLOCKSIZE;
+ }
+ weight = space;
+
+ /*
+ * Modern disks have uniform bit density and constant angular velocity.
+ * Therefore, the outer recording zones are faster (higher bandwidth)
+ * than the inner zones by the ratio of outer to inner track diameter,
+ * which is typically around 2:1. We account for this by assigning
+ * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
+ * In effect, this means that we'll select the metaslab with the most
+ * free bandwidth rather than simply the one with the most free space.
+ */
+ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
+ weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
+ ASSERT(weight >= space && weight <= 2 * space);
+ }
+
+ /*
+ * If this metaslab is one we're actively using, adjust its
+ * weight to make it preferable to any inactive metaslab so
+ * we'll polish it off. If the fragmentation on this metaslab
+ * has exceed our threshold, then don't mark it active.
+ */
+ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
+ msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
+ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+ }
+
+ WEIGHT_SET_SPACEBASED(weight);
+ return (weight);
+}
+
+/*
+ * Return the weight of the specified metaslab, according to the segment-based
+ * weighting algorithm. The metaslab must be loaded. This function can
+ * be called within a sync pass since it relies only on the metaslab's
+ * range tree which is always accurate when the metaslab is loaded.
+ */
+static uint64_t
+metaslab_weight_from_range_tree(metaslab_t *msp)
+{
+ uint64_t weight = 0;
+ uint32_t segments = 0;
+
+ ASSERT(msp->ms_loaded);
+
+ for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
+ i--) {
+ uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
+ int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+ segments <<= 1;
+ segments += msp->ms_allocatable->rt_histogram[i];
+
+ /*
+ * The range tree provides more precision than the space map
+ * and must be downgraded so that all values fit within the
+ * space map's histogram. This allows us to compare loaded
+ * vs. unloaded metaslabs to determine which metaslab is
+ * considered "best".
+ */
+ if (i > max_idx)
+ continue;
+
+ if (segments != 0) {
+ WEIGHT_SET_COUNT(weight, segments);
+ WEIGHT_SET_INDEX(weight, i);
+ WEIGHT_SET_ACTIVE(weight, 0);
+ break;
+ }
+ }
+ return (weight);
+}
+
+/*
+ * Calculate the weight based on the on-disk histogram. This should only
+ * be called after a sync pass has completely finished since the on-disk
+ * information is updated in metaslab_sync().
+ */
+static uint64_t
+metaslab_weight_from_spacemap(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(!msp->ms_loaded);
+ ASSERT(sm != NULL);
+ ASSERT3U(space_map_object(sm), !=, 0);
+ ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+
+ /*
+ * Create a joint histogram from all the segments that have made
+ * it to the metaslab's space map histogram, that are not yet
+ * available for allocation because they are still in the freeing
+ * pipeline (e.g. freeing, freed, and defer trees). Then subtract
+ * these segments from the space map's histogram to get a more
+ * accurate weight.
+ */
+ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+ deferspace_histogram[i] += msp->ms_synchist[i];
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ deferspace_histogram[i] += msp->ms_deferhist[t][i];
+ }
+ }
+
+ uint64_t weight = 0;
+ for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
+ ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
+ deferspace_histogram[i]);
+ uint64_t count =
+ sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
+ if (count != 0) {
+ WEIGHT_SET_COUNT(weight, count);
+ WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
+ WEIGHT_SET_ACTIVE(weight, 0);
+ break;
+ }
+ }
+ return (weight);
+}
+
+/*
+ * Compute a segment-based weight for the specified metaslab. The weight
+ * is determined by highest bucket in the histogram. The information
+ * for the highest bucket is encoded into the weight value.
+ */
+static uint64_t
+metaslab_segment_weight(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ uint64_t weight = 0;
+ uint8_t shift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * The metaslab is completely free.
+ */
+ if (metaslab_allocated_space(msp) == 0) {
+ int idx = highbit64(msp->ms_size) - 1;
+ int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+ if (idx < max_idx) {
+ WEIGHT_SET_COUNT(weight, 1ULL);
+ WEIGHT_SET_INDEX(weight, idx);
+ } else {
+ WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
+ WEIGHT_SET_INDEX(weight, max_idx);
+ }
+ WEIGHT_SET_ACTIVE(weight, 0);
+ ASSERT(!WEIGHT_IS_SPACEBASED(weight));
+
+ return (weight);
+ }
+
+ ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+
+ /*
+ * If the metaslab is fully allocated then just make the weight 0.
+ */
+ if (metaslab_allocated_space(msp) == msp->ms_size)
+ return (0);
+ /*
+ * If the metaslab is already loaded, then use the range tree to
+ * determine the weight. Otherwise, we rely on the space map information
+ * to generate the weight.
+ */
+ if (msp->ms_loaded) {
+ weight = metaslab_weight_from_range_tree(msp);
+ } else {
+ weight = metaslab_weight_from_spacemap(msp);
+ }
+
+ /*
+ * If the metaslab was active the last time we calculated its weight
+ * then keep it active. We want to consume the entire region that
+ * is associated with this weight.
+ */
+ if (msp->ms_activation_weight != 0 && weight != 0)
+ WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
+ return (weight);
+}
+
+/*
+ * Determine if we should attempt to allocate from this metaslab. If the
+ * metaslab has a maximum size then we can quickly determine if the desired
+ * allocation size can be satisfied. Otherwise, if we're using segment-based
+ * weighting then we can determine the maximum allocation that this metaslab
+ * can accommodate based on the index encoded in the weight. If we're using
+ * space-based weights then rely on the entire weight (excluding the weight
+ * type bit).
+ */
+boolean_t
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
+{
+ boolean_t should_allocate;
+
+ if (msp->ms_max_size != 0)
+ return (msp->ms_max_size >= asize);
+
+ if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+ /*
+ * The metaslab segment weight indicates segments in the
+ * range [2^i, 2^(i+1)), where i is the index in the weight.
+ * Since the asize might be in the middle of the range, we
+ * should attempt the allocation if asize < 2^(i+1).
+ */
+ should_allocate = (asize <
+ 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
+ } else {
+ should_allocate = (asize <=
+ (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
+ }
+ return (should_allocate);
+}
+
+static uint64_t
+metaslab_weight(metaslab_t *msp)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ uint64_t weight;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * If this vdev is in the process of being removed, there is nothing
+ * for us to do here.
+ */
+ if (vd->vdev_removing)
+ return (0);
+
+ metaslab_set_fragmentation(msp);
+
+ /*
+ * Update the maximum size if the metaslab is loaded. This will
+ * ensure that we get an accurate maximum size if newly freed space
+ * has been added back into the free tree.
+ */
+ if (msp->ms_loaded)
+ msp->ms_max_size = metaslab_block_maxsize(msp);
+ else
+ ASSERT0(msp->ms_max_size);
+
+ /*
+ * Segment-based weighting requires space map histogram support.
+ */
+ if (zfs_metaslab_segment_weight_enabled &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+ (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
+ sizeof (space_map_phys_t))) {
+ weight = metaslab_segment_weight(msp);
+ } else {
+ weight = metaslab_space_weight(msp);
+ }
+ return (weight);
+}
+
+void
+metaslab_recalculate_weight_and_sort(metaslab_t *msp)
+{
+ /* note: we preserve the mask (e.g. indication of primary, etc..) */
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ metaslab_group_sort(msp->ms_group, msp,
+ metaslab_weight(msp) | was_active);
+}
+
+static int
+metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+ int allocator, uint64_t activation_weight)
+{
+ /*
+ * If we're activating for the claim code, we don't want to actually
+ * set the metaslab up for a specific allocator.
+ */
+ if (activation_weight == METASLAB_WEIGHT_CLAIM)
+ return (0);
+ metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+ mg->mg_primaries : mg->mg_secondaries);
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ mutex_enter(&mg->mg_lock);
+ if (arr[allocator] != NULL) {
+ mutex_exit(&mg->mg_lock);
+ return (EEXIST);
+ }
+
+ arr[allocator] = msp;
+ ASSERT3S(msp->ms_allocator, ==, -1);
+ msp->ms_allocator = allocator;
+ msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+ mutex_exit(&mg->mg_lock);
+
+ return (0);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int error = metaslab_load(msp);
+ if (error != 0) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+ /*
+ * The metaslab was activated for another allocator
+ * while we were waiting, we should reselect.
+ */
+ return (EBUSY);
+ }
+ if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+ allocator, activation_weight)) != 0) {
+ return (error);
+ }
+
+ msp->ms_activation_weight = msp->ms_weight;
+ metaslab_group_sort(msp->ms_group, msp,
+ msp->ms_weight | activation_weight);
+ }
+ ASSERT(msp->ms_loaded);
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+ return (0);
+}
+
+static void
+metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+ uint64_t weight)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+ metaslab_group_sort(mg, msp, weight);
+ return;
+ }
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT3P(msp->ms_group, ==, mg);
+ if (msp->ms_primary) {
+ ASSERT3U(0, <=, msp->ms_allocator);
+ ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+ ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+ mg->mg_primaries[msp->ms_allocator] = NULL;
+ } else {
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+ ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+ mg->mg_secondaries[msp->ms_allocator] = NULL;
+ }
+ msp->ms_allocator = -1;
+ metaslab_group_sort_impl(mg, msp, weight);
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_passivate(metaslab_t *msp, uint64_t weight)
+{
+ uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
+
+ /*
+ * If size < SPA_MINBLOCKSIZE, then we will not allocate from
+ * this metaslab again. In that case, it had better be empty,
+ * or we would be leaving space on the table.
+ */
+ ASSERT(size >= SPA_MINBLOCKSIZE ||
+ range_tree_is_empty(msp->ms_allocatable));
+ ASSERT0(weight & METASLAB_ACTIVE_MASK);
+
+ msp->ms_activation_weight = 0;
+ metaslab_passivate_allocator(msp->ms_group, msp, weight);
+ ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
+}
+
+/*
+ * Segment-based metaslabs are activated once and remain active until
+ * we either fail an allocation attempt (similar to space-based metaslabs)
+ * or have exhausted the free space in zfs_metaslab_switch_threshold
+ * buckets since the metaslab was activated. This function checks to see
+ * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
+ * metaslab and passivates it proactively. This will allow us to select a
+ * metaslabs with larger contiguous region if any remaining within this
+ * metaslab group. If we're in sync pass > 1, then we continue using this
+ * metaslab so that we don't dirty more block and cause more sync passes.
+ */
+void
+metaslab_segment_may_passivate(metaslab_t *msp)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
+ return;
+
+ /*
+ * Since we are in the middle of a sync pass, the most accurate
+ * information that is accessible to us is the in-core range tree
+ * histogram; calculate the new weight based on that information.
+ */
+ uint64_t weight = metaslab_weight_from_range_tree(msp);
+ int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
+ int current_idx = WEIGHT_GET_INDEX(weight);
+
+ if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
+ metaslab_passivate(msp, weight);
+}
+
+static void
+metaslab_preload(void *arg)
+{
+ metaslab_t *msp = arg;
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
+
+ mutex_enter(&msp->ms_lock);
+ (void) metaslab_load(msp);
+ msp->ms_selected_txg = spa_syncing_txg(spa);
+ mutex_exit(&msp->ms_lock);
+}
+
+static void
+metaslab_group_preload(metaslab_group_t *mg)
+{
+ spa_t *spa = mg->mg_vd->vdev_spa;
+ metaslab_t *msp;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ int m = 0;
+
+ if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
+ taskq_wait(mg->mg_taskq);
+ return;
+ }
+
+ mutex_enter(&mg->mg_lock);
+
+ /*
+ * Load the next potential metaslabs
+ */
+ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
+ ASSERT3P(msp->ms_group, ==, mg);
+
+ /*
+ * We preload only the maximum number of metaslabs specified
+ * by metaslab_preload_limit. If a metaslab is being forced
+ * to condense then we preload it too. This will ensure
+ * that force condensing happens in the next txg.
+ */
+ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
+ continue;
+ }
+
+ VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
+ msp, TQ_SLEEP) != 0);
+ }
+ mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * Determine if the space map's on-disk footprint is past our tolerance
+ * for inefficiency. We would like to use the following criteria to make
+ * our decision:
+ *
+ * 1. The size of the space map object should not dramatically increase as a
+ * result of writing out the free space range tree.
+ *
+ * 2. The minimal on-disk space map representation is zfs_condense_pct/100
+ * times the size than the free space range tree representation
+ * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
+ *
+ * 3. The on-disk size of the space map should actually decrease.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
+ */
+static boolean_t
+metaslab_should_condense(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ vdev_t *vd = msp->ms_group->mg_vd;
+ uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+ uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_loaded);
+
+ /*
+ * Allocations and frees in early passes are generally more space
+ * efficient (in terms of blocks described in space map entries)
+ * than the ones in later passes (e.g. we don't compress after
+ * sync pass 5) and condensing a metaslab multiple times in a txg
+ * could degrade performance.
+ *
+ * Thus we prefer condensing each metaslab at most once every txg at
+ * the earliest sync pass possible. If a metaslab is eligible for
+ * condensing again after being considered for condensing within the
+ * same txg, it will hopefully be dirty in the next txg where it will
+ * be condensed at an earlier pass.
+ */
+ if (msp->ms_condense_checked_txg == current_txg)
+ return (B_FALSE);
+ msp->ms_condense_checked_txg = current_txg;
+
+ /*
+ * We always condense metaslabs that are empty and metaslabs for
+ * which a condense request has been made.
+ */
+ if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+ msp->ms_condense_wanted)
+ return (B_TRUE);
+
+ uint64_t object_size = space_map_length(msp->ms_sm);
+ uint64_t optimal_size = space_map_estimate_optimal_size(sm,
+ msp->ms_allocatable, SM_NO_VDEVID);
+
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(sm->sm_dbuf, &doi);
+ uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+
+ return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
+ object_size > zfs_metaslab_condense_block_threshold * record_size);
+}
+
+/*
+ * Condense the on-disk space map representation to its minimized form.
+ * The minimized form consists of a small number of allocations followed by
+ * the entries of the free range tree.
+ */
+static void
+metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
+{
+ range_tree_t *condense_tree;
+ space_map_t *sm = msp->ms_sm;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_loaded);
+
+ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
+ "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
+ msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
+ msp->ms_group->mg_vd->vdev_spa->spa_name,
+ space_map_length(msp->ms_sm),
+ avl_numnodes(&msp->ms_allocatable->rt_root),
+ msp->ms_condense_wanted ? "TRUE" : "FALSE");
+
+ msp->ms_condense_wanted = B_FALSE;
+
+ /*
+ * Create an range tree that is 100% allocated. We remove segments
+ * that have been freed in this txg, any deferred frees that exist,
+ * and any allocation in the future. Removing segments should be
+ * a relatively inexpensive operation since we expect these trees to
+ * have a small number of nodes.
+ */
+ condense_tree = range_tree_create(NULL, NULL);
+ range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
+
+ range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
+ range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, condense_tree);
+ }
+
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
+ range_tree_remove, condense_tree);
+ }
+
+ /*
+ * We're about to drop the metaslab's lock thus allowing
+ * other consumers to change it's content. Set the
+ * metaslab's ms_condensing flag to ensure that
+ * allocations on this metaslab do not occur while we're
+ * in the middle of committing it to disk. This is only critical
+ * for ms_allocatable as all other range trees use per txg
+ * views of their content.
+ */
+ msp->ms_condensing = B_TRUE;
+
+ mutex_exit(&msp->ms_lock);
+ space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
+
+ /*
+ * While we would ideally like to create a space map representation
+ * that consists only of allocation records, doing so can be
+ * prohibitively expensive because the in-core free tree can be
+ * large, and therefore computationally expensive to subtract
+ * from the condense_tree. Instead we sync out two trees, a cheap
+ * allocation only tree followed by the in-core free tree. While not
+ * optimal, this is typically close to optimal, and much cheaper to
+ * compute.
+ */
+ space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+ range_tree_vacate(condense_tree, NULL, NULL);
+ range_tree_destroy(condense_tree);
+
+ space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+ mutex_enter(&msp->ms_lock);
+ msp->ms_condensing = B_FALSE;
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa_meta_objset(spa);
+ range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
+ dmu_tx_t *tx;
+ uint64_t object = space_map_object(msp->ms_sm);
+
+ ASSERT(!vd->vdev_ishole);
+
+ /*
+ * This metaslab has just been added so there's no work to do now.
+ */
+ if (msp->ms_freeing == NULL) {
+ ASSERT3P(alloctree, ==, NULL);
+ return;
+ }
+
+ ASSERT3P(alloctree, !=, NULL);
+ ASSERT3P(msp->ms_freeing, !=, NULL);
+ ASSERT3P(msp->ms_freed, !=, NULL);
+ ASSERT3P(msp->ms_checkpointing, !=, NULL);
+
+ /*
+ * Normally, we don't want to process a metaslab if there are no
+ * allocations or frees to perform. However, if the metaslab is being
+ * forced to condense and it's loaded, we need to let it through.
+ */
+ if (range_tree_is_empty(alloctree) &&
+ range_tree_is_empty(msp->ms_freeing) &&
+ range_tree_is_empty(msp->ms_checkpointing) &&
+ !(msp->ms_loaded && msp->ms_condense_wanted))
+ return;
+
+
+ VERIFY(txg <= spa_final_dirty_txg(spa));
+
+ /*
+ * The only state that can actually be changing concurrently
+ * with metaslab_sync() is the metaslab's ms_allocatable. No
+ * other thread can be modifying this txg's alloc, freeing,
+ * freed, or space_map_phys_t. We drop ms_lock whenever we
+ * could call into the DMU, because the DMU can call down to
+ * us (e.g. via zio_free()) at any time.
+ *
+ * The spa_vdev_remove_thread() can be reading metaslab state
+ * concurrently, and it is locked out by the ms_sync_lock.
+ * Note that the ms_lock is insufficient for this, because it
+ * is dropped by space_map_write().
+ */
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (msp->ms_sm == NULL) {
+ uint64_t new_object;
+
+ new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
+ msp->ms_start, msp->ms_size, vd->vdev_ashift));
+
+ ASSERT(msp->ms_sm != NULL);
+ ASSERT0(metaslab_allocated_space(msp));
+ }
+
+ if (!range_tree_is_empty(msp->ms_checkpointing) &&
+ vd->vdev_checkpoint_sm == NULL) {
+ ASSERT(spa_has_checkpoint(spa));
+
+ uint64_t new_object = space_map_alloc(mos,
+ vdev_standard_sm_blksz, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+ mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * We save the space map object as an entry in vdev_top_zap
+ * so it can be retrieved when the pool is reopened after an
+ * export or through zdb.
+ */
+ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (new_object), 1, &new_object, tx));
+ }
+
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Note: metaslab_condense() clears the space map's histogram.
+ * Therefore we must verify and remove this histogram before
+ * condensing.
+ */
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+ metaslab_group_histogram_remove(mg, msp);
+
+ if (msp->ms_loaded && metaslab_should_condense(msp)) {
+ metaslab_condense(msp, txg, tx);
+ } else {
+ mutex_exit(&msp->ms_lock);
+ space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
+ SM_NO_VDEVID, tx);
+ space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
+ SM_NO_VDEVID, tx);
+ mutex_enter(&msp->ms_lock);
+ }
+
+ msp->ms_allocated_space += range_tree_space(alloctree);
+ ASSERT3U(msp->ms_allocated_space, >=,
+ range_tree_space(msp->ms_freeing));
+ msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
+
+ if (!range_tree_is_empty(msp->ms_checkpointing)) {
+ ASSERT(spa_has_checkpoint(spa));
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * Since we are doing writes to disk and the ms_checkpointing
+ * tree won't be changing during that time, we drop the
+ * ms_lock while writing to the checkpoint space map.
+ */
+ mutex_exit(&msp->ms_lock);
+ space_map_write(vd->vdev_checkpoint_sm,
+ msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
+ mutex_enter(&msp->ms_lock);
+
+ spa->spa_checkpoint_info.sci_dspace +=
+ range_tree_space(msp->ms_checkpointing);
+ vd->vdev_stat.vs_checkpoint_space +=
+ range_tree_space(msp->ms_checkpointing);
+ ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+ -space_map_allocated(vd->vdev_checkpoint_sm));
+
+ range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+ }
+
+ if (msp->ms_loaded) {
+ /*
+ * When the space map is loaded, we have an accurate
+ * histogram in the range tree. This gives us an opportunity
+ * to bring the space map's histogram up-to-date so we clear
+ * it first before updating it.
+ */
+ space_map_histogram_clear(msp->ms_sm);
+ space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
+
+ /*
+ * Since we've cleared the histogram we need to add back
+ * any free space that has already been processed, plus
+ * any deferred space. This allows the on-disk histogram
+ * to accurately reflect all free space even if some space
+ * is not yet available for allocation (i.e. deferred).
+ */
+ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
+
+ /*
+ * Add back any deferred free space that has not been
+ * added back into the in-core free tree yet. This will
+ * ensure that we don't end up with a space map histogram
+ * that is completely empty unless the metaslab is fully
+ * allocated.
+ */
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ space_map_histogram_add(msp->ms_sm,
+ msp->ms_defer[t], tx);
+ }
+ }
+
+ /*
+ * Always add the free space from this sync pass to the space
+ * map histogram. We want to make sure that the on-disk histogram
+ * accounts for all free space. If the space map is not loaded,
+ * then we will lose some accuracy but will correct it the next
+ * time we load the space map.
+ */
+ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
+ metaslab_aux_histograms_update(msp);
+
+ metaslab_group_histogram_add(mg, msp);
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+
+ /*
+ * For sync pass 1, we avoid traversing this txg's free range tree
+ * and instead will just swap the pointers for freeing and freed.
+ * We can safely do this since the freed_tree is guaranteed to be
+ * empty on the initial pass.
+ */
+ if (spa_sync_pass(spa) == 1) {
+ range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
+ ASSERT0(msp->ms_allocated_this_txg);
+ } else {
+ range_tree_vacate(msp->ms_freeing,
+ range_tree_add, msp->ms_freed);
+ }
+ msp->ms_allocated_this_txg += range_tree_space(alloctree);
+ range_tree_vacate(alloctree, NULL, NULL);
+
+ ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+ & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
+
+ mutex_exit(&msp->ms_lock);
+
+ if (object != space_map_object(msp->ms_sm)) {
+ object = space_map_object(msp->ms_sm);
+ dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+ msp->ms_id, sizeof (uint64_t), &object, tx);
+ }
+ mutex_exit(&msp->ms_sync_lock);
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ range_tree_t **defer_tree;
+ int64_t alloc_delta, defer_delta;
+ boolean_t defer_allowed = B_TRUE;
+
+ ASSERT(!vd->vdev_ishole);
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * If this metaslab is just becoming available, initialize its
+ * range trees and add its capacity to the vdev.
+ */
+ if (msp->ms_freed == NULL) {
+ for (int t = 0; t < TXG_SIZE; t++) {
+ ASSERT(msp->ms_allocating[t] == NULL);
+
+ msp->ms_allocating[t] = range_tree_create(NULL, NULL);
+ }
+
+ ASSERT3P(msp->ms_freeing, ==, NULL);
+ msp->ms_freeing = range_tree_create(NULL, NULL);
+
+ ASSERT3P(msp->ms_freed, ==, NULL);
+ msp->ms_freed = range_tree_create(NULL, NULL);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ ASSERT(msp->ms_defer[t] == NULL);
+
+ msp->ms_defer[t] = range_tree_create(NULL, NULL);
+ }
+
+ ASSERT3P(msp->ms_checkpointing, ==, NULL);
+ msp->ms_checkpointing = range_tree_create(NULL, NULL);
+
+ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
+ }
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
+
+ defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
+
+ uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
+ metaslab_class_get_alloc(spa_normal_class(spa));
+ if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
+ defer_allowed = B_FALSE;
+ }
+
+ defer_delta = 0;
+ alloc_delta = msp->ms_allocated_this_txg -
+ range_tree_space(msp->ms_freed);
+ if (defer_allowed) {
+ defer_delta = range_tree_space(msp->ms_freed) -
+ range_tree_space(*defer_tree);
+ } else {
+ defer_delta -= range_tree_space(*defer_tree);
+ }
+
+ metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
+ defer_delta, 0);
+
+ /*
+ * If there's a metaslab_load() in progress, wait for it to complete
+ * so that we have a consistent view of the in-core space map.
+ */
+ metaslab_load_wait(msp);
+
+ /*
+ * Move the frees from the defer_tree back to the free
+ * range tree (if it's loaded). Swap the freed_tree and
+ * the defer_tree -- this is safe to do because we've
+ * just emptied out the defer_tree.
+ */
+ range_tree_vacate(*defer_tree,
+ msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
+ if (defer_allowed) {
+ range_tree_swap(&msp->ms_freed, defer_tree);
+ } else {
+ range_tree_vacate(msp->ms_freed,
+ msp->ms_loaded ? range_tree_add : NULL,
+ msp->ms_allocatable);
+ }
+
+ msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+ msp->ms_deferspace += defer_delta;
+ ASSERT3S(msp->ms_deferspace, >=, 0);
+ ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
+ if (msp->ms_deferspace != 0) {
+ /*
+ * Keep syncing this metaslab until all deferred frees
+ * are back in circulation.
+ */
+ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+ }
+ metaslab_aux_histograms_update_done(msp, defer_allowed);
+
+ if (msp->ms_new) {
+ msp->ms_new = B_FALSE;
+ mutex_enter(&mg->mg_lock);
+ mg->mg_ms_ready++;
+ mutex_exit(&mg->mg_lock);
+ }
+
+ /*
+ * Re-sort metaslab within its group now that we've adjusted
+ * its allocatable space.
+ */
+ metaslab_recalculate_weight_and_sort(msp);
+
+ /*
+ * If the metaslab is loaded and we've not tried to load or allocate
+ * from it in 'metaslab_unload_delay' txgs, then unload it.
+ */
+ if (msp->ms_loaded &&
+ msp->ms_initializing == 0 &&
+ msp->ms_selected_txg + metaslab_unload_delay < txg) {
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ VERIFY0(range_tree_space(
+ msp->ms_allocating[(txg + t) & TXG_MASK]));
+ }
+ if (msp->ms_allocator != -1) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
+
+ if (!metaslab_debug_unload)
+ metaslab_unload(msp);
+ }
+
+ ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_freed));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
+
+ msp->ms_allocated_this_txg = 0;
+ mutex_exit(&msp->ms_lock);
+}
+
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+ spa_t *spa = mg->mg_class->mc_spa;
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+ metaslab_group_alloc_update(mg);
+ mg->mg_fragmentation = metaslab_group_fragmentation(mg);
+
+ /*
+ * Preload the next potential metaslabs but only on active
+ * metaslab groups. We can get into a state where the metaslab
+ * is no longer active since we dirty metaslabs as we remove a
+ * a device, thus potentially making the metaslab group eligible
+ * for preloading.
+ */
+ if (mg->mg_activation_count > 0) {
+ metaslab_group_preload(mg);
+ }
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+}
+
+/*
+ * When writing a ditto block (i.e. more than one DVA for a given BP) on
+ * the same vdev as an existing DVA of this BP, then try to allocate it
+ * on a different metaslab than existing DVAs (i.e. a unique metaslab).
+ */
+static boolean_t
+metaslab_is_unique(metaslab_t *msp, dva_t *dva)
+{
+ uint64_t dva_ms_id;
+
+ if (DVA_GET_ASIZE(dva) == 0)
+ return (B_TRUE);
+
+ if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+ return (B_TRUE);
+
+ dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
+
+ return (msp->ms_id != dva_ms_id);
+}
+
+/*
+ * ==========================================================================
+ * Metaslab allocation tracing facility
+ * ==========================================================================
+ */
+#ifdef _METASLAB_TRACING
+kstat_t *metaslab_trace_ksp;
+kstat_named_t metaslab_trace_over_limit;
+
+void
+metaslab_alloc_trace_init(void)
+{
+ ASSERT(metaslab_alloc_trace_cache == NULL);
+ metaslab_alloc_trace_cache = kmem_cache_create(
+ "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
+ "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
+ if (metaslab_trace_ksp != NULL) {
+ metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
+ kstat_named_init(&metaslab_trace_over_limit,
+ "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
+ kstat_install(metaslab_trace_ksp);
+ }
+}
+
+void
+metaslab_alloc_trace_fini(void)
+{
+ if (metaslab_trace_ksp != NULL) {
+ kstat_delete(metaslab_trace_ksp);
+ metaslab_trace_ksp = NULL;
+ }
+ kmem_cache_destroy(metaslab_alloc_trace_cache);
+ metaslab_alloc_trace_cache = NULL;
+}
+
+/*
+ * Add an allocation trace element to the allocation tracing list.
+ */
+static void
+metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
+ metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
+ int allocator)
+{
+ if (!metaslab_trace_enabled)
+ return;
+
+ /*
+ * When the tracing list reaches its maximum we remove
+ * the second element in the list before adding a new one.
+ * By removing the second element we preserve the original
+ * entry as a clue to what allocations steps have already been
+ * performed.
+ */
+ if (zal->zal_size == metaslab_trace_max_entries) {
+ metaslab_alloc_trace_t *mat_next;
+#ifdef DEBUG
+ panic("too many entries in allocation list");
+#endif
+ atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
+ zal->zal_size--;
+ mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
+ list_remove(&zal->zal_list, mat_next);
+ kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
+ }
+
+ metaslab_alloc_trace_t *mat =
+ kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
+ list_link_init(&mat->mat_list_node);
+ mat->mat_mg = mg;
+ mat->mat_msp = msp;
+ mat->mat_size = psize;
+ mat->mat_dva_id = dva_id;
+ mat->mat_offset = offset;
+ mat->mat_weight = 0;
+ mat->mat_allocator = allocator;
+
+ if (msp != NULL)
+ mat->mat_weight = msp->ms_weight;
+
+ /*
+ * The list is part of the zio so locking is not required. Only
+ * a single thread will perform allocations for a given zio.
+ */
+ list_insert_tail(&zal->zal_list, mat);
+ zal->zal_size++;
+
+ ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
+}
+
+void
+metaslab_trace_init(zio_alloc_list_t *zal)
+{
+ list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
+ offsetof(metaslab_alloc_trace_t, mat_list_node));
+ zal->zal_size = 0;
+}
+
+void
+metaslab_trace_fini(zio_alloc_list_t *zal)
+{
+ metaslab_alloc_trace_t *mat;
+
+ while ((mat = list_remove_head(&zal->zal_list)) != NULL)
+ kmem_cache_free(metaslab_alloc_trace_cache, mat);
+ list_destroy(&zal->zal_list);
+ zal->zal_size = 0;
+}
+
+#else
+
+#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
+
+void
+metaslab_alloc_trace_init(void)
+{
+}
+
+void
+metaslab_alloc_trace_fini(void)
+{
+}
+
+void
+metaslab_trace_init(zio_alloc_list_t *zal)
+{
+}
+
+void
+metaslab_trace_fini(zio_alloc_list_t *zal)
+{
+}
+
+#endif /* _METASLAB_TRACING */
+
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+static void
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
+ int allocator)
+{
+ if (!(flags & METASLAB_ASYNC_ALLOC) ||
+ (flags & METASLAB_DONT_THROTTLE))
+ return;
+
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ if (!mg->mg_class->mc_alloc_throttle_enabled)
+ return;
+
+ (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
+}
+
+static void
+metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
+{
+ uint64_t max = mg->mg_max_alloc_queue_depth;
+ uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+ while (cur < max) {
+ if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
+ cur, cur + 1) == cur) {
+ atomic_inc_64(
+ &mg->mg_class->mc_alloc_max_slots[allocator]);
+ return;
+ }
+ cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+ }
+}
+
+void
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
+ int allocator, boolean_t io_complete)
+{
+ if (!(flags & METASLAB_ASYNC_ALLOC) ||
+ (flags & METASLAB_DONT_THROTTLE))
+ return;
+
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ if (!mg->mg_class->mc_alloc_throttle_enabled)
+ return;
+
+ (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
+ if (io_complete)
+ metaslab_group_increment_qdepth(mg, allocator);
+}
+
+void
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+ int allocator)
+{
+#ifdef ZFS_DEBUG
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+
+ for (int d = 0; d < ndvas; d++) {
+ uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ VERIFY(zfs_refcount_not_held(
+ &mg->mg_alloc_queue_depth[allocator], tag));
+ }
+#endif
+}
+
+static uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+ uint64_t start;
+ range_tree_t *rt = msp->ms_allocatable;
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY0(msp->ms_initializing);
+
+ start = mc->mc_ops->msop_alloc(msp, size);
+ if (start != -1ULL) {
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+
+ VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+ range_tree_remove(rt, start, size);
+
+ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
+ vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+ range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
+
+ /* Track the last successful allocation */
+ msp->ms_alloc_txg = txg;
+ metaslab_verify_space(msp, txg);
+ }
+
+ /*
+ * Now that we've attempted the allocation we need to update the
+ * metaslab's maximum block size since it may have changed.
+ */
+ msp->ms_max_size = metaslab_block_maxsize(msp);
+ return (start);
+}
+
+/*
+ * Find the metaslab with the highest weight that is less than what we've
+ * already tried. In the common case, this means that we will examine each
+ * metaslab at most once. Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock. If a metaslab is
+ * activated by another thread, and we fail to allocate from the metaslab we
+ * have selected, we may not try the newly-activated metaslab, and instead
+ * activate another metaslab. This is not optimal, but generally does not cause
+ * any problems (a possible exception being if every metaslab is completely full
+ * except for the the newly-activated metaslab which we fail to examine).
+ */
+static metaslab_t *
+find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
+ dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
+ zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+{
+ avl_index_t idx;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ metaslab_t *msp = avl_find(t, search, &idx);
+ if (msp == NULL)
+ msp = avl_nearest(t, idx, AVL_AFTER);
+
+ for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+ int i;
+ if (!metaslab_should_allocate(msp, asize)) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_TOO_SMALL, allocator);
+ continue;
+ }
+
+ /*
+ * If the selected metaslab is condensing or being
+ * initialized, skip it.
+ */
+ if (msp->ms_condensing || msp->ms_initializing > 0)
+ continue;
+
+ *was_active = msp->ms_allocator != -1;
+ /*
+ * If we're activating as primary, this is our first allocation
+ * from this disk, so we don't need to check how close we are.
+ * If the metaslab under consideration was already active,
+ * we're getting desperate enough to steal another allocator's
+ * metaslab, so we still don't care about distances.
+ */
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
+ break;
+
+ for (i = 0; i < d; i++) {
+ if (want_unique &&
+ !metaslab_is_unique(msp, &dva[i]))
+ break; /* try another metaslab */
+ }
+ if (i == d)
+ break;
+ }
+
+ if (msp != NULL) {
+ search->ms_weight = msp->ms_weight;
+ search->ms_start = msp->ms_start + 1;
+ search->ms_allocator = msp->ms_allocator;
+ search->ms_primary = msp->ms_primary;
+ }
+ return (msp);
+}
+
+/* ARGSUSED */
+static uint64_t
+metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+ int d, int allocator)
+{
+ metaslab_t *msp = NULL;
+ uint64_t offset = -1ULL;
+ uint64_t activation_weight;
+
+ activation_weight = METASLAB_WEIGHT_PRIMARY;
+ for (int i = 0; i < d; i++) {
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+ DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+ activation_weight = METASLAB_WEIGHT_SECONDARY;
+ } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+ DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+ activation_weight = METASLAB_WEIGHT_CLAIM;
+ break;
+ }
+ }
+
+ /*
+ * If we don't have enough metaslabs active to fill the entire array, we
+ * just use the 0th slot.
+ */
+ if (mg->mg_ms_ready < mg->mg_allocators * 3)
+ allocator = 0;
+
+ ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
+
+ metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
+ search->ms_weight = UINT64_MAX;
+ search->ms_start = 0;
+ /*
+ * At the end of the metaslab tree are the already-active metaslabs,
+ * first the primaries, then the secondaries. When we resume searching
+ * through the tree, we need to consider ms_allocator and ms_primary so
+ * we start in the location right after where we left off, and don't
+ * accidentally loop forever considering the same metaslabs.
+ */
+ search->ms_allocator = -1;
+ search->ms_primary = B_TRUE;
+ for (;;) {
+ boolean_t was_active = B_FALSE;
+
+ mutex_enter(&mg->mg_lock);
+
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+ mg->mg_primaries[allocator] != NULL) {
+ msp = mg->mg_primaries[allocator];
+ was_active = B_TRUE;
+ } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+ mg->mg_secondaries[allocator] != NULL) {
+ msp = mg->mg_secondaries[allocator];
+ was_active = B_TRUE;
+ } else {
+ msp = find_valid_metaslab(mg, activation_weight, dva, d,
+ want_unique, asize, allocator, zal, search,
+ &was_active);
+ }
+
+ mutex_exit(&mg->mg_lock);
+ if (msp == NULL) {
+ kmem_free(search, sizeof (*search));
+ return (-1ULL);
+ }
+
+ mutex_enter(&msp->ms_lock);
+ /*
+ * Ensure that the metaslab we have selected is still
+ * capable of handling our request. It's possible that
+ * another thread may have changed the weight while we
+ * were blocked on the metaslab lock. We check the
+ * active status first to see if we need to reselect
+ * a new metaslab.
+ */
+ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * If the metaslab is freshly activated for an allocator that
+ * isn't the one we're allocating from, or if it's a primary and
+ * we're seeking a secondary (or vice versa), we go back and
+ * select a new metaslab.
+ */
+ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+ (msp->ms_allocator != -1) &&
+ (msp->ms_allocator != allocator || ((activation_weight ==
+ METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
+ activation_weight != METASLAB_WEIGHT_CLAIM) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_WEIGHT_CLAIM);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if (metaslab_activate(msp, allocator, activation_weight) != 0) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ msp->ms_selected_txg = txg;
+
+ /*
+ * Now that we have the lock, recheck to see if we should
+ * continue to use this metaslab for this allocation. The
+ * the metaslab is now loaded so metaslab_should_allocate() can
+ * accurately determine if the allocation attempt should
+ * proceed.
+ */
+ if (!metaslab_should_allocate(msp, asize)) {
+ /* Passivate this metaslab and select a new one. */
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_TOO_SMALL, allocator);
+ goto next;
+ }
+
+ /*
+ * If this metaslab is currently condensing then pick again as
+ * we can't manipulate this metaslab until it's committed
+ * to disk. If this metaslab is being initialized, we shouldn't
+ * allocate from it since the allocated region might be
+ * overwritten after allocation.
+ */
+ if (msp->ms_condensing) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_CONDENSING, allocator);
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ } else if (msp->ms_initializing > 0) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_INITIALIZING, allocator);
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ offset = metaslab_block_alloc(msp, asize, txg);
+ metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
+
+ if (offset != -1ULL) {
+ /* Proactively passivate the metaslab, if needed */
+ metaslab_segment_may_passivate(msp);
+ break;
+ }
+next:
+ ASSERT(msp->ms_loaded);
+
+ /*
+ * We were unable to allocate from this metaslab so determine
+ * a new weight for this metaslab. Now that we have loaded
+ * the metaslab we can provide a better hint to the metaslab
+ * selector.
+ *
+ * For space-based metaslabs, we use the maximum block size.
+ * This information is only available when the metaslab
+ * is loaded and is more accurate than the generic free
+ * space weight that was calculated by metaslab_weight().
+ * This information allows us to quickly compare the maximum
+ * available allocation in the metaslab to the allocation
+ * size being requested.
+ *
+ * For segment-based metaslabs, determine the new weight
+ * based on the highest bucket in the range tree. We
+ * explicitly use the loaded segment weight (i.e. the range
+ * tree histogram) since it contains the space that is
+ * currently available for allocation and is accurate
+ * even within a sync pass.
+ */
+ if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+ uint64_t weight = metaslab_block_maxsize(msp);
+ WEIGHT_SET_SPACEBASED(weight);
+ metaslab_passivate(msp, weight);
+ } else {
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ }
+
+ /*
+ * We have just failed an allocation attempt, check
+ * that metaslab_should_allocate() agrees. Otherwise,
+ * we may end up in an infinite loop retrying the same
+ * metaslab.
+ */
+ ASSERT(!metaslab_should_allocate(msp, asize));
+
+ mutex_exit(&msp->ms_lock);
+ }
+ mutex_exit(&msp->ms_lock);
+ kmem_free(search, sizeof (*search));
+ return (offset);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+ int d, int allocator)
+{
+ uint64_t offset;
+ ASSERT(mg->mg_initialized);
+
+ offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
+ dva, d, allocator);
+
+ mutex_enter(&mg->mg_lock);
+ if (offset == -1ULL) {
+ mg->mg_failed_allocations++;
+ metaslab_trace_add(zal, mg, NULL, asize, d,
+ TRACE_GROUP_FAILURE, allocator);
+ if (asize == SPA_GANGBLOCKSIZE) {
+ /*
+ * This metaslab group was unable to allocate
+ * the minimum gang block size so it must be out of
+ * space. We must notify the allocation throttle
+ * to start skipping allocation attempts to this
+ * metaslab group until more space becomes available.
+ * Note: this failure cannot be caused by the
+ * allocation throttle since the allocation throttle
+ * is only responsible for skipping devices and
+ * not failing block allocations.
+ */
+ mg->mg_no_free_space = B_TRUE;
+ }
+ }
+ mg->mg_allocations++;
+ mutex_exit(&mg->mg_lock);
+ return (offset);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+int
+metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+ dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
+ zio_alloc_list_t *zal, int allocator)
+{
+ metaslab_group_t *mg, *rotor;
+ vdev_t *vd;
+ boolean_t try_hard = B_FALSE;
+
+ ASSERT(!DVA_IS_VALID(&dva[d]));
+
+ /*
+ * For testing, make some blocks above a certain size be gang blocks.
+ * This will also test spilling from special to normal.
+ */
+ if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
+ metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
+ allocator);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ /*
+ * Start at the rotor and loop through all mgs until we find something.
+ * Note that there's no locking on mc_rotor or mc_aliquot because
+ * nothing actually breaks if we miss a few updates -- we just won't
+ * allocate quite as evenly. It all balances out over time.
+ *
+ * If we are doing ditto or log blocks, try to spread them across
+ * consecutive vdevs. If we're forced to reuse a vdev before we've
+ * allocated all of our ditto blocks, then try and spread them out on
+ * that vdev as much as possible. If it turns out to not be possible,
+ * gradually lower our standards until anything becomes acceptable.
+ * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+ * gives us hope of containing our fault domains to something we're
+ * able to reason about. Otherwise, any two top-level vdev failures
+ * will guarantee the loss of data. With consecutive allocation,
+ * only two adjacent top-level vdev failures will result in data loss.
+ *
+ * If we are doing gang blocks (hintdva is non-NULL), try to keep
+ * ourselves on the same vdev as our gang block header. That
+ * way, we can hope for locality in vdev_cache, plus it makes our
+ * fault domains something tractable.
+ */
+ if (hintdva) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+
+ /*
+ * It's possible the vdev we're using as the hint no
+ * longer exists or its mg has been closed (e.g. by
+ * device removal). Consult the rotor when
+ * all else fails.
+ */
+ if (vd != NULL && vd->vdev_mg != NULL) {
+ mg = vd->vdev_mg;
+
+ if (flags & METASLAB_HINTBP_AVOID &&
+ mg->mg_next != NULL)
+ mg = mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
+ } else if (d != 0) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+ mg = vd->vdev_mg->mg_next;
+ } else {
+ ASSERT(mc->mc_rotor != NULL);
+ mg = mc->mc_rotor;
+ }
+
+ /*
+ * If the hint put us into the wrong metaslab class, or into a
+ * metaslab group that has been passivated, just follow the rotor.
+ */
+ if (mg->mg_class != mc || mg->mg_activation_count <= 0)
+ mg = mc->mc_rotor;
+
+ rotor = mg;
+top:
+ do {
+ boolean_t allocatable;
+
+ ASSERT(mg->mg_activation_count == 1);
+ vd = mg->mg_vd;
+
+ /*
+ * Don't allocate from faulted devices.
+ */
+ if (try_hard) {
+ spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+ allocatable = vdev_allocatable(vd);
+ spa_config_exit(spa, SCL_ZIO, FTAG);
+ } else {
+ allocatable = vdev_allocatable(vd);
+ }
+
+ /*
+ * Determine if the selected metaslab group is eligible
+ * for allocations. If we're ganging then don't allow
+ * this metaslab group to skip allocations since that would
+ * inadvertently return ENOSPC and suspend the pool
+ * even though space is still available.
+ */
+ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
+ allocatable = metaslab_group_allocatable(mg, rotor,
+ psize, allocator, d);
+ }
+
+ if (!allocatable) {
+ metaslab_trace_add(zal, mg, NULL, psize, d,
+ TRACE_NOT_ALLOCATABLE, allocator);
+ goto next;
+ }
+
+ ASSERT(mg->mg_initialized);
+
+ /*
+ * Avoid writing single-copy data to a failing,
+ * non-redundant vdev, unless we've already tried all
+ * other vdevs.
+ */
+ if ((vd->vdev_stat.vs_write_errors > 0 ||
+ vd->vdev_state < VDEV_STATE_HEALTHY) &&
+ d == 0 && !try_hard && vd->vdev_children == 0) {
+ metaslab_trace_add(zal, mg, NULL, psize, d,
+ TRACE_VDEV_ERROR, allocator);
+ goto next;
+ }
+
+ ASSERT(mg->mg_class == mc);
+
+ uint64_t asize = vdev_psize_to_asize(vd, psize);
+ ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+ /*
+ * If we don't need to try hard, then require that the
+ * block be on an different metaslab from any other DVAs
+ * in this BP (unique=true). If we are trying hard, then
+ * allow any metaslab to be used (unique=false).
+ */
+ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
+ !try_hard, dva, d, allocator);
+
+ if (offset != -1ULL) {
+ /*
+ * If we've just selected this metaslab group,
+ * figure out whether the corresponding vdev is
+ * over- or under-used relative to the pool,
+ * and set an allocation bias to even it out.
+ */
+ if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
+ vdev_stat_t *vs = &vd->vdev_stat;
+ int64_t vu, cu;
+
+ vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+ cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
+
+ /*
+ * Calculate how much more or less we should
+ * try to allocate from this device during
+ * this iteration around the rotor.
+ * For example, if a device is 80% full
+ * and the pool is 20% full then we should
+ * reduce allocations by 60% on this device.
+ *
+ * mg_bias = (20 - 80) * 512K / 100 = -307K
+ *
+ * This reduces allocations by 307K for this
+ * iteration.
+ */
+ mg->mg_bias = ((cu - vu) *
+ (int64_t)mg->mg_aliquot) / 100;
+ } else if (!metaslab_bias_enabled) {
+ mg->mg_bias = 0;
+ }
+
+ if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
+ mg->mg_aliquot + mg->mg_bias) {
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_aliquot = 0;
+ }
+
+ DVA_SET_VDEV(&dva[d], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[d], offset);
+ DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
+ DVA_SET_ASIZE(&dva[d], asize);
+
+ return (0);
+ }
+next:
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_aliquot = 0;
+ } while ((mg = mg->mg_next) != rotor);
+
+ /*
+ * If we haven't tried hard, do so now.
+ */
+ if (!try_hard) {
+ try_hard = B_TRUE;
+ goto top;
+ }
+
+ bzero(&dva[d], sizeof (dva_t));
+
+ metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
+ return (SET_ERROR(ENOSPC));
+}
+
+void
+metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
+ boolean_t checkpoint)
+{
+ metaslab_t *msp;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+ ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY3U(offset, >=, msp->ms_start);
+ VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
+
+ metaslab_check_free_impl(vd, offset, asize);
+
+ mutex_enter(&msp->ms_lock);
+ if (range_tree_is_empty(msp->ms_freeing) &&
+ range_tree_is_empty(msp->ms_checkpointing)) {
+ vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+ }
+
+ if (checkpoint) {
+ ASSERT(spa_has_checkpoint(spa));
+ range_tree_add(msp->ms_checkpointing, offset, asize);
+ } else {
+ range_tree_add(msp->ms_freeing, offset, asize);
+ }
+ mutex_exit(&msp->ms_lock);
+}
+
+/* ARGSUSED */
+void
+metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ boolean_t *checkpoint = arg;
+
+ ASSERT3P(checkpoint, !=, NULL);
+
+ if (vd->vdev_ops->vdev_op_remap != NULL)
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ else
+ metaslab_free_impl(vd, offset, size, *checkpoint);
+}
+
+static void
+metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
+ boolean_t checkpoint)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
+ return;
+
+ if (spa->spa_vdev_removal != NULL &&
+ spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
+ vdev_is_concrete(vd)) {
+ /*
+ * Note: we check if the vdev is concrete because when
+ * we complete the removal, we first change the vdev to be
+ * an indirect vdev (in open context), and then (in syncing
+ * context) clear spa_vdev_removal.
+ */
+ free_from_removing_vdev(vd, offset, size);
+ } else if (vd->vdev_ops->vdev_op_remap != NULL) {
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_free_impl_cb, &checkpoint);
+ } else {
+ metaslab_free_concrete(vd, offset, size, checkpoint);
+ }
+}
+
+typedef struct remap_blkptr_cb_arg {
+ blkptr_t *rbca_bp;
+ spa_remap_cb_t rbca_cb;
+ vdev_t *rbca_remap_vd;
+ uint64_t rbca_remap_offset;
+ void *rbca_cb_arg;
+} remap_blkptr_cb_arg_t;
+
+void
+remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ remap_blkptr_cb_arg_t *rbca = arg;
+ blkptr_t *bp = rbca->rbca_bp;
+
+ /* We can not remap split blocks. */
+ if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
+ return;
+ ASSERT0(inner_offset);
+
+ if (rbca->rbca_cb != NULL) {
+ /*
+ * At this point we know that we are not handling split
+ * blocks and we invoke the callback on the previous
+ * vdev which must be indirect.
+ */
+ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
+ rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
+
+ /* set up remap_blkptr_cb_arg for the next call */
+ rbca->rbca_remap_vd = vd;
+ rbca->rbca_remap_offset = offset;
+ }
+
+ /*
+ * The phys birth time is that of dva[0]. This ensures that we know
+ * when each dva was written, so that resilver can determine which
+ * blocks need to be scrubbed (i.e. those written during the time
+ * the vdev was offline). It also ensures that the key used in
+ * the ARC hash table is unique (i.e. dva[0] + phys_birth). If
+ * we didn't change the phys_birth, a lookup in the ARC for a
+ * remapped BP could find the data that was previously stored at
+ * this vdev + offset.
+ */
+ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]));
+ vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
+ bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+ DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+
+ DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&bp->blk_dva[0], offset);
+}
+
+/*
+ * If the block pointer contains any indirect DVAs, modify them to refer to
+ * concrete DVAs. Note that this will sometimes not be possible, leaving
+ * the indirect DVA in place. This happens if the indirect DVA spans multiple
+ * segments in the mapping (i.e. it is a "split block").
+ *
+ * If the BP was remapped, calls the callback on the original dva (note the
+ * callback can be called multiple times if the original indirect DVA refers
+ * to another indirect DVA, etc).
+ *
+ * Returns TRUE if the BP was remapped.
+ */
+boolean_t
+spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
+{
+ remap_blkptr_cb_arg_t rbca;
+
+ if (!zfs_remap_blkptr_enable)
+ return (B_FALSE);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
+ return (B_FALSE);
+
+ /*
+ * Dedup BP's can not be remapped, because ddt_phys_select() depends
+ * on DVA[0] being the same in the BP as in the DDT (dedup table).
+ */
+ if (BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ /*
+ * Gang blocks can not be remapped, because
+ * zio_checksum_gang_verifier() depends on the DVA[0] that's in
+ * the BP used to read the gang block header (GBH) being the same
+ * as the DVA[0] that we allocated for the GBH.
+ */
+ if (BP_IS_GANG(bp))
+ return (B_FALSE);
+
+ /*
+ * Embedded BP's have no DVA to remap.
+ */
+ if (BP_GET_NDVAS(bp) < 1)
+ return (B_FALSE);
+
+ /*
+ * Note: we only remap dva[0]. If we remapped other dvas, we
+ * would no longer know what their phys birth txg is.
+ */
+ dva_t *dva = &bp->blk_dva[0];
+
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+ if (vd->vdev_ops->vdev_op_remap == NULL)
+ return (B_FALSE);
+
+ rbca.rbca_bp = bp;
+ rbca.rbca_cb = callback;
+ rbca.rbca_remap_vd = vd;
+ rbca.rbca_remap_offset = offset;
+ rbca.rbca_cb_arg = arg;
+
+ /*
+ * remap_blkptr_cb() will be called in order for each level of
+ * indirection, until a concrete vdev is reached or a split block is
+ * encountered. old_vd and old_offset are updated within the callback
+ * as we go from the one indirect vdev to the next one (either concrete
+ * or indirect again) in that order.
+ */
+ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
+
+ /* Check if the DVA wasn't remapped because it is a split block */
+ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Undo the allocation of a DVA which happened in the given transaction group.
+ */
+void
+metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+ metaslab_t *msp;
+ vdev_t *vd;
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+
+ ASSERT(DVA_IS_VALID(dva));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ if (txg > spa_freeze_txg(spa))
+ return;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+ cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
+ (u_longlong_t)vdev, (u_longlong_t)offset);
+ ASSERT(0);
+ return;
+ }
+
+ ASSERT(!vd->vdev_removing);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
+ ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ mutex_enter(&msp->ms_lock);
+ range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
+ offset, size);
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY3U(offset, >=, msp->ms_start);
+ VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
+ VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
+ msp->ms_size);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ range_tree_add(msp->ms_allocatable, offset, size);
+ mutex_exit(&msp->ms_lock);
+}
+
+/*
+ * Free the block represented by the given DVA.
+ */
+void
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+
+ ASSERT(DVA_IS_VALID(dva));
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ if (DVA_GET_GANG(dva)) {
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ }
+
+ metaslab_free_impl(vd, offset, size, checkpoint);
+}
+
+/*
+ * Reserve some allocation slots. The reservation system must be called
+ * before we call into the allocator. If there aren't any available slots
+ * then the I/O will be throttled until an I/O completes and its slots are
+ * freed up. The function returns true if it was successful in placing
+ * the reservation.
+ */
+boolean_t
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
+ zio_t *zio, int flags)
+{
+ uint64_t available_slots = 0;
+ boolean_t slot_reserved = B_FALSE;
+ uint64_t max = mc->mc_alloc_max_slots[allocator];
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ mutex_enter(&mc->mc_lock);
+
+ uint64_t reserved_slots =
+ zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
+ if (reserved_slots < max)
+ available_slots = max - reserved_slots;
+
+ if (slots <= available_slots || GANG_ALLOCATION(flags) ||
+ flags & METASLAB_MUST_RESERVE) {
+ /*
+ * We reserve the slots individually so that we can unreserve
+ * them individually when an I/O completes.
+ */
+ for (int d = 0; d < slots; d++) {
+ reserved_slots =
+ zfs_refcount_add(&mc->mc_alloc_slots[allocator],
+ zio);
+ }
+ zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
+ slot_reserved = B_TRUE;
+ }
+
+ mutex_exit(&mc->mc_lock);
+ return (slot_reserved);
+}
+
+void
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
+ int allocator, zio_t *zio)
+{
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ mutex_enter(&mc->mc_lock);
+ for (int d = 0; d < slots; d++) {
+ (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
+ zio);
+ }
+ mutex_exit(&mc->mc_lock);
+}
+
+static int
+metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
+ uint64_t txg)
+{
+ metaslab_t *msp;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+
+ if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
+ return (ENXIO);
+
+ ASSERT3P(vd->vdev_ms, !=, NULL);
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ mutex_enter(&msp->ms_lock);
+
+ if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
+ error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
+ /*
+ * No need to fail in that case; someone else has activated the
+ * metaslab, but that doesn't preclude us from using it.
+ */
+ if (error == EBUSY)
+ error = 0;
+
+ if (error == 0 &&
+ !range_tree_contains(msp->ms_allocatable, offset, size))
+ error = SET_ERROR(ENOENT);
+
+ if (error || txg == 0) { /* txg == 0 indicates dry run */
+ mutex_exit(&msp->ms_lock);
+ return (error);
+ }
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+ msp->ms_size);
+ range_tree_remove(msp->ms_allocatable, offset, size);
+
+ if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
+ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+ offset, size);
+ }
+
+ mutex_exit(&msp->ms_lock);
+
+ return (0);
+}
+
+typedef struct metaslab_claim_cb_arg_t {
+ uint64_t mcca_txg;
+ int mcca_error;
+} metaslab_claim_cb_arg_t;
+
+/* ARGSUSED */
+static void
+metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ metaslab_claim_cb_arg_t *mcca_arg = arg;
+
+ if (mcca_arg->mcca_error == 0) {
+ mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
+ size, mcca_arg->mcca_txg);
+ }
+}
+
+int
+metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
+{
+ if (vd->vdev_ops->vdev_op_remap != NULL) {
+ metaslab_claim_cb_arg_t arg;
+
+ /*
+ * Only zdb(1M) can claim on indirect vdevs. This is used
+ * to detect leaks of mapped space (that are not accounted
+ * for in the obsolete counts, spacemap, or bpobj).
+ */
+ ASSERT(!spa_writeable(vd->vdev_spa));
+ arg.mcca_error = 0;
+ arg.mcca_txg = txg;
+
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_claim_impl_cb, &arg);
+
+ if (arg.mcca_error == 0) {
+ arg.mcca_error = metaslab_claim_concrete(vd,
+ offset, size, txg);
+ }
+ return (arg.mcca_error);
+ } else {
+ return (metaslab_claim_concrete(vd, offset, size, txg));
+ }
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+static int
+metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ return (metaslab_claim_impl(vd, offset, size, txg));
+}
+
+int
+metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
+ int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
+ zio_alloc_list_t *zal, zio_t *zio, int allocator)
+{
+ dva_t *dva = bp->blk_dva;
+ dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
+ int error = 0;
+
+ ASSERT(bp->blk_birth == 0);
+ ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+ if (mc->mc_rotor == NULL) { /* no vdevs in this class */
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
+ ASSERT(BP_GET_NDVAS(bp) == 0);
+ ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+ ASSERT3P(zal, !=, NULL);
+
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
+ txg, flags, zal, allocator);
+ if (error != 0) {
+ for (d--; d >= 0; d--) {
+ metaslab_unalloc_dva(spa, &dva[d], txg);
+ metaslab_group_alloc_decrement(spa,
+ DVA_GET_VDEV(&dva[d]), zio, flags,
+ allocator, B_FALSE);
+ bzero(&dva[d], sizeof (dva_t));
+ }
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+ return (error);
+ } else {
+ /*
+ * Update the metaslab group's queue depth
+ * based on the newly allocated dva.
+ */
+ metaslab_group_alloc_increment(spa,
+ DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
+ }
+
+ }
+ ASSERT(error == 0);
+ ASSERT(BP_GET_NDVAS(bp) == ndvas);
+
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ BP_SET_BIRTH(bp, txg, txg);
+
+ return (0);
+}
+
+void
+metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+
+ /*
+ * If we have a checkpoint for the pool we need to make sure that
+ * the blocks that we free that are part of the checkpoint won't be
+ * reused until the checkpoint is discarded or we revert to it.
+ *
+ * The checkpoint flag is passed down the metaslab_free code path
+ * and is set whenever we want to add a block to the checkpoint's
+ * accounting. That is, we "checkpoint" blocks that existed at the
+ * time the checkpoint was created and are therefore referenced by
+ * the checkpointed uberblock.
+ *
+ * Note that, we don't checkpoint any blocks if the current
+ * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+ * normally as they will be referenced by the checkpointed uberblock.
+ */
+ boolean_t checkpoint = B_FALSE;
+ if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+ spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+ /*
+ * At this point, if the block is part of the checkpoint
+ * there is no way it was created in the current txg.
+ */
+ ASSERT(!now);
+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
+ checkpoint = B_TRUE;
+ }
+
+ spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
+
+ for (int d = 0; d < ndvas; d++) {
+ if (now) {
+ metaslab_unalloc_dva(spa, &dva[d], txg);
+ } else {
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+ metaslab_free_dva(spa, &dva[d], checkpoint);
+ }
+ }
+
+ spa_config_exit(spa, SCL_FREE, FTAG);
+}
+
+int
+metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int error = 0;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (txg != 0) {
+ /*
+ * First do a dry run to make sure all DVAs are claimable,
+ * so we don't have to unwind from partial failures below.
+ */
+ if ((error = metaslab_claim(spa, bp, 0)) != 0)
+ return (error);
+ }
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_claim_dva(spa, &dva[d], txg);
+ if (error != 0)
+ break;
+ }
+
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ ASSERT(error == 0 || txg == 0);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ return;
+
+ metaslab_check_free_impl(vd, offset, size);
+}
+
+static void
+metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+ metaslab_t *msp;
+ spa_t *spa = vd->vdev_spa;
+
+ if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+ return;
+
+ if (vd->vdev_ops->vdev_op_remap != NULL) {
+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
+ metaslab_check_free_impl_cb, NULL);
+ return;
+ }
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ mutex_enter(&msp->ms_lock);
+ if (msp->ms_loaded) {
+ range_tree_verify_not_present(msp->ms_allocatable,
+ offset, size);
+ }
+
+ range_tree_verify_not_present(msp->ms_freeing, offset, size);
+ range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
+ range_tree_verify_not_present(msp->ms_freed, offset, size);
+ for (int j = 0; j < TXG_DEFER_SIZE; j++)
+ range_tree_verify_not_present(msp->ms_defer[j], offset, size);
+ mutex_exit(&msp->ms_lock);
+}
+
+void
+metaslab_check_free(spa_t *spa, const blkptr_t *bp)
+{
+ if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+ return;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+ uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+ uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
+
+ if (DVA_GET_GANG(&bp->blk_dva[i]))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ ASSERT3P(vd, !=, NULL);
+
+ metaslab_check_free_impl(vd, offset, size);
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c
new file mode 100644
index 000000000000..f22af0b40146
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c
@@ -0,0 +1,750 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/abd.h>
+#include <sys/mmp.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/time.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/callb.h>
+
+/*
+ * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
+ * or opening a pool on more than one host at a time. In particular, it
+ * prevents "zpool import -f" on a host from succeeding while the pool is
+ * already imported on another host. There are many other ways in which a
+ * device could be used by two hosts for different purposes at the same time
+ * resulting in pool damage. This implementation does not attempt to detect
+ * those cases.
+ *
+ * MMP operates by ensuring there are frequent visible changes on disk (a
+ * "heartbeat") at all times. And by altering the import process to check
+ * for these changes and failing the import when they are detected. This
+ * functionality is enabled by setting the 'multihost' pool property to on.
+ *
+ * Uberblocks written by the txg_sync thread always go into the first
+ * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
+ * They are used to hold uberblocks which are exactly the same as the last
+ * synced uberblock except that the ub_timestamp and mmp_config are frequently
+ * updated. Like all other uberblocks, the slot is written with an embedded
+ * checksum, and slots with invalid checksums are ignored. This provides the
+ * "heartbeat", with no risk of overwriting good uberblocks that must be
+ * preserved, e.g. previous txgs and associated block pointers.
+ *
+ * Three optional fields are added to uberblock structure; ub_mmp_magic,
+ * ub_mmp_config, and ub_mmp_delay. The ub_mmp_magic value allows zfs to tell
+ * whether the other ub_mmp_* fields are valid. The ub_mmp_config field tells
+ * the importing host the settings of zfs_multihost_interval and
+ * zfs_multihost_fail_intervals on the host which last had (or currently has)
+ * the pool imported. These determine how long a host must wait to detect
+ * activity in the pool, before concluding the pool is not in use. The
+ * mmp_delay field is a decaying average of the amount of time between
+ * completion of successive MMP writes, in nanoseconds. It indicates whether
+ * MMP is enabled.
+ *
+ * During import an activity test may now be performed to determine if
+ * the pool is in use. The activity test is typically required if the
+ * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
+ * POOL_STATE_ACTIVE, and the pool is not a root pool.
+ *
+ * The activity test finds the "best" uberblock (highest txg, timestamp, and, if
+ * ub_mmp_magic is valid, sequence number from ub_mmp_config). It then waits
+ * some time, and finds the "best" uberblock again. If any of the mentioned
+ * fields have different values in the newly read uberblock, the pool is in use
+ * by another host and the import fails. In order to assure the accuracy of the
+ * activity test, the default values result in an activity test duration of 20x
+ * the mmp write interval.
+ *
+ * The duration of the "zpool import" activity test depends on the information
+ * available in the "best" uberblock:
+ *
+ * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0:
+ * ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2
+ *
+ * In this case, a weak guarantee is provided. Since the host which last had
+ * the pool imported will suspend the pool if no mmp writes land within
+ * fail_intervals * multihost_interval ms, the absence of writes during that
+ * time means either the pool is not imported, or it is imported but the pool
+ * is suspended and no further writes will occur.
+ *
+ * Note that resuming the suspended pool on the remote host would invalidate
+ * this guarantee, and so it is not allowed.
+ *
+ * The factor of 2 provides a conservative safety factor and derives from
+ * MMP_IMPORT_SAFETY_FACTOR;
+ *
+ * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0:
+ * (ub_mmp_config.multihost_interval + ub_mmp_delay) *
+ * zfs_multihost_import_intervals
+ *
+ * In this case no guarantee can provided. However, as long as some devices
+ * are healthy and connected, it is likely that at least one write will land
+ * within (multihost_interval + mmp_delay) because multihost_interval is
+ * enough time for a write to be attempted to each leaf vdev, and mmp_delay
+ * is enough for one to land, based on past delays. Multiplying by
+ * zfs_multihost_import_intervals provides a conservative safety factor.
+ *
+ * 3) If uberblock was written by zfs-0.7:
+ * (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
+ *
+ * The same logic as case #2 applies, but we do not know remote tunables.
+ *
+ * We use the local value for zfs_multihost_interval because the original MMP
+ * did not record this value in the uberblock.
+ *
+ * ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host
+ * has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect
+ * that. We will have waited enough time for zfs_multihost_import_intervals
+ * writes to be issued and all but one to land.
+ *
+ * single device pool example delays
+ *
+ * import_delay = (1 + 1) * 20 = 40s #defaults, no I/O delay
+ * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay
+ * import_delay = (10 + 10) * 20 = 400s #10s multihost_interval,
+ * no I/O delay
+ * 100 device pool example delays
+ *
+ * import_delay = (1 + .01) * 20 = 20s #defaults, no I/O delay
+ * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay
+ * import_delay = (10 + .1) * 20 = 202s #10s multihost_interval,
+ * no I/O delay
+ *
+ * 4) Otherwise, this uberblock was written by a pre-MMP zfs:
+ * zfs_multihost_import_intervals * zfs_multihost_interval
+ *
+ * In this case local tunables are used. By default this product = 10s, long
+ * enough for a pool with any activity at all to write at least one
+ * uberblock. No guarantee can be provided.
+ *
+ * Additionally, the duration is then extended by a random 25% to attempt to to
+ * detect simultaneous imports. For example, if both partner hosts are rebooted
+ * at the same time and automatically attempt to import the pool.
+ */
+
+/*
+ * Used to control the frequency of mmp writes which are performed when the
+ * 'multihost' pool property is on. This is one factor used to determine the
+ * length of the activity check during import.
+ *
+ * On average an mmp write will be issued for each leaf vdev every
+ * zfs_multihost_interval milliseconds. In practice, the observed period can
+ * vary with the I/O load and this observed value is the ub_mmp_delay which is
+ * stored in the uberblock. The minimum allowed value is 100 ms.
+ */
+ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
+#ifdef __FreeBSD__
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_ULONG(_vfs_zfs, OID_AUTO, multihost_interval, CTLFLAG_RWTUN,
+ &zfs_multihost_interval, 0, "Interval between MMP writes, milliseconds");
+#endif
+
+/*
+ * Used to control the duration of the activity test on import. Smaller values
+ * of zfs_multihost_import_intervals will reduce the import time but increase
+ * the risk of failing to detect an active pool. The total activity check time
+ * is never allowed to drop below one second. A value of 0 is ignored and
+ * treated as if it was set to 1.
+ */
+uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
+#ifdef __FreeBSD__
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, multihost_import_intervals, CTLFLAG_RWTUN,
+ &zfs_multihost_import_intervals, 0,
+ "MMP activity check period for pool import, "
+ "in units of multihost_interval");
+#endif
+
+/*
+ * Controls the behavior of the pool when mmp write failures or delays are
+ * detected.
+ *
+ * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are
+ * ignored. The failures will still be reported to the ZED which depending on
+ * its configuration may take action such as suspending the pool or taking a
+ * device offline.
+ *
+ * When zfs_multihost_fail_intervals > 0, the pool will be suspended if
+ * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass
+ * without a successful mmp write. This guarantees the activity test will see
+ * mmp writes if the pool is imported. A value of 1 is ignored and treated as
+ * if it was set to 2, because a single leaf vdev pool will issue a write once
+ * per multihost_interval and thus any variation in latency would cause the
+ * pool to be suspended.
+ */
+uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
+#ifdef __FreeBSD__
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, multihost_fail_intervals, CTLFLAG_RWTUN,
+ &zfs_multihost_fail_intervals, 0,
+ "How long to tolerate MMP write failures before suspending a pool, "
+ "in units of multihost_interval");
+#endif
+
+char *mmp_tag = "mmp_write_uberblock";
+static void mmp_thread(void *arg);
+
+void
+mmp_init(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ mmp->mmp_kstat_id = 1;
+
+ /*
+ * mmp_write_done() calculates mmp_delay based on prior mmp_delay and
+ * the elapsed time since the last write. For the first mmp write,
+ * there is no "last write", so we start with fake non-zero values.
+ */
+ mmp->mmp_last_write = gethrtime();
+ mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
+}
+
+void
+mmp_fini(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_destroy(&mmp->mmp_thread_lock);
+ cv_destroy(&mmp->mmp_thread_cv);
+ mutex_destroy(&mmp->mmp_io_lock);
+}
+
+static void
+mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
+{
+ CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
+ mutex_enter(&mmp->mmp_thread_lock);
+}
+
+static void
+mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
+{
+ ASSERT(*mpp != NULL);
+ *mpp = NULL;
+ cv_broadcast(&mmp->mmp_thread_cv);
+ CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */
+ thread_exit();
+}
+
+void
+mmp_thread_start(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ if (spa_writeable(spa)) {
+ mutex_enter(&mmp->mmp_thread_lock);
+ if (!mmp->mmp_thread) {
+ mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
+ spa, 0, &p0, TS_RUN, minclsyspri);
+ zfs_dbgmsg("MMP thread started pool '%s' "
+ "gethrtime %llu", spa_name(spa), gethrtime());
+ }
+ mutex_exit(&mmp->mmp_thread_lock);
+ }
+}
+
+void
+mmp_thread_stop(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_enter(&mmp->mmp_thread_lock);
+ mmp->mmp_thread_exiting = 1;
+ cv_broadcast(&mmp->mmp_thread_cv);
+
+ while (mmp->mmp_thread) {
+ cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
+ }
+ mutex_exit(&mmp->mmp_thread_lock);
+ zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
+ spa_name(spa), gethrtime());
+
+ ASSERT(mmp->mmp_thread == NULL);
+ mmp->mmp_thread_exiting = 0;
+}
+
+typedef enum mmp_vdev_state_flag {
+ MMP_FAIL_NOT_WRITABLE = (1 << 0),
+ MMP_FAIL_WRITE_PENDING = (1 << 1),
+} mmp_vdev_state_flag_t;
+
+/*
+ * Find a leaf vdev to write an MMP block to. It must not have an outstanding
+ * mmp write (if so a new write will also likely block). If there is no usable
+ * leaf, a nonzero error value is returned. The error value returned is a bit
+ * field.
+ *
+ * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an
+ * outstanding MMP write.
+ * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable.
+ */
+
+static int
+mmp_next_leaf(spa_t *spa)
+{
+ vdev_t *leaf;
+ vdev_t *starting_leaf;
+ int fail_mask = 0;
+
+ ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
+ ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
+ ASSERT(!list_is_empty(&spa->spa_leaf_list));
+
+ if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
+ spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
+ spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
+ }
+
+ leaf = spa->spa_mmp.mmp_last_leaf;
+ if (leaf == NULL)
+ leaf = list_head(&spa->spa_leaf_list);
+ starting_leaf = leaf;
+
+ do {
+ leaf = list_next(&spa->spa_leaf_list, leaf);
+ if (leaf == NULL)
+ leaf = list_head(&spa->spa_leaf_list);
+
+ if (!vdev_writeable(leaf)) {
+ fail_mask |= MMP_FAIL_NOT_WRITABLE;
+ } else if (leaf->vdev_mmp_pending != 0) {
+ fail_mask |= MMP_FAIL_WRITE_PENDING;
+ } else {
+ spa->spa_mmp.mmp_last_leaf = leaf;
+ return (0);
+ }
+ } while (leaf != starting_leaf);
+
+ ASSERT(fail_mask);
+
+ return (fail_mask);
+}
+
+/*
+ * MMP writes are issued on a fixed schedule, but may complete at variable,
+ * much longer, intervals. The mmp_delay captures long periods between
+ * successful writes for any reason, including disk latency, scheduling delays,
+ * etc.
+ *
+ * The mmp_delay is usually calculated as a decaying average, but if the latest
+ * delay is higher we do not average it, so that we do not hide sudden spikes
+ * which the importing host must wait for.
+ *
+ * If writes are occurring frequently, such as due to a high rate of txg syncs,
+ * the mmp_delay could become very small. Since those short delays depend on
+ * activity we cannot count on, we never allow mmp_delay to get lower than rate
+ * expected if only mmp_thread writes occur.
+ *
+ * If an mmp write was skipped or fails, and we have already waited longer than
+ * mmp_delay, we need to update it so the next write reflects the longer delay.
+ *
+ * Do not set mmp_delay if the multihost property is not on, so as not to
+ * trigger an activity check on import.
+ */
+static void
+mmp_delay_update(spa_t *spa, boolean_t write_completed)
+{
+ mmp_thread_t *mts = &spa->spa_mmp;
+ hrtime_t delay = gethrtime() - mts->mmp_last_write;
+
+ ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
+
+ if (spa_multihost(spa) == B_FALSE) {
+ mts->mmp_delay = 0;
+ return;
+ }
+
+ if (delay > mts->mmp_delay)
+ mts->mmp_delay = delay;
+
+ if (write_completed == B_FALSE)
+ return;
+
+ mts->mmp_last_write = gethrtime();
+
+ /*
+ * strictly less than, in case delay was changed above.
+ */
+ if (delay < mts->mmp_delay) {
+ hrtime_t min_delay =
+ MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) /
+ MAX(1, vdev_count_leaves(spa));
+ mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
+ min_delay);
+ }
+}
+
+static void
+mmp_write_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ vdev_t *vd = zio->io_vd;
+ mmp_thread_t *mts = zio->io_private;
+
+ mutex_enter(&mts->mmp_io_lock);
+ uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
+ hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
+
+ mmp_delay_update(spa, (zio->io_error == 0));
+
+ vd->vdev_mmp_pending = 0;
+ vd->vdev_mmp_kstat_id = 0;
+
+ mutex_exit(&mts->mmp_io_lock);
+ spa_config_exit(spa, SCL_STATE, mmp_tag);
+
+ abd_free(zio->io_abd);
+}
+
+/*
+ * When the uberblock on-disk is updated by a spa_sync,
+ * creating a new "best" uberblock, update the one stored
+ * in the mmp thread state, used for mmp writes.
+ */
+void
+mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_enter(&mmp->mmp_io_lock);
+ mmp->mmp_ub = *ub;
+ mmp->mmp_seq = 1;
+ mmp->mmp_ub.ub_timestamp = gethrestime_sec();
+ mmp_delay_update(spa, B_TRUE);
+ mutex_exit(&mmp->mmp_io_lock);
+}
+
+/*
+ * Choose a random vdev, label, and MMP block, and write over it
+ * with a copy of the last-synced uberblock, whose timestamp
+ * has been updated to reflect that the pool is in use.
+ */
+static void
+mmp_write_uberblock(spa_t *spa)
+{
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ mmp_thread_t *mmp = &spa->spa_mmp;
+ uberblock_t *ub;
+ vdev_t *vd = NULL;
+ int label, error;
+ uint64_t offset;
+
+ hrtime_t lock_acquire_time = gethrtime();
+ spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
+ lock_acquire_time = gethrtime() - lock_acquire_time;
+ if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
+ zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
+ "gethrtime %llu", spa_name(spa), lock_acquire_time,
+ gethrtime());
+
+ mutex_enter(&mmp->mmp_io_lock);
+
+ error = mmp_next_leaf(spa);
+
+ /*
+ * spa_mmp_history has two types of entries:
+ * Issued MMP write: records time issued, error status, etc.
+ * Skipped MMP write: an MMP write could not be issued because no
+ * suitable leaf vdev was available. See comment above struct
+ * spa_mmp_history for details.
+ */
+
+ if (error) {
+ mmp_delay_update(spa, B_FALSE);
+ if (mmp->mmp_skip_error == error) {
+ /*
+ * ZoL porting note: the following is TBD
+ * spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
+ */
+ } else {
+ mmp->mmp_skip_error = error;
+ /*
+ * ZoL porting note: the following is TBD
+ * spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
+ * gethrestime_sec(), mmp->mmp_delay, NULL, 0,
+ * mmp->mmp_kstat_id++, error);
+ */
+ zfs_dbgmsg("MMP error choosing leaf pool '%s' "
+ "gethrtime %llu fail_mask %#x", spa_name(spa),
+ gethrtime(), error);
+ }
+ mutex_exit(&mmp->mmp_io_lock);
+ spa_config_exit(spa, SCL_STATE, mmp_tag);
+ return;
+ }
+
+ vd = spa->spa_mmp.mmp_last_leaf;
+ if (mmp->mmp_skip_error != 0) {
+ mmp->mmp_skip_error = 0;
+ zfs_dbgmsg("MMP write after skipping due to unavailable "
+ "leaves, pool '%s' gethrtime %llu leaf %#llu",
+ spa_name(spa), gethrtime(), vd->vdev_guid);
+ }
+
+ if (mmp->mmp_zio_root == NULL)
+ mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
+ flags | ZIO_FLAG_GODFATHER);
+
+ if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) {
+ /*
+ * Want to reset mmp_seq when timestamp advances because after
+ * an mmp_seq wrap new values will not be chosen by
+ * uberblock_compare() as the "best".
+ */
+ mmp->mmp_ub.ub_timestamp = gethrestime_sec();
+ mmp->mmp_seq = 1;
+ }
+
+ ub = &mmp->mmp_ub;
+ ub->ub_mmp_magic = MMP_MAGIC;
+ ub->ub_mmp_delay = mmp->mmp_delay;
+ ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) |
+ MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) |
+ MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK(
+ zfs_multihost_fail_intervals));
+ vd->vdev_mmp_pending = gethrtime();
+ vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
+
+ zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
+ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+ abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+
+ mmp->mmp_seq++;
+ mmp->mmp_kstat_id++;
+ mutex_exit(&mmp->mmp_io_lock);
+
+ offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
+ MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
+
+ label = spa_get_random(VDEV_LABELS);
+ vdev_label_write(zio, vd, label, ub_abd, offset,
+ VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
+ flags | ZIO_FLAG_DONT_PROPAGATE);
+
+ /*
+ * ZoL porting note: the following is TBD
+ * (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
+ * ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
+ */
+
+ zio_nowait(zio);
+}
+
+static void
+mmp_thread(void *arg)
+{
+ spa_t *spa = (spa_t *)arg;
+ mmp_thread_t *mmp = &spa->spa_mmp;
+ boolean_t suspended = spa_suspended(spa);
+ boolean_t multihost = spa_multihost(spa);
+ uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
+ zfs_multihost_interval));
+ uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
+ zfs_multihost_fail_intervals);
+ hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
+ boolean_t last_spa_suspended = suspended;
+ boolean_t last_spa_multihost = multihost;
+ uint64_t last_mmp_interval = mmp_interval;
+ uint32_t last_mmp_fail_intervals = mmp_fail_intervals;
+ hrtime_t last_mmp_fail_ns = mmp_fail_ns;
+ callb_cpr_t cpr;
+ int skip_wait = 0;
+
+ mmp_thread_enter(mmp, &cpr);
+
+ while (!mmp->mmp_thread_exiting) {
+ hrtime_t next_time = gethrtime() +
+ MSEC2NSEC(MMP_DEFAULT_INTERVAL);
+ int leaves = MAX(vdev_count_leaves(spa), 1);
+
+ /* Detect changes in tunables or state */
+
+ last_spa_suspended = suspended;
+ last_spa_multihost = multihost;
+ suspended = spa_suspended(spa);
+ multihost = spa_multihost(spa);
+
+ last_mmp_interval = mmp_interval;
+ last_mmp_fail_intervals = mmp_fail_intervals;
+ last_mmp_fail_ns = mmp_fail_ns;
+ mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
+ zfs_multihost_interval));
+ mmp_fail_intervals = MMP_FAIL_INTVS_OK(
+ zfs_multihost_fail_intervals);
+
+ /* Smooth so pool is not suspended when reducing tunables */
+ if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) {
+ mmp_fail_ns = (mmp_fail_ns * 31 +
+ mmp_fail_intervals * mmp_interval) / 32;
+ } else {
+ mmp_fail_ns = mmp_fail_intervals *
+ mmp_interval;
+ }
+
+ if (mmp_interval != last_mmp_interval ||
+ mmp_fail_intervals != last_mmp_fail_intervals) {
+ /*
+ * We want other hosts to see new tunables as quickly as
+ * possible. Write out at higher frequency than usual.
+ */
+ skip_wait += leaves;
+ }
+
+ if (multihost)
+ next_time = gethrtime() + mmp_interval / leaves;
+
+ if (mmp_fail_ns != last_mmp_fail_ns) {
+ zfs_dbgmsg("MMP interval change pool '%s' "
+ "gethrtime %llu last_mmp_interval %llu "
+ "mmp_interval %llu last_mmp_fail_intervals %u "
+ "mmp_fail_intervals %u mmp_fail_ns %llu "
+ "skip_wait %d leaves %d next_time %llu",
+ spa_name(spa), gethrtime(), last_mmp_interval,
+ mmp_interval, last_mmp_fail_intervals,
+ mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves,
+ next_time);
+ }
+
+ /*
+ * MMP off => on, or suspended => !suspended:
+ * No writes occurred recently. Update mmp_last_write to give
+ * us some time to try.
+ */
+ if ((!last_spa_multihost && multihost) ||
+ (last_spa_suspended && !suspended)) {
+ zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
+ "last_spa_multihost %u multihost %u "
+ "last_spa_suspended %u suspended %u",
+ spa_name(spa), last_spa_multihost, multihost,
+ last_spa_suspended, suspended);
+ mutex_enter(&mmp->mmp_io_lock);
+ mmp->mmp_last_write = gethrtime();
+ mmp->mmp_delay = mmp_interval;
+ mutex_exit(&mmp->mmp_io_lock);
+ }
+
+ /*
+ * MMP on => off:
+ * mmp_delay == 0 tells importing node to skip activity check.
+ */
+ if (last_spa_multihost && !multihost) {
+ mutex_enter(&mmp->mmp_io_lock);
+ mmp->mmp_delay = 0;
+ mutex_exit(&mmp->mmp_io_lock);
+ }
+
+ /*
+ * Suspend the pool if no MMP write has succeeded in over
+ * mmp_interval * mmp_fail_intervals nanoseconds.
+ */
+ if (multihost && !suspended && mmp_fail_intervals &&
+ (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
+ zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
+ "mmp_last_write %llu mmp_interval %llu "
+ "mmp_fail_intervals %llu mmp_fail_ns %llu",
+ spa_name(spa), (u_longlong_t)gethrtime(),
+ (u_longlong_t)mmp->mmp_last_write,
+ (u_longlong_t)mmp_interval,
+ (u_longlong_t)mmp_fail_intervals,
+ (u_longlong_t)mmp_fail_ns);
+ cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
+ "succeeded in over %llu ms; suspending pool. "
+ "Hrtime %llu",
+ spa_name(spa),
+ NSEC2MSEC(gethrtime() - mmp->mmp_last_write),
+ gethrtime());
+ zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
+ }
+
+ if (multihost && !suspended)
+ mmp_write_uberblock(spa);
+
+ if (skip_wait > 0) {
+ next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) /
+ leaves;
+ skip_wait--;
+ }
+
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+#if defined(illumos)
+ (void) cv_timedwait_sig_hrtime(&mmp->mmp_thread_cv,
+ &mmp->mmp_thread_lock, next_time);
+#elif defined(_KERNEL)
+ (void) cv_timedwait_sig_sbt(&mmp->mmp_thread_cv,
+ &mmp->mmp_thread_lock, nstosbt(next_time),
+ 100 * SBT_1US, C_ABSOLUTE);
+#else
+ (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv,
+ &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
+ CALLOUT_FLAG_ABSOLUTE);
+#endif
+ CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
+ }
+
+ /* Outstanding writes are allowed to complete. */
+ if (mmp->mmp_zio_root)
+ zio_wait(mmp->mmp_zio_root);
+
+ mmp->mmp_zio_root = NULL;
+ mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
+}
+
+/*
+ * Signal the MMP thread to wake it, when it is sleeping on
+ * its cv. Used when some module parameter has changed and
+ * we want the thread to know about it.
+ * Only signal if the pool is active and mmp thread is
+ * running, otherwise there is no thread to wake.
+ */
+static void
+mmp_signal_thread(spa_t *spa)
+{
+ mmp_thread_t *mmp = &spa->spa_mmp;
+
+ mutex_enter(&mmp->mmp_thread_lock);
+ if (mmp->mmp_thread)
+ cv_broadcast(&mmp->mmp_thread_cv);
+ mutex_exit(&mmp->mmp_thread_lock);
+}
+
+void
+mmp_signal_all_threads(void)
+{
+ spa_t *spa = NULL;
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa))) {
+ if (spa->spa_state == POOL_STATE_ACTIVE)
+ mmp_signal_thread(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
new file mode 100644
index 000000000000..f517454d3d6d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
@@ -0,0 +1,423 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/multilist.h>
+
+/* needed for spa_get_random() */
+#include <sys/spa.h>
+
+/*
+ * This overrides the number of sublists in each multilist_t, which defaults
+ * to the number of CPUs in the system (see multilist_create()).
+ */
+int zfs_multilist_num_sublists = 0;
+
+/*
+ * Given the object contained on the list, return a pointer to the
+ * object's multilist_node_t structure it contains.
+ */
+static multilist_node_t *
+multilist_d2l(multilist_t *ml, void *obj)
+{
+ return ((multilist_node_t *)((char *)obj + ml->ml_offset));
+}
+
+/*
+ * Initialize a new mutlilist using the parameters specified.
+ *
+ * - 'size' denotes the size of the structure containing the
+ * multilist_node_t.
+ * - 'offset' denotes the byte offset of the mutlilist_node_t within
+ * the structure that contains it.
+ * - 'num' specifies the number of internal sublists to create.
+ * - 'index_func' is used to determine which sublist to insert into
+ * when the multilist_insert() function is called; as well as which
+ * sublist to remove from when multilist_remove() is called. The
+ * requirements this function must meet, are the following:
+ *
+ * - It must always return the same value when called on the same
+ * object (to ensure the object is removed from the list it was
+ * inserted into).
+ *
+ * - It must return a value in the range [0, number of sublists).
+ * The multilist_get_num_sublists() function may be used to
+ * determine the number of sublists in the multilist.
+ *
+ * Also, in order to reduce internal contention between the sublists
+ * during insertion and removal, this function should choose evenly
+ * between all available sublists when inserting. This isn't a hard
+ * requirement, but a general rule of thumb in order to garner the
+ * best multi-threaded performance out of the data structure.
+ */
+static multilist_t *
+multilist_create_impl(size_t size, size_t offset,
+ unsigned int num, multilist_sublist_index_func_t *index_func)
+{
+ ASSERT3U(size, >, 0);
+ ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
+ ASSERT3U(num, >, 0);
+ ASSERT3P(index_func, !=, NULL);
+
+ multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP);
+ ml->ml_offset = offset;
+ ml->ml_num_sublists = num;
+ ml->ml_index_func = index_func;
+
+ ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+ ml->ml_num_sublists, KM_SLEEP);
+
+ ASSERT3P(ml->ml_sublists, !=, NULL);
+
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+ mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&mls->mls_list, size, offset);
+ }
+ return (ml);
+}
+
+/*
+ * Allocate a new multilist, using the default number of sublists
+ * (the number of CPUs, or at least 4, or the tunable
+ * zfs_multilist_num_sublists).
+ */
+multilist_t *
+multilist_create(size_t size, size_t offset,
+ multilist_sublist_index_func_t *index_func)
+{
+ int num_sublists;
+
+ if (zfs_multilist_num_sublists > 0) {
+ num_sublists = zfs_multilist_num_sublists;
+ } else {
+ num_sublists = MAX(max_ncpus, 4);
+ }
+
+ return (multilist_create_impl(size, offset, num_sublists, index_func));
+}
+
+/*
+ * Destroy the given multilist object, and free up any memory it holds.
+ */
+void
+multilist_destroy(multilist_t *ml)
+{
+ ASSERT(multilist_is_empty(ml));
+
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+
+ ASSERT(list_is_empty(&mls->mls_list));
+
+ list_destroy(&mls->mls_list);
+ mutex_destroy(&mls->mls_lock);
+ }
+
+ ASSERT3P(ml->ml_sublists, !=, NULL);
+ kmem_free(ml->ml_sublists,
+ sizeof (multilist_sublist_t) * ml->ml_num_sublists);
+
+ ml->ml_num_sublists = 0;
+ ml->ml_offset = 0;
+ kmem_free(ml, sizeof (multilist_t));
+}
+
+/*
+ * Insert the given object into the multilist.
+ *
+ * This function will insert the object specified into the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The sublist locks are automatically acquired if not already held, to
+ * ensure consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_insert(multilist_t *ml, void *obj)
+{
+ unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+ multilist_sublist_t *mls;
+ boolean_t need_lock;
+
+ DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
+ unsigned int, sublist_idx, void *, obj);
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+ mls = &ml->ml_sublists[sublist_idx];
+
+ /*
+ * Note: Callers may already hold the sublist lock by calling
+ * multilist_sublist_lock(). Here we rely on MUTEX_HELD()
+ * returning TRUE if and only if the current thread holds the
+ * lock. While it's a little ugly to make the lock recursive in
+ * this way, it works and allows the calling code to be much
+ * simpler -- otherwise it would have to pass around a flag
+ * indicating that it already has the lock.
+ */
+ need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
+
+ multilist_sublist_insert_head(mls, obj);
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Remove the given object from the multilist.
+ *
+ * This function will remove the object specified from the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The necessary sublist locks are automatically acquired, to ensure
+ * consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_remove(multilist_t *ml, void *obj)
+{
+ unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+ multilist_sublist_t *mls;
+ boolean_t need_lock;
+
+ DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
+ unsigned int, sublist_idx, void *, obj);
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+ mls = &ml->ml_sublists[sublist_idx];
+ /* See comment in multilist_insert(). */
+ need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
+
+ multilist_sublist_remove(mls, obj);
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Check to see if this multilist object is empty.
+ *
+ * This will return TRUE if it finds all of the sublists of this
+ * multilist to be empty, and FALSE otherwise. Each sublist lock will be
+ * automatically acquired as necessary.
+ *
+ * If concurrent insertions and removals are occurring, the semantics
+ * of this function become a little fuzzy. Instead of locking all
+ * sublists for the entire call time of the function, each sublist is
+ * only locked as it is individually checked for emptiness. Thus, it's
+ * possible for this function to return TRUE with non-empty sublists at
+ * the time the function returns. This would be due to another thread
+ * inserting into a given sublist, after that specific sublist was check
+ * and deemed empty, but before all sublists have been checked.
+ */
+int
+multilist_is_empty(multilist_t *ml)
+{
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+ /* See comment in multilist_insert(). */
+ boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ if (!list_is_empty(&mls->mls_list)) {
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+
+ return (FALSE);
+ }
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+ }
+
+ return (TRUE);
+}
+
+/* Return the number of sublists composing this multilist */
+unsigned int
+multilist_get_num_sublists(multilist_t *ml)
+{
+ return (ml->ml_num_sublists);
+}
+
+/* Return a randomly selected, valid sublist index for this multilist */
+unsigned int
+multilist_get_random_index(multilist_t *ml)
+{
+ return (spa_get_random(ml->ml_num_sublists));
+}
+
+/* Lock and return the sublist specified at the given index */
+multilist_sublist_t *
+multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+{
+ multilist_sublist_t *mls;
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+ mls = &ml->ml_sublists[sublist_idx];
+ mutex_enter(&mls->mls_lock);
+
+ return (mls);
+}
+
+/* Lock and return the sublist that would be used to store the specified obj */
+multilist_sublist_t *
+multilist_sublist_lock_obj(multilist_t *ml, void *obj)
+{
+ return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+}
+
+void
+multilist_sublist_unlock(multilist_sublist_t *mls)
+{
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * We're allowing any object to be inserted into this specific sublist,
+ * but this can lead to trouble if multilist_remove() is called to
+ * remove this object. Specifically, if calling ml_index_func on this
+ * object returns an index for sublist different than what is passed as
+ * a parameter here, any call to multilist_remove() with this newly
+ * inserted object is undefined! (the call to multilist_remove() will
+ * remove the object from a list that it isn't contained in)
+ */
+void
+multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_head(&mls->mls_list, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_tail(&mls->mls_list, obj);
+}
+
+/*
+ * Move the object one element forward in the list.
+ *
+ * This function will move the given object forward in the list (towards
+ * the head) by one object. So, in essence, it will swap its position in
+ * the list with its "prev" pointer. If the given object is already at the
+ * head of the list, it cannot be moved forward any more than it already
+ * is, so no action is taken.
+ *
+ * NOTE: This function **must not** remove any object from the list other
+ * than the object given as the parameter. This is relied upon in
+ * arc_evict_state_impl().
+ */
+void
+multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
+{
+ void *prev = list_prev(&mls->mls_list, obj);
+
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ ASSERT(!list_is_empty(&mls->mls_list));
+
+ /* 'obj' must be at the head of the list, nothing to do */
+ if (prev == NULL)
+ return;
+
+ list_remove(&mls->mls_list, obj);
+ list_insert_before(&mls->mls_list, prev, obj);
+}
+
+void
+multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_remove(&mls->mls_list, obj);
+}
+
+int
+multilist_sublist_is_empty(multilist_sublist_t *mls)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_is_empty(&mls->mls_list));
+}
+
+int
+multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
+{
+ multilist_sublist_t *mls;
+ int empty;
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+ mls = &ml->ml_sublists[sublist_idx];
+ ASSERT(!MUTEX_HELD(&mls->mls_lock));
+ mutex_enter(&mls->mls_lock);
+ empty = list_is_empty(&mls->mls_list);
+ mutex_exit(&mls->mls_lock);
+ return (empty);
+}
+
+void *
+multilist_sublist_head(multilist_sublist_t *mls)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_head(&mls->mls_list));
+}
+
+void *
+multilist_sublist_tail(multilist_sublist_t *mls)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_tail(&mls->mls_list));
+}
+
+void *
+multilist_sublist_next(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_next(&mls->mls_list, obj));
+}
+
+void *
+multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_prev(&mls->mls_list, obj));
+}
+
+void
+multilist_link_init(multilist_node_t *link)
+{
+ list_link_init(link);
+}
+
+int
+multilist_link_active(multilist_node_t *link)
+{
+ return (list_link_active(link));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
new file mode 100644
index 000000000000..fc705e37964d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
@@ -0,0 +1,670 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/range_tree.h>
+
+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_remove can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk()
+ * or range_tree_vacate() functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create function. Any callbacks that are non-NULL
+ * are then called at the appropriate times.
+ *
+ * The range tree code also supports a special variant of range trees
+ * that can bridge small gaps between segments. This kind of tree is used
+ * by the dsl scanning code to group I/Os into mostly sequential chunks to
+ * optimize disk performance. The code here attempts to do this with as
+ * little memory and computational overhead as possible. One limitation of
+ * this implementation is that segments of range trees with gaps can only
+ * support removing complete segments.
+ */
+
+kmem_cache_t *range_seg_cache;
+
+/* Generic ops for managing an AVL tree alongside a range tree */
+struct range_tree_ops rt_avl_ops = {
+ .rtop_create = rt_avl_create,
+ .rtop_destroy = rt_avl_destroy,
+ .rtop_add = rt_avl_add,
+ .rtop_remove = rt_avl_remove,
+ .rtop_vacate = rt_avl_vacate,
+};
+
+void
+range_tree_init(void)
+{
+ ASSERT(range_seg_cache == NULL);
+ range_seg_cache = kmem_cache_create("range_seg_cache",
+ sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+range_tree_fini(void)
+{
+ kmem_cache_destroy(range_seg_cache);
+ range_seg_cache = NULL;
+}
+
+void
+range_tree_stat_verify(range_tree_t *rt)
+{
+ range_seg_t *rs;
+ uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
+ int i;
+
+ for (rs = avl_first(&rt->rt_root); rs != NULL;
+ rs = AVL_NEXT(&rt->rt_root, rs)) {
+ uint64_t size = rs->rs_end - rs->rs_start;
+ int idx = highbit64(size) - 1;
+
+ hist[idx]++;
+ ASSERT3U(hist[idx], !=, 0);
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ if (hist[i] != rt->rt_histogram[i]) {
+ zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
+ i, hist, hist[i], rt->rt_histogram[i]);
+ }
+ VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
+ }
+}
+
+static void
+range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
+{
+ uint64_t size = rs->rs_end - rs->rs_start;
+ int idx = highbit64(size) - 1;
+
+ ASSERT(size != 0);
+ ASSERT3U(idx, <,
+ sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+
+ rt->rt_histogram[idx]++;
+ ASSERT3U(rt->rt_histogram[idx], !=, 0);
+}
+
+static void
+range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
+{
+ uint64_t size = rs->rs_end - rs->rs_start;
+ int idx = highbit64(size) - 1;
+
+ ASSERT(size != 0);
+ ASSERT3U(idx, <,
+ sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+
+ ASSERT3U(rt->rt_histogram[idx], !=, 0);
+ rt->rt_histogram[idx]--;
+}
+
+/*
+ * NOTE: caller is responsible for all locking.
+ */
+static int
+range_tree_seg_compare(const void *x1, const void *x2)
+{
+ const range_seg_t *r1 = (const range_seg_t *)x1;
+ const range_seg_t *r2 = (const range_seg_t *)x2;
+
+ ASSERT3U(r1->rs_start, <=, r1->rs_end);
+ ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+range_tree_t *
+range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+ int (*avl_compare) (const void *, const void *), uint64_t gap)
+{
+ range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
+
+ avl_create(&rt->rt_root, range_tree_seg_compare,
+ sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
+
+ rt->rt_ops = ops;
+ rt->rt_arg = arg;
+ rt->rt_gap = gap;
+ rt->rt_avl_compare = avl_compare;
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
+ rt->rt_ops->rtop_create(rt, rt->rt_arg);
+
+ return (rt);
+}
+
+range_tree_t *
+range_tree_create(range_tree_ops_t *ops, void *arg)
+{
+ return (range_tree_create_impl(ops, arg, NULL, 0));
+}
+
+void
+range_tree_destroy(range_tree_t *rt)
+{
+ VERIFY0(rt->rt_space);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
+ rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
+
+ avl_destroy(&rt->rt_root);
+ kmem_free(rt, sizeof (*rt));
+}
+
+void
+range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
+{
+ ASSERT3U(rs->rs_fill + delta, !=, 0);
+ ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+ rs->rs_fill += delta;
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+}
+
+static void
+range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
+{
+ range_tree_t *rt = arg;
+ avl_index_t where;
+ range_seg_t rsearch, *rs_before, *rs_after, *rs;
+ uint64_t end = start + size, gap = rt->rt_gap;
+ uint64_t bridge_size = 0;
+ boolean_t merge_before, merge_after;
+
+ ASSERT3U(size, !=, 0);
+ ASSERT3U(fill, <=, size);
+
+ rsearch.rs_start = start;
+ rsearch.rs_end = end;
+ rs = avl_find(&rt->rt_root, &rsearch, &where);
+
+ if (gap == 0 && rs != NULL &&
+ rs->rs_start <= start && rs->rs_end >= end) {
+ zfs_panic_recover("zfs: allocating allocated segment"
+ "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
+ (longlong_t)start, (longlong_t)size,
+ (longlong_t)rs->rs_start,
+ (longlong_t)rs->rs_end - rs->rs_start);
+ return;
+ }
+
+ /*
+ * If this is a gap-supporting range tree, it is possible that we
+ * are inserting into an existing segment. In this case simply
+ * bump the fill count and call the remove / add callbacks. If the
+ * new range will extend an existing segment, we remove the
+ * existing one, apply the new extent to it and re-insert it using
+ * the normal code paths.
+ */
+ if (rs != NULL) {
+ ASSERT3U(gap, !=, 0);
+ if (rs->rs_start <= start && rs->rs_end >= end) {
+ range_tree_adjust_fill(rt, rs, fill);
+ return;
+ }
+
+ avl_remove(&rt->rt_root, rs);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+ range_tree_stat_decr(rt, rs);
+ rt->rt_space -= rs->rs_end - rs->rs_start;
+
+ fill += rs->rs_fill;
+ start = MIN(start, rs->rs_start);
+ end = MAX(end, rs->rs_end);
+ size = end - start;
+
+ range_tree_add_impl(rt, start, size, fill);
+
+ kmem_cache_free(range_seg_cache, rs);
+ return;
+ }
+
+ ASSERT3P(rs, ==, NULL);
+
+ /*
+ * Determine whether or not we will have to merge with our neighbors.
+ * If gap != 0, we might need to merge with our neighbors even if we
+ * aren't directly touching.
+ */
+ rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
+ rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
+
+ merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
+ merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
+
+ if (merge_before && gap != 0)
+ bridge_size += start - rs_before->rs_end;
+ if (merge_after && gap != 0)
+ bridge_size += rs_after->rs_start - end;
+
+ if (merge_before && merge_after) {
+ avl_remove(&rt->rt_root, rs_before);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
+ rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
+ rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
+ }
+
+ range_tree_stat_decr(rt, rs_before);
+ range_tree_stat_decr(rt, rs_after);
+
+ rs_after->rs_fill += rs_before->rs_fill + fill;
+ rs_after->rs_start = rs_before->rs_start;
+ kmem_cache_free(range_seg_cache, rs_before);
+ rs = rs_after;
+ } else if (merge_before) {
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
+
+ range_tree_stat_decr(rt, rs_before);
+
+ rs_before->rs_fill += fill;
+ rs_before->rs_end = end;
+ rs = rs_before;
+ } else if (merge_after) {
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
+
+ range_tree_stat_decr(rt, rs_after);
+
+ rs_after->rs_fill += fill;
+ rs_after->rs_start = start;
+ rs = rs_after;
+ } else {
+ rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+
+ rs->rs_fill = fill;
+ rs->rs_start = start;
+ rs->rs_end = end;
+ avl_insert(&rt->rt_root, rs, where);
+ }
+
+ if (gap != 0)
+ ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
+ else
+ ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+ range_tree_stat_incr(rt, rs);
+ rt->rt_space += size + bridge_size;
+}
+
+void
+range_tree_add(void *arg, uint64_t start, uint64_t size)
+{
+ range_tree_add_impl(arg, start, size, size);
+}
+
+static void
+range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
+ boolean_t do_fill)
+{
+ avl_index_t where;
+ range_seg_t rsearch, *rs, *newseg;
+ uint64_t end = start + size;
+ boolean_t left_over, right_over;
+
+ VERIFY3U(size, !=, 0);
+ VERIFY3U(size, <=, rt->rt_space);
+
+ rsearch.rs_start = start;
+ rsearch.rs_end = end;
+ rs = avl_find(&rt->rt_root, &rsearch, &where);
+
+ /* Make sure we completely overlap with someone */
+ if (rs == NULL) {
+ zfs_panic_recover("zfs: freeing free segment "
+ "(offset=%llu size=%llu)",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+
+ /*
+ * Range trees with gap support must only remove complete segments
+ * from the tree. This allows us to maintain accurate fill accounting
+ * and to ensure that bridged sections are not leaked. If we need to
+ * remove less than the full segment, we can only adjust the fill count.
+ */
+ if (rt->rt_gap != 0) {
+ if (do_fill) {
+ if (rs->rs_fill == size) {
+ start = rs->rs_start;
+ end = rs->rs_end;
+ size = end - start;
+ } else {
+ range_tree_adjust_fill(rt, rs, -size);
+ return;
+ }
+ } else if (rs->rs_start != start || rs->rs_end != end) {
+ zfs_panic_recover("zfs: freeing partial segment of "
+ "gap tree (offset=%llu size=%llu) of "
+ "(offset=%llu size=%llu)",
+ (longlong_t)start, (longlong_t)size,
+ (longlong_t)rs->rs_start,
+ (longlong_t)rs->rs_end - rs->rs_start);
+ return;
+ }
+ }
+
+ VERIFY3U(rs->rs_start, <=, start);
+ VERIFY3U(rs->rs_end, >=, end);
+
+ left_over = (rs->rs_start != start);
+ right_over = (rs->rs_end != end);
+
+ range_tree_stat_decr(rt, rs);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+ if (left_over && right_over) {
+ newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+ newseg->rs_start = end;
+ newseg->rs_end = rs->rs_end;
+ newseg->rs_fill = newseg->rs_end - newseg->rs_start;
+ range_tree_stat_incr(rt, newseg);
+
+ rs->rs_end = start;
+
+ avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
+ } else if (left_over) {
+ rs->rs_end = start;
+ } else if (right_over) {
+ rs->rs_start = end;
+ } else {
+ avl_remove(&rt->rt_root, rs);
+ kmem_cache_free(range_seg_cache, rs);
+ rs = NULL;
+ }
+
+ if (rs != NULL) {
+ /*
+ * The fill of the leftover segment will always be equal to
+ * the size, since we do not support removing partial segments
+ * of range trees with gaps.
+ */
+ rs->rs_fill = rs->rs_end - rs->rs_start;
+ range_tree_stat_incr(rt, rs);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+ }
+
+ rt->rt_space -= size;
+}
+
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+ range_tree_remove_impl(arg, start, size, B_FALSE);
+}
+
+void
+range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_tree_remove_impl(rt, start, size, B_TRUE);
+}
+
+void
+range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+ uint64_t newstart, uint64_t newsize)
+{
+ int64_t delta = newsize - (rs->rs_end - rs->rs_start);
+
+ range_tree_stat_decr(rt, rs);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+ rs->rs_start = newstart;
+ rs->rs_end = newstart + newsize;
+
+ range_tree_stat_incr(rt, rs);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+ rt->rt_space += delta;
+}
+
+static range_seg_t *
+range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_seg_t rsearch;
+ uint64_t end = start + size;
+
+ VERIFY(size != 0);
+
+ rsearch.rs_start = start;
+ rsearch.rs_end = end;
+ return (avl_find(&rt->rt_root, &rsearch, NULL));
+}
+
+range_seg_t *
+range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_seg_t *rs = range_tree_find_impl(rt, start, size);
+ if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
+ return (rs);
+ return (NULL);
+}
+
+void
+range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
+{
+ range_seg_t *rs = range_tree_find(rt, off, size);
+ if (rs != NULL)
+ panic("segment already in tree; rs=%p", (void *)rs);
+}
+
+boolean_t
+range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ return (range_tree_find(rt, start, size) != NULL);
+}
+
+/*
+ * Ensure that this range is not in the tree, regardless of whether
+ * it is currently in the tree.
+ */
+void
+range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_seg_t *rs;
+
+ if (size == 0)
+ return;
+
+ while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
+ uint64_t free_start = MAX(rs->rs_start, start);
+ uint64_t free_end = MIN(rs->rs_end, start + size);
+ range_tree_remove(rt, free_start, free_end - free_start);
+ }
+}
+
+void
+range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
+{
+ range_tree_t *rt;
+
+ ASSERT0(range_tree_space(*rtdst));
+ ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
+
+ rt = *rtsrc;
+ *rtsrc = *rtdst;
+ *rtdst = rt;
+}
+
+void
+range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
+{
+ range_seg_t *rs;
+ void *cookie = NULL;
+
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
+ rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
+
+ while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
+ if (func != NULL)
+ func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
+ kmem_cache_free(range_seg_cache, rs);
+ }
+
+ bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
+ rt->rt_space = 0;
+}
+
+void
+range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
+{
+ range_seg_t *rs;
+
+ for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
+ func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
+}
+
+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+ return (avl_first(&rt->rt_root));
+}
+
+uint64_t
+range_tree_space(range_tree_t *rt)
+{
+ return (rt->rt_space);
+}
+
+/* Generic range tree functions for maintaining segments in an AVL tree. */
+void
+rt_avl_create(range_tree_t *rt, void *arg)
+{
+ avl_tree_t *tree = arg;
+
+ avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
+ offsetof(range_seg_t, rs_pp_node));
+}
+
+void
+rt_avl_destroy(range_tree_t *rt, void *arg)
+{
+ avl_tree_t *tree = arg;
+
+ ASSERT0(avl_numnodes(tree));
+ avl_destroy(tree);
+}
+
+void
+rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ avl_tree_t *tree = arg;
+ avl_add(tree, rs);
+}
+
+void
+rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ avl_tree_t *tree = arg;
+ avl_remove(tree, rs);
+}
+
+void
+rt_avl_vacate(range_tree_t *rt, void *arg)
+{
+ /*
+ * Normally one would walk the tree freeing nodes along the way.
+ * Since the nodes are shared with the range trees we can avoid
+ * walking all nodes and just reinitialize the avl tree. The nodes
+ * will be freed by the range tree, so we don't want to free them here.
+ */
+ rt_avl_create(rt, arg);
+}
+
+boolean_t
+range_tree_is_empty(range_tree_t *rt)
+{
+ ASSERT(rt != NULL);
+ return (range_tree_space(rt) == 0);
+}
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+ range_seg_t *rs = avl_first(&rt->rt_root);
+ return (rs != NULL ? rs->rs_start : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+ range_seg_t *rs = avl_last(&rt->rt_root);
+ return (rs != NULL ? rs->rs_end : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+ return (range_tree_max(rt) - range_tree_min(rt));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
new file mode 100644
index 000000000000..b03a3c4abd45
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -0,0 +1,321 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#ifdef ZFS_DEBUG
+
+#ifdef _KERNEL
+int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
+ &reference_tracking_enable, 0,
+ "Track reference holders to refcount_t objects, used mostly by ZFS");
+#else
+int reference_tracking_enable = TRUE;
+#endif
+int reference_history = 3; /* tunable */
+
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+zfs_refcount_init(void)
+{
+ reference_cache = kmem_cache_create("reference_cache",
+ sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ reference_history_cache = kmem_cache_create("reference_history_cache",
+ sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_refcount_fini(void)
+{
+ kmem_cache_destroy(reference_cache);
+ kmem_cache_destroy(reference_history_cache);
+}
+
+void
+zfs_refcount_create(zfs_refcount_t *rc)
+{
+ mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&rc->rc_list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&rc->rc_removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ rc->rc_count = 0;
+ rc->rc_removed_count = 0;
+ rc->rc_tracked = reference_tracking_enable;
+}
+
+void
+zfs_refcount_create_tracked(zfs_refcount_t *rc)
+{
+ zfs_refcount_create(rc);
+ rc->rc_tracked = B_TRUE;
+}
+
+void
+zfs_refcount_create_untracked(zfs_refcount_t *rc)
+{
+ zfs_refcount_create(rc);
+ rc->rc_tracked = B_FALSE;
+}
+
+void
+zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
+{
+ reference_t *ref;
+
+ ASSERT(rc->rc_count == number);
+ while (ref = list_head(&rc->rc_list)) {
+ list_remove(&rc->rc_list, ref);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_list);
+
+ while (ref = list_head(&rc->rc_removed)) {
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache, ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_removed);
+ mutex_destroy(&rc->rc_mtx);
+}
+
+void
+zfs_refcount_destroy(zfs_refcount_t *rc)
+{
+ zfs_refcount_destroy_many(rc, 0);
+}
+
+int
+zfs_refcount_is_zero(zfs_refcount_t *rc)
+{
+ return (rc->rc_count == 0);
+}
+
+int64_t
+zfs_refcount_count(zfs_refcount_t *rc)
+{
+ return (rc->rc_count);
+}
+
+int64_t
+zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref = NULL;
+ int64_t count;
+
+ if (rc->rc_tracked) {
+ ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+ ref->ref_holder = holder;
+ ref->ref_number = number;
+ }
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= 0);
+ if (rc->rc_tracked)
+ list_insert_head(&rc->rc_list, ref);
+ rc->rc_count += number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+
+ return (count);
+}
+
+int64_t
+zfs_refcount_add(zfs_refcount_t *rc, void *holder)
+{
+ return (zfs_refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= number);
+
+ if (!rc->rc_tracked) {
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder && ref->ref_number == number) {
+ list_remove(&rc->rc_list, ref);
+ if (reference_history > 0) {
+ ref->ref_removed =
+ kmem_cache_alloc(reference_history_cache,
+ KM_SLEEP);
+ list_insert_head(&rc->rc_removed, ref);
+ rc->rc_removed_count++;
+ if (rc->rc_removed_count > reference_history) {
+ ref = list_tail(&rc->rc_removed);
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache,
+ ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ rc->rc_removed_count--;
+ }
+ } else {
+ kmem_cache_free(reference_cache, ref);
+ }
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+ }
+ panic("No such hold %p on refcount %llx", holder,
+ (u_longlong_t)(uintptr_t)rc);
+ return (-1);
+}
+
+int64_t
+zfs_refcount_remove(zfs_refcount_t *rc, void *holder)
+{
+ return (zfs_refcount_remove_many(rc, 1, holder));
+}
+
+void
+zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
+{
+ int64_t count, removed_count;
+ list_t list, removed;
+
+ list_create(&list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+
+ mutex_enter(&src->rc_mtx);
+ count = src->rc_count;
+ removed_count = src->rc_removed_count;
+ src->rc_count = 0;
+ src->rc_removed_count = 0;
+ list_move_tail(&list, &src->rc_list);
+ list_move_tail(&removed, &src->rc_removed);
+ mutex_exit(&src->rc_mtx);
+
+ mutex_enter(&dst->rc_mtx);
+ dst->rc_count += count;
+ dst->rc_removed_count += removed_count;
+ list_move_tail(&dst->rc_list, &list);
+ list_move_tail(&dst->rc_removed, &removed);
+ mutex_exit(&dst->rc_mtx);
+
+ list_destroy(&list);
+ list_destroy(&removed);
+}
+
+void
+zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder,
+ void *new_holder)
+{
+ reference_t *ref;
+ boolean_t found = B_FALSE;
+
+ mutex_enter(&rc->rc_mtx);
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return;
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == current_holder) {
+ ref->ref_holder = new_holder;
+ found = B_TRUE;
+ break;
+ }
+ }
+ ASSERT(found);
+ mutex_exit(&rc->rc_mtx);
+}
+
+/*
+ * If tracking is enabled, return true if a reference exists that matches
+ * the "holder" tag. If tracking is disabled, then return true if a reference
+ * might be held.
+ */
+boolean_t
+zfs_refcount_held(zfs_refcount_t *rc, void *holder)
+{
+ reference_t *ref;
+
+ mutex_enter(&rc->rc_mtx);
+
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return (rc->rc_count > 0);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+ }
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+}
+
+/*
+ * If tracking is enabled, return true if a reference does not exist that
+ * matches the "holder" tag. If tracking is disabled, always return true
+ * since the reference might not be held.
+ */
+boolean_t
+zfs_refcount_not_held(zfs_refcount_t *rc, void *holder)
+{
+ reference_t *ref;
+
+ mutex_enter(&rc->rc_mtx);
+
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+ }
+ }
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+}
+#endif /* ZFS_DEBUG */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
new file mode 100644
index 000000000000..6e7456efb2d5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
@@ -0,0 +1,396 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+
+/*
+ * This file contains the implementation of a re-entrant read
+ * reader/writer lock (aka "rrwlock").
+ *
+ * This is a normal reader/writer lock with the additional feature
+ * of allowing threads who have already obtained a read lock to
+ * re-enter another read lock (re-entrant read) - even if there are
+ * waiting writers.
+ *
+ * Callers who have not obtained a read lock give waiting writers priority.
+ *
+ * The rrwlock_t lock does not allow re-entrant writers, nor does it
+ * allow a re-entrant mix of reads and writes (that is, it does not
+ * allow a caller who has already obtained a read lock to be able to
+ * then grab a write lock without first dropping all read locks, and
+ * vice versa).
+ *
+ * The rrwlock_t uses tsd (thread specific data) to keep a list of
+ * nodes (rrw_node_t), where each node keeps track of which specific
+ * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
+ * should be rare, a thread that grabs multiple reads on the same rrwlock_t
+ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
+ * tsd list can represent a different rrwlock_t. This allows a thread
+ * to enter multiple and unique rrwlock_ts for read locks at the same time.
+ *
+ * Since using tsd exposes some overhead, the rrwlock_t only needs to
+ * keep tsd data when writers are waiting. If no writers are waiting, then
+ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
+ * is needed. Once a writer attempts to grab the lock, readers then
+ * keep tsd data and bump the linked readers count (rr_linked_rcount).
+ *
+ * If there are waiting writers and there are anonymous readers, then a
+ * reader doesn't know if it is a re-entrant lock. But since it may be one,
+ * we allow the read to proceed (otherwise it could deadlock). Since once
+ * waiting writers are active, readers no longer bump the anonymous count,
+ * the anonymous readers will eventually flush themselves out. At this point,
+ * readers will be able to tell if they are a re-entrant lock (have a
+ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
+ * we must let the proceed. If they are not, then the reader blocks for the
+ * waiting writers. Hence, we do not starve writers.
+ */
+
+/* global key for TSD */
+uint_t rrw_tsd_key;
+
+typedef struct rrw_node {
+ struct rrw_node *rn_next;
+ rrwlock_t *rn_rrl;
+ void *rn_tag;
+} rrw_node_t;
+
+static rrw_node_t *
+rrn_find(rrwlock_t *rrl)
+{
+ rrw_node_t *rn;
+
+ if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
+ return (NULL);
+
+ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+ if (rn->rn_rrl == rrl)
+ return (rn);
+ }
+ return (NULL);
+}
+
+/*
+ * Add a node to the head of the singly linked list.
+ */
+static void
+rrn_add(rrwlock_t *rrl, void *tag)
+{
+ rrw_node_t *rn;
+
+ rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
+ rn->rn_rrl = rrl;
+ rn->rn_next = tsd_get(rrw_tsd_key);
+ rn->rn_tag = tag;
+ VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
+}
+
+/*
+ * If a node is found for 'rrl', then remove the node from this
+ * thread's list and return TRUE; otherwise return FALSE.
+ */
+static boolean_t
+rrn_find_and_remove(rrwlock_t *rrl, void *tag)
+{
+ rrw_node_t *rn;
+ rrw_node_t *prev = NULL;
+
+ if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
+ return (B_FALSE);
+
+ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+ if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
+ if (prev)
+ prev->rn_next = rn->rn_next;
+ else
+ VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
+ kmem_free(rn, sizeof (*rn));
+ return (B_TRUE);
+ }
+ prev = rn;
+ }
+ return (B_FALSE);
+}
+
+void
+rrw_init(rrwlock_t *rrl, boolean_t track_all)
+{
+ mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
+ rrl->rr_writer = NULL;
+ zfs_refcount_create(&rrl->rr_anon_rcount);
+ zfs_refcount_create(&rrl->rr_linked_rcount);
+ rrl->rr_writer_wanted = B_FALSE;
+ rrl->rr_track_all = track_all;
+}
+
+void
+rrw_destroy(rrwlock_t *rrl)
+{
+ mutex_destroy(&rrl->rr_lock);
+ cv_destroy(&rrl->rr_cv);
+ ASSERT(rrl->rr_writer == NULL);
+ zfs_refcount_destroy(&rrl->rr_anon_rcount);
+ zfs_refcount_destroy(&rrl->rr_linked_rcount);
+}
+
+static void
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
+{
+ mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+ if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
+ !rrl->rr_track_all) {
+ rrl->rr_anon_rcount.rc_count++;
+ mutex_exit(&rrl->rr_lock);
+ return;
+ }
+ DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
+#endif
+ ASSERT(rrl->rr_writer != curthread);
+ ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0);
+
+ while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
+ zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
+ rrn_find(rrl) == NULL))
+ cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+
+ if (rrl->rr_writer_wanted || rrl->rr_track_all) {
+ /* may or may not be a re-entrant enter */
+ rrn_add(rrl, tag);
+ (void) zfs_refcount_add(&rrl->rr_linked_rcount, tag);
+ } else {
+ (void) zfs_refcount_add(&rrl->rr_anon_rcount, tag);
+ }
+ ASSERT(rrl->rr_writer == NULL);
+ mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+ rrw_enter_read_impl(rrl, B_FALSE, tag);
+}
+
+/*
+ * take a read lock even if there are pending write lock requests. if we want
+ * to take a lock reentrantly, but from different threads (that have a
+ * relationship to each other), the normal detection mechanism to overrule
+ * the pending writer does not work, so we have to give an explicit hint here.
+ */
+void
+rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+{
+ rrw_enter_read_impl(rrl, B_TRUE, tag);
+}
+
+
+void
+rrw_enter_write(rrwlock_t *rrl)
+{
+ mutex_enter(&rrl->rr_lock);
+ ASSERT(rrl->rr_writer != curthread);
+
+ while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 ||
+ zfs_refcount_count(&rrl->rr_linked_rcount) > 0 ||
+ rrl->rr_writer != NULL) {
+ rrl->rr_writer_wanted = B_TRUE;
+ cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+ }
+ rrl->rr_writer_wanted = B_FALSE;
+ rrl->rr_writer = curthread;
+ mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+{
+ if (rw == RW_READER)
+ rrw_enter_read(rrl, tag);
+ else
+ rrw_enter_write(rrl);
+}
+
+void
+rrw_exit(rrwlock_t *rrl, void *tag)
+{
+ mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+ if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
+ rrl->rr_anon_rcount.rc_count--;
+ if (rrl->rr_anon_rcount.rc_count == 0)
+ cv_broadcast(&rrl->rr_cv);
+ mutex_exit(&rrl->rr_lock);
+ return;
+ }
+ DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
+#endif
+ ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
+ !zfs_refcount_is_zero(&rrl->rr_linked_rcount) ||
+ rrl->rr_writer != NULL);
+
+ if (rrl->rr_writer == NULL) {
+ int64_t count;
+ if (rrn_find_and_remove(rrl, tag)) {
+ count = zfs_refcount_remove(
+ &rrl->rr_linked_rcount, tag);
+ } else {
+ ASSERT(!rrl->rr_track_all);
+ count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag);
+ }
+ if (count == 0)
+ cv_broadcast(&rrl->rr_cv);
+ } else {
+ ASSERT(rrl->rr_writer == curthread);
+ ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) &&
+ zfs_refcount_is_zero(&rrl->rr_linked_rcount));
+ rrl->rr_writer = NULL;
+ cv_broadcast(&rrl->rr_cv);
+ }
+ mutex_exit(&rrl->rr_lock);
+}
+
+/*
+ * If the lock was created with track_all, rrw_held(RW_READER) will return
+ * B_TRUE iff the current thread has the lock for reader. Otherwise it may
+ * return B_TRUE if any thread has the lock for reader.
+ */
+boolean_t
+rrw_held(rrwlock_t *rrl, krw_t rw)
+{
+ boolean_t held;
+
+ mutex_enter(&rrl->rr_lock);
+ if (rw == RW_WRITER) {
+ held = (rrl->rr_writer == curthread);
+ } else {
+ held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
+ rrn_find(rrl) != NULL);
+ }
+ mutex_exit(&rrl->rr_lock);
+
+ return (held);
+}
+
+void
+rrw_tsd_destroy(void *arg)
+{
+ rrw_node_t *rn = arg;
+ if (rn != NULL) {
+ panic("thread %p terminating with rrw lock %p held",
+ (void *)curthread, (void *)rn->rn_rrl);
+ }
+}
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, while pessimizing writes.
+ *
+ * The idea is to split single busy lock into array of locks, so that
+ * each reader can lock only one of them for read, depending on result
+ * of simple hash function. That proportionally reduces lock congestion.
+ * Writer same time has to sequentially aquire write on all the locks.
+ * That makes write aquisition proportionally slower, but in places where
+ * it is used (filesystem unmount) performance is not critical.
+ *
+ * All the functions below are direct wrappers around functions above.
+ */
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+ if (rw == RW_READER)
+ rrm_enter_read(rrl, tag);
+ else
+ rrm_enter_write(rrl);
+}
+
+/*
+ * This maps the current thread to a specific lock. Note that the lock
+ * must be released by the same thread that acquired it. We do this
+ * mapping by taking the thread pointer mod a prime number. We examine
+ * only the low 32 bits of the thread pointer, because 32-bit division
+ * is faster than 64-bit division, and the high 32 bits have little
+ * entropy anyway.
+ */
+#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+ rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+ int i;
+
+ if (rrl->locks[0].rr_writer == curthread) {
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_exit(&rrl->locks[i], tag);
+ } else {
+ rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+ }
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+ if (rw == RW_WRITER) {
+ return (rrw_held(&rrl->locks[0], rw));
+ } else {
+ return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
new file mode 100644
index 000000000000..dfc9d012b08d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
@@ -0,0 +1,2012 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 iXsystems, Inc
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sunddi.h>
+#include <sys/sa_impl.h>
+#include <sys/dnode.h>
+#include <sys/errno.h>
+#include <sys/zfs_context.h>
+
+/*
+ * ZFS System attributes:
+ *
+ * A generic mechanism to allow for arbitrary attributes
+ * to be stored in a dnode. The data will be stored in the bonus buffer of
+ * the dnode and if necessary a special "spill" block will be used to handle
+ * overflow situations. The spill block will be sized to fit the data
+ * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the
+ * spill block is stored at the end of the current bonus buffer. Any
+ * attributes that would be in the way of the blkptr_t will be relocated
+ * into the spill block.
+ *
+ * Attribute registration:
+ *
+ * Stored persistently on a per dataset basis
+ * a mapping between attribute "string" names and their actual attribute
+ * numeric values, length, and byteswap function. The names are only used
+ * during registration. All attributes are known by their unique attribute
+ * id value. If an attribute can have a variable size then the value
+ * 0 will be used to indicate this.
+ *
+ * Attribute Layout:
+ *
+ * Attribute layouts are a way to compactly store multiple attributes, but
+ * without taking the overhead associated with managing each attribute
+ * individually. Since you will typically have the same set of attributes
+ * stored in the same order a single table will be used to represent that
+ * layout. The ZPL for example will usually have only about 10 different
+ * layouts (regular files, device files, symlinks,
+ * regular files + scanstamp, files/dir with extended attributes, and then
+ * you have the possibility of all of those minus ACL, because it would
+ * be kicked out into the spill block)
+ *
+ * Layouts are simply an array of the attributes and their
+ * ordering i.e. [0, 1, 4, 5, 2]
+ *
+ * Each distinct layout is given a unique layout number and that is whats
+ * stored in the header at the beginning of the SA data buffer.
+ *
+ * A layout only covers a single dbuf (bonus or spill). If a set of
+ * attributes is split up between the bonus buffer and a spill buffer then
+ * two different layouts will be used. This allows us to byteswap the
+ * spill without looking at the bonus buffer and keeps the on disk format of
+ * the bonus and spill buffer the same.
+ *
+ * Adding a single attribute will cause the entire set of attributes to
+ * be rewritten and could result in a new layout number being constructed
+ * as part of the rewrite if no such layout exists for the new set of
+ * attribues. The new attribute will be appended to the end of the already
+ * existing attributes.
+ *
+ * Both the attribute registration and attribute layout information are
+ * stored in normal ZAP attributes. Their should be a small number of
+ * known layouts and the set of attributes is assumed to typically be quite
+ * small.
+ *
+ * The registered attributes and layout "table" information is maintained
+ * in core and a special "sa_os_t" is attached to the objset_t.
+ *
+ * A special interface is provided to allow for quickly applying
+ * a large set of attributes at once. sa_replace_all_by_template() is
+ * used to set an array of attributes. This is used by the ZPL when
+ * creating a brand new file. The template that is passed into the function
+ * specifies the attribute, size for variable length attributes, location of
+ * data and special "data locator" function if the data isn't in a contiguous
+ * location.
+ *
+ * Byteswap implications:
+ *
+ * Since the SA attributes are not entirely self describing we can't do
+ * the normal byteswap processing. The special ZAP layout attribute and
+ * attribute registration attributes define the byteswap function and the
+ * size of the attributes, unless it is variable sized.
+ * The normal ZFS byteswapping infrastructure assumes you don't need
+ * to read any objects in order to do the necessary byteswapping. Whereas
+ * SA attributes can only be properly byteswapped if the dataset is opened
+ * and the layout/attribute ZAP attributes are available. Because of this
+ * the SA attributes will be byteswapped when they are first accessed by
+ * the SA code that will read the SA data.
+ */
+
+typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
+ uint16_t length, int length_idx, boolean_t, void *userp);
+
+static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
+static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
+static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
+ sa_hdr_phys_t *hdr);
+static void sa_idx_tab_rele(objset_t *os, void *arg);
+static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
+ int buflen);
+static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx);
+
+arc_byteswap_func_t *sa_bswap_table[] = {
+ byteswap_uint64_array,
+ byteswap_uint32_array,
+ byteswap_uint16_array,
+ byteswap_uint8_array,
+ zfs_acl_byteswap,
+};
+
+#define SA_COPY_DATA(f, s, t, l) \
+ { \
+ if (f == NULL) { \
+ if (l == 8) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ } else if (l == 16) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ *(uint64_t *)((uintptr_t)t + 8) = \
+ *(uint64_t *)((uintptr_t)s + 8); \
+ } else { \
+ bcopy(s, t, l); \
+ } \
+ } else \
+ sa_copy_data(f, s, t, l); \
+ }
+
+/*
+ * This table is fixed and cannot be changed. Its purpose is to
+ * allow the SA code to work with both old/new ZPL file systems.
+ * It contains the list of legacy attributes. These attributes aren't
+ * stored in the "attribute" registry zap objects, since older ZPL file systems
+ * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will
+ * use this static table.
+ */
+sa_attr_reg_t sa_legacy_attrs[] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+};
+
+/*
+ * This is only used for objects of type DMU_OT_ZNODE
+ */
+sa_attr_type_t sa_legacy_zpl_layout[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * Special dummy layout used for buffers with no attributes.
+ */
+sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+
+static int sa_legacy_attr_count = 16;
+static kmem_cache_t *sa_cache = NULL;
+
+/*ARGSUSED*/
+static int
+sa_cache_constructor(void *buf, void *unused, int kmflag)
+{
+ sa_handle_t *hdl = buf;
+
+ mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_cache_destructor(void *buf, void *unused)
+{
+ sa_handle_t *hdl = buf;
+ mutex_destroy(&hdl->sa_lock);
+}
+
+void
+sa_cache_init(void)
+{
+ sa_cache = kmem_cache_create("sa_cache",
+ sizeof (sa_handle_t), 0, sa_cache_constructor,
+ sa_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+sa_cache_fini(void)
+{
+ if (sa_cache)
+ kmem_cache_destroy(sa_cache);
+}
+
+static int
+layout_num_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+ const sa_lot_t *node2 = (const sa_lot_t *)arg2;
+
+ return (AVL_CMP(node1->lot_num, node2->lot_num));
+}
+
+static int
+layout_hash_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+ const sa_lot_t *node2 = (const sa_lot_t *)arg2;
+
+ int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash);
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_CMP(node1->lot_instance, node2->lot_instance));
+}
+
+boolean_t
+sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
+{
+ int i;
+
+ if (count != tbf->lot_attr_count)
+ return (1);
+
+ for (i = 0; i != count; i++) {
+ if (attrs[i] != tbf->lot_attrs[i])
+ return (1);
+ }
+ return (0);
+}
+
+#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
+
+static uint64_t
+sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+{
+ int i;
+ uint64_t crc = -1ULL;
+
+ for (i = 0; i != attr_count; i++)
+ crc ^= SA_ATTR_HASH(attrs[i]);
+
+ return (crc);
+}
+
+static int
+sa_get_spill(sa_handle_t *hdl)
+{
+ int rc;
+ if (hdl->sa_spill == NULL) {
+ if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
+ &hdl->sa_spill)) == 0)
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ } else {
+ rc = 0;
+ }
+
+ return (rc);
+}
+
+/*
+ * Main attribute lookup/update function
+ * returns 0 for success or non zero for failures
+ *
+ * Operates on bulk array, first failure will abort further processing
+ */
+int
+sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ sa_data_op_t data_op, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int i;
+ int error = 0;
+ sa_buf_type_t buftypes;
+
+ buftypes = 0;
+
+ ASSERT(count > 0);
+ for (i = 0; i != count; i++) {
+ ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
+
+ bulk[i].sa_addr = NULL;
+ /* First check the bonus buffer */
+
+ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
+ hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
+ SA_GET_HDR(hdl, SA_BONUS),
+ bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
+ if (tx && !(buftypes & SA_BONUS)) {
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ buftypes |= SA_BONUS;
+ }
+ }
+ if (bulk[i].sa_addr == NULL &&
+ ((error = sa_get_spill(hdl)) == 0)) {
+ if (TOC_ATTR_PRESENT(
+ hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_spill_tab,
+ SA_GET_HDR(hdl, SA_SPILL),
+ bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
+ if (tx && !(buftypes & SA_SPILL) &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+ buftypes |= SA_SPILL;
+ }
+ }
+ }
+ if (error && error != ENOENT) {
+ return ((error == ECKSUM) ? EIO : error);
+ }
+
+ switch (data_op) {
+ case SA_LOOKUP:
+ if (bulk[i].sa_addr == NULL)
+ return (SET_ERROR(ENOENT));
+ if (bulk[i].sa_data) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_addr, bulk[i].sa_data,
+ bulk[i].sa_size);
+ }
+ continue;
+
+ case SA_UPDATE:
+ /* existing rewrite of attr */
+ if (bulk[i].sa_addr &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_addr,
+ bulk[i].sa_length);
+ continue;
+ } else if (bulk[i].sa_addr) { /* attr size change */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_REPLACE, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ } else { /* adding new attribute */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_ADD, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ }
+ if (error)
+ return (error);
+ break;
+ }
+ }
+ return (error);
+}
+
+static sa_lot_t *
+sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+ uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, *findtb;
+ int i;
+ avl_index_t loc;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
+ tb->lot_attr_count = attr_count;
+ tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+ tb->lot_num = lot_num;
+ tb->lot_hash = hash;
+ tb->lot_instance = 0;
+
+ if (zapadd) {
+ char attr_name[8];
+
+ if (sa->sa_layout_attr_obj == 0) {
+ sa->sa_layout_attr_obj = zap_create_link(os,
+ DMU_OT_SA_ATTR_LAYOUTS,
+ sa->sa_master_obj, SA_LAYOUTS, tx);
+ }
+
+ (void) snprintf(attr_name, sizeof (attr_name),
+ "%d", (int)lot_num);
+ VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
+ attr_name, 2, attr_count, attrs, tx));
+ }
+
+ list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
+ offsetof(sa_idx_tab_t, sa_next));
+
+ for (i = 0; i != attr_count; i++) {
+ if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
+ tb->lot_var_sizes++;
+ }
+
+ avl_add(&sa->sa_layout_num_tree, tb);
+
+ /* verify we don't have a hash collision */
+ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
+ for (; findtb && findtb->lot_hash == hash;
+ findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
+ if (findtb->lot_instance != tb->lot_instance)
+ break;
+ tb->lot_instance++;
+ }
+ }
+ avl_add(&sa->sa_layout_hash_tree, tb);
+ return (tb);
+}
+
+static void
+sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
+ int count, dmu_tx_t *tx, sa_lot_t **lot)
+{
+ sa_lot_t *tb, tbsearch;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ boolean_t found = B_FALSE;
+
+ mutex_enter(&sa->sa_lock);
+ tbsearch.lot_hash = hash;
+ tbsearch.lot_instance = 0;
+ tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
+ if (tb) {
+ for (; tb && tb->lot_hash == hash;
+ tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
+ if (sa_layout_equal(tb, attrs, count) == 0) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ tb = sa_add_layout_entry(os, attrs, count,
+ avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
+ }
+ mutex_exit(&sa->sa_lock);
+ *lot = tb;
+}
+
+static int
+sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
+{
+ int error;
+ uint32_t blocksize;
+
+ if (size == 0) {
+ blocksize = SPA_MINBLOCKSIZE;
+ } else if (size > SPA_OLD_MAXBLOCKSIZE) {
+ ASSERT(0);
+ return (SET_ERROR(EFBIG));
+ } else {
+ blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
+ }
+
+ error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
+{
+ if (func == NULL) {
+ bcopy(datastart, target, buflen);
+ } else {
+ boolean_t start;
+ int bytes;
+ void *dataptr;
+ void *saptr = target;
+ uint32_t length;
+
+ start = B_TRUE;
+ bytes = 0;
+ while (bytes < buflen) {
+ func(&dataptr, &length, buflen, start, datastart);
+ bcopy(dataptr, saptr, length);
+ saptr = (void *)((caddr_t)saptr + length);
+ bytes += length;
+ start = B_FALSE;
+ }
+ }
+}
+
+/*
+ * Determine several different sizes
+ * first the sa header size
+ * the number of bytes to be stored
+ * if spill would occur the index in the attribute array is returned
+ *
+ * the boolean will_spill will be set when spilling is necessary. It
+ * is only set when the buftype is SA_BONUS
+ */
+static int
+sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
+ int *total, boolean_t *will_spill)
+{
+ int var_size = 0;
+ int i;
+ int hdrsize;
+ int extra_hdrsize;
+
+ if (buftype == SA_BONUS && sa->sa_force_spill) {
+ *total = 0;
+ *index = 0;
+ *will_spill = B_TRUE;
+ return (0);
+ }
+
+ *index = -1;
+ *total = 0;
+ *will_spill = B_FALSE;
+
+ extra_hdrsize = 0;
+ hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
+ sizeof (sa_hdr_phys_t);
+
+ ASSERT(IS_P2ALIGNED(full_space, 8));
+
+ for (i = 0; i != attr_count; i++) {
+ boolean_t is_var_sz;
+
+ *total = P2ROUNDUP(*total, 8);
+ *total += attr_desc[i].sa_length;
+ if (*will_spill)
+ continue;
+
+ is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
+ if (is_var_sz) {
+ var_size++;
+ }
+
+ if (is_var_sz && var_size > 1) {
+ /*
+ * Don't worry that the spill block might overflow.
+ * It will be resized if needed in sa_build_layouts().
+ */
+ if (buftype == SA_SPILL ||
+ P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
+ *total < full_space) {
+ /*
+ * Account for header space used by array of
+ * optional sizes of variable-length attributes.
+ * Record the extra header size in case this
+ * increase needs to be reversed due to
+ * spill-over.
+ */
+ hdrsize += sizeof (uint16_t);
+ if (*index != -1)
+ extra_hdrsize += sizeof (uint16_t);
+ } else {
+ ASSERT(buftype == SA_BONUS);
+ if (*index == -1)
+ *index = i;
+ *will_spill = B_TRUE;
+ continue;
+ }
+ }
+
+ /*
+ * find index of where spill *could* occur.
+ * Then continue to count of remainder attribute
+ * space. The sum is used later for sizing bonus
+ * and spill buffer.
+ */
+ if (buftype == SA_BONUS && *index == -1 &&
+ (*total + P2ROUNDUP(hdrsize, 8)) >
+ (full_space - sizeof (blkptr_t))) {
+ *index = i;
+ }
+
+ if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
+ buftype == SA_BONUS)
+ *will_spill = B_TRUE;
+ }
+
+ if (*will_spill)
+ hdrsize -= extra_hdrsize;
+
+ hdrsize = P2ROUNDUP(hdrsize, 8);
+ return (hdrsize);
+}
+
+#define BUF_SPACE_NEEDED(total, header) (total + header)
+
+/*
+ * Find layout that corresponds to ordering of attributes
+ * If not found a new layout number is created and added to
+ * persistent layout tables.
+ */
+static int
+sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ uint64_t hash;
+ sa_buf_type_t buftype;
+ sa_hdr_phys_t *sahdr;
+ void *data_start;
+ int buf_space;
+ sa_attr_type_t *attrs, *attrs_start;
+ int i, lot_count;
+ int dnodesize;
+ int hdrsize;
+ int spillhdrsize = 0;
+ int used;
+ dmu_object_type_t bonustype;
+ sa_lot_t *lot;
+ int len_idx;
+ int spill_used;
+ int bonuslen;
+ boolean_t spilling;
+
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+ dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
+ bonuslen = DN_BONUS_SIZE(dnodesize);
+
+ dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
+ bonuslen = DN_BONUS_SIZE(dnodesize);
+
+ /* first determine bonus header size and sum of all attributes */
+ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
+ SA_BONUS, bonuslen, &i, &used, &spilling);
+
+ if (used > SPA_OLD_MAXBLOCKSIZE)
+ return (SET_ERROR(EFBIG));
+
+ VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
+ MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
+ used + hdrsize, tx));
+
+ ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
+ bonustype == DMU_OT_SA);
+
+ /* setup and size spill buffer when needed */
+ if (spilling) {
+ boolean_t dummy;
+
+ if (hdl->sa_spill == NULL) {
+ VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
+ &hdl->sa_spill) == 0);
+ }
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+
+ spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
+ attr_count - i, hdl->sa_spill, SA_SPILL,
+ hdl->sa_spill->db_size, &i, &spill_used, &dummy);
+
+ if (spill_used > SPA_OLD_MAXBLOCKSIZE)
+ return (SET_ERROR(EFBIG));
+
+ buf_space = hdl->sa_spill->db_size - spillhdrsize;
+ if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+ hdl->sa_spill->db_size)
+ VERIFY(0 == sa_resize_spill(hdl,
+ BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
+ }
+
+ /* setup starting pointers to lay down data */
+ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
+ sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
+ buftype = SA_BONUS;
+
+ if (spilling)
+ buf_space = (sa->sa_force_spill) ?
+ 0 : SA_BLKPTR_SPACE - hdrsize;
+ else
+ buf_space = hdl->sa_bonus->db_size - hdrsize;
+
+ attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ lot_count = 0;
+
+ for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
+ uint16_t length;
+
+ ASSERT(IS_P2ALIGNED(data_start, 8));
+ ASSERT(IS_P2ALIGNED(buf_space, 8));
+ attrs[i] = attr_desc[i].sa_attr;
+ length = SA_REGISTERED_LEN(sa, attrs[i]);
+ if (length == 0)
+ length = attr_desc[i].sa_length;
+ else
+ VERIFY(length == attr_desc[i].sa_length);
+
+ if (buf_space < length) { /* switch to spill buffer */
+ VERIFY(spilling);
+ VERIFY(bonustype == DMU_OT_SA);
+ if (buftype == SA_BONUS && !sa->sa_force_spill) {
+ sa_find_layout(hdl->sa_os, hash, attrs_start,
+ lot_count, tx, &lot);
+ SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
+ }
+
+ buftype = SA_SPILL;
+ hash = -1ULL;
+ len_idx = 0;
+
+ sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
+ sahdr->sa_magic = SA_MAGIC;
+ data_start = (void *)((uintptr_t)sahdr +
+ spillhdrsize);
+ attrs_start = &attrs[i];
+ buf_space = hdl->sa_spill->db_size - spillhdrsize;
+ lot_count = 0;
+ }
+ hash ^= SA_ATTR_HASH(attrs[i]);
+ attr_desc[i].sa_addr = data_start;
+ attr_desc[i].sa_size = length;
+ SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
+ data_start, length);
+ if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
+ sahdr->sa_lengths[len_idx++] = length;
+ }
+ VERIFY((uintptr_t)data_start % 8 == 0);
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ length), 8);
+ buf_space -= P2ROUNDUP(length, 8);
+ lot_count++;
+ }
+
+ sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+
+ /*
+ * Verify that old znodes always have layout number 0.
+ * Must be DMU_OT_SA for arbitrary layouts
+ */
+ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
+ (bonustype == DMU_OT_SA && lot->lot_num > 1));
+
+ if (bonustype == DMU_OT_SA) {
+ SA_SET_HDR(sahdr, lot->lot_num,
+ buftype == SA_BONUS ? hdrsize : spillhdrsize);
+ }
+
+ kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
+ if (hdl->sa_bonus_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+ hdl->sa_bonus_tab = NULL;
+ }
+ if (!sa->sa_force_spill)
+ VERIFY(0 == sa_build_index(hdl, SA_BONUS));
+ if (hdl->sa_spill) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ if (!spilling) {
+ /*
+ * remove spill block that is no longer needed.
+ */
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ hdl->sa_spill_tab = NULL;
+ VERIFY(0 == dmu_rm_spill(hdl->sa_os,
+ sa_handle_object(hdl), tx));
+ } else {
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ }
+ }
+
+ return (0);
+}
+
+static void
+sa_free_attr_table(sa_os_t *sa)
+{
+ int i;
+
+ if (sa->sa_attr_table == NULL)
+ return;
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_name)
+ kmem_free(sa->sa_attr_table[i].sa_name,
+ strlen(sa->sa_attr_table[i].sa_name) + 1);
+ }
+
+ kmem_free(sa->sa_attr_table,
+ sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+ sa->sa_attr_table = NULL;
+}
+
+static int
+sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+{
+ sa_os_t *sa = os->os_sa;
+ uint64_t sa_attr_count = 0;
+ uint64_t sa_reg_count = 0;
+ int error = 0;
+ uint64_t attr_value;
+ sa_attr_table_t *tb;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int registered_count = 0;
+ int i;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+
+ sa->sa_user_table =
+ kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
+ sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
+
+ if (sa->sa_reg_attr_obj != 0) {
+ error = zap_count(os, sa->sa_reg_attr_obj,
+ &sa_attr_count);
+
+ /*
+ * Make sure we retrieved a count and that it isn't zero
+ */
+ if (error || (error == 0 && sa_attr_count == 0)) {
+ if (error == 0)
+ error = SET_ERROR(EINVAL);
+ goto bail;
+ }
+ sa_reg_count = sa_attr_count;
+ }
+
+ if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
+ sa_attr_count += sa_legacy_attr_count;
+
+ /* Allocate attribute numbers for attributes that aren't registered */
+ for (i = 0; i != count; i++) {
+ boolean_t found = B_FALSE;
+ int j;
+
+ if (ostype == DMU_OST_ZFS) {
+ for (j = 0; j != sa_legacy_attr_count; j++) {
+ if (strcmp(reg_attrs[i].sa_name,
+ sa_legacy_attrs[j].sa_name) == 0) {
+ sa->sa_user_table[i] =
+ sa_legacy_attrs[j].sa_attr;
+ found = B_TRUE;
+ }
+ }
+ }
+ if (found)
+ continue;
+
+ if (sa->sa_reg_attr_obj)
+ error = zap_lookup(os, sa->sa_reg_attr_obj,
+ reg_attrs[i].sa_name, 8, 1, &attr_value);
+ else
+ error = SET_ERROR(ENOENT);
+ switch (error) {
+ case ENOENT:
+ sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
+ sa_attr_count++;
+ break;
+ case 0:
+ sa->sa_user_table[i] = ATTR_NUM(attr_value);
+ break;
+ default:
+ goto bail;
+ }
+ }
+
+ sa->sa_num_attrs = sa_attr_count;
+ tb = sa->sa_attr_table =
+ kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
+
+ /*
+ * Attribute table is constructed from requested attribute list,
+ * previously foreign registered attributes, and also the legacy
+ * ZPL set of attributes.
+ */
+
+ if (sa->sa_reg_attr_obj) {
+ for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t value;
+ value = za.za_first_integer;
+
+ registered_count++;
+ tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
+ tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
+ tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
+ tb[ATTR_NUM(value)].sa_registered = B_TRUE;
+
+ if (tb[ATTR_NUM(value)].sa_name) {
+ continue;
+ }
+ tb[ATTR_NUM(value)].sa_name =
+ kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
+ (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
+ strlen(za.za_name) +1);
+ }
+ zap_cursor_fini(&zc);
+ /*
+ * Make sure we processed the correct number of registered
+ * attributes
+ */
+ if (registered_count != sa_reg_count) {
+ ASSERT(error != 0);
+ goto bail;
+ }
+
+ }
+
+ if (ostype == DMU_OST_ZFS) {
+ for (i = 0; i != sa_legacy_attr_count; i++) {
+ if (tb[i].sa_name)
+ continue;
+ tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
+ tb[i].sa_length = sa_legacy_attrs[i].sa_length;
+ tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
+ tb[i].sa_registered = B_FALSE;
+ tb[i].sa_name =
+ kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
+ KM_SLEEP);
+ (void) strlcpy(tb[i].sa_name,
+ sa_legacy_attrs[i].sa_name,
+ strlen(sa_legacy_attrs[i].sa_name) + 1);
+ }
+ }
+
+ for (i = 0; i != count; i++) {
+ sa_attr_type_t attr_id;
+
+ attr_id = sa->sa_user_table[i];
+ if (tb[attr_id].sa_name)
+ continue;
+
+ tb[attr_id].sa_length = reg_attrs[i].sa_length;
+ tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
+ tb[attr_id].sa_attr = attr_id;
+ tb[attr_id].sa_name =
+ kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
+ (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
+ strlen(reg_attrs[i].sa_name) + 1);
+ }
+
+ sa->sa_need_attr_registration =
+ (sa_attr_count != registered_count);
+
+ return (0);
+bail:
+ kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
+ sa->sa_user_table = NULL;
+ sa_free_attr_table(sa);
+ return ((error != 0) ? error : EINVAL);
+}
+
+int
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
+ sa_attr_type_t **user_table)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ sa_os_t *sa;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+ sa_attr_type_t *tb;
+ int error;
+
+ mutex_enter(&os->os_user_ptr_lock);
+ if (os->os_sa) {
+ mutex_enter(&os->os_sa->sa_lock);
+ mutex_exit(&os->os_user_ptr_lock);
+ tb = os->os_sa->sa_user_table;
+ mutex_exit(&os->os_sa->sa_lock);
+ *user_table = tb;
+ return (0);
+ }
+
+ sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
+ mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ sa->sa_master_obj = sa_obj;
+
+ os->os_sa = sa;
+ mutex_enter(&sa->sa_lock);
+ mutex_exit(&os->os_user_ptr_lock);
+ avl_create(&sa->sa_layout_num_tree, layout_num_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
+ avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
+
+ if (sa_obj) {
+ error = zap_lookup(os, sa_obj, SA_LAYOUTS,
+ 8, 1, &sa->sa_layout_attr_obj);
+ if (error != 0 && error != ENOENT)
+ goto fail;
+ error = zap_lookup(os, sa_obj, SA_REGISTRY,
+ 8, 1, &sa->sa_reg_attr_obj);
+ if (error != 0 && error != ENOENT)
+ goto fail;
+ }
+
+ if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
+ goto fail;
+
+ if (sa->sa_layout_attr_obj != 0) {
+ uint64_t layout_count;
+
+ error = zap_count(os, sa->sa_layout_attr_obj,
+ &layout_count);
+
+ /*
+ * Layout number count should be > 0
+ */
+ if (error || (error == 0 && layout_count == 0)) {
+ if (error == 0)
+ error = SET_ERROR(EINVAL);
+ goto fail;
+ }
+
+ for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ sa_attr_type_t *lot_attrs;
+ uint64_t lot_num;
+
+ lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
+ za.za_num_integers, KM_SLEEP);
+
+ if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
+ za.za_name, 2, za.za_num_integers,
+ lot_attrs))) != 0) {
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ break;
+ }
+ VERIFY(ddi_strtoull(za.za_name, NULL, 10,
+ (unsigned long long *)&lot_num) == 0);
+
+ (void) sa_add_layout_entry(os, lot_attrs,
+ za.za_num_integers, lot_num,
+ sa_layout_info_hash(lot_attrs,
+ za.za_num_integers), B_FALSE, NULL);
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ }
+ zap_cursor_fini(&zc);
+
+ /*
+ * Make sure layout count matches number of entries added
+ * to AVL tree
+ */
+ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
+ ASSERT(error != 0);
+ goto fail;
+ }
+ }
+
+ /* Add special layout number for old ZNODES */
+ if (ostype == DMU_OST_ZFS) {
+ (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
+ sa_legacy_attr_count, 0,
+ sa_layout_info_hash(sa_legacy_zpl_layout,
+ sa_legacy_attr_count), B_FALSE, NULL);
+
+ (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
+ 0, B_FALSE, NULL);
+ }
+ *user_table = os->os_sa->sa_user_table;
+ mutex_exit(&sa->sa_lock);
+ return (0);
+fail:
+ os->os_sa = NULL;
+ sa_free_attr_table(sa);
+ if (sa->sa_user_table)
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+ mutex_exit(&sa->sa_lock);
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+ mutex_destroy(&sa->sa_lock);
+ kmem_free(sa, sizeof (sa_os_t));
+ return ((error == ECKSUM) ? EIO : error);
+}
+
+void
+sa_tear_down(objset_t *os)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *layout;
+ void *cookie;
+
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+
+ /* Free up attr table */
+
+ sa_free_attr_table(sa);
+
+ cookie = NULL;
+ while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
+ sa_idx_tab_t *tab;
+ while (tab = list_head(&layout->lot_idx_tab)) {
+ ASSERT(zfs_refcount_count(&tab->sa_refcount));
+ sa_idx_tab_rele(os, tab);
+ }
+ }
+
+ cookie = NULL;
+ while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
+ kmem_free(layout->lot_attrs,
+ sizeof (sa_attr_type_t) * layout->lot_attr_count);
+ kmem_free(layout, sizeof (sa_lot_t));
+ }
+
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+ mutex_destroy(&sa->sa_lock);
+
+ kmem_free(sa, sizeof (sa_os_t));
+ os->os_sa = NULL;
+}
+
+void
+sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t var_length, void *userp)
+{
+ sa_idx_tab_t *idx_tab = userp;
+
+ if (var_length) {
+ ASSERT(idx_tab->sa_variable_lengths);
+ idx_tab->sa_variable_lengths[length_idx] = length;
+ }
+ TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
+ (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
+}
+
+static void
+sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
+ sa_iterfunc_t func, sa_lot_t *tab, void *userp)
+{
+ void *data_start;
+ sa_lot_t *tb = tab;
+ sa_lot_t search;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ int i;
+ uint16_t *length_start = NULL;
+ uint8_t length_idx = 0;
+
+ if (tab == NULL) {
+ search.lot_num = SA_LAYOUT_NUM(hdr, type);
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+ ASSERT(tb);
+ }
+
+ if (IS_SA_BONUSTYPE(type)) {
+ data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
+ offsetof(sa_hdr_phys_t, sa_lengths) +
+ (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
+ length_start = hdr->sa_lengths;
+ } else {
+ data_start = hdr;
+ }
+
+ for (i = 0; i != tb->lot_attr_count; i++) {
+ int attr_length, reg_length;
+ uint8_t idx_len;
+
+ reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+ if (reg_length) {
+ attr_length = reg_length;
+ idx_len = 0;
+ } else {
+ attr_length = length_start[length_idx];
+ idx_len = length_idx++;
+ }
+
+ func(hdr, data_start, tb->lot_attrs[i], attr_length,
+ idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
+
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ attr_length), 8);
+ }
+}
+
+/*ARGSUSED*/
+void
+sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t variable_length, void *userp)
+{
+ sa_handle_t *hdl = userp;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
+}
+
+void
+sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+ dmu_buf_impl_t *db;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int num_lengths = 1;
+ int i;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ if (sa_hdr_phys->sa_magic == SA_MAGIC)
+ return;
+
+ db = SA_GET_DB(hdl, buftype);
+
+ if (buftype == SA_SPILL) {
+ arc_release(db->db_buf, NULL);
+ arc_buf_thaw(db->db_buf);
+ }
+
+ sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
+ sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
+
+ /*
+ * Determine number of variable lenghts in header
+ * The standard 8 byte header has one for free and a
+ * 16 byte header would have 4 + 1;
+ */
+ if (SA_HDR_SIZE(sa_hdr_phys) > 8)
+ num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
+ for (i = 0; i != num_lengths; i++)
+ sa_hdr_phys->sa_lengths[i] =
+ BSWAP_16(sa_hdr_phys->sa_lengths[i]);
+
+ sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
+ sa_byteswap_cb, NULL, hdl);
+
+ if (buftype == SA_SPILL)
+ arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
+}
+
+static int
+sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys;
+ dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
+ dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_idx_tab_t *idx_tab;
+
+ sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+
+ mutex_enter(&sa->sa_lock);
+
+ /* Do we need to byteswap? */
+
+ /* only check if not old znode */
+ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
+ sa_hdr_phys->sa_magic != 0) {
+ VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
+ sa_byteswap(hdl, buftype);
+ }
+
+ idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
+
+ if (buftype == SA_BONUS)
+ hdl->sa_bonus_tab = idx_tab;
+ else
+ hdl->sa_spill_tab = idx_tab;
+
+ mutex_exit(&sa->sa_lock);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_evict_sync(void *dbu)
+{
+ panic("evicting sa dbuf\n");
+}
+
+static void
+sa_idx_tab_rele(objset_t *os, void *arg)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_idx_tab_t *idx_tab = arg;
+
+ if (idx_tab == NULL)
+ return;
+
+ mutex_enter(&sa->sa_lock);
+ if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
+ list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
+ if (idx_tab->sa_variable_lengths)
+ kmem_free(idx_tab->sa_variable_lengths,
+ sizeof (uint16_t) *
+ idx_tab->sa_layout->lot_var_sizes);
+ zfs_refcount_destroy(&idx_tab->sa_refcount);
+ kmem_free(idx_tab->sa_idx_tab,
+ sizeof (uint32_t) * sa->sa_num_attrs);
+ kmem_free(idx_tab, sizeof (sa_idx_tab_t));
+ }
+ mutex_exit(&sa->sa_lock);
+}
+
+static void
+sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
+{
+ sa_os_t *sa = os->os_sa;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ (void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
+}
+
+void
+sa_handle_destroy(sa_handle_t *hdl)
+{
+ dmu_buf_t *db = hdl->sa_bonus;
+
+ mutex_enter(&hdl->sa_lock);
+ (void) dmu_buf_remove_user(db, &hdl->sa_dbu);
+
+ if (hdl->sa_bonus_tab)
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+
+ if (hdl->sa_spill_tab)
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+
+ dmu_buf_rele(hdl->sa_bonus, NULL);
+
+ if (hdl->sa_spill)
+ dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+ mutex_exit(&hdl->sa_lock);
+
+ kmem_cache_free(sa_cache, hdl);
+}
+
+int
+sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ int error = 0;
+ dmu_object_info_t doi;
+ sa_handle_t *handle = NULL;
+
+#ifdef ZFS_DEBUG
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
+ doi.doi_bonus_type == DMU_OT_ZNODE);
+#endif
+ /* find handle, if it exists */
+ /* if one doesn't exist then create a new one, and initialize it */
+
+ if (hdl_type == SA_HDL_SHARED)
+ handle = dmu_buf_get_user(db);
+
+ if (handle == NULL) {
+ sa_handle_t *winner = NULL;
+
+ handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+ handle->sa_dbu.dbu_evict_func_sync = NULL;
+ handle->sa_dbu.dbu_evict_func_async = NULL;
+ handle->sa_userp = userp;
+ handle->sa_bonus = db;
+ handle->sa_os = os;
+ handle->sa_spill = NULL;
+ handle->sa_bonus_tab = NULL;
+ handle->sa_spill_tab = NULL;
+
+ error = sa_build_index(handle, SA_BONUS);
+
+ if (hdl_type == SA_HDL_SHARED) {
+ dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
+ NULL);
+ winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
+ }
+
+ if (winner != NULL) {
+ kmem_cache_free(sa_cache, handle);
+ handle = winner;
+ }
+ }
+ *handlepp = handle;
+
+ return (error);
+}
+
+int
+sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ dmu_buf_t *db;
+ int error;
+
+ if (error = dmu_bonus_hold(objset, objid, NULL, &db))
+ return (error);
+
+ return (sa_handle_get_from_db(objset, db, userp, hdl_type,
+ handlepp));
+}
+
+int
+sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+{
+ return (dmu_bonus_hold(objset, obj_num, tag, db));
+}
+
+void
+sa_buf_rele(dmu_buf_t *db, void *tag)
+{
+ dmu_buf_rele(db, tag);
+}
+
+int
+sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
+}
+
+int
+sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = buf;
+ bulk.sa_length = buflen;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_lookup_impl(hdl, &bulk, 1);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+#ifdef _KERNEL
+int
+sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+
+ mutex_enter(&hdl->sa_lock);
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
+ error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
+ uio->uio_resid), UIO_READ, uio);
+ }
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+
+}
+#endif
+
+static sa_idx_tab_t *
+sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr)
+{
+ sa_idx_tab_t *idx_tab;
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, search;
+ avl_index_t loc;
+
+ /*
+ * Deterimine layout number. If SA node and header == 0 then
+ * force the index table to the dummy "1" empty layout.
+ *
+ * The layout number would only be zero for a newly created file
+ * that has not added any attributes yet, or with crypto enabled which
+ * doesn't write any attributes to the bonus buffer.
+ */
+
+ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
+
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+
+ /* Verify header size is consistent with layout information */
+ ASSERT(tb);
+ ASSERT(IS_SA_BONUSTYPE(bonustype) &&
+ SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
+ (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
+
+ /*
+ * See if any of the already existing TOC entries can be reused?
+ */
+
+ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
+ idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
+ boolean_t valid_idx = B_TRUE;
+ int i;
+
+ if (tb->lot_var_sizes != 0 &&
+ idx_tab->sa_variable_lengths != NULL) {
+ for (i = 0; i != tb->lot_var_sizes; i++) {
+ if (hdr->sa_lengths[i] !=
+ idx_tab->sa_variable_lengths[i]) {
+ valid_idx = B_FALSE;
+ break;
+ }
+ }
+ }
+ if (valid_idx) {
+ sa_idx_tab_hold(os, idx_tab);
+ return (idx_tab);
+ }
+ }
+
+ /* No such luck, create a new entry */
+ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
+ idx_tab->sa_idx_tab =
+ kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
+ idx_tab->sa_layout = tb;
+ zfs_refcount_create(&idx_tab->sa_refcount);
+ if (tb->lot_var_sizes)
+ idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
+ tb->lot_var_sizes, KM_SLEEP);
+
+ sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
+ tb, idx_tab);
+ sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */
+ sa_idx_tab_hold(os, idx_tab); /* one for layout */
+ list_insert_tail(&tb->lot_idx_tab, idx_tab);
+ return (idx_tab);
+}
+
+void
+sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
+ boolean_t start, void *userdata)
+{
+ ASSERT(start);
+
+ *dataptr = userdata;
+ *len = total_len;
+}
+
+static void
+sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ uint64_t attr_value = 0;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_attr_table_t *tb = sa->sa_attr_table;
+ int i;
+
+ mutex_enter(&sa->sa_lock);
+
+ if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
+ mutex_exit(&sa->sa_lock);
+ return;
+ }
+
+ if (sa->sa_reg_attr_obj == 0) {
+ sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
+ DMU_OT_SA_ATTR_REGISTRATION,
+ sa->sa_master_obj, SA_REGISTRY, tx);
+ }
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_registered)
+ continue;
+ ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
+ tb[i].sa_byteswap);
+ VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
+ tb[i].sa_name, 8, 1, &attr_value, tx));
+ tb[i].sa_registered = B_TRUE;
+ }
+ sa->sa_need_attr_registration = B_FALSE;
+ mutex_exit(&sa->sa_lock);
+}
+
+/*
+ * Replace all attributes with attributes specified in template.
+ * If dnode had a spill buffer then those attributes will be
+ * also be replaced, possibly with just an empty spill block
+ *
+ * This interface is intended to only be used for bulk adding of
+ * attributes for a new file. It will also be used by the ZPL
+ * when converting and old formatted znode to native SA support.
+ */
+int
+sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+ return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
+}
+
+int
+sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_replace_all_by_template_locked(hdl, attr_desc,
+ attr_count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * Add/remove a single attribute or replace a variable-sized attribute value
+ * with a value of a different size, and then rewrite the entire set
+ * of attributes.
+ * Same-length attribute value replacement (including fixed-length attributes)
+ * is handled more efficiently by the upper layers.
+ */
+static int
+sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+ sa_bulk_attr_t *attr_desc;
+ void *old_data[2];
+ int bonus_attr_count = 0;
+ int bonus_data_size = 0;
+ int spill_data_size = 0;
+ int spill_attr_count = 0;
+ int error;
+ uint16_t length, reg_length;
+ int i, j, k, length_idx;
+ sa_hdr_phys_t *hdr;
+ sa_idx_tab_t *idx_tab;
+ int attr_count;
+ int count;
+
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* First make of copy of the old data */
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_bonuslen != 0) {
+ bonus_data_size = hdl->sa_bonus->db_size;
+ old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
+ bcopy(hdl->sa_bonus->db_data, old_data[0],
+ hdl->sa_bonus->db_size);
+ bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
+ } else {
+ old_data[0] = NULL;
+ }
+ DB_DNODE_EXIT(db);
+
+ /* Bring spill buffer online if it isn't currently */
+
+ if ((error = sa_get_spill(hdl)) == 0) {
+ spill_data_size = hdl->sa_spill->db_size;
+ old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
+ bcopy(hdl->sa_spill->db_data, old_data[1],
+ hdl->sa_spill->db_size);
+ spill_attr_count =
+ hdl->sa_spill_tab->sa_layout->lot_attr_count;
+ } else if (error && error != ENOENT) {
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ return (error);
+ } else {
+ old_data[1] = NULL;
+ }
+
+ /* build descriptor of all attributes */
+
+ attr_count = bonus_attr_count + spill_attr_count;
+ if (action == SA_ADD)
+ attr_count++;
+ else if (action == SA_REMOVE)
+ attr_count--;
+
+ attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
+
+ /*
+ * loop through bonus and spill buffer if it exists, and
+ * build up new attr_descriptor to reset the attributes
+ */
+ k = j = 0;
+ count = bonus_attr_count;
+ hdr = SA_GET_HDR(hdl, SA_BONUS);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
+ for (; k != 2; k++) {
+ /*
+ * Iterate over each attribute in layout. Fetch the
+ * size of variable-length attributes needing rewrite
+ * from sa_lengths[].
+ */
+ for (i = 0, length_idx = 0; i != count; i++) {
+ sa_attr_type_t attr;
+
+ attr = idx_tab->sa_layout->lot_attrs[i];
+ reg_length = SA_REGISTERED_LEN(sa, attr);
+ if (reg_length == 0) {
+ length = hdr->sa_lengths[length_idx];
+ length_idx++;
+ } else {
+ length = reg_length;
+ }
+ if (attr == newattr) {
+ /*
+ * There is nothing to do for SA_REMOVE,
+ * so it is just skipped.
+ */
+ if (action == SA_REMOVE)
+ continue;
+
+ /*
+ * Duplicate attributes are not allowed, so the
+ * action can not be SA_ADD here.
+ */
+ ASSERT3S(action, ==, SA_REPLACE);
+
+ /*
+ * Only a variable-sized attribute can be
+ * replaced here, and its size must be changing.
+ */
+ ASSERT3U(reg_length, ==, 0);
+ ASSERT3U(length, !=, buflen);
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ locator, datastart, buflen);
+ } else {
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ NULL, (void *)
+ (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
+ (uintptr_t)old_data[k]), length);
+ }
+ }
+ if (k == 0 && hdl->sa_spill) {
+ hdr = SA_GET_HDR(hdl, SA_SPILL);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
+ count = spill_attr_count;
+ } else {
+ break;
+ }
+ }
+ if (action == SA_ADD) {
+ reg_length = SA_REGISTERED_LEN(sa, newattr);
+ IMPLY(reg_length != 0, reg_length == buflen);
+ SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
+ datastart, buflen);
+ }
+ ASSERT3U(j, ==, attr_count);
+
+ error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
+
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ if (old_data[1])
+ kmem_free(old_data[1], spill_data_size);
+ kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
+
+ return (error);
+}
+
+static int
+sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ dmu_tx_t *tx)
+{
+ int error;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_object_type_t bonustype;
+
+ bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* sync out registration table if necessary */
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+
+ error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
+ if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
+ sa->sa_update_cb(hdl, tx);
+
+ return (error);
+}
+
+/*
+ * update or add new attribute
+ */
+int
+sa_update(sa_handle_t *hdl, sa_attr_type_t type,
+ void *buf, uint32_t buflen, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = type;
+ bulk.sa_data_func = NULL;
+ bulk.sa_length = buflen;
+ bulk.sa_data = buf;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
+ uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = userdata;
+ bulk.sa_data_func = locator;
+ bulk.sa_length = buflen;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * Return size of an attribute
+ */
+
+int
+sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
+{
+ sa_bulk_attr_t bulk;
+ int error;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+ }
+ *size = bulk.sa_size;
+
+ mutex_exit(&hdl->sa_lock);
+ return (0);
+}
+
+int
+sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_lookup_impl(hdl, attrs, count));
+}
+
+int
+sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_lookup_locked(hdl, attrs, count);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, attrs, count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
+ NULL, 0, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+void
+sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
+}
+
+void
+sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
+{
+ dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
+ blksize, nblocks);
+}
+
+void
+sa_set_userp(sa_handle_t *hdl, void *ptr)
+{
+ hdl->sa_userp = ptr;
+}
+
+dmu_buf_t *
+sa_get_db(sa_handle_t *hdl)
+{
+ return ((dmu_buf_t *)hdl->sa_bonus);
+}
+
+void *
+sa_get_userdata(sa_handle_t *hdl)
+{
+ return (hdl->sa_userp);
+}
+
+void
+sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
+{
+ ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
+ os->os_sa->sa_update_cb = func;
+}
+
+void
+sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
+{
+
+ mutex_enter(&os->os_sa->sa_lock);
+ sa_register_update_callback_locked(os, func);
+ mutex_exit(&os->os_sa->sa_lock);
+}
+
+uint64_t
+sa_handle_object(sa_handle_t *hdl)
+{
+ return (hdl->sa_bonus->db_object);
+}
+
+boolean_t
+sa_enabled(objset_t *os)
+{
+ return (os->os_sa == NULL);
+}
+
+int
+sa_set_sa_object(objset_t *os, uint64_t sa_object)
+{
+ sa_os_t *sa = os->os_sa;
+
+ if (sa->sa_master_obj)
+ return (1);
+
+ sa->sa_master_obj = sa_object;
+
+ return (0);
+}
+
+int
+sa_hdrsize(void *arg)
+{
+ sa_hdr_phys_t *hdr = arg;
+
+ return (SA_HDR_SIZE(hdr));
+}
+
+void
+sa_handle_lock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+}
+
+void
+sa_handle_unlock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_exit(&hdl->sa_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
new file mode 100644
index 000000000000..34c909f0c71a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#ifdef _KERNEL
+#include <crypto/sha2/sha256.h>
+#include <crypto/sha2/sha512t.h>
+#else
+#include <sha256.h>
+#include <sha512t.h>
+#endif
+#include <sys/abd.h>
+
+static int
+sha256_incremental(void *buf, size_t size, void *arg)
+{
+ SHA256_CTX *ctx = arg;
+ SHA256_Update(ctx, buf, size);
+ return (0);
+}
+
+static int
+sha512_incremental(void *buf, size_t size, void *arg)
+{
+ SHA512_CTX *ctx = arg;
+ SHA512_256_Update(ctx, buf, size);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA256(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ SHA256_CTX ctx;
+ zio_cksum_t tmp;
+
+ SHA256_Init(&ctx);
+ (void) abd_iterate_func(abd, 0, size, sha256_incremental, &ctx);
+ SHA256_Final((unsigned char *)&tmp, &ctx);
+
+ /*
+ * A prior implementation of this function had a
+ * private SHA256 implementation always wrote things out in
+ * Big Endian and there wasn't a byteswap variant of it.
+ * To preserve on disk compatibility we need to force that
+ * behavior.
+ */
+ zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ SHA512_CTX ctx;
+
+ SHA512_256_Init(&ctx);
+ (void) abd_iterate_func(abd, 0, size, sha512_incremental, &ctx);
+ SHA512_256_Final((unsigned char *)zcp, &ctx);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
new file mode 100644
index 000000000000..c30f590a5fdb
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#ifdef _KERNEL
+#include <crypto/skein/skein.h>
+#else
+#include <skein.h>
+#endif
+#include <sys/abd.h>
+
+static int
+skein_incremental(void *buf, size_t size, void *arg)
+{
+ Skein_512_Ctxt_t *ctx = arg;
+ (void) Skein_512_Update(ctx, buf, size);
+ return (0);
+}
+
+/*
+ * Computes a native 256-bit skein MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using abd_checksum_skein_tmpl_init.
+ */
+/*ARGSUSED*/
+void
+abd_checksum_skein_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ Skein_512_Ctxt_t ctx;
+
+ ASSERT(ctx_template != NULL);
+ bcopy(ctx_template, &ctx, sizeof (ctx));
+ (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
+ (void) Skein_512_Final(&ctx, (uint8_t *)zcp);
+ bzero(&ctx, sizeof (ctx));
+}
+
+/*
+ * Byteswapped version of abd_checksum_skein_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * skein is internally endian-insensitive).
+ */
+void
+abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ abd_checksum_skein_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a skein MAC template suitable for using in skein MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ Skein_512_Ctxt_t *ctx;
+
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
+ salt->zcs_bytes, sizeof (salt->zcs_bytes));
+ return (ctx);
+}
+
+/*
+ * Frees a skein context template previously allocated using
+ * abd_checksum_skein_tmpl_init.
+ */
+void
+abd_checksum_skein_tmpl_free(void *ctx_template)
+{
+ Skein_512_Ctxt_t *ctx = ctx_template;
+
+ bzero(ctx, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
new file mode 100644
index 000000000000..65ae7904047b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -0,0 +1,8972 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright 2018 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+/*
+ * SPA: Storage Pool Allocator
+ *
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/ddt.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/mmp.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/callb.h>
+#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zvol.h>
+#include <sys/trim_map.h>
+#include <sys/abd.h>
+
+#ifdef _KERNEL
+#include <sys/callb.h>
+#include <sys/cpupart.h>
+#include <sys/zone.h>
+#endif /* _KERNEL */
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/* Check hostid on import? */
+static int check_hostid = 1;
+
+/*
+ * The interval, in seconds, at which failed configuration cache file writes
+ * should be retried.
+ */
+int zfs_ccw_retry_interval = 300;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
+ "Check hostid on import?");
+TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
+ &zfs_ccw_retry_interval, 0,
+ "Configuration cache file write, retry after failure, interval (seconds)");
+
+typedef enum zti_modes {
+ ZTI_MODE_FIXED, /* value is # of threads (min 1) */
+ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
+ ZTI_MODE_NULL, /* don't create a taskq */
+ ZTI_NMODES
+} zti_modes_t;
+
+#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
+#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
+#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
+
+#define ZTI_N(n) ZTI_P(n, 1)
+#define ZTI_ONE ZTI_N(1)
+
+typedef struct zio_taskq_info {
+ zti_modes_t zti_mode;
+ uint_t zti_value;
+ uint_t zti_count;
+} zio_taskq_info_t;
+
+static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
+ "issue", "issue_high", "intr", "intr_high"
+};
+
+/*
+ * This table defines the taskq settings for each ZFS I/O type. When
+ * initializing a pool, we use this table to create an appropriately sized
+ * taskq. Some operations are low volume and therefore have a small, static
+ * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
+ * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macro causes us to create a taskq oriented for throughput. Some operations
+ * are so high frequency and short-lived that the taskq itself can become a a
+ * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
+ * additional degree of parallelism specified by the number of threads per-
+ * taskq and the number of taskqs; when dispatching an event in this case, the
+ * particular taskq is chosen at random.
+ *
+ * The different taskq priorities are to handle the different contexts (issue
+ * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
+ * need to be handled with minimum delay.
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+ /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
+ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
+ { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */
+ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
+};
+
+static void spa_sync_version(void *arg, dmu_tx_t *tx);
+static void spa_sync_props(void *arg, dmu_tx_t *tx);
+static boolean_t spa_has_active_shared_spare(spa_t *spa);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
+
+uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
+#ifdef PSRSET_BIND
+id_t zio_taskq_psrset_bind = PS_NONE;
+#endif
+#ifdef SYSDC
+boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
+uint_t zio_taskq_basedc = 80; /* base duty cycle */
+#endif
+
+#ifdef _KERNEL
+#define SPA_PROCESS
+#endif
+boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
+
+extern int zfs_sync_pass_deferred_free;
+
+/*
+ * Report any spa_load_verify errors found, but do not fail spa_load.
+ * This is used by zdb to analyze non-idle pools.
+ */
+boolean_t spa_load_verify_dryrun = B_FALSE;
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define TRYIMPORT_NAME "$import"
+
+/*
+ * For debugging purposes: print out vdev tree during pool import.
+ */
+int spa_load_print_vdev_tree = B_FALSE;
+
+/*
+ * A non-zero value for zfs_max_missing_tvds means that we allow importing
+ * pools with missing top-level vdevs. This is strictly intended for advanced
+ * pool recovery cases since missing data is almost inevitable. Pools with
+ * missing devices can only be imported read-only for safety reasons, and their
+ * fail-mode will be automatically set to "continue".
+ *
+ * With 1 missing vdev we should be able to import the pool and mount all
+ * datasets. User data that was not modified after the missing device has been
+ * added should be recoverable. This means that snapshots created prior to the
+ * addition of that device should be completely intact.
+ *
+ * With 2 missing vdevs, some datasets may fail to mount since there are
+ * dataset statistics that are stored as regular metadata. Some data might be
+ * recoverable if those vdevs were added recently.
+ *
+ * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
+ * may be missing entirely. Chances of data recovery are very low. Note that
+ * there are also risks of performing an inadvertent rewind as we might be
+ * missing all the vdevs with the latest uberblocks.
+ */
+uint64_t zfs_max_missing_tvds = 0;
+
+/*
+ * The parameters below are similar to zfs_max_missing_tvds but are only
+ * intended for a preliminary open of the pool with an untrusted config which
+ * might be incomplete or out-dated.
+ *
+ * We are more tolerant for pools opened from a cachefile since we could have
+ * an out-dated cachefile where a device removal was not registered.
+ * We could have set the limit arbitrarily high but in the case where devices
+ * are really missing we would want to return the proper error codes; we chose
+ * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
+ * and we get a chance to retrieve the trusted config.
+ */
+uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+
+/*
+ * In the case where config was assembled by scanning device paths (/dev/dsks
+ * by default) we are less tolerant since all the existing devices should have
+ * been detected and we want spa_load to return the right error codes.
+ */
+uint64_t zfs_max_missing_tvds_scan = 0;
+
+
+SYSCTL_DECL(_vfs_zfs_zio);
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_batch_pct, CTLFLAG_RDTUN,
+ &zio_taskq_batch_pct, 0,
+ "Percentage of CPUs to run an IO worker thread");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN,
+ &spa_load_print_vdev_tree, 0,
+ "print out vdev tree during pool import");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN,
+ &zfs_max_missing_tvds, 0,
+ "allow importing pools with missing top-level vdevs");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN,
+ &zfs_max_missing_tvds_cachefile, 0,
+ "allow importing pools with missing top-level vdevs in cache file");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN,
+ &zfs_max_missing_tvds_scan, 0,
+ "allow importing pools with missing top-level vdevs during scan");
+
+/*
+ * Debugging aid that pauses spa_sync() towards the end.
+ */
+boolean_t zfs_pause_spa_sync = B_FALSE;
+
+/*
+ * ==========================================================================
+ * SPA properties routines
+ * ==========================================================================
+ */
+
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
+ uint64_t intval, zprop_source_t src)
+{
+ const char *propname = zpool_prop_to_name(prop);
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+
+ if (strval != NULL)
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+ else
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
+
+ VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+ nvlist_free(propval);
+}
+
+/*
+ * Get property values from the spa configuration.
+ */
+static void
+spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ dsl_pool_t *pool = spa->spa_dsl_pool;
+ uint64_t size, alloc, cap, version;
+ zprop_source_t src = ZPROP_SRC_NONE;
+ spa_config_dirent_t *dp;
+ metaslab_class_t *mc = spa_normal_class(spa);
+
+ ASSERT(MUTEX_HELD(&spa->spa_props_lock));
+
+ if (rvd != NULL) {
+ alloc = metaslab_class_get_alloc(mc);
+ alloc += metaslab_class_get_alloc(spa_special_class(spa));
+ alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+
+ size = metaslab_class_get_space(mc);
+ size += metaslab_class_get_space(spa_special_class(spa));
+ size += metaslab_class_get_space(spa_dedup_class(spa));
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
+ size - alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+ spa->spa_checkpoint_info.sci_dspace, src);
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
+ metaslab_class_fragmentation(mc), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
+ metaslab_class_expandable_space(mc), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+ (spa_mode(spa) == FREAD), src);
+
+ cap = (size == 0) ? 0 : (alloc * 100 / size);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+ ddt_get_pool_dedup_ratio(spa), src);
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+ rvd->vdev_state, src);
+
+ version = spa_version(spa);
+ if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+ spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
+ }
+
+ if (pool != NULL) {
+ /*
+ * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
+ * when opening pools before this version freedir will be NULL.
+ */
+ if (pool->dp_free_dir != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
+ dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
+ src);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
+ NULL, 0, src);
+ }
+
+ if (pool->dp_leak_dir != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+ dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
+ src);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+ NULL, 0, src);
+ }
+ }
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
+
+ if (spa->spa_comment != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
+ 0, ZPROP_SRC_LOCAL);
+ }
+
+ if (spa->spa_root != NULL)
+ spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
+ 0, ZPROP_SRC_LOCAL);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+ }
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MAX_SIZE, ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MIN_SIZE, ZPROP_SRC_NONE);
+ }
+
+ if ((dp = list_head(&spa->spa_config_list)) != NULL) {
+ if (dp->scd_path == NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+ "none", 0, ZPROP_SRC_LOCAL);
+ } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+ dp->scd_path, 0, ZPROP_SRC_LOCAL);
+ }
+ }
+}
+
+/*
+ * Get zpool property values.
+ */
+int
+spa_prop_get(spa_t *spa, nvlist_t **nvp)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ mutex_enter(&spa->spa_props_lock);
+
+ /*
+ * Get properties from the spa config.
+ */
+ spa_prop_get_config(spa, nvp);
+
+ /* If no pool property object, no more prop to get. */
+ if (mos == NULL || spa->spa_pool_props_object == 0) {
+ mutex_exit(&spa->spa_props_lock);
+ return (0);
+ }
+
+ /*
+ * Get properties from the MOS pool property object.
+ */
+ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t intval = 0;
+ char *strval = NULL;
+ zprop_source_t src = ZPROP_SRC_DEFAULT;
+ zpool_prop_t prop;
+
+ if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
+ continue;
+
+ switch (za.za_integer_length) {
+ case 8:
+ /* integer property */
+ if (za.za_first_integer !=
+ zpool_prop_default_numeric(prop))
+ src = ZPROP_SRC_LOCAL;
+
+ if (prop == ZPOOL_PROP_BOOTFS) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+
+ dp = spa_get_dsl(spa);
+ dsl_pool_config_enter(dp, FTAG);
+ err = dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_config_exit(dp, FTAG);
+ break;
+ }
+
+ strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
+ KM_SLEEP);
+ dsl_dataset_name(ds, strval);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ } else {
+ strval = NULL;
+ intval = za.za_first_integer;
+ }
+
+ spa_prop_add_list(*nvp, prop, strval, intval, src);
+
+ if (strval != NULL)
+ kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
+
+ break;
+
+ case 1:
+ /* string property */
+ strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
+ err = zap_lookup(mos, spa->spa_pool_props_object,
+ za.za_name, 1, za.za_num_integers, strval);
+ if (err) {
+ kmem_free(strval, za.za_num_integers);
+ break;
+ }
+ spa_prop_add_list(*nvp, prop, strval, 0, src);
+ kmem_free(strval, za.za_num_integers);
+ break;
+
+ default:
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ mutex_exit(&spa->spa_props_lock);
+out:
+ if (err && err != ENOENT) {
+ nvlist_free(*nvp);
+ *nvp = NULL;
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Validate the given pool properties nvlist and modify the list
+ * for the property values to be set.
+ */
+static int
+spa_prop_validate(spa_t *spa, nvlist_t *props)
+{
+ nvpair_t *elem;
+ int error = 0, reset_bootfs = 0;
+ uint64_t objnum = 0;
+ boolean_t has_feature = B_FALSE;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ uint64_t intval;
+ char *strval, *slash, *check, *fname;
+ const char *propname = nvpair_name(elem);
+ zpool_prop_t prop = zpool_name_to_prop(propname);
+
+ switch (prop) {
+ case ZPOOL_PROP_INVAL:
+ if (!zpool_prop_feature(propname)) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ /*
+ * Sanitize the input.
+ */
+ if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ if (intval != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ fname = strchr(propname, '@') + 1;
+ if (zfeature_lookup_name(fname, NULL) != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ has_feature = B_TRUE;
+ break;
+
+ case ZPOOL_PROP_VERSION:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error &&
+ (intval < spa_version(spa) ||
+ intval > SPA_VERSION_BEFORE_FEATURES ||
+ has_feature))
+ error = SET_ERROR(EINVAL);
+ break;
+
+ case ZPOOL_PROP_DELEGATION:
+ case ZPOOL_PROP_AUTOREPLACE:
+ case ZPOOL_PROP_LISTSNAPS:
+ case ZPOOL_PROP_AUTOEXPAND:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && intval > 1)
+ error = SET_ERROR(EINVAL);
+ break;
+
+ case ZPOOL_PROP_MULTIHOST:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && intval > 1)
+ error = SET_ERROR(EINVAL);
+
+ if (!error && !spa_get_hostid())
+ error = SET_ERROR(ENOTSUP);
+
+ break;
+
+ case ZPOOL_PROP_BOOTFS:
+ /*
+ * If the pool version is less than SPA_VERSION_BOOTFS,
+ * or the pool is still being created (version == 0),
+ * the bootfs property cannot be set.
+ */
+ if (spa_version(spa) < SPA_VERSION_BOOTFS) {
+ error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ /*
+ * Make sure the vdev config is bootable
+ */
+ if (!vdev_is_bootable(spa->spa_root_vdev)) {
+ error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ reset_bootfs = 1;
+
+ error = nvpair_value_string(elem, &strval);
+
+ if (!error) {
+ objset_t *os;
+ uint64_t propval;
+
+ if (strval == NULL || strval[0] == '\0') {
+ objnum = zpool_prop_default_numeric(
+ ZPOOL_PROP_BOOTFS);
+ break;
+ }
+
+ error = dmu_objset_hold(strval, FTAG, &os);
+ if (error != 0)
+ break;
+
+ /*
+ * Must be ZPL, and its property settings
+ * must be supported.
+ */
+
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ error = SET_ERROR(ENOTSUP);
+ } else if ((error =
+ dsl_prop_get_int_ds(dmu_objset_ds(os),
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+ &propval)) == 0 &&
+ !BOOTFS_COMPRESS_VALID(propval)) {
+ error = SET_ERROR(ENOTSUP);
+ } else {
+ objnum = dmu_objset_id(os);
+ }
+ dmu_objset_rele(os, FTAG);
+ }
+ break;
+
+ case ZPOOL_PROP_FAILUREMODE:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
+ intval > ZIO_FAILURE_MODE_PANIC))
+ error = SET_ERROR(EINVAL);
+
+ /*
+ * This is a special case which only occurs when
+ * the pool has completely failed. This allows
+ * the user to change the in-core failmode property
+ * without syncing it out to disk (I/Os might
+ * currently be blocked). We do this by returning
+ * EIO to the caller (spa_prop_set) to trick it
+ * into thinking we encountered a property validation
+ * error.
+ */
+ if (!error && spa_suspended(spa)) {
+ spa->spa_failmode = intval;
+ error = SET_ERROR(EIO);
+ }
+ break;
+
+ case ZPOOL_PROP_CACHEFILE:
+ if ((error = nvpair_value_string(elem, &strval)) != 0)
+ break;
+
+ if (strval[0] == '\0')
+ break;
+
+ if (strcmp(strval, "none") == 0)
+ break;
+
+ if (strval[0] != '/') {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ slash = strrchr(strval, '/');
+ ASSERT(slash != NULL);
+
+ if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
+ strcmp(slash, "/..") == 0)
+ error = SET_ERROR(EINVAL);
+ break;
+
+ case ZPOOL_PROP_COMMENT:
+ if ((error = nvpair_value_string(elem, &strval)) != 0)
+ break;
+ for (check = strval; *check != '\0'; check++) {
+ /*
+ * The kernel doesn't have an easy isprint()
+ * check. For this kernel check, we merely
+ * check ASCII apart from DEL. Fix this if
+ * there is an easy-to-use kernel isprint().
+ */
+ if (*check >= 0x7f) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ }
+ if (strlen(strval) > ZPROP_MAX_COMMENT)
+ error = E2BIG;
+ break;
+
+ case ZPOOL_PROP_DEDUPDITTO:
+ if (spa_version(spa) < SPA_VERSION_DEDUP)
+ error = SET_ERROR(ENOTSUP);
+ else
+ error = nvpair_value_uint64(elem, &intval);
+ if (error == 0 &&
+ intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ if (error)
+ break;
+ }
+
+ if (!error && reset_bootfs) {
+ error = nvlist_remove(props,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
+
+ if (!error) {
+ error = nvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
+ }
+ }
+
+ return (error);
+}
+
+void
+spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
+{
+ char *cachefile;
+ spa_config_dirent_t *dp;
+
+ if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
+ &cachefile) != 0)
+ return;
+
+ dp = kmem_alloc(sizeof (spa_config_dirent_t),
+ KM_SLEEP);
+
+ if (cachefile[0] == '\0')
+ dp->scd_path = spa_strdup(spa_config_path);
+ else if (strcmp(cachefile, "none") == 0)
+ dp->scd_path = NULL;
+ else
+ dp->scd_path = spa_strdup(cachefile);
+
+ list_insert_head(&spa->spa_config_list, dp);
+ if (need_sync)
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
+int
+spa_prop_set(spa_t *spa, nvlist_t *nvp)
+{
+ int error;
+ nvpair_t *elem = NULL;
+ boolean_t need_sync = B_FALSE;
+
+ if ((error = spa_prop_validate(spa, nvp)) != 0)
+ return (error);
+
+ while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
+ zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
+
+ if (prop == ZPOOL_PROP_CACHEFILE ||
+ prop == ZPOOL_PROP_ALTROOT ||
+ prop == ZPOOL_PROP_READONLY)
+ continue;
+
+ if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
+ uint64_t ver;
+
+ if (prop == ZPOOL_PROP_VERSION) {
+ VERIFY(nvpair_value_uint64(elem, &ver) == 0);
+ } else {
+ ASSERT(zpool_prop_feature(nvpair_name(elem)));
+ ver = SPA_VERSION_FEATURES;
+ need_sync = B_TRUE;
+ }
+
+ /* Save time if the version is already set. */
+ if (ver == spa_version(spa))
+ continue;
+
+ /*
+ * In addition to the pool directory object, we might
+ * create the pool properties object, the features for
+ * read object, the features for write object, or the
+ * feature descriptions object.
+ */
+ error = dsl_sync_task(spa->spa_name, NULL,
+ spa_sync_version, &ver,
+ 6, ZFS_SPACE_CHECK_RESERVED);
+ if (error)
+ return (error);
+ continue;
+ }
+
+ need_sync = B_TRUE;
+ break;
+ }
+
+ if (need_sync) {
+ return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
+ nvp, 6, ZFS_SPACE_CHECK_RESERVED));
+ }
+
+ return (0);
+}
+
+/*
+ * If the bootfs property value is dsobj, clear it.
+ */
+void
+spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+{
+ if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
+ VERIFY(zap_remove(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
+ spa->spa_bootfs = 0;
+ }
+}
+
+/*ARGSUSED*/
+static int
+spa_change_guid_check(void *arg, dmu_tx_t *tx)
+{
+ uint64_t *newguid = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t vdev_state;
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ int error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (SET_ERROR(error));
+ }
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_state = rvd->vdev_state;
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ if (vdev_state != VDEV_STATE_HEALTHY)
+ return (SET_ERROR(ENXIO));
+
+ ASSERT3U(spa_guid(spa), !=, *newguid);
+
+ return (0);
+}
+
+static void
+spa_change_guid_sync(void *arg, dmu_tx_t *tx)
+{
+ uint64_t *newguid = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ uint64_t oldguid;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ oldguid = spa_guid(spa);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ rvd->vdev_guid = *newguid;
+ rvd->vdev_guid_sum += (*newguid - oldguid);
+ vdev_config_dirty(rvd);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
+ oldguid, *newguid);
+}
+
+/*
+ * Change the GUID for the pool. This is done so that we can later
+ * re-import a pool built from a clone of our own vdevs. We will modify
+ * the root vdev's guid, our own pool guid, and then mark all of our
+ * vdevs dirty. Note that we must make sure that all our vdevs are
+ * online when we do this, or else any vdevs that weren't present
+ * would be orphaned from our pool. We are also going to issue a
+ * sysevent to update any watchers.
+ */
+int
+spa_change_guid(spa_t *spa)
+{
+ int error;
+ uint64_t guid;
+
+ mutex_enter(&spa->spa_vdev_top_lock);
+ mutex_enter(&spa_namespace_lock);
+ guid = spa_generate_guid(NULL);
+
+ error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
+ spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
+
+ if (error == 0) {
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
+ }
+
+ mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+ const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
+ const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
+ int ret;
+
+ ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
+ sizeof (zbookmark_phys_t));
+
+ return (AVL_ISIGN(ret));
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+ ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+ bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+ bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+static void
+spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+ const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+ enum zti_modes mode = ztip->zti_mode;
+ uint_t value = ztip->zti_value;
+ uint_t count = ztip->zti_count;
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ char name[32];
+ uint_t flags = 0;
+ boolean_t batch = B_FALSE;
+
+ if (mode == ZTI_MODE_NULL) {
+ tqs->stqs_count = 0;
+ tqs->stqs_taskq = NULL;
+ return;
+ }
+
+ ASSERT3U(count, >, 0);
+
+ tqs->stqs_count = count;
+ tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
+
+ switch (mode) {
+ case ZTI_MODE_FIXED:
+ ASSERT3U(value, >=, 1);
+ value = MAX(value, 1);
+ break;
+
+ case ZTI_MODE_BATCH:
+ batch = B_TRUE;
+ flags |= TASKQ_THREADS_CPU_PCT;
+ value = zio_taskq_batch_pct;
+ break;
+
+ default:
+ panic("unrecognized mode for %s_%s taskq (%u:%u) in "
+ "spa_activate()",
+ zio_type_name[t], zio_taskq_types[q], mode, value);
+ break;
+ }
+
+ for (uint_t i = 0; i < count; i++) {
+ taskq_t *tq;
+
+ if (count > 1) {
+ (void) snprintf(name, sizeof (name), "%s_%s_%u",
+ zio_type_name[t], zio_taskq_types[q], i);
+ } else {
+ (void) snprintf(name, sizeof (name), "%s_%s",
+ zio_type_name[t], zio_taskq_types[q]);
+ }
+
+#ifdef SYSDC
+ if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+ if (batch)
+ flags |= TASKQ_DC_BATCH;
+
+ tq = taskq_create_sysdc(name, value, 50, INT_MAX,
+ spa->spa_proc, zio_taskq_basedc, flags);
+ } else {
+#endif
+ pri_t pri = maxclsyspri;
+ /*
+ * The write issue taskq can be extremely CPU
+ * intensive. Run it at slightly lower priority
+ * than the other taskqs.
+ * FreeBSD notes:
+ * - numerically higher priorities are lower priorities;
+ * - if priorities divided by four (RQ_PPQ) are equal
+ * then a difference between them is insignificant.
+ */
+ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+#ifdef illumos
+ pri--;
+#else
+ pri += 4;
+#endif
+
+ tq = taskq_create_proc(name, value, pri, 50,
+ INT_MAX, spa->spa_proc, flags);
+#ifdef SYSDC
+ }
+#endif
+
+ tqs->stqs_taskq[i] = tq;
+ }
+}
+
+static void
+spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+
+ if (tqs->stqs_taskq == NULL) {
+ ASSERT0(tqs->stqs_count);
+ return;
+ }
+
+ for (uint_t i = 0; i < tqs->stqs_count; i++) {
+ ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
+ taskq_destroy(tqs->stqs_taskq[i]);
+ }
+
+ kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
+ tqs->stqs_taskq = NULL;
+}
+
+/*
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
+ * on the taskq itself. In that case we choose which taskq at random by using
+ * the low bits of gethrtime().
+ */
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+{
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ taskq_t *tq;
+
+ ASSERT3P(tqs->stqs_taskq, !=, NULL);
+ ASSERT3U(tqs->stqs_count, !=, 0);
+
+ if (tqs->stqs_count == 1) {
+ tq = tqs->stqs_taskq[0];
+ } else {
+#ifdef _KERNEL
+ tq = tqs->stqs_taskq[(u_int)(sbinuptime() + curcpu) %
+ tqs->stqs_count];
+#else
+ tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
+#endif
+ }
+
+ taskq_dispatch_ent(tq, func, arg, flags, ent);
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
+ for (int t = 0; t < ZIO_TYPES; t++) {
+ for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ spa_taskqs_init(spa, t, q);
+ }
+ }
+}
+
+#ifdef SPA_PROCESS
+static int
+newproc(void (*pc)(void *), void *arg, id_t cid, int pri,
+ void **ct, pid_t pid)
+{
+ va_list ap;
+ spa_t *spa = (spa_t *)arg; /* XXX */
+ struct proc *newp;
+ struct thread *td;
+ int error;
+
+ ASSERT(ct == NULL);
+ ASSERT(pid == 0);
+ ASSERT(cid == syscid);
+
+ error = kproc_create(pc, arg, &newp, 0, 0, "zpool-%s", spa->spa_name);
+ if (error != 0)
+ return (error);
+ td = FIRST_THREAD_IN_PROC(newp);
+ thread_lock(td);
+ sched_prio(td, pri);
+ thread_unlock(td);
+ return (0);
+}
+
+static void
+spa_thread(void *arg)
+{
+ callb_cpr_t cprinfo;
+
+ spa_t *spa = arg;
+#ifdef illumos
+ user_t *pu = PTOU(curproc);
+#endif
+ CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+ spa->spa_name);
+
+ ASSERT(curproc != &p0);
+#ifdef illumos
+ (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+ "zpool-%s", spa->spa_name);
+ (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+#endif
+
+#ifdef PSRSET_BIND
+ /* bind this thread to the requested psrset */
+ if (zio_taskq_psrset_bind != PS_NONE) {
+ pool_lock();
+ mutex_enter(&cpu_lock);
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+ 0, NULL, NULL) == 0) {
+ curthread->t_bind_pset = zio_taskq_psrset_bind;
+ } else {
+ cmn_err(CE_WARN,
+ "Couldn't bind process for zfs pool \"%s\" to "
+ "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+ }
+
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ mutex_exit(&cpu_lock);
+ pool_unlock();
+ }
+#endif
+
+#ifdef SYSDC
+ if (zio_taskq_sysdc) {
+ sysdc_thread_enter(curthread, 100, 0);
+ }
+#endif
+
+ spa->spa_proc = curproc;
+ spa->spa_did = curthread->t_did;
+
+ spa_create_zio_taskqs(spa);
+
+ mutex_enter(&spa->spa_proc_lock);
+ ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+ spa->spa_proc_state = SPA_PROC_ACTIVE;
+ cv_broadcast(&spa->spa_proc_cv);
+
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+ ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+ spa->spa_proc_state = SPA_PROC_GONE;
+ spa->spa_proc = &p0;
+ cv_broadcast(&spa->spa_proc_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
+
+#ifdef illumos
+ mutex_enter(&curproc->p_lock);
+ lwp_exit();
+#else
+ kthread_exit();
+#endif
+}
+#endif /* SPA_PROCESS */
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa, int mode)
+{
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_mode = mode;
+
+ spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
+
+ /* Try to create a covering process */
+ mutex_enter(&spa->spa_proc_lock);
+ ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+ ASSERT(spa->spa_proc == &p0);
+ spa->spa_did = 0;
+
+#ifdef SPA_PROCESS
+ /* Only create a process if we're going to be around a while. */
+ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+ if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+ NULL, 0) == 0) {
+ spa->spa_proc_state = SPA_PROC_CREATED;
+ while (spa->spa_proc_state == SPA_PROC_CREATED) {
+ cv_wait(&spa->spa_proc_cv,
+ &spa->spa_proc_lock);
+ }
+ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+ ASSERT(spa->spa_proc != &p0);
+ ASSERT(spa->spa_did != 0);
+ } else {
+#ifdef _KERNEL
+ cmn_err(CE_WARN,
+ "Couldn't create process for zfs pool \"%s\"\n",
+ spa->spa_name);
+#endif
+ }
+ }
+#endif /* SPA_PROCESS */
+ mutex_exit(&spa->spa_proc_lock);
+
+ /* If we didn't create a process, we need to create our taskqs. */
+#ifndef SPA_PROCESS
+ ASSERT(spa->spa_proc == &p0);
+#endif /* SPA_PROCESS */
+ if (spa->spa_proc == &p0) {
+ spa_create_zio_taskqs(spa);
+ }
+
+ /*
+ * Start TRIM thread.
+ */
+ trim_thread_create(spa);
+
+ /*
+ * This taskq is used to perform zvol-minor-related tasks
+ * asynchronously. This has several advantages, including easy
+ * resolution of various deadlocks (zfsonlinux bug #3681).
+ *
+ * The taskq must be single threaded to ensure tasks are always
+ * processed in the order in which they were dispatched.
+ *
+ * A taskq per pool allows one to keep the pools independent.
+ * This way if one pool is suspended, it will not impact another.
+ *
+ * The preferred location to dispatch a zvol minor task is a sync
+ * task. In this context, there is easy access to the spa_t and minimal
+ * error handling is required because the sync task must succeed.
+ */
+ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, minclsyspri,
+ 1, INT_MAX, 0);
+
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
+
+ list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_config_dirty_node));
+ list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
+ offsetof(objset_t, os_evicting_node));
+ list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_state_dirty_node));
+
+ txg_list_create(&spa->spa_vdev_txg_list, spa,
+ offsetof(struct vdev, vdev_txg_node));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+ ASSERT(spa->spa_sync_on == B_FALSE);
+ ASSERT(spa->spa_dsl_pool == NULL);
+ ASSERT(spa->spa_root_vdev == NULL);
+ ASSERT(spa->spa_async_zio_root == NULL);
+ ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+ /*
+ * Stop TRIM thread in case spa_unload() wasn't called directly
+ * before spa_deactivate().
+ */
+ trim_thread_destroy(spa);
+
+ spa_evicting_os_wait(spa);
+
+ if (spa->spa_zvol_taskq) {
+ taskq_destroy(spa->spa_zvol_taskq);
+ spa->spa_zvol_taskq = NULL;
+ }
+
+ txg_list_destroy(&spa->spa_vdev_txg_list);
+
+ list_destroy(&spa->spa_config_dirty_list);
+ list_destroy(&spa->spa_evicting_os_list);
+ list_destroy(&spa->spa_state_dirty_list);
+
+ for (int t = 0; t < ZIO_TYPES; t++) {
+ for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ spa_taskqs_fini(spa, t, q);
+ }
+ }
+
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
+ VERIFY0(zio_wait(spa->spa_txg_zio[i]));
+ spa->spa_txg_zio[i] = NULL;
+ }
+
+ metaslab_class_destroy(spa->spa_normal_class);
+ spa->spa_normal_class = NULL;
+
+ metaslab_class_destroy(spa->spa_log_class);
+ spa->spa_log_class = NULL;
+
+ metaslab_class_destroy(spa->spa_special_class);
+ spa->spa_special_class = NULL;
+
+ metaslab_class_destroy(spa->spa_dedup_class);
+ spa->spa_dedup_class = NULL;
+
+ /*
+ * If this was part of an import or the open otherwise failed, we may
+ * still have errors left in the queues. Empty them just in case.
+ */
+ spa_errlog_drain(spa);
+
+ avl_destroy(&spa->spa_errlist_scrub);
+ avl_destroy(&spa->spa_errlist_last);
+
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+ mutex_enter(&spa->spa_proc_lock);
+ if (spa->spa_proc_state != SPA_PROC_NONE) {
+ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+ spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+ cv_broadcast(&spa->spa_proc_cv);
+ while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+ ASSERT(spa->spa_proc != &p0);
+ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+ }
+ ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+ spa->spa_proc_state = SPA_PROC_NONE;
+ }
+ ASSERT(spa->spa_proc == &p0);
+ mutex_exit(&spa->spa_proc_lock);
+
+#ifdef SPA_PROCESS
+#ifdef illumos
+ /*
+ * We want to make sure spa_thread() has actually exited the ZFS
+ * module, so that the module can't be unloaded out from underneath
+ * it.
+ */
+ if (spa->spa_did != 0) {
+ thread_join(spa->spa_did);
+ spa->spa_did = 0;
+ }
+#endif
+#endif /* SPA_PROCESS */
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately. This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state. This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+static int
+spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
+ uint_t id, int atype)
+{
+ nvlist_t **child;
+ uint_t children;
+ int error;
+
+ if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
+ return (error);
+
+ if ((*vdp)->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children);
+
+ if (error == ENOENT)
+ return (0);
+
+ if (error) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *vd;
+ if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
+ atype)) != 0) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (error);
+ }
+ }
+
+ ASSERT(*vdp != NULL);
+
+ return (0);
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+ int i;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_load_note(spa, "UNLOADING");
+
+ /*
+ * Stop TRIM thread.
+ */
+ trim_thread_destroy(spa);
+
+ /*
+ * Stop async tasks.
+ */
+ spa_async_suspend(spa);
+
+ if (spa->spa_root_vdev) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
+ /*
+ * Stop syncing.
+ */
+ if (spa->spa_sync_on) {
+ txg_sync_stop(spa->spa_dsl_pool);
+ spa->spa_sync_on = B_FALSE;
+ }
+
+ /*
+ * Even though vdev_free() also calls vdev_metaslab_fini, we need
+ * to call it earlier, before we wait for async i/o to complete.
+ * This ensures that there is no async metaslab prefetching, by
+ * calling taskq_wait(mg_taskq).
+ */
+ if (spa->spa_root_vdev != NULL) {
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
+ vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
+ spa_config_exit(spa, SCL_ALL, spa);
+ }
+
+ if (spa->spa_mmp.mmp_thread)
+ mmp_thread_stop(spa);
+
+ /*
+ * Wait for any outstanding async I/O to complete.
+ */
+ if (spa->spa_async_zio_root != NULL) {
+ for (int i = 0; i < max_ncpus; i++)
+ (void) zio_wait(spa->spa_async_zio_root[i]);
+ kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
+ spa->spa_async_zio_root = NULL;
+ }
+
+ if (spa->spa_vdev_removal != NULL) {
+ spa_vdev_removal_destroy(spa->spa_vdev_removal);
+ spa->spa_vdev_removal = NULL;
+ }
+
+ if (spa->spa_condense_zthr != NULL) {
+ zthr_destroy(spa->spa_condense_zthr);
+ spa->spa_condense_zthr = NULL;
+ }
+
+ if (spa->spa_checkpoint_discard_zthr != NULL) {
+ zthr_destroy(spa->spa_checkpoint_discard_zthr);
+ spa->spa_checkpoint_discard_zthr = NULL;
+ }
+
+ spa_condense_fini(spa);
+
+ bpobj_close(&spa->spa_deferred_bpobj);
+
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+
+ /*
+ * Close all vdevs.
+ */
+ if (spa->spa_root_vdev)
+ vdev_free(spa->spa_root_vdev);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ /*
+ * Close the dsl pool.
+ */
+ if (spa->spa_dsl_pool) {
+ dsl_pool_close(spa->spa_dsl_pool);
+ spa->spa_dsl_pool = NULL;
+ spa->spa_meta_objset = NULL;
+ }
+
+ ddt_unload(spa);
+
+ /*
+ * Drop and purge level 2 cache
+ */
+ spa_l2cache_drop(spa);
+
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ vdev_free(spa->spa_spares.sav_vdevs[i]);
+ if (spa->spa_spares.sav_vdevs) {
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
+ spa->spa_spares.sav_vdevs = NULL;
+ }
+ if (spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
+ }
+ spa->spa_spares.sav_count = 0;
+
+ for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
+ vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+ }
+ if (spa->spa_l2cache.sav_vdevs) {
+ kmem_free(spa->spa_l2cache.sav_vdevs,
+ spa->spa_l2cache.sav_count * sizeof (void *));
+ spa->spa_l2cache.sav_vdevs = NULL;
+ }
+ if (spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
+ }
+ spa->spa_l2cache.sav_count = 0;
+
+ spa->spa_async_suspended = 0;
+
+ spa->spa_indirect_vdevs_loaded = B_FALSE;
+
+ if (spa->spa_comment != NULL) {
+ spa_strfree(spa->spa_comment);
+ spa->spa_comment = NULL;
+ }
+
+ spa_config_exit(spa, SCL_ALL, spa);
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active spares for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ */
+void
+spa_load_spares(spa_t *spa)
+{
+ nvlist_t **spares;
+ uint_t nspares;
+ int i;
+ vdev_t *vd, *tvd;
+
+#ifndef _KERNEL
+ /*
+ * zdb opens both the current state of the pool and the
+ * checkpointed state (if present), with a different spa_t.
+ *
+ * As spare vdevs are shared among open pools, we skip loading
+ * them when we load the checkpointed state of the pool.
+ */
+ if (!spa_writeable(spa))
+ return;
+#endif
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /*
+ * First, close and free any existing spare vdevs.
+ */
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ vd = spa->spa_spares.sav_vdevs[i];
+
+ /* Undo the call to spa_activate() below */
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+ B_FALSE)) != NULL && tvd->vdev_isspare)
+ spa_spare_remove(tvd);
+ vdev_close(vd);
+ vdev_free(vd);
+ }
+
+ if (spa->spa_spares.sav_vdevs)
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
+
+ if (spa->spa_spares.sav_config == NULL)
+ nspares = 0;
+ else
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ spa->spa_spares.sav_count = (int)nspares;
+ spa->spa_spares.sav_vdevs = NULL;
+
+ if (nspares == 0)
+ return;
+
+ /*
+ * Construct the array of vdevs, opening them to get status in the
+ * process. For each spare, there is potentially two different vdev_t
+ * structures associated with it: one in the list of spares (used only
+ * for basic validation purposes) and one in the active vdev
+ * configuration (if it's spared in). During this phase we open and
+ * validate each vdev on the spare list. If the vdev also exists in the
+ * active configuration, then we also mark this vdev as an active spare.
+ */
+ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
+ VDEV_ALLOC_SPARE) == 0);
+ ASSERT(vd != NULL);
+
+ spa->spa_spares.sav_vdevs[i] = vd;
+
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+ B_FALSE)) != NULL) {
+ if (!tvd->vdev_isspare)
+ spa_spare_add(tvd);
+
+ /*
+ * We only mark the spare active if we were successfully
+ * able to load the vdev. Otherwise, importing a pool
+ * with a bad active spare would result in strange
+ * behavior, because multiple pool would think the spare
+ * is actively in use.
+ *
+ * There is a vulnerability here to an equally bizarre
+ * circumstance, where a dead active spare is later
+ * brought back to life (onlined or otherwise). Given
+ * the rarity of this scenario, and the extra complexity
+ * it adds, we ignore the possibility.
+ */
+ if (!vdev_is_dead(tvd))
+ spa_spare_activate(tvd);
+ }
+
+ vd->vdev_top = vd;
+ vd->vdev_aux = &spa->spa_spares;
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ if (vdev_validate_aux(vd) == 0)
+ spa_spare_add(vd);
+ }
+
+ /*
+ * Recompute the stashed list of spares, with status information
+ * this time.
+ */
+ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ spares[i] = vdev_config_generate(spa,
+ spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ nvlist_free(spares[i]);
+ kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active l2cache for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ * Devices which are already active have their details maintained, and are
+ * not re-opened.
+ */
+void
+spa_load_l2cache(spa_t *spa)
+{
+ nvlist_t **l2cache;
+ uint_t nl2cache;
+ int i, j, oldnvdevs;
+ uint64_t guid;
+ vdev_t *vd, **oldvdevs, **newvdevs;
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+#ifndef _KERNEL
+ /*
+ * zdb opens both the current state of the pool and the
+ * checkpointed state (if present), with a different spa_t.
+ *
+ * As L2 caches are part of the ARC which is shared among open
+ * pools, we skip loading them when we load the checkpointed
+ * state of the pool.
+ */
+ if (!spa_writeable(spa))
+ return;
+#endif
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if (sav->sav_config != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+ newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
+ } else {
+ nl2cache = 0;
+ newvdevs = NULL;
+ }
+
+ oldvdevs = sav->sav_vdevs;
+ oldnvdevs = sav->sav_count;
+ sav->sav_vdevs = NULL;
+ sav->sav_count = 0;
+
+ /*
+ * Process new nvlist of vdevs.
+ */
+ for (i = 0; i < nl2cache; i++) {
+ VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
+ &guid) == 0);
+
+ newvdevs[i] = NULL;
+ for (j = 0; j < oldnvdevs; j++) {
+ vd = oldvdevs[j];
+ if (vd != NULL && guid == vd->vdev_guid) {
+ /*
+ * Retain previous vdev for add/remove ops.
+ */
+ newvdevs[i] = vd;
+ oldvdevs[j] = NULL;
+ break;
+ }
+ }
+
+ if (newvdevs[i] == NULL) {
+ /*
+ * Create new vdev
+ */
+ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
+ VDEV_ALLOC_L2CACHE) == 0);
+ ASSERT(vd != NULL);
+ newvdevs[i] = vd;
+
+ /*
+ * Commit this vdev as an l2cache device,
+ * even if it fails to open.
+ */
+ spa_l2cache_add(vd);
+
+ vd->vdev_top = vd;
+ vd->vdev_aux = sav;
+
+ spa_l2cache_activate(vd);
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ (void) vdev_validate_aux(vd);
+
+ if (!vdev_is_dead(vd))
+ l2arc_add_vdev(spa, vd);
+ }
+ }
+
+ /*
+ * Purge vdevs that were dropped
+ */
+ for (i = 0; i < oldnvdevs; i++) {
+ uint64_t pool;
+
+ vd = oldvdevs[i];
+ if (vd != NULL) {
+ ASSERT(vd->vdev_isl2cache);
+
+ if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL && l2arc_vdev_present(vd))
+ l2arc_remove_vdev(vd);
+ vdev_clear_stats(vd);
+ vdev_free(vd);
+ }
+ }
+
+ if (oldvdevs)
+ kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+
+ if (sav->sav_config == NULL)
+ goto out;
+
+ sav->sav_vdevs = newvdevs;
+ sav->sav_count = (int)nl2cache;
+
+ /*
+ * Recompute the stashed list of l2cache devices, with status
+ * information this time.
+ */
+ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < sav->sav_count; i++)
+ l2cache[i] = vdev_config_generate(spa,
+ sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
+out:
+ for (i = 0; i < sav->sav_count; i++)
+ nvlist_free(l2cache[i]);
+ if (sav->sav_count)
+ kmem_free(l2cache, sav->sav_count * sizeof (void *));
+}
+
+static int
+load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
+{
+ dmu_buf_t *db;
+ char *packed = NULL;
+ size_t nvsize = 0;
+ int error;
+ *value = NULL;
+
+ error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
+ if (error != 0)
+ return (error);
+
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+ error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
+ DMU_READ_PREFETCH);
+ if (error == 0)
+ error = nvlist_unpack(packed, nvsize, value, 0);
+ kmem_free(packed, nvsize);
+
+ return (error);
+}
+
+/*
+ * Concrete top-level vdevs that are not missing and are not logs. At every
+ * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
+ */
+static uint64_t
+spa_healthy_core_tvds(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t tvds = 0;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+ if (vd->vdev_islog)
+ continue;
+ if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
+ tvds++;
+ }
+
+ return (tvds);
+}
+
+/*
+ * Checks to see if the given vdev could not be opened, in which case we post a
+ * sysevent to notify the autoreplace code that the device has been removed.
+ */
+static void
+spa_check_removed(vdev_t *vd)
+{
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ spa_check_removed(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
+ vdev_is_concrete(vd)) {
+ zfs_post_autoreplace(vd->vdev_spa, vd);
+ spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
+ }
+}
+
+static int
+spa_check_for_missing_logs(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * If we're doing a normal import, then build up any additional
+ * diagnostic information about missing log devices.
+ * We'll pass this up to the user for further processing.
+ */
+ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
+ nvlist_t **child, *nv;
+ uint64_t idx = 0;
+
+ child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
+ KM_SLEEP);
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ /*
+ * We consider a device as missing only if it failed
+ * to open (i.e. offline or faulted is not considered
+ * as missing).
+ */
+ if (tvd->vdev_islog &&
+ tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+ child[idx++] = vdev_config_generate(spa, tvd,
+ B_FALSE, VDEV_CONFIG_MISSING);
+ }
+ }
+
+ if (idx > 0) {
+ fnvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx);
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_MISSING_DEVICES, nv);
+
+ for (uint64_t i = 0; i < idx; i++)
+ nvlist_free(child[i]);
+ }
+ nvlist_free(nv);
+ kmem_free(child, rvd->vdev_children * sizeof (char **));
+
+ if (idx > 0) {
+ spa_load_failed(spa, "some log devices are missing");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ return (SET_ERROR(ENXIO));
+ }
+ } else {
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_islog &&
+ tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ spa_load_note(spa, "some log devices are "
+ "missing, ZIL is dropped.");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ break;
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Check for missing log devices
+ */
+static boolean_t
+spa_check_logs(spa_t *spa)
+{
+ boolean_t rv = B_FALSE;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+
+ switch (spa->spa_log_state) {
+ case SPA_LOG_MISSING:
+ /* need to recheck in case slog has been restored */
+ case SPA_LOG_UNKNOWN:
+ rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
+ if (rv)
+ spa_set_log_state(spa, SPA_LOG_MISSING);
+ break;
+ }
+ return (rv);
+}
+
+static boolean_t
+spa_passivate_log(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ boolean_t slog_found = B_FALSE;
+
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ if (!spa_has_slogs(spa))
+ return (B_FALSE);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (tvd->vdev_islog) {
+ metaslab_group_passivate(mg);
+ slog_found = B_TRUE;
+ }
+ }
+
+ return (slog_found);
+}
+
+static void
+spa_activate_log(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (tvd->vdev_islog)
+ metaslab_group_activate(mg);
+ }
+}
+
+int
+spa_reset_logs(spa_t *spa)
+{
+ int error;
+
+ error = dmu_objset_find(spa_name(spa), zil_reset,
+ NULL, DS_FIND_CHILDREN);
+ if (error == 0) {
+ /*
+ * We successfully offlined the log device, sync out the
+ * current txg so that the "stubby" block can be removed
+ * by zil_sync().
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ }
+ return (error);
+}
+
+static void
+spa_aux_check_removed(spa_aux_vdev_t *sav)
+{
+ int i;
+
+ for (i = 0; i < sav->sav_count; i++)
+ spa_check_removed(sav->sav_vdevs[i]);
+}
+
+void
+spa_claim_notify(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ if (zio->io_error)
+ return;
+
+ mutex_enter(&spa->spa_props_lock); /* any mutex will do */
+ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+ spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+ mutex_exit(&spa->spa_props_lock);
+}
+
+typedef struct spa_load_error {
+ uint64_t sle_meta_count;
+ uint64_t sle_data_count;
+} spa_load_error_t;
+
+static void
+spa_load_verify_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ spa_load_error_t *sle = zio->io_private;
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ int error = zio->io_error;
+ spa_t *spa = zio->io_spa;
+
+ abd_free(zio->io_abd);
+ if (error) {
+ if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
+ type != DMU_OT_INTENT_LOG)
+ atomic_inc_64(&sle->sle_meta_count);
+ else
+ atomic_inc_64(&sle->sle_data_count);
+ }
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_load_verify_ios--;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+/*
+ * Maximum number of concurrent scrub i/os to create while verifying
+ * a pool while importing it.
+ */
+int spa_load_verify_maxinflight = 10000;
+boolean_t spa_load_verify_metadata = B_TRUE;
+boolean_t spa_load_verify_data = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
+ &spa_load_verify_maxinflight, 0,
+ "Maximum number of concurrent scrub I/Os to create while verifying a "
+ "pool while importing it");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
+ &spa_load_verify_metadata, 0,
+ "Check metadata on import?");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
+ &spa_load_verify_data, 0,
+ "Check user data on import?");
+
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ return (0);
+ /*
+ * Note: normally this routine will not be called if
+ * spa_load_verify_metadata is not set. However, it may be useful
+ * to manually set the flag after the traversal has begun.
+ */
+ if (!spa_load_verify_metadata)
+ return (0);
+ if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
+ return (0);
+
+ zio_t *rio = arg;
+ size_t size = BP_GET_PSIZE(bp);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_load_verify_ios++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
+ spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+ return (0);
+}
+
+/* ARGSUSED */
+int
+verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ return (0);
+}
+
+static int
+spa_load_verify(spa_t *spa)
+{
+ zio_t *rio;
+ spa_load_error_t sle = { 0 };
+ zpool_load_policy_t policy;
+ boolean_t verify_ok = B_FALSE;
+ int error = 0;
+
+ zpool_get_load_policy(spa->spa_config, &policy);
+
+ if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
+ return (0);
+
+ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+ error = dmu_objset_find_dp(spa->spa_dsl_pool,
+ spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
+ DS_FIND_CHILDREN);
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+ if (error != 0)
+ return (error);
+
+ rio = zio_root(spa, NULL, &sle,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ if (spa_load_verify_metadata) {
+ if (spa->spa_extreme_rewind) {
+ spa_load_note(spa, "performing a complete scan of the "
+ "pool since extreme rewind is on. This may take "
+ "a very long time.\n (spa_load_verify_data=%u, "
+ "spa_load_verify_metadata=%u)",
+ spa_load_verify_data, spa_load_verify_metadata);
+ }
+ error = traverse_pool(spa, spa->spa_verify_min_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+ spa_load_verify_cb, rio);
+ }
+
+ (void) zio_wait(rio);
+
+ spa->spa_load_meta_errors = sle.sle_meta_count;
+ spa->spa_load_data_errors = sle.sle_data_count;
+
+ if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
+ spa_load_note(spa, "spa_load_verify found %llu metadata errors "
+ "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
+ (u_longlong_t)sle.sle_data_count);
+ }
+
+ if (spa_load_verify_dryrun ||
+ (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
+ sle.sle_data_count <= policy.zlp_maxdata)) {
+ int64_t loss = 0;
+
+ verify_ok = B_TRUE;
+ spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
+ VERIFY(nvlist_add_int64(spa->spa_load_info,
+ ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
+ } else {
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+ }
+
+ if (spa_load_verify_dryrun)
+ return (0);
+
+ if (error) {
+ if (error != ENXIO && error != EIO)
+ error = SET_ERROR(EIO);
+ return (error);
+ }
+
+ return (verify_ok ? 0 : EIO);
+}
+
+/*
+ * Find a value in the pool props object.
+ */
+static void
+spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+{
+ (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+ zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+}
+
+/*
+ * Find a value in the pool directory object.
+ */
+static int
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
+{
+ int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ name, sizeof (uint64_t), 1, val);
+
+ if (error != 0 && (error != ENOENT || log_enoent)) {
+ spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
+ "[error=%d]", name, error);
+ }
+
+ return (error);
+}
+
+static int
+spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+{
+ vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+ return (SET_ERROR(err));
+}
+
+static void
+spa_spawn_aux_threads(spa_t *spa)
+{
+ ASSERT(spa_writeable(spa));
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_start_indirect_condensing_thread(spa);
+
+ ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
+ spa->spa_checkpoint_discard_zthr =
+ zthr_create(spa_checkpoint_discard_thread_check,
+ spa_checkpoint_discard_thread, spa);
+}
+
+/*
+ * Fix up config after a partly-completed split. This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process. To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call. If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+static void
+spa_try_repair(spa_t *spa, nvlist_t *config)
+{
+ uint_t extracted;
+ uint64_t *glist;
+ uint_t i, gcount;
+ nvlist_t *nvl;
+ vdev_t **vd;
+ boolean_t attempt_reopen;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+ return;
+
+ /* check that the config is complete */
+ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ &glist, &gcount) != 0)
+ return;
+
+ vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+
+ /* attempt to online all the vdevs & validate */
+ attempt_reopen = B_TRUE;
+ for (i = 0; i < gcount; i++) {
+ if (glist[i] == 0) /* vdev is hole */
+ continue;
+
+ vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+ if (vd[i] == NULL) {
+ /*
+ * Don't bother attempting to reopen the disks;
+ * just do the split.
+ */
+ attempt_reopen = B_FALSE;
+ } else {
+ /* attempt to re-online it */
+ vd[i]->vdev_offline = B_FALSE;
+ }
+ }
+
+ if (attempt_reopen) {
+ vdev_reopen(spa->spa_root_vdev);
+
+ /* check each device to see what state it's in */
+ for (extracted = 0, i = 0; i < gcount; i++) {
+ if (vd[i] != NULL &&
+ vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+ break;
+ ++extracted;
+ }
+ }
+
+ /*
+ * If every disk has been moved to the new pool, or if we never
+ * even attempted to look at them, then we split them off for
+ * good.
+ */
+ if (!attempt_reopen || gcount == extracted) {
+ for (i = 0; i < gcount; i++)
+ if (vd[i] != NULL)
+ vdev_split(vd[i]);
+ vdev_reopen(spa->spa_root_vdev);
+ }
+
+ kmem_free(vd, gcount * sizeof (vdev_t *));
+}
+
+static int
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
+{
+ char *ereport = FM_EREPORT_ZFS_POOL;
+ int error;
+
+ spa->spa_load_state = state;
+
+ gethrestime(&spa->spa_loaded_ts);
+ error = spa_load_impl(spa, type, &ereport);
+
+ /*
+ * Don't count references from objsets that are already closed
+ * and are making their way through the eviction process.
+ */
+ spa_evicting_os_wait(spa);
+ spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
+ if (error) {
+ if (error != EEXIST) {
+ spa->spa_loaded_ts.tv_sec = 0;
+ spa->spa_loaded_ts.tv_nsec = 0;
+ }
+ if (error != EBADF) {
+ zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ }
+ }
+ spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
+}
+
+/*
+ * Count the number of per-vdev ZAPs associated with all of the vdevs in the
+ * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
+ * spa's per-vdev ZAP list.
+ */
+static uint64_t
+vdev_count_verify_zaps(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t total = 0;
+ if (vd->vdev_top_zap != 0) {
+ total++;
+ ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, vd->vdev_top_zap));
+ }
+ if (vd->vdev_leaf_zap != 0) {
+ total++;
+ ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ total += vdev_count_verify_zaps(vd->vdev_child[i]);
+ }
+
+ return (total);
+}
+
+/*
+ * Determine whether the activity check is required.
+ */
+static boolean_t
+spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
+ nvlist_t *config)
+{
+ uint64_t state = 0;
+ uint64_t hostid = 0;
+ uint64_t tryconfig_txg = 0;
+ uint64_t tryconfig_timestamp = 0;
+ uint16_t tryconfig_mmp_seq = 0;
+ nvlist_t *nvinfo;
+
+ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
+ nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
+ (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
+ &tryconfig_txg);
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
+ &tryconfig_timestamp);
+ (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
+ &tryconfig_mmp_seq);
+ }
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
+
+ /*
+ * Disable the MMP activity check - This is used by zdb which
+ * is intended to be used on potentially active pools.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
+ return (B_FALSE);
+
+ /*
+ * Skip the activity check when the MMP feature is disabled.
+ */
+ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
+ return (B_FALSE);
+
+ /*
+ * If the tryconfig_ values are nonzero, they are the results of an
+ * earlier tryimport. If they all match the uberblock we just found,
+ * then the pool has not changed and we return false so we do not test
+ * a second time.
+ */
+ if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
+ tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
+ tryconfig_mmp_seq && tryconfig_mmp_seq ==
+ (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
+ return (B_FALSE);
+
+ /*
+ * Allow the activity check to be skipped when importing the pool
+ * on the same host which last imported it. Since the hostid from
+ * configuration may be stale use the one read from the label.
+ */
+ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
+ hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
+
+ if (hostid == spa_get_hostid())
+ return (B_FALSE);
+
+ /*
+ * Skip the activity test when the pool was cleanly exported.
+ */
+ if (state != POOL_STATE_ACTIVE)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Nanoseconds the activity check must watch for changes on-disk.
+ */
+static uint64_t
+spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
+{
+ uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
+ uint64_t multihost_interval = MSEC2NSEC(
+ MMP_INTERVAL_OK(zfs_multihost_interval));
+ uint64_t import_delay = MAX(NANOSEC, import_intervals *
+ multihost_interval);
+
+ /*
+ * Local tunables determine a minimum duration except for the case
+ * where we know when the remote host will suspend the pool if MMP
+ * writes do not land.
+ *
+ * See Big Theory comment at the top of mmp.c for the reasoning behind
+ * these cases and times.
+ */
+
+ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
+
+ if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
+ MMP_FAIL_INT(ub) > 0) {
+
+ /* MMP on remote host will suspend pool after failed writes */
+ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
+ MMP_IMPORT_SAFETY_FACTOR / 100;
+
+ zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
+ "mmp_fails=%llu ub_mmp mmp_interval=%llu "
+ "import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
+ MMP_INTERVAL(ub), import_intervals);
+
+ } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
+ MMP_FAIL_INT(ub) == 0) {
+
+ /* MMP on remote host will never suspend pool */
+ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
+ ub->ub_mmp_delay) * import_intervals);
+
+ zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
+ "mmp_interval=%llu ub_mmp_delay=%llu "
+ "import_intervals=%u", import_delay, MMP_INTERVAL(ub),
+ ub->ub_mmp_delay, import_intervals);
+
+ } else if (MMP_VALID(ub)) {
+ /*
+ * zfs-0.7 compatability case
+ */
+
+ import_delay = MAX(import_delay, (multihost_interval +
+ ub->ub_mmp_delay) * import_intervals);
+
+ zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
+ "import_intervals=%u leaves=%u", import_delay,
+ ub->ub_mmp_delay, import_intervals,
+ vdev_count_leaves(spa));
+ } else {
+ /* Using local tunings is the only reasonable option */
+ zfs_dbgmsg("pool last imported on non-MMP aware "
+ "host using import_delay=%llu multihost_interval=%llu "
+ "import_intervals=%u", import_delay, multihost_interval,
+ import_intervals);
+ }
+
+ return (import_delay);
+}
+
+/*
+ * Perform the import activity check. If the user canceled the import or
+ * we detected activity then fail.
+ */
+static int
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+{
+ uint64_t txg = ub->ub_txg;
+ uint64_t timestamp = ub->ub_timestamp;
+ uint64_t mmp_config = ub->ub_mmp_config;
+ uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
+ uint64_t import_delay;
+ hrtime_t import_expire;
+ nvlist_t *mmp_label = NULL;
+ vdev_t *rvd = spa->spa_root_vdev;
+ kcondvar_t cv;
+ kmutex_t mtx;
+ int error = 0;
+
+ cv_init(&cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_enter(&mtx);
+
+ /*
+ * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
+ * during the earlier tryimport. If the txg recorded there is 0 then
+ * the pool is known to be active on another host.
+ *
+ * Otherwise, the pool might be in use on another host. Check for
+ * changes in the uberblocks on disk if necessary.
+ */
+ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
+ nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_LOAD_INFO);
+
+ if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
+ fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
+ vdev_uberblock_load(rvd, ub, &mmp_label);
+ error = SET_ERROR(EREMOTEIO);
+ goto out;
+ }
+ }
+
+ import_delay = spa_activity_check_duration(spa, ub);
+
+ /* Add a small random factor in case of simultaneous imports (0-25%) */
+ import_delay += import_delay * spa_get_random(250) / 1000;
+
+ import_expire = gethrtime() + import_delay;
+
+ while (gethrtime() < import_expire) {
+ vdev_uberblock_load(rvd, ub, &mmp_label);
+
+ if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
+ mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
+ zfs_dbgmsg("multihost activity detected "
+ "txg %llu ub_txg %llu "
+ "timestamp %llu ub_timestamp %llu "
+ "mmp_config %#llx ub_mmp_config %#llx",
+ txg, ub->ub_txg, timestamp, ub->ub_timestamp,
+ mmp_config, ub->ub_mmp_config);
+
+ error = SET_ERROR(EREMOTEIO);
+ break;
+ }
+
+ if (mmp_label) {
+ nvlist_free(mmp_label);
+ mmp_label = NULL;
+ }
+ error = cv_timedwait_sig(&cv, &mtx, hz);
+#if defined(illumos) || !defined(_KERNEL)
+ if (error != -1) {
+#else
+ if (error != EWOULDBLOCK) {
+#endif
+ error = SET_ERROR(EINTR);
+ break;
+ }
+ error = 0;
+ }
+
+out:
+ mutex_exit(&mtx);
+ mutex_destroy(&mtx);
+ cv_destroy(&cv);
+
+ /*
+ * If the pool is determined to be active store the status in the
+ * spa->spa_load_info nvlist. If the remote hostname or hostid are
+ * available from configuration read from disk store them as well.
+ * This allows 'zpool import' to generate a more useful message.
+ *
+ * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory)
+ * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
+ * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
+ */
+ if (error == EREMOTEIO) {
+ char *hostname = "<unknown>";
+ uint64_t hostid = 0;
+
+ if (mmp_label) {
+ if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
+ hostname = fnvlist_lookup_string(mmp_label,
+ ZPOOL_CONFIG_HOSTNAME);
+ fnvlist_add_string(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
+ }
+
+ if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
+ hostid = fnvlist_lookup_uint64(mmp_label,
+ ZPOOL_CONFIG_HOSTID);
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_HOSTID, hostid);
+ }
+ }
+
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_TXG, 0);
+
+ error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
+ }
+
+ if (mmp_label)
+ nvlist_free(mmp_label);
+
+ return (error);
+}
+
+static int
+spa_verify_host(spa_t *spa, nvlist_t *mos_config)
+{
+ uint64_t hostid;
+ char *hostname;
+ uint64_t myhostid = 0;
+
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
+ ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+ hostname = fnvlist_lookup_string(mos_config,
+ ZPOOL_CONFIG_HOSTNAME);
+
+ myhostid = zone_get_hostid(NULL);
+
+ if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
+ cmn_err(CE_WARN, "pool '%s' could not be "
+ "loaded as it was last accessed by "
+ "another system (host: %s hostid: 0x%llx). "
+ "See: http://illumos.org/msg/ZFS-8000-EY",
+ spa_name(spa), hostname, (u_longlong_t)hostid);
+ spa_load_failed(spa, "hostid verification failed: pool "
+ "last accessed by host: %s (hostid: 0x%llx)",
+ hostname, (u_longlong_t)hostid);
+ return (SET_ERROR(EBADF));
+ }
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
+{
+ int error = 0;
+ nvlist_t *nvtree, *nvl, *config = spa->spa_config;
+ int parse;
+ vdev_t *rvd;
+ uint64_t pool_guid;
+ char *comment;
+
+ /*
+ * Versioning wasn't explicitly added to the label until later, so if
+ * it's not present treat it as the initial version.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+ spa_load_failed(spa, "invalid config provided: '%s' missing",
+ ZPOOL_CONFIG_POOL_GUID);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If we are doing an import, ensure that the pool is not already
+ * imported by checking if its pool guid already exists in the
+ * spa namespace.
+ *
+ * The only case that we allow an already imported pool to be
+ * imported again, is when the pool is checkpointed and we want to
+ * look at its checkpointed state from userland tools like zdb.
+ */
+#ifdef _KERNEL
+ if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+#else
+ if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0) &&
+ !spa_importing_readonly_checkpoint(spa)) {
+#endif
+ spa_load_failed(spa, "a pool with guid %llu is already open",
+ (u_longlong_t)pool_guid);
+ return (SET_ERROR(EEXIST));
+ }
+
+ spa->spa_config_guid = pool_guid;
+
+ nvlist_free(spa->spa_load_info);
+ spa->spa_load_info = fnvlist_alloc();
+
+ ASSERT(spa->spa_comment == NULL);
+ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+ spa->spa_comment = spa_strdup(comment);
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
+ spa->spa_config_splitting = fnvlist_dup(nvl);
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
+ spa_load_failed(spa, "invalid config provided: '%s' missing",
+ ZPOOL_CONFIG_VDEV_TREE);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Create "The Godfather" zio to hold all async IOs
+ */
+ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+ KM_SLEEP);
+ for (int i = 0; i < max_ncpus; i++) {
+ spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
+
+ /*
+ * Parse the configuration into a vdev tree. We explicitly set the
+ * value that will be returned by spa_version() since parsing the
+ * configuration requires knowing the version number.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ parse = (type == SPA_IMPORT_EXISTING ?
+ VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+ error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_load_failed(spa, "unable to parse config [error=%d]",
+ error);
+ return (error);
+ }
+
+ ASSERT(spa->spa_root_vdev == rvd);
+ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
+
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_guid(spa) == pool_guid);
+ }
+
+ return (0);
+}
+
+/*
+ * Recursively open all vdevs in the vdev tree. This function is called twice:
+ * first with the untrusted config, then with the trusted config.
+ */
+static int
+spa_ld_open_vdevs(spa_t *spa)
+{
+ int error = 0;
+
+ /*
+ * spa_missing_tvds_allowed defines how many top-level vdevs can be
+ * missing/unopenable for the root vdev to be still considered openable.
+ */
+ if (spa->spa_trust_config) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
+ } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
+ } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
+ } else {
+ spa->spa_missing_tvds_allowed = 0;
+ }
+
+ spa->spa_missing_tvds_allowed =
+ MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_open(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (spa->spa_missing_tvds != 0) {
+ spa_load_note(spa, "vdev tree has %lld missing top-level "
+ "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
+ if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
+ /*
+ * Although theoretically we could allow users to open
+ * incomplete pools in RW mode, we'd need to add a lot
+ * of extra logic (e.g. adjust pool space to account
+ * for missing vdevs).
+ * This limitation also prevents users from accidentally
+ * opening the pool in RW mode during data recovery and
+ * damaging it further.
+ */
+ spa_load_note(spa, "pools with missing top-level "
+ "vdevs can only be opened in read-only mode.");
+ error = SET_ERROR(ENXIO);
+ } else {
+ spa_load_note(spa, "current settings allow for maximum "
+ "%lld missing top-level vdevs at this stage.",
+ (u_longlong_t)spa->spa_missing_tvds_allowed);
+ }
+ }
+ if (error != 0) {
+ spa_load_failed(spa, "unable to open vdev tree [error=%d]",
+ error);
+ }
+ if (spa->spa_missing_tvds != 0 || error != 0)
+ vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
+
+ return (error);
+}
+
+/*
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand. This function is called twice: first with an untrusted
+ * config, then with a trusted config. The validation is more strict when the
+ * config is trusted.
+ */
+static int
+spa_ld_validate_vdevs(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
+ return (error);
+ }
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ spa_load_failed(spa, "cannot open vdev tree after invalidating "
+ "some vdevs");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ return (SET_ERROR(ENXIO));
+ }
+
+ return (0);
+}
+
+static void
+spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
+{
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+ TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+ spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+ spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ spa->spa_claim_max_txg = spa->spa_first_txg;
+ spa->spa_prev_software_version = ub->ub_software_version;
+}
+
+static int
+spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ nvlist_t *label;
+ uberblock_t *ub = &spa->spa_uberblock;
+ boolean_t activity_check = B_FALSE;
+
+ /*
+ * If we are opening the checkpointed state of the pool by
+ * rewinding to it, at this point we will have written the
+ * checkpointed uberblock to the vdev labels, so searching
+ * the labels will find the right uberblock. However, if
+ * we are opening the checkpointed state read-only, we have
+ * not modified the labels. Therefore, we must ignore the
+ * labels and continue using the spa_uberblock that was set
+ * by spa_ld_checkpoint_rewind.
+ *
+ * Note that it would be fine to ignore the labels when
+ * rewinding (opening writeable) as well. However, if we
+ * crash just after writing the labels, we will end up
+ * searching the labels. Doing so in the common case means
+ * that this code path gets exercised normally, rather than
+ * just in the edge case.
+ */
+ if (ub->ub_checkpoint_txg != 0 &&
+ spa_importing_readonly_checkpoint(spa)) {
+ spa_ld_select_uberblock_done(spa, ub);
+ return (0);
+ }
+
+ /*
+ * Find the best uberblock.
+ */
+ vdev_uberblock_load(rvd, ub, &label);
+
+ /*
+ * If we weren't able to find a single valid uberblock, return failure.
+ */
+ if (ub->ub_txg == 0) {
+ nvlist_free(label);
+ spa_load_failed(spa, "no valid uberblock found");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+ }
+
+ spa_load_note(spa, "using uberblock with txg=%llu",
+ (u_longlong_t)ub->ub_txg);
+
+ /*
+ * For pools which have the multihost property on determine if the
+ * pool is truly inactive and can be safely imported. Prevent
+ * hosts which don't have a hostid set from importing the pool.
+ */
+ activity_check = spa_activity_check_required(spa, ub, label,
+ spa->spa_config);
+ if (activity_check) {
+ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
+ spa_get_hostid() == 0) {
+ nvlist_free(label);
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
+ return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
+ }
+
+ int error = spa_activity_check(spa, ub, spa->spa_config);
+ if (error) {
+ nvlist_free(label);
+ return (error);
+ }
+
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
+ fnvlist_add_uint16(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_SEQ,
+ (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
+ }
+
+ /*
+ * If the pool has an unsupported version we can't open it.
+ */
+ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
+ nvlist_free(label);
+ spa_load_failed(spa, "version %llu is not supported",
+ (u_longlong_t)ub->ub_version);
+ return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+ }
+
+ if (ub->ub_version >= SPA_VERSION_FEATURES) {
+ nvlist_t *features;
+
+ /*
+ * If we weren't able to find what's necessary for reading the
+ * MOS in the label, return failure.
+ */
+ if (label == NULL) {
+ spa_load_failed(spa, "label config unavailable");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ ENXIO));
+ }
+
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
+ &features) != 0) {
+ nvlist_free(label);
+ spa_load_failed(spa, "invalid label: '%s' missing",
+ ZPOOL_CONFIG_FEATURES_FOR_READ);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ ENXIO));
+ }
+
+ /*
+ * Update our in-core representation with the definitive values
+ * from the label.
+ */
+ nvlist_free(spa->spa_label_features);
+ VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * Look through entries in the label nvlist's features_for_read. If
+ * there is a feature listed there which we don't understand then we
+ * cannot open a pool.
+ */
+ if (ub->ub_version >= SPA_VERSION_FEATURES) {
+ nvlist_t *unsup_feat;
+
+ VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
+ 0);
+
+ for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
+ NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
+ if (!zfeature_is_supported(nvpair_name(nvp))) {
+ VERIFY(nvlist_add_string(unsup_feat,
+ nvpair_name(nvp), "") == 0);
+ }
+ }
+
+ if (!nvlist_empty(unsup_feat)) {
+ VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
+ nvlist_free(unsup_feat);
+ spa_load_failed(spa, "some features are unsupported");
+ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP));
+ }
+
+ nvlist_free(unsup_feat);
+ }
+
+ if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_try_repair(spa, spa->spa_config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ }
+
+ /*
+ * Initialize internal SPA structures.
+ */
+ spa_ld_select_uberblock_done(spa, ub);
+
+ return (0);
+}
+
+static int
+spa_ld_open_rootbp(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ if (error != 0) {
+ spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
+ "[error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+ return (0);
+}
+
+static int
+spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
+ boolean_t reloading)
+{
+ vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+ nvlist_t *nv, *mos_config, *policy;
+ int error = 0, copy_error;
+ uint64_t healthy_tvds, healthy_tvds_mos;
+ uint64_t mos_config_txg;
+
+ if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
+ != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * If we're assembling a pool from a split, the config provided is
+ * already trusted so there is nothing to do.
+ */
+ if (type == SPA_IMPORT_ASSEMBLE)
+ return (0);
+
+ healthy_tvds = spa_healthy_core_tvds(spa);
+
+ if (load_nvlist(spa, spa->spa_config_object, &mos_config)
+ != 0) {
+ spa_load_failed(spa, "unable to retrieve MOS config");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ /*
+ * If we are doing an open, pool owner wasn't verified yet, thus do
+ * the verification here.
+ */
+ if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ error = spa_verify_host(spa, mos_config);
+ if (error != 0) {
+ nvlist_free(mos_config);
+ return (error);
+ }
+ }
+
+ nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ /*
+ * Build a new vdev tree from the trusted config
+ */
+ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+ /*
+ * Vdev paths in the MOS may be obsolete. If the untrusted config was
+ * obtained by scanning /dev/dsk, then it will have the right vdev
+ * paths. We update the trusted MOS config with this information.
+ * We first try to copy the paths with vdev_copy_path_strict, which
+ * succeeds only when both configs have exactly the same vdev tree.
+ * If that fails, we fall back to a more flexible method that has a
+ * best effort policy.
+ */
+ copy_error = vdev_copy_path_strict(rvd, mrvd);
+ if (copy_error != 0 || spa_load_print_vdev_tree) {
+ spa_load_note(spa, "provided vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ spa_load_note(spa, "MOS vdev tree:");
+ vdev_dbgmsg_print_tree(mrvd, 2);
+ }
+ if (copy_error != 0) {
+ spa_load_note(spa, "vdev_copy_path_strict failed, falling "
+ "back to vdev_copy_path_relaxed");
+ vdev_copy_path_relaxed(rvd, mrvd);
+ }
+
+ vdev_close(rvd);
+ vdev_free(rvd);
+ spa->spa_root_vdev = mrvd;
+ rvd = mrvd;
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * We will use spa_config if we decide to reload the spa or if spa_load
+ * fails and we rewind. We must thus regenerate the config using the
+ * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
+ * pass settings on how to load the pool and is not stored in the MOS.
+ * We copy it over to our new, trusted config.
+ */
+ mos_config_txg = fnvlist_lookup_uint64(mos_config,
+ ZPOOL_CONFIG_POOL_TXG);
+ nvlist_free(mos_config);
+ mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
+ if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
+ &policy) == 0)
+ fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
+ spa_config_set(spa, mos_config);
+ spa->spa_config_source = SPA_CONFIG_SRC_MOS;
+
+ /*
+ * Now that we got the config from the MOS, we should be more strict
+ * in checking blkptrs and can make assumptions about the consistency
+ * of the vdev tree. spa_trust_config must be set to true before opening
+ * vdevs in order for them to be writeable.
+ */
+ spa->spa_trust_config = B_TRUE;
+
+ /*
+ * Open and validate the new vdev tree
+ */
+ error = spa_ld_open_vdevs(spa);
+ if (error != 0)
+ return (error);
+
+ error = spa_ld_validate_vdevs(spa);
+ if (error != 0)
+ return (error);
+
+ if (copy_error != 0 || spa_load_print_vdev_tree) {
+ spa_load_note(spa, "final vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ }
+
+ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
+ !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
+ /*
+ * Sanity check to make sure that we are indeed loading the
+ * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
+ * in the config provided and they happened to be the only ones
+ * to have the latest uberblock, we could involuntarily perform
+ * an extreme rewind.
+ */
+ healthy_tvds_mos = spa_healthy_core_tvds(spa);
+ if (healthy_tvds_mos - healthy_tvds >=
+ SPA_SYNC_MIN_VDEVS) {
+ spa_load_note(spa, "config provided misses too many "
+ "top-level vdevs compared to MOS (%lld vs %lld). ",
+ (u_longlong_t)healthy_tvds,
+ (u_longlong_t)healthy_tvds_mos);
+ spa_load_note(spa, "vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ if (reloading) {
+ spa_load_failed(spa, "config was already "
+ "provided from MOS. Aborting.");
+ return (spa_vdev_err(rvd,
+ VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+ spa_load_note(spa, "spa must be reloaded using MOS "
+ "config");
+ return (SET_ERROR(EAGAIN));
+ }
+ }
+
+ error = spa_check_for_missing_logs(spa);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+ if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
+ spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
+ "guid sum (%llu != %llu)",
+ (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
+ (u_longlong_t)rvd->vdev_guid_sum);
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+ ENXIO));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_open_indirect_vdev_metadata(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * Everything that we read before spa_remove_init() must be stored
+ * on concreted vdevs. Therefore we do this as early as possible.
+ */
+ error = spa_remove_init(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_remove_init failed [error=%d]",
+ error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ /*
+ * Retrieve information needed to condense indirect vdev mappings.
+ */
+ error = spa_condense_init(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_condense_init failed [error=%d]",
+ error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if (spa_version(spa) >= SPA_VERSION_FEATURES) {
+ boolean_t missing_feat_read = B_FALSE;
+ nvlist_t *unsup_feat, *enabled_feat;
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
+ &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
+ &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
+ &spa->spa_feat_desc_obj, B_TRUE) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ enabled_feat = fnvlist_alloc();
+ unsup_feat = fnvlist_alloc();
+
+ if (!spa_features_check(spa, B_FALSE,
+ unsup_feat, enabled_feat))
+ missing_feat_read = B_TRUE;
+
+ if (spa_writeable(spa) ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
+ if (!spa_features_check(spa, B_TRUE,
+ unsup_feat, enabled_feat)) {
+ *missing_feat_writep = B_TRUE;
+ }
+ }
+
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
+
+ if (!nvlist_empty(unsup_feat)) {
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
+ }
+
+ fnvlist_free(enabled_feat);
+ fnvlist_free(unsup_feat);
+
+ if (!missing_feat_read) {
+ fnvlist_add_boolean(spa->spa_load_info,
+ ZPOOL_CONFIG_CAN_RDONLY);
+ }
+
+ /*
+ * If the state is SPA_LOAD_TRYIMPORT, our objective is
+ * twofold: to determine whether the pool is available for
+ * import in read-write mode and (if it is not) whether the
+ * pool is available for import in read-only mode. If the pool
+ * is available for import in read-write mode, it is displayed
+ * as available in userland; if it is not available for import
+ * in read-only mode, it is displayed as unavailable in
+ * userland. If the pool is available for import in read-only
+ * mode but not read-write mode, it is displayed as unavailable
+ * in userland with a special note that the pool is actually
+ * available for open in read-only mode.
+ *
+ * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
+ * missing a feature for write, we must first determine whether
+ * the pool can be opened read-only before returning to
+ * userland in order to know whether to display the
+ * abovementioned note.
+ */
+ if (missing_feat_read || (*missing_feat_writep &&
+ spa_writeable(spa))) {
+ spa_load_failed(spa, "pool uses unsupported features");
+ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP));
+ }
+
+ /*
+ * Load refcounts for ZFS features from disk into an in-memory
+ * cache during SPA initialization.
+ */
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+ uint64_t refcount;
+
+ error = feature_get_refcount_from_disk(spa,
+ &spa_feature_table[i], &refcount);
+ if (error == 0) {
+ spa->spa_feat_refcount_cache[i] = refcount;
+ } else if (error == ENOTSUP) {
+ spa->spa_feat_refcount_cache[i] =
+ SPA_FEATURE_DISABLED;
+ } else {
+ spa_load_failed(spa, "error getting refcount "
+ "for feature %s [error=%d]",
+ spa_feature_table[i].fi_guid, error);
+ return (spa_vdev_err(rvd,
+ VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+ }
+ }
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
+ if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
+ &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_load_special_directories(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ spa->spa_is_initializing = B_TRUE;
+ error = dsl_pool_open(spa->spa_dsl_pool);
+ spa->spa_is_initializing = B_FALSE;
+ if (error != 0) {
+ spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_get_props(spa_t *spa)
+{
+ int error = 0;
+ uint64_t obj;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /* Grab the secret checksum salt from the MOS. */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CHECKSUM_SALT, 1,
+ sizeof (spa->spa_cksum_salt.zcs_bytes),
+ spa->spa_cksum_salt.zcs_bytes);
+ if (error == ENOENT) {
+ /* Generate a new salt for subsequent use */
+ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+ sizeof (spa->spa_cksum_salt.zcs_bytes));
+ } else if (error != 0) {
+ spa_load_failed(spa, "unable to retrieve checksum salt from "
+ "MOS [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+ if (error != 0) {
+ spa_load_failed(spa, "error opening deferred-frees bpobj "
+ "[error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ /*
+ * Load the bit that tells us to use the new accounting function
+ * (raid-z deflation). If we have an older pool, this will not
+ * be present.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+ &spa->spa_creation_version, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the persistent error log. If we have an older pool, this will
+ * not be present.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
+ B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
+ &spa->spa_errlog_scrub, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the history object. If we have an older pool, this
+ * will not be present.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the per-vdev ZAP map. If we have an older pool, this will not
+ * be present; in this case, defer its creation to a later time to
+ * avoid dirtying the MOS this early / out of sync context. See
+ * spa_sync_config_object.
+ */
+
+ /* The sentinel is only available in the MOS config. */
+ nvlist_t *mos_config;
+ if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
+ spa_load_failed(spa, "unable to retrieve MOS config");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
+ &spa->spa_all_vdev_zaps, B_FALSE);
+
+ if (error == ENOENT) {
+ VERIFY(!nvlist_exists(mos_config,
+ ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
+ spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
+ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+ } else if (error != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
+ /*
+ * An older version of ZFS overwrote the sentinel value, so
+ * we have orphaned per-vdev ZAPs in the MOS. Defer their
+ * destruction to later; see spa_sync_config_object.
+ */
+ spa->spa_avz_action = AVZ_ACTION_DESTROY;
+ /*
+ * We're assuming that no vdevs have had their ZAPs created
+ * before this. Better be sure of it.
+ */
+ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+ }
+ nvlist_free(mos_config);
+
+ spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+
+ error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
+ B_FALSE);
+ if (error && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (error == 0) {
+ uint64_t autoreplace;
+
+ spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
+ spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
+ spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
+ spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+ spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
+ spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
+ &spa->spa_dedup_ditto);
+
+ spa->spa_autoreplace = (autoreplace != 0);
+ }
+
+ /*
+ * If we are importing a pool with missing top-level vdevs,
+ * we enforce that the pool doesn't panic or get suspended on
+ * error since the likelihood of missing data is extremely high.
+ */
+ if (spa->spa_missing_tvds > 0 &&
+ spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
+ spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+ spa_load_note(spa, "forcing failmode to 'continue' "
+ "as some top level vdevs are missing");
+ spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * If we're assembling the pool from the split-off vdevs of
+ * an existing pool, we don't want to attach the spares & cache
+ * devices.
+ */
+
+ /*
+ * Load any hot spares for this pool.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
+ B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
+ if (load_nvlist(spa, spa->spa_spares.sav_object,
+ &spa->spa_spares.sav_config) != 0) {
+ spa_load_failed(spa, "error loading spares nvlist");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_spares(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ } else if (error == 0) {
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Load any level 2 ARC devices for this pool.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
+ &spa->spa_l2cache.sav_object, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
+ if (load_nvlist(spa, spa->spa_l2cache.sav_object,
+ &spa->spa_l2cache.sav_config) != 0) {
+ spa_load_failed(spa, "error loading l2cache nvlist");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ } else if (error == 0) {
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_load_vdev_metadata(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * If the 'multihost' property is set, then never allow a pool to
+ * be imported when the system hostid is zero. The exception to
+ * this rule is zdb which is always allowed to access pools.
+ */
+ if (spa_multihost(spa) && spa_get_hostid() == 0 &&
+ (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
+ fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
+ return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
+ }
+
+ /*
+ * If the 'autoreplace' property is set, then post a resource notifying
+ * the ZFS DE that it should not issue any faults for unopenable
+ * devices. We also iterate over the vdevs, and post a sysevent for any
+ * unopenable vdevs so that the normal autoreplace handler can take
+ * over.
+ */
+ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+ spa_check_removed(spa->spa_root_vdev);
+ /*
+ * For the import case, this is done in spa_import(), because
+ * at this point we're using the spare definitions from
+ * the MOS config, not necessarily from the userland config.
+ */
+ if (spa->spa_load_state != SPA_LOAD_IMPORT) {
+ spa_aux_check_removed(&spa->spa_spares);
+ spa_aux_check_removed(&spa->spa_l2cache);
+ }
+ }
+
+ /*
+ * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
+ */
+ error = vdev_load(rvd);
+ if (error != 0) {
+ spa_load_failed(spa, "vdev_load failed [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
+
+ /*
+ * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ return (0);
+}
+
+static int
+spa_ld_load_dedup_tables(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ error = ddt_load(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "ddt_load failed [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
+ boolean_t missing = spa_check_logs(spa);
+ if (missing) {
+ if (spa->spa_missing_tvds != 0) {
+ spa_load_note(spa, "spa_check_logs failed "
+ "so dropping the logs");
+ } else {
+ *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ spa_load_failed(spa, "spa_check_logs failed");
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
+ ENXIO));
+ }
+ }
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_verify_pool_data(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * We've successfully opened the pool, verify that we're ready
+ * to start pushing transactions.
+ */
+ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+ error = spa_load_verify(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_load_verify failed "
+ "[error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ error));
+ }
+ }
+
+ return (0);
+}
+
+static void
+spa_ld_claim_log_blocks(spa_t *spa)
+{
+ dmu_tx_t *tx;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+
+ /*
+ * Claim log blocks that haven't been committed yet.
+ * This must all happen in a single txg.
+ * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+ * invoked from zil_claim_log_block()'s i/o done callback.
+ * Price of rollback is that we abandon the log.
+ */
+ spa->spa_claiming = B_TRUE;
+
+ tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+ (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ zil_claim, tx, DS_FIND_CHILDREN);
+ dmu_tx_commit(tx);
+
+ spa->spa_claiming = B_FALSE;
+
+ spa_set_log_state(spa, SPA_LOG_GOOD);
+}
+
+static void
+spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
+ boolean_t update_config_cache)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ int need_update = B_FALSE;
+
+ /*
+ * If the config cache is stale, or we have uninitialized
+ * metaslabs (see spa_vdev_add()), then update the config.
+ *
+ * If this is a verbatim import, trust the current
+ * in-core spa_config and update the disk labels.
+ */
+ if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
+ spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_RECOVER ||
+ (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
+ need_update = B_TRUE;
+
+ for (int c = 0; c < rvd->vdev_children; c++)
+ if (rvd->vdev_child[c]->vdev_ms_array == 0)
+ need_update = B_TRUE;
+
+ /*
+ * Update the config cache asychronously in case we're the
+ * root pool, in which case the config cache isn't writable yet.
+ */
+ if (need_update)
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
+static void
+spa_ld_prepare_for_reload(spa_t *spa)
+{
+ int mode = spa->spa_mode;
+ int async_suspended = spa->spa_async_suspended;
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_activate(spa, mode);
+
+ /*
+ * We save the value of spa_async_suspended as it gets reset to 0 by
+ * spa_unload(). We want to restore it back to the original value before
+ * returning as we might be calling spa_async_resume() later.
+ */
+ spa->spa_async_suspended = async_suspended;
+}
+
+static int
+spa_ld_read_checkpoint_txg(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error = 0;
+
+ ASSERT0(spa->spa_checkpoint_txg);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error == ENOENT)
+ return (0);
+
+ if (error != 0)
+ return (error);
+
+ ASSERT3U(checkpoint.ub_txg, !=, 0);
+ ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
+ ASSERT3U(checkpoint.ub_timestamp, !=, 0);
+ spa->spa_checkpoint_txg = checkpoint.ub_txg;
+ spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+ return (0);
+}
+
+static int
+spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+ /*
+ * Never trust the config that is provided unless we are assembling
+ * a pool following a split.
+ * This means don't trust blkptrs and the vdev tree in general. This
+ * also effectively puts the spa in read-only mode since
+ * spa_writeable() checks for spa_trust_config to be true.
+ * We will later load a trusted config from the MOS.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE)
+ spa->spa_trust_config = B_FALSE;
+
+ /*
+ * Parse the config provided to create a vdev tree.
+ */
+ error = spa_ld_parse_config(spa, type);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Now that we have the vdev tree, try to open each vdev. This involves
+ * opening the underlying physical device, retrieving its geometry and
+ * probing the vdev with a dummy I/O. The state of each vdev will be set
+ * based on the success of those operations. After this we'll be ready
+ * to read from the vdevs.
+ */
+ error = spa_ld_open_vdevs(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Read the label of each vdev and make sure that the GUIDs stored
+ * there match the GUIDs in the config provided.
+ * If we're assembling a new pool that's been split off from an
+ * existing pool, the labels haven't yet been updated so we skip
+ * validation for now.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ error = spa_ld_validate_vdevs(spa);
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * Read all vdev labels to find the best uberblock (i.e. latest,
+ * unless spa_load_max_txg is set) and store it in spa_uberblock. We
+ * get the list of features required to read blkptrs in the MOS from
+ * the vdev label with the best uberblock and verify that our version
+ * of zfs supports them all.
+ */
+ error = spa_ld_select_uberblock(spa, type);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Pass that uberblock to the dsl_pool layer which will open the root
+ * blkptr. This blkptr points to the latest version of the MOS and will
+ * allow us to read its contents.
+ */
+ error = spa_ld_open_rootbp(spa);
+ if (error != 0)
+ return (error);
+
+ return (0);
+}
+
+static int
+spa_ld_checkpoint_rewind(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error != 0) {
+ spa_load_failed(spa, "unable to retrieve checkpointed "
+ "uberblock from the MOS config [error=%d]", error);
+
+ if (error == ENOENT)
+ error = ZFS_ERR_NO_CHECKPOINT;
+
+ return (error);
+ }
+
+ ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
+ ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
+
+ /*
+ * We need to update the txg and timestamp of the checkpointed
+ * uberblock to be higher than the latest one. This ensures that
+ * the checkpointed uberblock is selected if we were to close and
+ * reopen the pool right after we've written it in the vdev labels.
+ * (also see block comment in vdev_uberblock_compare)
+ */
+ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
+ checkpoint.ub_timestamp = gethrestime_sec();
+
+ /*
+ * Set current uberblock to be the checkpointed uberblock.
+ */
+ spa->spa_uberblock = checkpoint;
+
+ /*
+ * If we are doing a normal rewind, then the pool is open for
+ * writing and we sync the "updated" checkpointed uberblock to
+ * disk. Once this is done, we've basically rewound the whole
+ * pool and there is no way back.
+ *
+ * There are cases when we don't want to attempt and sync the
+ * checkpointed uberblock to disk because we are opening a
+ * pool as read-only. Specifically, verifying the checkpointed
+ * state with zdb, and importing the checkpointed state to get
+ * a "preview" of its content.
+ */
+ if (spa_writeable(spa)) {
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+ int svdcount = 0;
+ int children = rvd->vdev_children;
+ int c0 = spa_get_random(children);
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
+
+ /* Stop when revisiting the first vdev */
+ if (c > 0 && svd[0] == vd)
+ break;
+
+ if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+ !vdev_is_concrete(vd))
+ continue;
+
+ svd[svdcount++] = vd;
+ if (svdcount == SPA_SYNC_MIN_VDEVS)
+ break;
+ }
+ error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
+ if (error == 0)
+ spa->spa_last_synced_guid = rvd->vdev_guid;
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_load_failed(spa, "failed to write checkpointed "
+ "uberblock to the vdev labels [error=%d]", error);
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
+ boolean_t *update_config_cache)
+{
+ int error;
+
+ /*
+ * Parse the config for pool, open and validate vdevs,
+ * select an uberblock, and use that uberblock to open
+ * the MOS.
+ */
+ error = spa_ld_mos_init(spa, type);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve the trusted config stored in the MOS and use it to create
+ * a new, exact version of the vdev tree, then reopen all vdevs.
+ */
+ error = spa_ld_trusted_config(spa, type, B_FALSE);
+ if (error == EAGAIN) {
+ if (update_config_cache != NULL)
+ *update_config_cache = B_TRUE;
+
+ /*
+ * Redo the loading process with the trusted config if it is
+ * too different from the untrusted config.
+ */
+ spa_ld_prepare_for_reload(spa);
+ spa_load_note(spa, "RELOADING");
+ error = spa_ld_mos_init(spa, type);
+ if (error != 0)
+ return (error);
+
+ error = spa_ld_trusted_config(spa, type, B_TRUE);
+ if (error != 0)
+ return (error);
+
+ } else if (error != 0) {
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * Load an existing storage pool, using the config provided. This config
+ * describes which vdevs are part of the pool and is later validated against
+ * partial configs present in each vdev's label and an entire copy of the
+ * config stored in the MOS.
+ */
+static int
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+ int error = 0;
+ boolean_t missing_feat_write = B_FALSE;
+ boolean_t checkpoint_rewind =
+ (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+ boolean_t update_config_cache = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+ spa_load_note(spa, "LOADING");
+
+ error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
+ if (error != 0)
+ return (error);
+
+ /*
+ * If we are rewinding to the checkpoint then we need to repeat
+ * everything we've done so far in this function but this time
+ * selecting the checkpointed uberblock and using that to open
+ * the MOS.
+ */
+ if (checkpoint_rewind) {
+ /*
+ * If we are rewinding to the checkpoint update config cache
+ * anyway.
+ */
+ update_config_cache = B_TRUE;
+
+ /*
+ * Extract the checkpointed uberblock from the current MOS
+ * and use this as the pool's uberblock from now on. If the
+ * pool is imported as writeable we also write the checkpoint
+ * uberblock to the labels, making the rewind permanent.
+ */
+ error = spa_ld_checkpoint_rewind(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Redo the loading process process again with the
+ * checkpointed uberblock.
+ */
+ spa_ld_prepare_for_reload(spa);
+ spa_load_note(spa, "LOADING checkpointed uberblock");
+ error = spa_ld_mos_with_trusted_config(spa, type, NULL);
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * Retrieve the checkpoint txg if the pool has a checkpoint.
+ */
+ error = spa_ld_read_checkpoint_txg(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve the mapping of indirect vdevs. Those vdevs were removed
+ * from the pool and their contents were re-mapped to other vdevs. Note
+ * that everything that we read before this step must have been
+ * rewritten on concrete vdevs after the last device removal was
+ * initiated. Otherwise we could be reading from indirect vdevs before
+ * we have loaded their mappings.
+ */
+ error = spa_ld_open_indirect_vdev_metadata(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve the full list of active features from the MOS and check if
+ * they are all supported.
+ */
+ error = spa_ld_check_features(spa, &missing_feat_write);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Load several special directories from the MOS needed by the dsl_pool
+ * layer.
+ */
+ error = spa_ld_load_special_directories(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve pool properties from the MOS.
+ */
+ error = spa_ld_get_props(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve the list of auxiliary devices - cache devices and spares -
+ * and open them.
+ */
+ error = spa_ld_open_aux_vdevs(spa, type);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Load the metadata for all vdevs. Also check if unopenable devices
+ * should be autoreplaced.
+ */
+ error = spa_ld_load_vdev_metadata(spa);
+ if (error != 0)
+ return (error);
+
+ error = spa_ld_load_dedup_tables(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Verify the logs now to make sure we don't have any unexpected errors
+ * when we claim log blocks later.
+ */
+ error = spa_ld_verify_logs(spa, type, ereport);
+ if (error != 0)
+ return (error);
+
+ if (missing_feat_write) {
+ ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
+
+ /*
+ * At this point, we know that we can open the pool in
+ * read-only mode but not read-write mode. We now have enough
+ * information and can return to userland.
+ */
+ return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP));
+ }
+
+ /*
+ * Traverse the last txgs to make sure the pool was left off in a safe
+ * state. When performing an extreme rewind, we verify the whole pool,
+ * which can take a very long time.
+ */
+ error = spa_ld_verify_pool_data(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Calculate the deflated space for the pool. This must be done before
+ * we write anything to the pool because we'd need to update the space
+ * accounting using the deflated sizes.
+ */
+ spa_update_dspace(spa);
+
+ /*
+ * We have now retrieved all the information we needed to open the
+ * pool. If we are importing the pool in read-write mode, a few
+ * additional steps must be performed to finish the import.
+ */
+ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
+ spa->spa_load_max_txg == UINT64_MAX)) {
+ uint64_t config_cache_txg = spa->spa_config_txg;
+
+ ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
+
+ /*
+ * In case of a checkpoint rewind, log the original txg
+ * of the checkpointed uberblock.
+ */
+ if (checkpoint_rewind) {
+ spa_history_log_internal(spa, "checkpoint rewind",
+ NULL, "rewound state to txg=%llu",
+ (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
+ }
+
+ /*
+ * Traverse the ZIL and claim all blocks.
+ */
+ spa_ld_claim_log_blocks(spa);
+
+ /*
+ * Kick-off the syncing thread.
+ */
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+ mmp_thread_start(spa);
+
+ /*
+ * Wait for all claims to sync. We sync up to the highest
+ * claimed log block birth time so that claimed log blocks
+ * don't appear to be from the future. spa_claim_max_txg
+ * will have been set for us by ZIL traversal operations
+ * performed above.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
+
+ /*
+ * Check if we need to request an update of the config. On the
+ * next sync, we would update the config stored in vdev labels
+ * and the cachefile (by default /etc/zfs/zpool.cache).
+ */
+ spa_ld_check_for_config_update(spa, config_cache_txg,
+ update_config_cache);
+
+ /*
+ * Check all DTLs to see if anything needs resilvering.
+ */
+ if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+ /*
+ * Log the fact that we booted up (so that we can detect if
+ * we rebooted in the middle of an operation).
+ */
+ spa_history_log_version(spa, "open");
+
+ spa_restart_removal(spa);
+ spa_spawn_aux_threads(spa);
+
+ /*
+ * Delete any inconsistent datasets.
+ *
+ * Note:
+ * Since we may be issuing deletes for clones here,
+ * we make sure to do so after we've spawned all the
+ * auxiliary threads above (from which the livelist
+ * deletion zthr is part of).
+ */
+ (void) dmu_objset_find(spa_name(spa),
+ dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+
+ /*
+ * Clean up any stale temporary dataset userrefs.
+ */
+ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ spa_load_note(spa, "LOADED");
+
+ return (0);
+}
+
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state)
+{
+ int mode = spa->spa_mode;
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
+
+ spa_activate(spa, mode);
+ spa_async_suspend(spa);
+
+ spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
+ (u_longlong_t)spa->spa_load_max_txg);
+
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING));
+}
+
+/*
+ * If spa_load() fails this function will try loading prior txg's. If
+ * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
+ * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
+ * function will not rewind the pool and will return the same error as
+ * spa_load().
+ */
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
+ int rewind_flags)
+{
+ nvlist_t *loadinfo = NULL;
+ nvlist_t *config = NULL;
+ int load_error, rewind_error;
+ uint64_t safe_rewind_txg;
+ uint64_t min_txg;
+
+ if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+ spa->spa_load_max_txg = spa->spa_load_txg;
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ spa->spa_load_max_txg = max_request;
+ if (max_request != UINT64_MAX)
+ spa->spa_extreme_rewind = B_TRUE;
+ }
+
+ load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
+ if (load_error == 0)
+ return (0);
+ if (load_error == ZFS_ERR_NO_CHECKPOINT) {
+ /*
+ * When attempting checkpoint-rewind on a pool with no
+ * checkpoint, we should not attempt to load uberblocks
+ * from previous txgs when spa_load fails.
+ */
+ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+ return (load_error);
+ }
+
+ if (spa->spa_root_vdev != NULL)
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+ spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ if (rewind_flags & ZPOOL_NEVER_REWIND) {
+ nvlist_free(config);
+ return (load_error);
+ }
+
+ if (state == SPA_LOAD_RECOVER) {
+ /* Price of rolling back is discarding txgs, including log */
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ /*
+ * If we aren't rolling back save the load info from our first
+ * import attempt so that we can restore it after attempting
+ * to rewind.
+ */
+ loadinfo = spa->spa_load_info;
+ spa->spa_load_info = fnvlist_alloc();
+ }
+
+ spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+ safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+ min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+ TXG_INITIAL : safe_rewind_txg;
+
+ /*
+ * Continue as long as we're finding errors, we're still within
+ * the acceptable rewind range, and we're still finding uberblocks
+ */
+ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+ spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+ if (spa->spa_load_max_txg < safe_rewind_txg)
+ spa->spa_extreme_rewind = B_TRUE;
+ rewind_error = spa_load_retry(spa, state);
+ }
+
+ spa->spa_extreme_rewind = B_FALSE;
+ spa->spa_load_max_txg = UINT64_MAX;
+
+ if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+ spa_config_set(spa, config);
+ else
+ nvlist_free(config);
+
+ if (state == SPA_LOAD_RECOVER) {
+ ASSERT3P(loadinfo, ==, NULL);
+ return (rewind_error);
+ } else {
+ /* Store the rewind info as part of the initial load info */
+ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
+ spa->spa_load_info);
+
+ /* Restore the initial load info */
+ fnvlist_free(spa->spa_load_info);
+ spa->spa_load_info = loadinfo;
+
+ return (load_error);
+ }
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache. For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNINITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+ nvlist_t **config)
+{
+ spa_t *spa;
+ spa_load_state_t state = SPA_LOAD_OPEN;
+ int error;
+ int locked = B_FALSE;
+ int firstopen = B_FALSE;
+
+ *spapp = NULL;
+
+ /*
+ * As disgusting as this is, we need to support recursive calls to this
+ * function because dsl_dir_open() is called during spa_load(), and ends
+ * up calling spa_open() again. The real fix is to figure out how to
+ * avoid dsl_dir_open() calling this in the first place.
+ */
+ if (mutex_owner(&spa_namespace_lock) != curthread) {
+ mutex_enter(&spa_namespace_lock);
+ locked = B_TRUE;
+ }
+
+ if ((spa = spa_lookup(pool)) == NULL) {
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+ zpool_load_policy_t policy;
+
+ firstopen = B_TRUE;
+
+ zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
+ &policy);
+ if (policy.zlp_rewind & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
+
+ spa_activate(spa, spa_mode_global);
+
+ if (state != SPA_LOAD_RECOVER)
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+ spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+
+ zfs_dbgmsg("spa_open_common: opening %s", pool);
+ error = spa_load_best(spa, state, policy.zlp_txg,
+ policy.zlp_rewind);
+
+ if (error == EBADF) {
+ /*
+ * If vdev_validate() returns failure (indicated by
+ * EBADF), it indicates that one of the vdevs indicates
+ * that the pool has been exported or destroyed. If
+ * this is the case, the config cache is out of sync and
+ * we should remove the pool from the namespace.
+ */
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_write_cachefile(spa, B_TRUE, B_TRUE);
+ spa_remove(spa);
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (error) {
+ /*
+ * We can't open the pool, but we still have useful
+ * information: the state of each vdev after the
+ * attempted vdev_open(). Return this to the user.
+ */
+ if (config != NULL && spa->spa_config) {
+ VERIFY(nvlist_dup(spa->spa_config, config,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist(*config,
+ ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa->spa_last_open_failed = error;
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ *spapp = NULL;
+ return (error);
+ }
+ }
+
+ spa_open_ref(spa, tag);
+
+ if (config != NULL)
+ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+ /*
+ * If we've recovered the pool, pass back any information we
+ * gathered while doing the load.
+ */
+ if (state == SPA_LOAD_RECOVER) {
+ VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
+
+ if (locked) {
+ spa->spa_last_open_failed = 0;
+ spa->spa_last_ubsync_txg = 0;
+ spa->spa_load_txg = 0;
+ mutex_exit(&spa_namespace_lock);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ if (firstopen)
+ zvol_create_minors(spa, spa->spa_name);
+#endif
+#endif
+ }
+
+ *spapp = spa;
+
+ return (0);
+}
+
+int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+ nvlist_t **config)
+{
+ return (spa_open_common(name, spapp, tag, policy, config));
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+ return (spa_open_common(name, spapp, tag, NULL, NULL));
+}
+
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+ spa_t *spa;
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (NULL);
+ }
+ spa->spa_inject_ref++;
+ mutex_exit(&spa_namespace_lock);
+
+ return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+ mutex_enter(&spa_namespace_lock);
+ spa->spa_inject_ref--;
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * Add spares device information to the nvlist.
+ */
+static void
+spa_add_spares(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **spares;
+ uint_t i, nspares;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_stat_t *vs;
+ uint_t vsc;
+ uint64_t pool;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ if (spa->spa_spares.sav_count == 0)
+ return;
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+ if (nspares != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ /*
+ * Go through and find any spares which have since been
+ * repurposed as an active spare. If this is the case, update
+ * their status appropriately.
+ */
+ for (i = 0; i < nspares; i++) {
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+ if (spa_spare_exists(guid, &pool, NULL) &&
+ pool != 0ULL) {
+ VERIFY(nvlist_lookup_uint64_array(
+ spares[i], ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+ vs->vs_state = VDEV_STATE_CANT_OPEN;
+ vs->vs_aux = VDEV_AUX_SPARED;
+ }
+ }
+ }
+}
+
+/*
+ * Add l2cache device information to the nvlist, including vdev stats.
+ */
+static void
+spa_add_l2cache(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **l2cache;
+ uint_t i, j, nl2cache;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_t *vd;
+ vdev_stat_t *vs;
+ uint_t vsc;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ if (spa->spa_l2cache.sav_count == 0)
+ return;
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+ if (nl2cache != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+
+ /*
+ * Update level 2 cache device stats.
+ */
+
+ for (i = 0; i < nl2cache; i++) {
+ VERIFY(nvlist_lookup_uint64(l2cache[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+
+ vd = NULL;
+ for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
+ if (guid ==
+ spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
+ vd = spa->spa_l2cache.sav_vdevs[j];
+ break;
+ }
+ }
+ ASSERT(vd != NULL);
+
+ VERIFY(nvlist_lookup_uint64_array(l2cache[i],
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+ == 0);
+ vdev_get_stats(vd, vs);
+ }
+ }
+}
+
+static void
+spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /* We may be unable to read features if pool is suspended. */
+ if (spa_suspended(spa))
+ return;
+
+ if (spa->spa_feat_for_read_obj != 0) {
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_feat_for_read_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+ za.za_num_integers == 1);
+ VERIFY0(nvlist_add_uint64(features, za.za_name,
+ za.za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ if (spa->spa_feat_for_write_obj != 0) {
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_feat_for_write_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+ za.za_num_integers == 1);
+ VERIFY0(nvlist_add_uint64(features, za.za_name,
+ za.za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+ }
+}
+
+static void
+spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
+{
+ int i;
+
+ for (i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t feature = spa_feature_table[i];
+ uint64_t refcount;
+
+ if (feature_get_refcount(spa, &feature, &refcount) != 0)
+ continue;
+
+ VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
+ }
+}
+
+/*
+ * Store a list of pool features and their reference counts in the
+ * config.
+ *
+ * The first time this is called on a spa, allocate a new nvlist, fetch
+ * the pool features and reference counts from disk, then save the list
+ * in the spa. In subsequent calls on the same spa use the saved nvlist
+ * and refresh its values from the cached reference counts. This
+ * ensures we don't block here on I/O on a suspended pool so 'zpool
+ * clear' can resume the pool.
+ */
+static void
+spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t *features;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ mutex_enter(&spa->spa_feat_stats_lock);
+ features = spa->spa_feat_stats;
+
+ if (features != NULL) {
+ spa_feature_stats_from_cache(spa, features);
+ } else {
+ VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
+ spa->spa_feat_stats = features;
+ spa_feature_stats_from_disk(spa, features);
+ }
+
+ VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+ features));
+
+ mutex_exit(&spa->spa_feat_stats_lock);
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config,
+ char *altroot, size_t buflen)
+{
+ int error;
+ spa_t *spa;
+
+ *config = NULL;
+ error = spa_open_common(name, &spa, FTAG, NULL, config);
+
+ if (spa != NULL) {
+ /*
+ * This still leaves a window of inconsistency where the spares
+ * or l2cache devices could change and the config would be
+ * self-inconsistent.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ if (*config != NULL) {
+ uint64_t loadtimes[2];
+
+ loadtimes[0] = spa->spa_loaded_ts.tv_sec;
+ loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
+ VERIFY(nvlist_add_uint64_array(*config,
+ ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
+
+ VERIFY(nvlist_add_uint64(*config,
+ ZPOOL_CONFIG_ERRCOUNT,
+ spa_get_errlog_size(spa)) == 0);
+
+ if (spa_suspended(spa)) {
+ VERIFY(nvlist_add_uint64(*config,
+ ZPOOL_CONFIG_SUSPENDED,
+ spa->spa_failmode) == 0);
+ VERIFY(nvlist_add_uint64(*config,
+ ZPOOL_CONFIG_SUSPENDED_REASON,
+ spa->spa_suspended) == 0);
+ }
+
+ spa_add_spares(spa, *config);
+ spa_add_l2cache(spa, *config);
+ spa_add_feature_stats(spa, *config);
+ }
+ }
+
+ /*
+ * We want to get the alternate root even for faulted pools, so we cheat
+ * and call spa_lookup() directly.
+ */
+ if (altroot) {
+ if (spa == NULL) {
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(name);
+ if (spa)
+ spa_altroot(spa, altroot, buflen);
+ else
+ altroot[0] = '\0';
+ spa = NULL;
+ mutex_exit(&spa_namespace_lock);
+ } else {
+ spa_altroot(spa, altroot, buflen);
+ }
+ }
+
+ if (spa != NULL) {
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ spa_close(spa, FTAG);
+ }
+
+ return (error);
+}
+
+/*
+ * Validate that the auxiliary device array is well formed. We must have an
+ * array of nvlists, each which describes a valid leaf vdev. If this is an
+ * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
+ * specified, as long as they are well-formed.
+ */
+static int
+spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
+ spa_aux_vdev_t *sav, const char *config, uint64_t version,
+ vdev_labeltype_t label)
+{
+ nvlist_t **dev;
+ uint_t i, ndev;
+ vdev_t *vd;
+ int error;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /*
+ * It's acceptable to have no devs specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
+ return (0);
+
+ if (ndev == 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Make sure the pool is formatted with a version that supports this
+ * device type.
+ */
+ if (spa_version(spa) < version)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * Set the pending device list so we correctly handle device in-use
+ * checking.
+ */
+ sav->sav_pending = dev;
+ sav->sav_npending = ndev;
+
+ for (i = 0; i < ndev; i++) {
+ if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
+ mode)) != 0)
+ goto out;
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ vdev_free(vd);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ vd->vdev_top = vd;
+
+ if ((error = vdev_open(vd)) == 0 &&
+ (error = vdev_label_init(vd, crtxg, label)) == 0) {
+ VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ }
+
+ vdev_free(vd);
+
+ if (error &&
+ (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
+ goto out;
+ else
+ error = 0;
+ }
+
+out:
+ sav->sav_pending = NULL;
+ sav->sav_npending = 0;
+ return (error);
+}
+
+static int
+spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+{
+ int error;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+ &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
+ VDEV_LABEL_SPARE)) != 0) {
+ return (error);
+ }
+
+ return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+ &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
+ VDEV_LABEL_L2CACHE));
+}
+
+static void
+spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
+ const char *config)
+{
+ int i;
+
+ if (sav->sav_config != NULL) {
+ nvlist_t **olddevs;
+ uint_t oldndevs;
+ nvlist_t **newdevs;
+
+ /*
+ * Generate new dev list by concatentating with the
+ * current dev list.
+ */
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
+ &olddevs, &oldndevs) == 0);
+
+ newdevs = kmem_alloc(sizeof (void *) *
+ (ndevs + oldndevs), KM_SLEEP);
+ for (i = 0; i < oldndevs; i++)
+ VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
+ KM_SLEEP) == 0);
+ for (i = 0; i < ndevs; i++)
+ VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
+ KM_SLEEP) == 0);
+
+ VERIFY(nvlist_remove(sav->sav_config, config,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ config, newdevs, ndevs + oldndevs) == 0);
+ for (i = 0; i < oldndevs + ndevs; i++)
+ nvlist_free(newdevs[i]);
+ kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
+ } else {
+ /*
+ * Generate a new dev list.
+ */
+ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
+ devs, ndevs) == 0);
+ }
+}
+
+/*
+ * Stop and drop level 2 ARC devices
+ */
+void
+spa_l2cache_drop(spa_t *spa)
+{
+ vdev_t *vd;
+ int i;
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+ for (i = 0; i < sav->sav_count; i++) {
+ uint64_t pool;
+
+ vd = sav->sav_vdevs[i];
+ ASSERT(vd != NULL);
+
+ if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL && l2arc_vdev_present(vd))
+ l2arc_remove_vdev(vd);
+ }
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
+ nvlist_t *zplprops)
+{
+ spa_t *spa;
+ char *altroot = NULL;
+ vdev_t *rvd;
+ dsl_pool_t *dp;
+ dmu_tx_t *tx;
+ int error = 0;
+ uint64_t txg = TXG_INITIAL;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+ uint64_t version, obj;
+ boolean_t has_features;
+ char *poolname;
+ nvlist_t *nvl;
+
+ if (props == NULL ||
+ nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
+ poolname = (char *)pool;
+
+ /*
+ * If this pool already exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(poolname) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EEXIST));
+ }
+
+ /*
+ * Allocate a new spa_t structure.
+ */
+ nvl = fnvlist_alloc();
+ fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+ spa = spa_add(poolname, nvl, altroot);
+ fnvlist_free(nvl);
+ spa_activate(spa, spa_mode_global);
+
+ if (props && (error = spa_prop_validate(spa, props))) {
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Temporary pool names should never be written to disk.
+ */
+ if (poolname != pool)
+ spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
+
+ has_features = B_FALSE;
+ for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
+ elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
+ if (zpool_prop_feature(nvpair_name(elem)))
+ has_features = B_TRUE;
+ }
+
+ if (has_features || nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
+ version = SPA_VERSION;
+ }
+ ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+
+ spa->spa_first_txg = txg;
+ spa->spa_uberblock.ub_txg = txg - 1;
+ spa->spa_uberblock.ub_version = version;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_load_state = SPA_LOAD_CREATE;
+ spa->spa_removing_phys.sr_state = DSS_NONE;
+ spa->spa_removing_phys.sr_removing_vdev = -1;
+ spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+ spa->spa_indirect_vdevs_loaded = B_TRUE;
+
+ /*
+ * Create "The Godfather" zio to hold all async IOs
+ */
+ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+ KM_SLEEP);
+ for (int i = 0; i < max_ncpus; i++) {
+ spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
+
+ /*
+ * Create the root vdev.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+ ASSERT(error != 0 || rvd != NULL);
+ ASSERT(error != 0 || spa->spa_root_vdev == rvd);
+
+ if (error == 0 && !zfs_allocatable_devs(nvroot))
+ error = SET_ERROR(EINVAL);
+
+ if (error == 0 &&
+ (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
+ (error = spa_validate_aux(spa, nvroot, txg,
+ VDEV_ALLOC_ADD)) == 0) {
+ /*
+ * instantiate the metaslab groups (this will dirty the vdevs)
+ * we can no longer error exit past this point
+ */
+ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ vdev_ashift_optimize(vd);
+ vdev_metaslab_set_size(vd);
+ vdev_expand(vd, txg);
+ }
+ }
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Get the list of spares, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_spares(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Get the list of level 2 cache devices, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ spa->spa_is_initializing = B_TRUE;
+ spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
+ spa->spa_meta_objset = dp->dp_meta_objset;
+ spa->spa_is_initializing = B_FALSE;
+
+ /*
+ * Create DDTs (dedup tables).
+ */
+ ddt_create(spa);
+
+ spa_update_dspace(spa);
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * Create the pool config object.
+ */
+ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool config");
+ }
+
+ if (spa_version(spa) >= SPA_VERSION_FEATURES)
+ spa_feature_create_zap_objects(spa, tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+ sizeof (uint64_t), 1, &version, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool version");
+ }
+
+ /* Newly created pools with the right version are always deflated. */
+ if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ spa->spa_deflate = TRUE;
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add deflate");
+ }
+ }
+
+ /*
+ * Create the deferred-free bpobj. Turn off compression
+ * because sync-to-convergence takes longer if the blocksize
+ * keeps changing.
+ */
+ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, obj,
+ ZIO_COMPRESS_OFF, tx);
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+ sizeof (uint64_t), 1, &obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bpobj");
+ }
+ VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+ spa->spa_meta_objset, obj));
+
+ /*
+ * Create the pool's history object.
+ */
+ if (version >= SPA_VERSION_ZPOOL_HISTORY)
+ spa_history_create_obj(spa, tx);
+
+ /*
+ * Generate some random noise for salted checksums to operate on.
+ */
+ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+ sizeof (spa->spa_cksum_salt.zcs_bytes));
+
+ /*
+ * Set pool properties.
+ */
+ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
+ spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+ spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
+ spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+ spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
+
+ if (props != NULL) {
+ spa_configfile_set(spa, props, B_FALSE);
+ spa_sync_props(props, tx);
+ }
+
+ dmu_tx_commit(tx);
+
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+ mmp_thread_start(spa);
+
+ /*
+ * We explicitly wait for the first transaction to complete so that our
+ * bean counters are appropriately updated.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ spa_spawn_aux_threads(spa);
+
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
+
+ spa_history_log_version(spa, "create");
+
+ /*
+ * Don't count references from objsets that are already closed
+ * and are making their way through the eviction process.
+ */
+ spa_evicting_os_wait(spa);
+ spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
+ spa->spa_load_state = SPA_LOAD_NONE;
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+#ifdef _KERNEL
+#ifdef illumos
+/*
+ * Get the root pool information from the root disk, then import the root pool
+ * during the system boot up time.
+ */
+extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
+
+static nvlist_t *
+spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
+{
+ nvlist_t *config;
+ nvlist_t *nvtop, *nvroot;
+ uint64_t pgid;
+
+ if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
+ return (NULL);
+
+ /*
+ * Add this top-level vdev to the child array.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pgid) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
+
+ /*
+ * Put this pool's top-level vdevs into a root vdev.
+ */
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &nvtop, 1) == 0);
+
+ /*
+ * Replace the existing vdev_tree with the new root vdev in
+ * this pool's configuration (remove the old, add the new).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+ nvlist_free(nvroot);
+ return (config);
+}
+
+/*
+ * Walk the vdev tree and see if we can find a device with "better"
+ * configuration. A configuration is "better" if the label on that
+ * device has a more recent txg.
+ */
+static void
+spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t *label;
+ uint64_t label_txg;
+
+ if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
+ &label) != 0)
+ return;
+
+ VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &label_txg) == 0);
+
+ /*
+ * Do we have a better boot device?
+ */
+ if (label_txg > *txg) {
+ *txg = label_txg;
+ *avd = vd;
+ }
+ nvlist_free(label);
+ }
+}
+
+/*
+ * Import a root pool.
+ *
+ * For x86. devpath_list will consist of devid and/or physpath name of
+ * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
+ * The GRUB "findroot" command will return the vdev we should boot.
+ *
+ * For Sparc, devpath_list consists the physpath name of the booting device
+ * no matter the rootpool is a single device pool or a mirrored pool.
+ * e.g.
+ * "/pci@1f,0/ide@d/disk@0,0:a"
+ */
+int
+spa_import_rootpool(char *devpath, char *devid)
+{
+ spa_t *spa;
+ vdev_t *rvd, *bvd, *avd = NULL;
+ nvlist_t *config, *nvtop;
+ uint64_t guid, txg;
+ char *pname;
+ int error;
+
+ /*
+ * Read the label from the boot device and generate a configuration.
+ */
+ config = spa_generate_rootconf(devpath, devid, &guid);
+#if defined(_OBP) && defined(_KERNEL)
+ if (config == NULL) {
+ if (strstr(devpath, "/iscsi/ssd") != NULL) {
+ /* iscsi boot */
+ get_iscsi_bootpath_phy(devpath);
+ config = spa_generate_rootconf(devpath, devid, &guid);
+ }
+ }
+#endif
+ if (config == NULL) {
+ cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
+ devpath);
+ return (SET_ERROR(EIO));
+ }
+
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pname)) != NULL) {
+ /*
+ * Remove the existing root pool from the namespace so that we
+ * can replace it with the correct config we just read in.
+ */
+ spa_remove(spa);
+ }
+
+ spa = spa_add(pname, config, NULL);
+ spa->spa_is_root = B_TRUE;
+ spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+ /*
+ * Build up a vdev tree based on the boot device's label config.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+ VDEV_ALLOC_ROOTPOOL);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+ pname);
+ return (error);
+ }
+
+ /*
+ * Get the boot vdev.
+ */
+ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+ cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
+ (u_longlong_t)guid);
+ error = SET_ERROR(ENOENT);
+ goto out;
+ }
+
+ /*
+ * Determine if there is a better boot device.
+ */
+ avd = bvd;
+ spa_alt_rootvdev(rvd, &avd, &txg);
+ if (avd != bvd) {
+ cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
+ "try booting from '%s'", avd->vdev_path);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ /*
+ * If the boot device is part of a spare vdev then ensure that
+ * we're booting off the active spare.
+ */
+ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ !bvd->vdev_isspare) {
+ cmn_err(CE_NOTE, "The boot device is currently spared. Please "
+ "try booting from '%s'",
+ bvd->vdev_parent->
+ vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ error = 0;
+out:
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_free(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(config);
+ return (error);
+}
+
+#else /* !illumos */
+
+extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
+ uint64_t *count);
+
+static nvlist_t *
+spa_generate_rootconf(const char *name)
+{
+ nvlist_t **configs, **tops;
+ nvlist_t *config;
+ nvlist_t *best_cfg, *nvtop, *nvroot;
+ uint64_t *holes;
+ uint64_t best_txg;
+ uint64_t nchildren;
+ uint64_t pgid;
+ uint64_t count;
+ uint64_t i;
+ uint_t nholes;
+
+ if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
+ return (NULL);
+
+ ASSERT3U(count, !=, 0);
+ best_txg = 0;
+ for (i = 0; i < count; i++) {
+ uint64_t txg;
+
+ VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
+ &txg) == 0);
+ if (txg > best_txg) {
+ best_txg = txg;
+ best_cfg = configs[i];
+ }
+ }
+
+ nchildren = 1;
+ nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
+ holes = NULL;
+ nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
+ &holes, &nholes);
+
+ tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
+ for (i = 0; i < nchildren; i++) {
+ if (i >= count)
+ break;
+ if (configs[i] == NULL)
+ continue;
+ VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ nvlist_dup(nvtop, &tops[i], KM_SLEEP);
+ }
+ for (i = 0; holes != NULL && i < nholes; i++) {
+ if (i >= nchildren)
+ continue;
+ if (tops[holes[i]] != NULL)
+ continue;
+ nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
+ VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_HOLE) == 0);
+ VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
+ holes[i]) == 0);
+ VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
+ 0) == 0);
+ }
+ for (i = 0; i < nchildren; i++) {
+ if (tops[i] != NULL)
+ continue;
+ nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
+ VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_MISSING) == 0);
+ VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
+ i) == 0);
+ VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
+ 0) == 0);
+ }
+
+ /*
+ * Create pool config based on the best vdev config.
+ */
+ nvlist_dup(best_cfg, &config, KM_SLEEP);
+
+ /*
+ * Put this pool's top-level vdevs into a root vdev.
+ */
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pgid) == 0);
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ tops, nchildren) == 0);
+
+ /*
+ * Replace the existing vdev_tree with the new root vdev in
+ * this pool's configuration (remove the old, add the new).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+
+ /*
+ * Drop vdev config elements that should not be present at pool level.
+ */
+ nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
+ nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
+
+ for (i = 0; i < count; i++)
+ nvlist_free(configs[i]);
+ kmem_free(configs, count * sizeof(void *));
+ for (i = 0; i < nchildren; i++)
+ nvlist_free(tops[i]);
+ kmem_free(tops, nchildren * sizeof(void *));
+ nvlist_free(nvroot);
+ return (config);
+}
+
+int
+spa_import_rootpool(const char *name, bool checkpointrewind)
+{
+ spa_t *spa;
+ vdev_t *rvd, *bvd, *avd = NULL;
+ nvlist_t *config, *nvtop;
+ uint64_t txg;
+ char *pname;
+ int error;
+
+ /*
+ * Read the label from the boot device and generate a configuration.
+ */
+ config = spa_generate_rootconf(name);
+
+ mutex_enter(&spa_namespace_lock);
+ if (config != NULL) {
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0 && strcmp(name, pname) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
+ == 0);
+
+ if ((spa = spa_lookup(pname)) != NULL) {
+ /*
+ * The pool could already be imported,
+ * e.g., after reboot -r.
+ */
+ if (spa->spa_state == POOL_STATE_ACTIVE) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ return (0);
+ }
+
+ /*
+ * Remove the existing root pool from the namespace so
+ * that we can replace it with the correct config
+ * we just read in.
+ */
+ spa_remove(spa);
+ }
+ spa = spa_add(pname, config, NULL);
+
+ /*
+ * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
+ * via spa_version().
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+ } else if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
+ name);
+ return (EIO);
+ } else {
+ VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
+ }
+ spa->spa_is_root = B_TRUE;
+ spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
+ if (checkpointrewind) {
+ spa->spa_import_flags |= ZFS_IMPORT_CHECKPOINT;
+ }
+
+ /*
+ * Build up a vdev tree based on the boot device's label config.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+ VDEV_ALLOC_ROOTPOOL);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+ pname);
+ return (error);
+ }
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_free(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(config);
+ return (0);
+}
+
+#endif /* illumos */
+#endif /* _KERNEL */
+
+/*
+ * Import a non-root pool into the system.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
+{
+ spa_t *spa;
+ char *altroot = NULL;
+ spa_load_state_t state = SPA_LOAD_IMPORT;
+ zpool_load_policy_t policy;
+ uint64_t mode = spa_mode_global;
+ uint64_t readonly = B_FALSE;
+ int error;
+ nvlist_t *nvroot;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+
+ /*
+ * If a pool with this name exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EEXIST));
+ }
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+ (void) nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
+ if (readonly)
+ mode = FREAD;
+ spa = spa_add(pool, config, altroot);
+ spa->spa_import_flags = flags;
+
+ /*
+ * Verbatim import - Take a pool and insert it into the namespace
+ * as if it had been loaded at boot.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
+ if (props != NULL)
+ spa_configfile_set(spa, props, B_FALSE);
+
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
+ zfs_dbgmsg("spa_import: verbatim import of %s", pool);
+ mutex_exit(&spa_namespace_lock);
+ return (0);
+ }
+
+ spa_activate(spa, mode);
+
+ /*
+ * Don't start async tasks until we know everything is healthy.
+ */
+ spa_async_suspend(spa);
+
+ zpool_get_load_policy(config, &policy);
+ if (policy.zlp_rewind & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
+
+ spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
+
+ if (state != SPA_LOAD_RECOVER) {
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+ zfs_dbgmsg("spa_import: importing %s", pool);
+ } else {
+ zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
+ "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
+ }
+ error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
+
+ /*
+ * Propagate anything learned while loading the pool and pass it
+ * back to caller (i.e. rewind info, missing devices, etc).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ /*
+ * Toss any existing sparelist, as it doesn't have any validity
+ * anymore, and conflicts with spa_has_spare().
+ */
+ if (spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
+ spa_load_spares(spa);
+ }
+ if (spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
+ spa_load_l2cache(spa);
+ }
+
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (error == 0)
+ error = spa_validate_aux(spa, nvroot, -1ULL,
+ VDEV_ALLOC_SPARE);
+ if (error == 0)
+ error = spa_validate_aux(spa, nvroot, -1ULL,
+ VDEV_ALLOC_L2CACHE);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (props != NULL)
+ spa_configfile_set(spa, props, B_FALSE);
+
+ if (error != 0 || (props && spa_writeable(spa) &&
+ (error = spa_prop_set(spa, props)))) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ spa_async_resume(spa);
+
+ /*
+ * Override any spares and level 2 cache devices as specified by
+ * the user, as these may have correct device names/devids, etc.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ if (spa->spa_spares.sav_config)
+ VERIFY(nvlist_remove(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_spares(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ if (spa->spa_l2cache.sav_config)
+ VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Check for any removed devices.
+ */
+ if (spa->spa_autoreplace) {
+ spa_aux_check_removed(&spa->spa_spares);
+ spa_aux_check_removed(&spa->spa_l2cache);
+ }
+
+ if (spa_writeable(spa)) {
+ /*
+ * Update the config cache to include the newly-imported pool.
+ */
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ }
+
+ /*
+ * It's possible that the pool was expanded while it was exported.
+ * We kick off an async task to handle this for us.
+ */
+ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+
+ spa_history_log_version(spa, "import");
+
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
+
+ mutex_exit(&spa_namespace_lock);
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ zvol_create_minors(spa, pool);
+#endif
+#endif
+ return (0);
+}
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+ nvlist_t *config = NULL;
+ char *poolname, *cachefile;
+ spa_t *spa;
+ uint64_t state;
+ int error;
+ zpool_load_policy_t policy;
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+ return (NULL);
+
+ if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+ return (NULL);
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
+ spa_activate(spa, FREAD);
+
+ /*
+ * Rewind pool if a max txg was provided.
+ */
+ zpool_get_load_policy(spa->spa_config, &policy);
+ if (policy.zlp_txg != UINT64_MAX) {
+ spa->spa_load_max_txg = policy.zlp_txg;
+ spa->spa_extreme_rewind = B_TRUE;
+ zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
+ poolname, (longlong_t)policy.zlp_txg);
+ } else {
+ zfs_dbgmsg("spa_tryimport: importing %s", poolname);
+ }
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
+ == 0) {
+ zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
+ spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+ } else {
+ spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
+ }
+
+ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
+
+ /*
+ * If 'tryconfig' was at least parsable, return the current config.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ poolname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ state) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
+ spa->spa_uberblock.ub_timestamp) == 0);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+
+ /*
+ * If the bootfs property exists on this pool then we
+ * copy it out so that external consumers can tell which
+ * pools are bootable.
+ */
+ if ((!error || error == EEXIST) && spa->spa_bootfs) {
+ char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ /*
+ * We have to play games with the name since the
+ * pool was opened as TRYIMPORT_NAME.
+ */
+ if (dsl_dsobj_to_dsname(spa_name(spa),
+ spa->spa_bootfs, tmpname) == 0) {
+ char *cp;
+ char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ cp = strchr(tmpname, '/');
+ if (cp == NULL) {
+ (void) strlcpy(dsname, tmpname,
+ MAXPATHLEN);
+ } else {
+ (void) snprintf(dsname, MAXPATHLEN,
+ "%s/%s", poolname, ++cp);
+ }
+ VERIFY(nvlist_add_string(config,
+ ZPOOL_CONFIG_BOOTFS, dsname) == 0);
+ kmem_free(dsname, MAXPATHLEN);
+ }
+ kmem_free(tmpname, MAXPATHLEN);
+ }
+
+ /*
+ * Add the list of hot spares and level 2 cache devices.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_add_spares(spa, config);
+ spa_add_l2cache(spa, config);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+
+ return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple. We make sure there
+ * is no more pending I/O and any references to the pool are gone. Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards. If the 'hardforce' flag is set, then
+ * we don't sync the labels or remove the configuration cache.
+ */
+static int
+spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
+ boolean_t force, boolean_t hardforce)
+{
+ spa_t *spa;
+
+ if (oldconfig)
+ *oldconfig = NULL;
+
+ if (!(spa_mode_global & FWRITE))
+ return (SET_ERROR(EROFS));
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pool)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENOENT));
+ }
+
+ /*
+ * Put a hold on the pool, drop the namespace lock, stop async tasks,
+ * reacquire the namespace lock, and see if we can export.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ if (spa->spa_zvol_taskq) {
+#ifdef _KERNEL
+ zvol_remove_minors(spa, spa_name(spa));
+#endif
+ taskq_wait(spa->spa_zvol_taskq);
+ }
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ /*
+ * The pool will be in core if it's openable,
+ * in which case we can modify its state.
+ */
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+
+ /*
+ * Objsets may be open only because they're dirty, so we
+ * have to force it to sync before checking spa_refcnt.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ spa_evicting_os_wait(spa);
+
+ /*
+ * A pool cannot be exported or destroyed if there are active
+ * references. If we are resetting a pool, allow references by
+ * fault injection handlers.
+ */
+ if (!spa_refcount_zero(spa) ||
+ (spa->spa_inject_ref != 0 &&
+ new_state != POOL_STATE_UNINITIALIZED)) {
+ spa_async_resume(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EBUSY));
+ }
+
+ /*
+ * A pool cannot be exported if it has an active shared spare.
+ * This is to prevent other pools stealing the active spare
+ * from an exported pool. At user's own will, such pool can
+ * be forcedly exported.
+ */
+ if (!force && new_state == POOL_STATE_EXPORTED &&
+ spa_has_active_shared_spare(spa)) {
+ spa_async_resume(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * We're about to export or destroy this pool. Make sure
+ * we stop all initializtion activity here before we
+ * set the spa_final_txg. This will ensure that all
+ * dirty data resulting from the initialization is
+ * committed to disk before we unload the pool.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
+ /*
+ * We want this to be reflected on every label,
+ * so mark them all dirty. spa_unload() will do the
+ * final sync that pushes these changes out.
+ */
+ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa->spa_state = new_state;
+ spa->spa_final_txg = spa_last_synced_txg(spa) +
+ TXG_DEFER_SIZE + 1;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ }
+ }
+
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+
+ if (oldconfig && spa->spa_config)
+ VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ if (!hardforce)
+ spa_write_cachefile(spa, B_TRUE, B_TRUE);
+ spa_remove(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
+ B_FALSE, B_FALSE));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
+ boolean_t hardforce)
+{
+ return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
+ force, hardforce));
+}
+
+/*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
+ B_FALSE, B_FALSE));
+}
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * Add a device to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+ uint64_t txg, id;
+ int error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *tvd;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
+ VDEV_ALLOC_ADD)) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
+ &nspares) != 0)
+ nspares = 0;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
+ &nl2cache) != 0)
+ nl2cache = 0;
+
+ if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+
+ if (vd->vdev_children != 0 &&
+ (error = vdev_create(vd, txg, B_FALSE)) != 0)
+ return (spa_vdev_exit(spa, vd, txg, error));
+
+ /*
+ * We must validate the spares and l2cache devices after checking the
+ * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
+ */
+ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
+ return (spa_vdev_exit(spa, vd, txg, error));
+
+ /*
+ * If we are in the middle of a device removal, we can only add
+ * devices which match the existing devices in the pool.
+ * If we are in the middle of a removal, or have some indirect
+ * vdevs, we can not add raidz toplevels.
+ */
+ if (spa->spa_vdev_removal != NULL ||
+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ if (spa->spa_vdev_removal != NULL &&
+ tvd->vdev_ashift != spa->spa_max_ashift) {
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+ }
+ /* Fail if top level vdev is raidz */
+ if (tvd->vdev_ops == &vdev_raidz_ops) {
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+ }
+ /*
+ * Need the top level mirror to be
+ * a mirror of leaf vdevs only
+ */
+ if (tvd->vdev_ops == &vdev_mirror_ops) {
+ for (uint64_t cid = 0;
+ cid < tvd->vdev_children; cid++) {
+ vdev_t *cvd = tvd->vdev_child[cid];
+ if (!cvd->vdev_ops->vdev_op_leaf) {
+ return (spa_vdev_exit(spa, vd,
+ txg, EINVAL));
+ }
+ }
+ }
+ }
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+
+ /*
+ * Set the vdev id to the first hole, if one exists.
+ */
+ for (id = 0; id < rvd->vdev_children; id++) {
+ if (rvd->vdev_child[id]->vdev_ishole) {
+ vdev_free(rvd->vdev_child[id]);
+ break;
+ }
+ }
+ tvd = vd->vdev_child[c];
+ vdev_remove_child(vd, tvd);
+ tvd->vdev_id = id;
+ vdev_add_child(rvd, tvd);
+ vdev_config_dirty(tvd);
+ }
+
+ if (nspares != 0) {
+ spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
+ ZPOOL_CONFIG_SPARES);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ if (nl2cache != 0) {
+ spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
+ ZPOOL_CONFIG_L2CACHE);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ /*
+ * We have to be careful when adding new vdevs to an existing pool.
+ * If other threads start allocating from these vdevs before we
+ * sync the config cache, and we lose power, then upon reboot we may
+ * fail to open the pool because there are DVAs that the config cache
+ * can't translate. Therefore, we first add the vdevs without
+ * initializing metaslabs; sync the config cache (via spa_vdev_exit());
+ * and then let spa_config_update() initialize the new metaslabs.
+ *
+ * spa_load() checks for added-but-not-initialized vdevs, so that
+ * if we lose power at any point in this sequence, the remaining
+ * steps will be completed the next time we load the pool.
+ */
+ (void) spa_vdev_exit(spa, vd, txg, 0);
+
+ mutex_enter(&spa_namespace_lock);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Attach a device to a mirror. The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device. If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally identical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ */
+int
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
+{
+ uint64_t txg, dtl_max_txg;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+ vdev_ops_t *pvops;
+ char *oldvdpath, *newvdpath;
+ int newvd_isspare;
+ int error;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ if (spa->spa_vdev_removal != NULL)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ if (oldvd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!oldvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = oldvd->vdev_parent;
+
+ if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
+ VDEV_ALLOC_ATTACH)) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ if (newrootvd->vdev_children != 1)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ newvd = newrootvd->vdev_child[0];
+
+ if (!newvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+ /*
+ * Spares can't replace logs
+ */
+ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ if (!replacing) {
+ /*
+ * For attach, the only allowable parent is a mirror or the root
+ * vdev.
+ */
+ if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ pvops = &vdev_mirror_ops;
+ } else {
+ /*
+ * Active hot spares can only be replaced by inactive hot
+ * spares.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ oldvd->vdev_isspare &&
+ !spa_has_spare(spa, newvd->vdev_guid))
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * If the source is a hot spare, and the parent isn't already a
+ * spare, then we want to create a new hot spare. Otherwise, we
+ * want to create a replacing vdev. The user is not allowed to
+ * attach to a spared vdev child unless the 'isspare' state is
+ * the same (spare replaces spare, non-spare replaces
+ * non-spare).
+ */
+ if (pvd->vdev_ops == &vdev_replacing_ops &&
+ spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ } else if (pvd->vdev_ops == &vdev_spare_ops &&
+ newvd->vdev_isspare != oldvd->vdev_isspare) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ }
+
+ if (newvd->vdev_isspare)
+ pvops = &vdev_spare_ops;
+ else
+ pvops = &vdev_replacing_ops;
+ }
+
+ /*
+ * Make sure the new device is big enough.
+ */
+ if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+ return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+ /*
+ * The new device cannot have a higher alignment requirement
+ * than the top-level vdev.
+ */
+ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
+ return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+
+ /*
+ * If this is an in-place replacement, update oldvd's path and devid
+ * to make it distinguishable from newvd, and unopenable from now on.
+ */
+ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ spa_strfree(oldvd->vdev_path);
+ oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ KM_SLEEP);
+ (void) sprintf(oldvd->vdev_path, "%s/%s",
+ newvd->vdev_path, "old");
+ if (oldvd->vdev_devid != NULL) {
+ spa_strfree(oldvd->vdev_devid);
+ oldvd->vdev_devid = NULL;
+ }
+ }
+
+ /* mark the device being resilvered */
+ newvd->vdev_resilver_txg = txg;
+
+ /*
+ * If the parent is not a mirror, or if we're replacing, insert the new
+ * mirror/replacing/spare vdev above oldvd.
+ */
+ if (pvd->vdev_ops != pvops)
+ pvd = vdev_add_parent(oldvd, pvops);
+
+ ASSERT(pvd->vdev_top->vdev_parent == rvd);
+ ASSERT(pvd->vdev_ops == pvops);
+ ASSERT(oldvd->vdev_parent == pvd);
+
+ /*
+ * Extract the new device from its root and add it to pvd.
+ */
+ vdev_remove_child(newrootvd, newvd);
+ newvd->vdev_id = pvd->vdev_children;
+ newvd->vdev_crtxg = oldvd->vdev_crtxg;
+ vdev_add_child(pvd, newvd);
+
+ tvd = newvd->vdev_top;
+ ASSERT(pvd->vdev_top == tvd);
+ ASSERT(tvd->vdev_parent == rvd);
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+ * for any dmu_sync-ed blocks. It will propagate upward when
+ * spa_vdev_exit() calls vdev_dtl_reassess().
+ */
+ dtl_max_txg = txg + TXG_CONCURRENT_STATES;
+
+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+ dtl_max_txg - TXG_INITIAL);
+
+ if (newvd->vdev_isspare) {
+ spa_spare_activate(newvd);
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+ }
+
+ oldvdpath = spa_strdup(oldvd->vdev_path);
+ newvdpath = spa_strdup(newvd->vdev_path);
+ newvd_isspare = newvd->vdev_isspare;
+
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+ /*
+ * Schedule the resilver to restart in the future. We do this to
+ * ensure that dmu_sync-ed blocks have been stitched into the
+ * respective datasets.
+ */
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+
+ if (spa->spa_bootfs)
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
+
+ /*
+ * Commit the config
+ */
+ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
+
+ spa_history_log_internal(spa, "vdev attach", NULL,
+ "%s vdev=%s %s vdev=%s",
+ replacing && newvd_isspare ? "spare in" :
+ replacing ? "replace" : "attach", newvdpath,
+ replacing ? "for" : "to", oldvdpath);
+
+ spa_strfree(oldvdpath);
+ spa_strfree(newvdpath);
+
+ return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ *
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
+{
+ uint64_t txg;
+ int error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *pvd, *cvd, *tvd;
+ boolean_t unspare = B_FALSE;
+ uint64_t unspare_guid = 0;
+ char *vdpath;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ /*
+ * Besides being called directly from the userland through the
+ * ioctl interface, spa_vdev_detach() can be potentially called
+ * at the end of spa_vdev_resilver_done().
+ *
+ * In the regular case, when we have a checkpoint this shouldn't
+ * happen as we never empty the DTLs of a vdev during the scrub
+ * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
+ * should never get here when we have a checkpoint.
+ *
+ * That said, even in a case when we checkpoint the pool exactly
+ * as spa_vdev_resilver_done() calls this function everything
+ * should be fine as the resilver will return right away.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ if (vd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If the parent/child relationship is not as expected, don't do it.
+ * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
+ * vdev that's replacing B with C. The user's intent in replacing
+ * is to go from M(A,B) to M(A,C). If the user decides to cancel
+ * the replace by detaching C, the expected behavior is to end up
+ * M(A,B). But suppose that right after deciding to detach C,
+ * the replacement of B completes. We would have M(A,C), and then
+ * ask to detach C, which would leave us with just A -- not what
+ * the user wanted. To prevent this, we make sure that the
+ * parent/child relationship hasn't changed -- in this example,
+ * that C's parent is still the replacing vdev R.
+ */
+ if (pvd->vdev_guid != pguid && pguid != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * Only 'replacing' or 'spare' vdevs can be replaced.
+ */
+ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
+ spa_version(spa) >= SPA_VERSION_SPARES);
+
+ /*
+ * Only mirror, replacing, and spare vdevs support detach.
+ */
+ if (pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * If this device has the only valid copy of some data,
+ * we cannot safely detach it.
+ */
+ if (vdev_dtl_required(vd))
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ ASSERT(pvd->vdev_children >= 2);
+
+ /*
+ * If we are detaching the second disk from a replacing vdev, then
+ * check to see if we changed the original vdev's path to have "/old"
+ * at the end in spa_vdev_attach(). If so, undo that change now.
+ */
+ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+ vd->vdev_path != NULL) {
+ size_t len = strlen(vd->vdev_path);
+
+ for (int c = 0; c < pvd->vdev_children; c++) {
+ cvd = pvd->vdev_child[c];
+
+ if (cvd == vd || cvd->vdev_path == NULL)
+ continue;
+
+ if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+ strcmp(cvd->vdev_path + len, "/old") == 0) {
+ spa_strfree(cvd->vdev_path);
+ cvd->vdev_path = spa_strdup(vd->vdev_path);
+ break;
+ }
+ }
+ }
+
+ /*
+ * If we are detaching the original disk from a spare, then it implies
+ * that the spare should become a real disk, and be removed from the
+ * active spare list for the pool.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_id == 0 &&
+ pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
+ unspare = B_TRUE;
+
+ /*
+ * Erase the disk labels so the disk can be used for other things.
+ * This must be done after all other error cases are handled,
+ * but before we disembowel vd (so we can still do I/O to it).
+ * But if we can't do it, don't treat the error as fatal --
+ * it may be that the unwritability of the disk is the reason
+ * it's being detached!
+ */
+ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ /*
+ * Remove vd from its parent and compact the parent's children.
+ */
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ /*
+ * Remember one of the remaining children so we can get tvd below.
+ */
+ cvd = pvd->vdev_child[pvd->vdev_children - 1];
+
+ /*
+ * If we need to remove the remaining child from the list of hot spares,
+ * do it now, marking the vdev as no longer a spare in the process.
+ * We must do this before vdev_remove_parent(), because that can
+ * change the GUID if it creates a new toplevel GUID. For a similar
+ * reason, we must remove the spare now, in the same txg as the detach;
+ * otherwise someone could attach a new sibling, change the GUID, and
+ * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
+ */
+ if (unspare) {
+ ASSERT(cvd->vdev_isspare);
+ spa_spare_remove(cvd);
+ unspare_guid = cvd->vdev_guid;
+ (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ cvd->vdev_unspare = B_TRUE;
+ }
+
+ /*
+ * If the parent mirror/replacing vdev only has one child,
+ * the parent is no longer needed. Remove it from the tree.
+ */
+ if (pvd->vdev_children == 1) {
+ if (pvd->vdev_ops == &vdev_spare_ops)
+ cvd->vdev_unspare = B_FALSE;
+ vdev_remove_parent(cvd);
+ }
+
+
+ /*
+ * We don't set tvd until now because the parent we just removed
+ * may have been the previous top-level vdev.
+ */
+ tvd = cvd->vdev_top;
+ ASSERT(tvd->vdev_parent == rvd);
+
+ /*
+ * Reevaluate the parent vdev state.
+ */
+ vdev_propagate_state(cvd);
+
+ /*
+ * If the 'autoexpand' property is set on the pool then automatically
+ * try to expand the size of the pool. For example if the device we
+ * just detached was smaller than the others, it may be possible to
+ * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+ * first so that we can obtain the updated sizes of the leaf vdevs.
+ */
+ if (spa->spa_autoexpand) {
+ vdev_reopen(tvd);
+ vdev_expand(tvd, txg);
+ }
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
+ * vd->vdev_detached is set and free vd's DTL object in syncing context.
+ * But first make sure we're not on any *other* txg's DTL list, to
+ * prevent vd from being accessed after it's freed.
+ */
+ vdpath = spa_strdup(vd->vdev_path);
+ for (int t = 0; t < TXG_SIZE; t++)
+ (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+ vd->vdev_detached = B_TRUE;
+ vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
+
+ /* hang on to the spa before we release the lock */
+ spa_open_ref(spa, FTAG);
+
+ error = spa_vdev_exit(spa, vd, txg, 0);
+
+ spa_history_log_internal(spa, "detach", NULL,
+ "vdev=%s", vdpath);
+ spa_strfree(vdpath);
+
+ /*
+ * If this was the removal of the original device in a hot spare vdev,
+ * then we want to go through and remove the device from the hot spare
+ * list of every other pool.
+ */
+ if (unspare) {
+ spa_t *altspa = NULL;
+
+ mutex_enter(&spa_namespace_lock);
+ while ((altspa = spa_next(altspa)) != NULL) {
+ if (altspa->spa_state != POOL_STATE_ACTIVE ||
+ altspa == spa)
+ continue;
+
+ spa_open_ref(altspa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(altspa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ /* search the rest of the vdevs for spares to remove */
+ spa_vdev_resilver_done(spa);
+ }
+
+ /* all done with the spa; OK to release */
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ return (error);
+}
+
+int
+spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
+{
+ /*
+ * We hold the namespace lock through the whole function
+ * to prevent any changes to the pool while we're starting or
+ * stopping initialization. The config and state locks are held so that
+ * we can properly assess the vdev state before we commit to
+ * the initializing operation.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+ /* Look up vdev and ensure it's a leaf. */
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_detached) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENODEV));
+ } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EINVAL));
+ } else if (!vdev_writeable(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EROFS));
+ }
+ mutex_enter(&vd->vdev_initialize_lock);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ /*
+ * When we activate an initialize action we check to see
+ * if the vdev_initialize_thread is NULL. We do this instead
+ * of using the vdev_initialize_state since there might be
+ * a previous initialization process which has completed but
+ * the thread is not exited.
+ */
+ if (cmd_type == POOL_INITIALIZE_DO &&
+ (vd->vdev_initialize_thread != NULL ||
+ vd->vdev_top->vdev_removing)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EBUSY));
+ } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+ (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ESRCH));
+ }
+
+ switch (cmd_type) {
+ case POOL_INITIALIZE_DO:
+ vdev_initialize(vd);
+ break;
+ case POOL_INITIALIZE_CANCEL:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
+ break;
+ default:
+ panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ /* Sync out the initializing state */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+
+/*
+ * Split a set of devices from their mirrors, and create a new pool from them.
+ */
+int
+spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp)
+{
+ int error = 0;
+ uint64_t txg, *glist;
+ spa_t *newspa;
+ uint_t c, children, lastlog;
+ nvlist_t **child, *nvl, *tmp;
+ dmu_tx_t *tx;
+ char *altroot = NULL;
+ vdev_t *rvd, **vml = NULL; /* vdev modify list */
+ boolean_t activate_slog;
+
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /* clear the log and flush everything up to now */
+ activate_slog = spa_passivate_log(spa);
+ (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ error = spa_reset_logs(spa);
+ txg = spa_vdev_config_enter(spa);
+
+ if (activate_slog)
+ spa_activate_log(spa);
+
+ if (error != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ /* check new spa name before going any further */
+ if (spa_lookup(newname) != NULL)
+ return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+
+ /*
+ * scan through all the children to ensure they're all mirrors
+ */
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+ nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* first, check to ensure we've got the right child count */
+ rvd = spa->spa_root_vdev;
+ lastlog = 0;
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ /* don't count the holes & logs as children */
+ if (vd->vdev_islog || !vdev_is_concrete(vd)) {
+ if (lastlog == 0)
+ lastlog = c;
+ continue;
+ }
+
+ lastlog = 0;
+ }
+ if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* next, ensure no spare or cache devices are part of the split */
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+ nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+ glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+
+ /* then, loop over each vdev and validate it */
+ for (c = 0; c < children; c++) {
+ uint64_t is_hole = 0;
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &is_hole);
+
+ if (is_hole != 0) {
+ if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+ spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+ continue;
+ } else {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ }
+
+ /* which disk is going to be split? */
+ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+ &glist[c]) != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ /* look it up in the spa */
+ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+ if (vml[c] == NULL) {
+ error = SET_ERROR(ENODEV);
+ break;
+ }
+
+ /* make sure there's nothing stopping the split */
+ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+ vml[c]->vdev_islog ||
+ !vdev_is_concrete(vml[c]) ||
+ vml[c]->vdev_isspare ||
+ vml[c]->vdev_isl2cache ||
+ !vdev_writeable(vml[c]) ||
+ vml[c]->vdev_children != 0 ||
+ vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+ c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ if (vdev_dtl_required(vml[c])) {
+ error = SET_ERROR(EBUSY);
+ break;
+ }
+
+ /* we need certain info from the top level */
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+ vml[c]->vdev_top->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+ vml[c]->vdev_top->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+ vml[c]->vdev_top->vdev_asize) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+ vml[c]->vdev_top->vdev_ashift) == 0);
+
+ /* transfer per-vdev ZAPs */
+ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
+ VERIFY0(nvlist_add_uint64(child[c],
+ ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
+
+ ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
+ VERIFY0(nvlist_add_uint64(child[c],
+ ZPOOL_CONFIG_VDEV_TOP_ZAP,
+ vml[c]->vdev_parent->vdev_top_zap));
+ }
+
+ if (error != 0) {
+ kmem_free(vml, children * sizeof (vdev_t *));
+ kmem_free(glist, children * sizeof (uint64_t));
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /* stop writers from using the disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_TRUE;
+ }
+ vdev_reopen(spa->spa_root_vdev);
+
+ /*
+ * Temporarily record the splitting vdevs in the spa config. This
+ * will disappear once the config is regenerated.
+ */
+ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ glist, children) == 0);
+ kmem_free(glist, children * sizeof (uint64_t));
+
+ mutex_enter(&spa->spa_props_lock);
+ VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+ nvl) == 0);
+ mutex_exit(&spa->spa_props_lock);
+ spa->spa_config_splitting = nvl;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ /* configure and create the new pool */
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ spa->spa_config_txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_generate_guid(NULL)) == 0);
+ VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+
+ /* add the new pool to the namespace */
+ newspa = spa_add(newname, config, altroot);
+ newspa->spa_avz_action = AVZ_ACTION_REBUILD;
+ newspa->spa_config_txg = spa->spa_config_txg;
+ spa_set_log_state(newspa, SPA_LOG_CLEAR);
+
+ /* release the spa config lock, retaining the namespace lock */
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 1);
+
+ spa_activate(newspa, spa_mode_global);
+ spa_async_suspend(newspa);
+
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ /*
+ * Temporarily stop the initializing activity. We set
+ * the state to ACTIVE so that we know to resume
+ * the initializing once the split has completed.
+ */
+ mutex_enter(&vml[c]->vdev_initialize_lock);
+ vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
+ mutex_exit(&vml[c]->vdev_initialize_lock);
+ }
+ }
+
+#ifndef illumos
+ /* mark that we are creating new spa by splitting */
+ newspa->spa_splitting_newspa = B_TRUE;
+#endif
+ newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
+
+ /* create the new pool from the disks of the original pool */
+ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
+#ifndef illumos
+ newspa->spa_splitting_newspa = B_FALSE;
+#endif
+ if (error)
+ goto out;
+
+ /* if that worked, generate a real config for the new pool */
+ if (newspa->spa_root_vdev != NULL) {
+ VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+ spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+ B_TRUE));
+ }
+
+ /* set the props */
+ if (props != NULL) {
+ spa_configfile_set(newspa, props, B_FALSE);
+ error = spa_prop_set(newspa, props);
+ if (error)
+ goto out;
+ }
+
+ /* flush everything */
+ txg = spa_vdev_config_enter(newspa);
+ vdev_config_dirty(newspa->spa_root_vdev);
+ (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 2);
+
+ spa_async_resume(newspa);
+
+ /* finally, update the original pool's config */
+ txg = spa_vdev_config_enter(spa);
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0)
+ dmu_tx_abort(tx);
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ vdev_split(vml[c]);
+ if (error == 0)
+ spa_history_log_internal(spa, "detach", tx,
+ "vdev=%s", vml[c]->vdev_path);
+
+ vdev_free(vml[c]);
+ }
+ }
+ spa->spa_avz_action = AVZ_ACTION_REBUILD;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa->spa_config_splitting = NULL;
+ nvlist_free(nvl);
+ if (error == 0)
+ dmu_tx_commit(tx);
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 3);
+
+ /* split is complete; log a history record */
+ spa_history_log_internal(newspa, "split", NULL,
+ "from pool %s", spa_name(spa));
+
+ kmem_free(vml, children * sizeof (vdev_t *));
+
+ /* if we're not going to mount the filesystems in userland, export */
+ if (exp)
+ error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+ B_FALSE, B_FALSE);
+
+ return (error);
+
+out:
+ spa_unload(newspa);
+ spa_deactivate(newspa);
+ spa_remove(newspa);
+
+ txg = spa_vdev_config_enter(spa);
+
+ /* re-online all offlined disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_FALSE;
+ }
+
+ /* restart initializing disks as necessary */
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+
+ vdev_reopen(spa->spa_root_vdev);
+
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ (void) spa_vdev_exit(spa, NULL, txg, error);
+
+ kmem_free(vml, children * sizeof (vdev_t *));
+ return (error);
+}
+
+/*
+ * Find any device that's done replacing, or a vdev marked 'unspare' that's
+ * currently spared, so we can detach it.
+ */
+static vdev_t *
+spa_vdev_resilver_done_hunt(vdev_t *vd)
+{
+ vdev_t *newvd, *oldvd;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
+ if (oldvd != NULL)
+ return (oldvd);
+ }
+
+ /*
+ * Check for a completed replacement. We always consider the first
+ * vdev in the list to be the oldest vdev, and the last one to be
+ * the newest (see spa_vdev_attach() for how that works). In
+ * the case where the newest vdev is faulted, we will not automatically
+ * remove it after a resilver completes. This is OK as it will require
+ * user intervention to determine which disk the admin wishes to keep.
+ */
+ if (vd->vdev_ops == &vdev_replacing_ops) {
+ ASSERT(vd->vdev_children > 1);
+
+ newvd = vd->vdev_child[vd->vdev_children - 1];
+ oldvd = vd->vdev_child[0];
+
+ if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+ vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+ !vdev_dtl_required(oldvd))
+ return (oldvd);
+ }
+
+ /*
+ * Check for a completed resilver with the 'unspare' flag set.
+ * Also potentially update faulted state.
+ */
+ if (vd->vdev_ops == &vdev_spare_ops) {
+ vdev_t *first = vd->vdev_child[0];
+ vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+ if (last->vdev_unspare) {
+ oldvd = first;
+ newvd = last;
+ } else if (first->vdev_unspare) {
+ oldvd = last;
+ newvd = first;
+ } else {
+ oldvd = NULL;
+ }
+
+ if (oldvd != NULL &&
+ vdev_dtl_empty(newvd, DTL_MISSING) &&
+ vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+ !vdev_dtl_required(oldvd))
+ return (oldvd);
+
+ vdev_propagate_state(vd);
+
+ /*
+ * If there are more than two spares attached to a disk,
+ * and those spares are not required, then we want to
+ * attempt to free them up now so that they can be used
+ * by other pools. Once we're back down to a single
+ * disk+spare, we stop removing them.
+ */
+ if (vd->vdev_children > 2) {
+ newvd = vd->vdev_child[1];
+
+ if (newvd->vdev_isspare && last->vdev_isspare &&
+ vdev_dtl_empty(last, DTL_MISSING) &&
+ vdev_dtl_empty(last, DTL_OUTAGE) &&
+ !vdev_dtl_required(newvd))
+ return (newvd);
+ }
+ }
+
+ return (NULL);
+}
+
+static void
+spa_vdev_resilver_done(spa_t *spa)
+{
+ vdev_t *vd, *pvd, *ppvd;
+ uint64_t guid, sguid, pguid, ppguid;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+ pvd = vd->vdev_parent;
+ ppvd = pvd->vdev_parent;
+ guid = vd->vdev_guid;
+ pguid = pvd->vdev_guid;
+ ppguid = ppvd->vdev_guid;
+ sguid = 0;
+ /*
+ * If we have just finished replacing a hot spared device, then
+ * we need to detach the parent's first child (the original hot
+ * spare) as well.
+ */
+ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+ ppvd->vdev_children == 2) {
+ ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
+ sguid = ppvd->vdev_child[1]->vdev_guid;
+ }
+ ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
+ return;
+ if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
+ return;
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ }
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+}
+
+/*
+ * Update the stored path or FRU for this vdev.
+ */
+int
+spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
+ boolean_t ispath)
+{
+ vdev_t *vd;
+ boolean_t sync = B_FALSE;
+
+ ASSERT(spa_writeable(spa));
+
+ spa_vdev_state_enter(spa, SCL_ALL);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, ENOENT));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+ if (ispath) {
+ if (strcmp(value, vd->vdev_path) != 0) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = spa_strdup(value);
+ sync = B_TRUE;
+ }
+ } else {
+ if (vd->vdev_fru == NULL) {
+ vd->vdev_fru = spa_strdup(value);
+ sync = B_TRUE;
+ } else if (strcmp(value, vd->vdev_fru) != 0) {
+ spa_strfree(vd->vdev_fru);
+ vd->vdev_fru = spa_strdup(value);
+ sync = B_TRUE;
+ }
+ }
+
+ return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
+}
+
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+ return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
+}
+
+int
+spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
+{
+ return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
+}
+
+/*
+ * ==========================================================================
+ * SPA Scanning
+ * ==========================================================================
+ */
+int
+spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+ if (dsl_scan_resilvering(spa->spa_dsl_pool))
+ return (SET_ERROR(EBUSY));
+
+ return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
+}
+
+int
+spa_scan_stop(spa_t *spa)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+ if (dsl_scan_resilvering(spa->spa_dsl_pool))
+ return (SET_ERROR(EBUSY));
+ return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
+
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+ if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * If a resilver was requested, but there is no DTL on a
+ * writeable leaf device, we have nothing to do.
+ */
+ if (func == POOL_SCAN_RESILVER &&
+ !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ return (0);
+ }
+
+ return (dsl_scan(spa->spa_dsl_pool, func));
+}
+
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_remove(spa_t *spa, vdev_t *vd)
+{
+ if (vd->vdev_remove_wanted) {
+ vd->vdev_remove_wanted = B_FALSE;
+ vd->vdev_delayed_close = B_FALSE;
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
+
+ /*
+ * We want to clear the stats, but we don't want to do a full
+ * vdev_clear() as that will cause us to throw away
+ * degraded/faulted state as well as attempt to reopen the
+ * device, all of which is a waste.
+ */
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+
+ vdev_state_dirty(vd->vdev_top);
+ /* Tell userspace that the vdev is gone. */
+ zfs_post_remove(spa, vd);
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ spa_async_remove(spa, vd->vdev_child[c]);
+}
+
+static void
+spa_async_probe(spa_t *spa, vdev_t *vd)
+{
+ if (vd->vdev_probe_wanted) {
+ vd->vdev_probe_wanted = B_FALSE;
+ vdev_reopen(vd); /* vdev_open() does the actual probe */
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ spa_async_probe(spa, vd->vdev_child[c]);
+}
+
+static void
+spa_async_autoexpand(spa_t *spa, vdev_t *vd)
+{
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath;
+
+ if (!spa->spa_autoexpand)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ spa_async_autoexpand(spa, cvd);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
+ return;
+
+ physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+ (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+}
+
+static void
+spa_async_thread(void *arg)
+{
+ spa_t *spa = (spa_t *)arg;
+ int tasks;
+
+ ASSERT(spa->spa_sync_on);
+
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+ spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
+ mutex_exit(&spa->spa_async_lock);
+
+ /*
+ * See if the config needs to be updated.
+ */
+ if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+ uint64_t old_space, new_space;
+
+ mutex_enter(&spa_namespace_lock);
+ old_space = metaslab_class_get_space(spa_normal_class(spa));
+ old_space += metaslab_class_get_space(spa_special_class(spa));
+ old_space += metaslab_class_get_space(spa_dedup_class(spa));
+
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
+ new_space = metaslab_class_get_space(spa_normal_class(spa));
+ new_space += metaslab_class_get_space(spa_special_class(spa));
+ new_space += metaslab_class_get_space(spa_dedup_class(spa));
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * If the pool grew as a result of the config update,
+ * then log an internal history event.
+ */
+ if (new_space != old_space) {
+ spa_history_log_internal(spa, "vdev online", NULL,
+ "pool '%s' size: %llu(+%llu)",
+ spa_name(spa), new_space, new_space - old_space);
+ }
+ }
+
+ if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_async_autoexpand(spa, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ /*
+ * See if any devices need to be probed.
+ */
+ if (tasks & SPA_ASYNC_PROBE) {
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa_async_probe(spa, spa->spa_root_vdev);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ }
+
+ /*
+ * If any devices are done replacing, detach them.
+ */
+ if (tasks & SPA_ASYNC_RESILVER_DONE)
+ spa_vdev_resilver_done(spa);
+
+ /*
+ * Kick off a resilver.
+ */
+ if (tasks & SPA_ASYNC_RESILVER)
+ dsl_resilver_restart(spa->spa_dsl_pool, 0);
+
+ if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * Let the world know that we're done.
+ */
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_thread = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
+static void
+spa_async_thread_vd(void *arg)
+{
+ spa_t *spa = arg;
+ int tasks;
+
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+retry:
+ spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
+ mutex_exit(&spa->spa_async_lock);
+
+ /*
+ * See if any devices need to be marked REMOVED.
+ */
+ if (tasks & SPA_ASYNC_REMOVE) {
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa_async_remove(spa, spa->spa_root_vdev);
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+ spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
+ for (int i = 0; i < spa->spa_spares.sav_count; i++)
+ spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ }
+
+ /*
+ * Let the world know that we're done.
+ */
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+ if ((tasks & SPA_ASYNC_REMOVE) != 0)
+ goto retry;
+ spa->spa_async_thread_vd = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_suspended++;
+ while (spa->spa_async_thread != NULL ||
+ spa->spa_async_thread_vd != NULL)
+ cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+ mutex_exit(&spa->spa_async_lock);
+
+ spa_vdev_remove_suspend(spa);
+
+ zthr_t *condense_thread = spa->spa_condense_zthr;
+ if (condense_thread != NULL)
+ zthr_cancel(condense_thread);
+
+ zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+ if (discard_thread != NULL)
+ zthr_cancel(discard_thread);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ ASSERT(spa->spa_async_suspended != 0);
+ spa->spa_async_suspended--;
+ mutex_exit(&spa->spa_async_lock);
+ spa_restart_removal(spa);
+
+ zthr_t *condense_thread = spa->spa_condense_zthr;
+ if (condense_thread != NULL)
+ zthr_resume(condense_thread);
+
+ zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+ if (discard_thread != NULL)
+ zthr_resume(discard_thread);
+}
+
+static boolean_t
+spa_async_tasks_pending(spa_t *spa)
+{
+ uint_t non_config_tasks;
+ uint_t config_task;
+ boolean_t config_task_suspended;
+
+ non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
+ SPA_ASYNC_REMOVE);
+ config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
+ if (spa->spa_ccw_fail_time == 0) {
+ config_task_suspended = B_FALSE;
+ } else {
+ config_task_suspended =
+ (gethrtime() - spa->spa_ccw_fail_time) <
+ (zfs_ccw_retry_interval * NANOSEC);
+ }
+
+ return (non_config_tasks || (config_task && !config_task_suspended));
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ if (spa_async_tasks_pending(spa) &&
+ !spa->spa_async_suspended &&
+ spa->spa_async_thread == NULL &&
+ rootdir != NULL)
+ spa->spa_async_thread = thread_create(NULL, 0,
+ spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+static void
+spa_async_dispatch_vd(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
+ !spa->spa_async_suspended &&
+ spa->spa_async_thread_vd == NULL &&
+ rootdir != NULL)
+ spa->spa_async_thread_vd = thread_create(NULL, 0,
+ spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+ zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks |= task;
+ mutex_exit(&spa->spa_async_lock);
+ spa_async_dispatch_vd(spa);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ bpobj_t *bpo = arg;
+ bpobj_enqueue(bpo, bp, tx);
+ return (0);
+}
+
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zio_t *zio = arg;
+
+ zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
+ BP_GET_PSIZE(bp), zio->io_flags));
+ return (0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
+static void
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+ VERIFY(zio_wait(zio) == 0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+ spa_free_sync_cb, zio, tx), ==, 0);
+ VERIFY0(zio_wait(zio));
+}
+
+
+static void
+spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
+{
+ char *packed = NULL;
+ size_t bufsize;
+ size_t nvsize = 0;
+ dmu_buf_t *db;
+
+ VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
+
+ /*
+ * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
+ * information. This avoids the dmu_buf_will_dirty() path and
+ * saves us a pre-read to get data we don't actually care about.
+ */
+ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
+ packed = kmem_alloc(bufsize, KM_SLEEP);
+
+ VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
+ bzero(packed + nvsize, bufsize - nvsize);
+
+ dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
+
+ kmem_free(packed, bufsize);
+
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = nvsize;
+ dmu_buf_rele(db, FTAG);
+}
+
+static void
+spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
+ const char *config, const char *entry)
+{
+ nvlist_t *nvroot;
+ nvlist_t **list;
+ int i;
+
+ if (!sav->sav_sync)
+ return;
+
+ /*
+ * Update the MOS nvlist describing the list of available devices.
+ * spa_validate_aux() will have already made sure this nvlist is
+ * valid and the vdevs are labeled appropriately.
+ */
+ if (sav->sav_object == 0) {
+ sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
+ sizeof (uint64_t), tx);
+ VERIFY(zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
+ &sav->sav_object, tx) == 0);
+ }
+
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (sav->sav_count == 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
+ } else {
+ list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < sav->sav_count; i++)
+ list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
+ B_FALSE, VDEV_CONFIG_L2CACHE);
+ VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
+ sav->sav_count) == 0);
+ for (i = 0; i < sav->sav_count; i++)
+ nvlist_free(list[i]);
+ kmem_free(list, sav->sav_count * sizeof (void *));
+ }
+
+ spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
+ nvlist_free(nvroot);
+
+ sav->sav_sync = B_FALSE;
+}
+
+/*
+ * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
+ * The all-vdev ZAP must be empty.
+ */
+static void
+spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ if (vd->vdev_top_zap != 0) {
+ VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+ vd->vdev_top_zap, tx));
+ }
+ if (vd->vdev_leaf_zap != 0) {
+ VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+ vd->vdev_leaf_zap, tx));
+ }
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ spa_avz_build(vd->vdev_child[i], avz, tx);
+ }
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+ nvlist_t *config;
+
+ /*
+ * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
+ * its config may not be dirty but we still need to build per-vdev ZAPs.
+ * Similarly, if the pool is being assembled (e.g. after a split), we
+ * need to rebuild the AVZ although the config may not be dirty.
+ */
+ if (list_is_empty(&spa->spa_config_dirty_list) &&
+ spa->spa_avz_action == AVZ_ACTION_NONE)
+ return;
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
+ spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
+ spa->spa_all_vdev_zaps != 0);
+
+ if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
+ /* Make and build the new AVZ */
+ uint64_t new_avz = zap_create(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+ spa_avz_build(spa->spa_root_vdev, new_avz, tx);
+
+ /* Diff old AVZ with new one */
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t vdzap = za.za_first_integer;
+ if (zap_lookup_int(spa->spa_meta_objset, new_avz,
+ vdzap) == ENOENT) {
+ /*
+ * ZAP is listed in old AVZ but not in new one;
+ * destroy it
+ */
+ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
+ tx));
+ }
+ }
+
+ zap_cursor_fini(&zc);
+
+ /* Destroy the old AVZ */
+ VERIFY0(zap_destroy(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, tx));
+
+ /* Replace the old AVZ in the dir obj with the new one */
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
+ sizeof (new_avz), 1, &new_avz, tx));
+
+ spa->spa_all_vdev_zaps = new_avz;
+ } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /* Walk through the AVZ and destroy all listed ZAPs */
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t zap = za.za_first_integer;
+ VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
+ }
+
+ zap_cursor_fini(&zc);
+
+ /* Destroy and unlink the AVZ itself */
+ VERIFY0(zap_destroy(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, tx));
+ VERIFY0(zap_remove(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
+ spa->spa_all_vdev_zaps = 0;
+ }
+
+ if (spa->spa_all_vdev_zaps == 0) {
+ spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_VDEV_ZAP_MAP, tx);
+ }
+ spa->spa_avz_action = AVZ_ACTION_NONE;
+
+ /* Create ZAPs for vdevs that don't have them. */
+ vdev_construct_zaps(spa->spa_root_vdev, tx);
+
+ config = spa_config_generate(spa, spa->spa_root_vdev,
+ dmu_tx_get_txg(tx), B_FALSE);
+
+ /*
+ * If we're upgrading the spa version then make sure that
+ * the config object gets updated with the correct version.
+ */
+ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa->spa_uberblock.ub_version);
+
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ nvlist_free(spa->spa_config_syncing);
+ spa->spa_config_syncing = config;
+
+ spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
+}
+
+static void
+spa_sync_version(void *arg, dmu_tx_t *tx)
+{
+ uint64_t *versionp = arg;
+ uint64_t version = *versionp;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ /*
+ * Setting the version is special cased when first creating the pool.
+ */
+ ASSERT(tx->tx_txg != TXG_INITIAL);
+
+ ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+ ASSERT(version >= spa_version(spa));
+
+ spa->spa_uberblock.ub_version = version;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_history_log_internal(spa, "set", tx, "version=%lld", version);
+}
+
+/*
+ * Set zpool properties.
+ */
+static void
+spa_sync_props(void *arg, dmu_tx_t *tx)
+{
+ nvlist_t *nvp = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ nvpair_t *elem = NULL;
+
+ mutex_enter(&spa->spa_props_lock);
+
+ while ((elem = nvlist_next_nvpair(nvp, elem))) {
+ uint64_t intval;
+ char *strval, *fname;
+ zpool_prop_t prop;
+ const char *propname;
+ zprop_type_t proptype;
+ spa_feature_t fid;
+
+ switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
+ case ZPOOL_PROP_INVAL:
+ /*
+ * We checked this earlier in spa_prop_validate().
+ */
+ ASSERT(zpool_prop_feature(nvpair_name(elem)));
+
+ fname = strchr(nvpair_name(elem), '@') + 1;
+ VERIFY0(zfeature_lookup_name(fname, &fid));
+
+ spa_feature_enable(spa, fid, tx);
+ spa_history_log_internal(spa, "set", tx,
+ "%s=enabled", nvpair_name(elem));
+ break;
+
+ case ZPOOL_PROP_VERSION:
+ intval = fnvpair_value_uint64(elem);
+ /*
+ * The version is synced seperatly before other
+ * properties and should be correct by now.
+ */
+ ASSERT3U(spa_version(spa), >=, intval);
+ break;
+
+ case ZPOOL_PROP_ALTROOT:
+ /*
+ * 'altroot' is a non-persistent property. It should
+ * have been set temporarily at creation or import time.
+ */
+ ASSERT(spa->spa_root != NULL);
+ break;
+
+ case ZPOOL_PROP_READONLY:
+ case ZPOOL_PROP_CACHEFILE:
+ /*
+ * 'readonly' and 'cachefile' are also non-persisitent
+ * properties.
+ */
+ break;
+ case ZPOOL_PROP_COMMENT:
+ strval = fnvpair_value_string(elem);
+ if (spa->spa_comment != NULL)
+ spa_strfree(spa->spa_comment);
+ spa->spa_comment = spa_strdup(strval);
+ /*
+ * We need to dirty the configuration on all the vdevs
+ * so that their labels get updated. It's unnecessary
+ * to do this for pool creation since the vdev's
+ * configuratoin has already been dirtied.
+ */
+ if (tx->tx_txg != TXG_INITIAL)
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_history_log_internal(spa, "set", tx,
+ "%s=%s", nvpair_name(elem), strval);
+ break;
+ default:
+ /*
+ * Set pool property values in the poolprops mos object.
+ */
+ if (spa->spa_pool_props_object == 0) {
+ spa->spa_pool_props_object =
+ zap_create_link(mos, DMU_OT_POOL_PROPS,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
+ tx);
+ }
+
+ /* normalize the property name */
+ propname = zpool_prop_to_name(prop);
+ proptype = zpool_prop_get_type(prop);
+
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ ASSERT(proptype == PROP_TYPE_STRING);
+ strval = fnvpair_value_string(elem);
+ VERIFY0(zap_update(mos,
+ spa->spa_pool_props_object, propname,
+ 1, strlen(strval) + 1, strval, tx));
+ spa_history_log_internal(spa, "set", tx,
+ "%s=%s", nvpair_name(elem), strval);
+ } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+ intval = fnvpair_value_uint64(elem);
+
+ if (proptype == PROP_TYPE_INDEX) {
+ const char *unused;
+ VERIFY0(zpool_prop_index_to_string(
+ prop, intval, &unused));
+ }
+ VERIFY0(zap_update(mos,
+ spa->spa_pool_props_object, propname,
+ 8, 1, &intval, tx));
+ spa_history_log_internal(spa, "set", tx,
+ "%s=%lld", nvpair_name(elem), intval);
+ } else {
+ ASSERT(0); /* not allowed */
+ }
+
+ switch (prop) {
+ case ZPOOL_PROP_DELEGATION:
+ spa->spa_delegation = intval;
+ break;
+ case ZPOOL_PROP_BOOTFS:
+ spa->spa_bootfs = intval;
+ break;
+ case ZPOOL_PROP_FAILUREMODE:
+ spa->spa_failmode = intval;
+ break;
+ case ZPOOL_PROP_AUTOEXPAND:
+ spa->spa_autoexpand = intval;
+ if (tx->tx_txg != TXG_INITIAL)
+ spa_async_request(spa,
+ SPA_ASYNC_AUTOEXPAND);
+ break;
+ case ZPOOL_PROP_MULTIHOST:
+ spa->spa_multihost = intval;
+ break;
+ case ZPOOL_PROP_DEDUPDITTO:
+ spa->spa_dedup_ditto = intval;
+ break;
+ default:
+ break;
+ }
+ }
+
+ }
+
+ mutex_exit(&spa->spa_props_lock);
+}
+
+/*
+ * Perform one-time upgrade on-disk changes. spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ ASSERT(spa->spa_sync_pass == 1);
+
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+ dsl_pool_create_origin(dp, tx);
+
+ /* Keeping the origin open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+ dsl_pool_upgrade_clones(dp, tx);
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+ dsl_pool_upgrade_dir_clones(dp, tx);
+
+ /* Keeping the freedir open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+ spa_feature_create_zap_objects(spa, tx);
+ }
+
+ /*
+ * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
+ * when possibility to use lz4 compression for metadata was added
+ * Old pools that have this feature enabled must be upgraded to have
+ * this feature active
+ */
+ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+ boolean_t lz4_en = spa_feature_is_enabled(spa,
+ SPA_FEATURE_LZ4_COMPRESS);
+ boolean_t lz4_ac = spa_feature_is_active(spa,
+ SPA_FEATURE_LZ4_COMPRESS);
+
+ if (lz4_en && !lz4_ac)
+ spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
+ }
+
+ /*
+ * If we haven't written the salt, do so now. Note that the
+ * feature may not be activated yet, but that's fine since
+ * the presence of this ZAP entry is backwards compatible.
+ */
+ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CHECKSUM_SALT) == ENOENT) {
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
+ sizeof (spa->spa_cksum_salt.zcs_bytes),
+ spa->spa_cksum_salt.zcs_bytes, tx));
+ }
+
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+}
+
+static void
+vdev_indirect_state_sync_verify(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ ASSERT(vim != NULL);
+ ASSERT(vib != NULL);
+ }
+
+ if (vdev_obsolete_sm_object(vd) != 0) {
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
+ ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
+
+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ space_map_object(vd->vdev_obsolete_sm));
+ ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
+ space_map_allocated(vd->vdev_obsolete_sm));
+ }
+ ASSERT(vd->vdev_obsolete_segments != NULL);
+
+ /*
+ * Since frees / remaps to an indirect vdev can only
+ * happen in syncing context, the obsolete segments
+ * tree must be empty when we start syncing.
+ */
+ ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ objset_t *mos = spa->spa_meta_objset;
+ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
+ metaslab_class_t *normal = spa_normal_class(spa);
+ metaslab_class_t *special = spa_special_class(spa);
+ metaslab_class_t *dedup = spa_dedup_class(spa);
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ dmu_tx_t *tx;
+ int error;
+ uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
+ zfs_vdev_queue_depth_pct / 100;
+
+ VERIFY(spa_writeable(spa));
+
+ /*
+ * Wait for i/os issued in open context that need to complete
+ * before this txg syncs.
+ */
+ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
+
+ /*
+ * If there are any pending vdev state changes, convert them
+ * into config changes that go out with this transaction group.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ while (list_head(&spa->spa_state_dirty_list) != NULL) {
+ /*
+ * We need the write lock here because, for aux vdevs,
+ * calling vdev_config_dirty() modifies sav_config.
+ * This is ugly and will become unnecessary when we
+ * eliminate the aux vdev wart by integrating all vdevs
+ * into the root vdev tree.
+ */
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+ while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+ vdev_state_clean(vd);
+ vdev_config_dirty(vd);
+ }
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ spa->spa_sync_starttime = gethrtime();
+#ifdef illumos
+ VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
+ spa->spa_sync_starttime + spa->spa_deadman_synctime));
+#else /* !illumos */
+#ifdef _KERNEL
+ callout_schedule(&spa->spa_deadman_cycid,
+ hz * spa->spa_deadman_synctime / NANOSEC);
+#endif
+#endif /* illumos */
+
+ /*
+ * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+ * set spa_deflate if we have no raid-z vdevs.
+ */
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ int i;
+
+ for (i = 0; i < rvd->vdev_children; i++) {
+ vd = rvd->vdev_child[i];
+ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+ break;
+ }
+ if (i == rvd->vdev_children) {
+ spa->spa_deflate = TRUE;
+ VERIFY(0 == zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+ }
+ }
+
+ /*
+ * Set the top-level vdev's max queue depth. Evaluate each
+ * top-level's async write queue depth in case it changed.
+ * The max queue depth will not change in the middle of syncing
+ * out this txg.
+ */
+ uint64_t slots_per_allocator = 0;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+ metaslab_class_t *mc;
+
+ if (mg == NULL || !metaslab_group_initialized(mg))
+ continue;
+
+ mc = mg->mg_class;
+ if (mc != normal && mc != special && mc != dedup)
+ continue;
+
+ /*
+ * It is safe to do a lock-free check here because only async
+ * allocations look at mg_max_alloc_queue_depth, and async
+ * allocations all happen from spa_sync().
+ */
+ for (int i = 0; i < spa->spa_alloc_count; i++)
+ ASSERT0(zfs_refcount_count(
+ &(mg->mg_alloc_queue_depth[i])));
+ mg->mg_max_alloc_queue_depth = max_queue_depth;
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mg->mg_cur_max_alloc_queue_depth[i] =
+ zfs_vdev_def_queue_depth;
+ }
+ slots_per_allocator += zfs_vdev_def_queue_depth;
+ }
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
+ ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
+ ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
+ normal->mc_alloc_max_slots[i] = slots_per_allocator;
+ special->mc_alloc_max_slots[i] = slots_per_allocator;
+ dedup->mc_alloc_max_slots[i] = slots_per_allocator;
+ }
+ normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ vdev_indirect_state_sync_verify(vd);
+
+ if (vdev_indirect_should_condense(vd)) {
+ spa_condense_indirect_start_sync(vd, tx);
+ break;
+ }
+ }
+
+ /*
+ * Iterate to convergence.
+ */
+ do {
+ int pass = ++spa->spa_sync_pass;
+
+ spa_sync_config_object(spa, tx);
+ spa_sync_aux_dev(spa, &spa->spa_spares, tx,
+ ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
+ spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
+ ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
+ spa_errlog_sync(spa, txg);
+ dsl_pool_sync(dp, txg);
+
+ if (pass < zfs_sync_pass_deferred_free) {
+ spa_sync_frees(spa, free_bpl, tx);
+ } else {
+ /*
+ * We can not defer frees in pass 1, because
+ * we sync the deferred frees later in pass 1.
+ */
+ ASSERT3U(pass, >, 1);
+ bplist_iterate(free_bpl, bpobj_enqueue_cb,
+ &spa->spa_deferred_bpobj, tx);
+ }
+
+ ddt_sync(spa, txg);
+ dsl_scan_sync(dp, tx);
+
+ if (spa->spa_vdev_removal != NULL)
+ svr_sync(spa, tx);
+
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+ != NULL)
+ vdev_sync(vd, txg);
+
+ if (pass == 1) {
+ spa_sync_upgrades(spa, tx);
+ ASSERT3U(txg, >=,
+ spa->spa_uberblock.ub_rootbp.blk_birth);
+ /*
+ * Note: We need to check if the MOS is dirty
+ * because we could have marked the MOS dirty
+ * without updating the uberblock (e.g. if we
+ * have sync tasks but no dirty user data). We
+ * need to check the uberblock's rootbp because
+ * it is updated if we have synced out dirty
+ * data (though in this case the MOS will most
+ * likely also be dirty due to second order
+ * effects, we don't want to rely on that here).
+ */
+ if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ !dmu_objset_is_dirty(mos, txg)) {
+ /*
+ * Nothing changed on the first pass,
+ * therefore this TXG is a no-op. Avoid
+ * syncing deferred frees, so that we
+ * can keep this TXG as a no-op.
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
+ txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+ ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
+ txg));
+ break;
+ }
+ spa_sync_deferred_frees(spa, tx);
+ }
+
+ } while (dmu_objset_is_dirty(mos, txg));
+
+ if (!list_is_empty(&spa->spa_config_dirty_list)) {
+ /*
+ * Make sure that the number of ZAPs for all the vdevs matches
+ * the number of ZAPs in the per-vdev ZAP list. This only gets
+ * called if the config is dirty; otherwise there may be
+ * outstanding AVZ operations that weren't completed in
+ * spa_sync_config_object.
+ */
+ uint64_t all_vdev_zap_entry_count;
+ ASSERT0(zap_count(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+ ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+ all_vdev_zap_entry_count);
+ }
+
+ if (spa->spa_vdev_removal != NULL) {
+ ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+ }
+
+ /*
+ * Rewrite the vdev configuration (which includes the uberblock)
+ * to commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few
+ * random top-level vdevs that are known to be visible in the
+ * config cache (see spa_vdev_add() for a complete description).
+ * If there *are* dirty vdevs, sync the uberblock to all vdevs.
+ */
+ for (;;) {
+ /*
+ * We hold SCL_STATE to prevent vdev open/close/etc.
+ * while we're attempting to write the vdev labels.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ if (list_is_empty(&spa->spa_config_dirty_list)) {
+ vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+ int svdcount = 0;
+ int children = rvd->vdev_children;
+ int c0 = spa_get_random(children);
+
+ for (int c = 0; c < children; c++) {
+ vd = rvd->vdev_child[(c0 + c) % children];
+
+ /* Stop when revisiting the first vdev */
+ if (c > 0 && svd[0] == vd)
+ break;
+
+ if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+ !vdev_is_concrete(vd))
+ continue;
+
+ svd[svdcount++] = vd;
+ if (svdcount == SPA_SYNC_MIN_VDEVS)
+ break;
+ }
+ error = vdev_config_sync(svd, svdcount, txg);
+ } else {
+ error = vdev_config_sync(rvd->vdev_child,
+ rvd->vdev_children, txg);
+ }
+
+ if (error == 0)
+ spa->spa_last_synced_guid = rvd->vdev_guid;
+
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ if (error == 0)
+ break;
+ zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
+ zio_resume_wait(spa);
+ }
+ dmu_tx_commit(tx);
+
+#ifdef illumos
+ VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+#else /* !illumos */
+#ifdef _KERNEL
+ callout_drain(&spa->spa_deadman_cycid);
+#endif
+#endif /* illumos */
+
+ /*
+ * Clear the dirty config list.
+ */
+ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
+ vdev_config_clean(vd);
+
+ /*
+ * Now that the new config has synced transactionally,
+ * let it become visible to the config cache.
+ */
+ if (spa->spa_config_syncing != NULL) {
+ spa_config_set(spa, spa->spa_config_syncing);
+ spa->spa_config_txg = txg;
+ spa->spa_config_syncing = NULL;
+ }
+
+ dsl_pool_sync_done(dp, txg);
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
+
+ /*
+ * Update usable space statistics.
+ */
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ != NULL)
+ vdev_sync_done(vd, txg);
+
+ spa_update_dspace(spa);
+
+ /*
+ * It had better be the case that we didn't dirty anything
+ * since vdev_config_sync().
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+
+ while (zfs_pause_spa_sync)
+ delay(1);
+
+ spa->spa_sync_pass = 0;
+
+ /*
+ * Update the last synced uberblock here. We want to do this at
+ * the end of spa_sync() so that consumers of spa_last_synced_txg()
+ * will be guaranteed that all the processing associated with
+ * that txg has been completed.
+ */
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ spa_handle_ignored_writes(spa);
+
+ /*
+ * If any async tasks have been requested, kick them off.
+ */
+ spa_async_dispatch(spa);
+ spa_async_dispatch_vd(spa);
+}
+
+/*
+ * Sync all pools. We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+ spa_t *spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE ||
+ !spa_writeable(spa) || spa_suspended(spa))
+ continue;
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+ spa_t *spa;
+
+ /*
+ * Remove all cached state. All pools should be closed now,
+ * so every spa in the AVL tree should be unreferenced.
+ */
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(NULL)) != NULL) {
+ /*
+ * Stop async tasks. The async thread may need to detach
+ * a device that's been replaced, which requires grabbing
+ * spa_namespace_lock, so we must drop it here.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+ spa_remove(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
+{
+ vdev_t *vd;
+ int i;
+
+ if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
+ return (vd);
+
+ if (aux) {
+ for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vd = spa->spa_l2cache.sav_vdevs[i];
+ if (vd->vdev_guid == guid)
+ return (vd);
+ }
+
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ vd = spa->spa_spares.sav_vdevs[i];
+ if (vd->vdev_guid == guid)
+ return (vd);
+ }
+ }
+
+ return (NULL);
+}
+
+void
+spa_upgrade(spa_t *spa, uint64_t version)
+{
+ ASSERT(spa_writeable(spa));
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ /*
+ * This should only be called for a non-faulted pool, and since a
+ * future version would result in an unopenable pool, this shouldn't be
+ * possible.
+ */
+ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
+ ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
+
+ spa->spa_uberblock.ub_version = version;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+}
+
+boolean_t
+spa_has_spare(spa_t *spa, uint64_t guid)
+{
+ int i;
+ uint64_t spareguid;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
+
+ for (i = 0; i < sav->sav_count; i++)
+ if (sav->sav_vdevs[i]->vdev_guid == guid)
+ return (B_TRUE);
+
+ for (i = 0; i < sav->sav_npending; i++) {
+ if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
+ &spareguid) == 0 && spareguid == guid)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Check if a pool has an active shared spare device.
+ * Note: reference count of an active spare is 2, as a spare and as a replace
+ */
+static boolean_t
+spa_has_active_shared_spare(spa_t *spa)
+{
+ int i, refcnt;
+ uint64_t pool;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
+
+ for (i = 0; i < sav->sav_count; i++) {
+ if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
+ &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
+ refcnt > 2)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+sysevent_t *
+spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
+{
+ sysevent_t *ev = NULL;
+#ifdef _KERNEL
+ sysevent_attr_list_t *attr = NULL;
+ sysevent_value_t value;
+
+ ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
+ SE_SLEEP);
+ ASSERT(ev != NULL);
+
+ value.value_type = SE_DATA_TYPE_STRING;
+ value.value.sv_string = spa_name(spa);
+ if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
+ goto done;
+
+ value.value_type = SE_DATA_TYPE_UINT64;
+ value.value.sv_uint64 = spa_guid(spa);
+ if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
+ goto done;
+
+ if (vd) {
+ value.value_type = SE_DATA_TYPE_UINT64;
+ value.value.sv_uint64 = vd->vdev_guid;
+ if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
+ SE_SLEEP) != 0)
+ goto done;
+
+ if (vd->vdev_path) {
+ value.value_type = SE_DATA_TYPE_STRING;
+ value.value.sv_string = vd->vdev_path;
+ if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
+ &value, SE_SLEEP) != 0)
+ goto done;
+ }
+ }
+
+ if (hist_nvl != NULL) {
+ fnvlist_merge((nvlist_t *)attr, hist_nvl);
+ }
+
+ if (sysevent_attach_attributes(ev, attr) != 0)
+ goto done;
+ attr = NULL;
+
+done:
+ if (attr)
+ sysevent_free_attr(attr);
+
+#endif
+ return (ev);
+}
+
+void
+spa_event_post(sysevent_t *ev)
+{
+#ifdef _KERNEL
+ sysevent_id_t eid;
+
+ (void) log_sysevent(ev, SE_SLEEP, &eid);
+ sysevent_free(ev);
+#endif
+}
+
+void
+spa_event_discard(sysevent_t *ev)
+{
+#ifdef _KERNEL
+ sysevent_free(ev);
+#endif
+}
+
+/*
+ * Post a sysevent corresponding to the given event. The 'name' must be one of
+ * the event definitions in sys/sysevent/eventdefs.h. The payload will be
+ * filled in from the spa and (optionally) the vdev and history nvl. This
+ * doesn't do anything in the userland libzpool, as we don't want consumers to
+ * misinterpret ztest or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
+{
+ spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
new file mode 100644
index 000000000000..62c3137cd590
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
@@ -0,0 +1,623 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Storage Pool Checkpoint
+ *
+ * A storage pool checkpoint can be thought of as a pool-wide snapshot or
+ * a stable version of extreme rewind that guarantees no blocks from the
+ * checkpointed state will have been overwritten. It remembers the entire
+ * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
+ * point that it was taken and the user can rewind back to that point even if
+ * they applied destructive operations on their datasets or even enabled new
+ * zpool on-disk features. If a pool has a checkpoint that is no longer
+ * needed, the user can discard it.
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ * flag is set to active when we create the checkpoint and remains active
+ * until the checkpoint is fully discarded. The entry in the MOS config
+ * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
+ * references the state of the pool when we take the checkpoint. The entry
+ * remains populated until we start discarding the checkpoint or we rewind
+ * back to it.
+ *
+ * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
+ * which persists until the checkpoint is fully discarded. The space map
+ * contains entries that have been freed in the current state of the pool
+ * but we want to keep around in case we decide to rewind to the checkpoint.
+ * [see vdev_checkpoint_sm]
+ *
+ * - Each metaslab's ms_sm space map behaves the same as without the
+ * checkpoint, with the only exception being the scenario when we free
+ * blocks that belong to the checkpoint. In this case, these blocks remain
+ * ALLOCATED in the metaslab's space map and they are added as FREE in the
+ * vdev's checkpoint space map.
+ *
+ * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
+ * the uberblock was checkpointed. For normal uberblocks this field is 0.
+ *
+ * == Overview of operations ==
+ *
+ * - To create a checkpoint, we first wait for the current TXG to be synced,
+ * so we can use the most recently synced uberblock (spa_ubsync) as the
+ * checkpointed uberblock. Then we use an early synctask to place that
+ * uberblock in MOS config, increment the feature flag for the checkpoint
+ * (marking it active), and setting spa_checkpoint_txg (see its use below)
+ * to the TXG of the checkpointed uberblock. We use an early synctask for
+ * the aforementioned operations to ensure that no blocks were dirtied
+ * between the current TXG and the TXG of the checkpointed uberblock
+ * (e.g the previous txg).
+ *
+ * - When a checkpoint exists, we need to ensure that the blocks that
+ * belong to the checkpoint are freed but never reused. This means that
+ * these blocks should never end up in the ms_allocatable or the ms_freeing
+ * trees of a metaslab. Therefore, whenever there is a checkpoint the new
+ * ms_checkpointing tree is used in addition to the aforementioned ones.
+ *
+ * Whenever a block is freed and we find out that it is referenced by the
+ * checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
+ * we place it in the ms_checkpointing tree instead of the ms_freeingtree.
+ * This way, we divide the blocks that are being freed into checkpointed
+ * and not-checkpointed blocks.
+ *
+ * In order to persist these frees, we write the extents from the
+ * ms_freeingtree to the ms_sm as usual, and the extents from the
+ * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
+ * checkpointed extents will remain allocated in the metaslab's ms_sm space
+ * map, and therefore won't be reused [see metaslab_sync()]. In addition,
+ * when we discard the checkpoint, we can find the entries that have
+ * actually been freed in vdev_checkpoint_sm.
+ * [see spa_checkpoint_discard_thread_sync()]
+ *
+ * - To discard the checkpoint we use an early synctask to delete the
+ * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
+ * and wakeup the discarding zthr thread (an open-context async thread).
+ * We use an early synctask to ensure that the operation happens before any
+ * new data end up in the checkpoint's data structures.
+ *
+ * Once the synctask is done and the discarding zthr is awake, we discard
+ * the checkpointed data over multiple TXGs by having the zthr prefetching
+ * entries from vdev_checkpoint_sm and then starting a synctask that places
+ * them as free blocks in to their respective ms_allocatable and ms_sm
+ * structures.
+ * [see spa_checkpoint_discard_thread()]
+ *
+ * When there are no entries left in the vdev_checkpoint_sm of all
+ * top-level vdevs, a final synctask runs that decrements the feature flag.
+ *
+ * - To rewind to the checkpoint, we first use the current uberblock and
+ * open the MOS so we can access the checkpointed uberblock from the MOS
+ * config. After we retrieve the checkpointed uberblock, we use it as the
+ * current uberblock for the pool by writing it to disk with an updated
+ * TXG, opening its version of the MOS, and moving on as usual from there.
+ * [see spa_ld_checkpoint_rewind()]
+ *
+ * An important note on rewinding to the checkpoint has to do with how we
+ * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
+ * blocks that have not been claimed by the time we took the checkpoint
+ * as they should no longer be valid.
+ * [see comment in zil_claim()]
+ *
+ * == Miscellaneous information ==
+ *
+ * - In the hypothetical event that we take a checkpoint, remove a vdev,
+ * and attempt to rewind, the rewind would fail as the checkpointed
+ * uberblock would reference data in the removed device. For this reason
+ * and others of similar nature, we disallow the following operations that
+ * can change the config:
+ * vdev removal and attach/detach, mirror splitting, and pool reguid.
+ *
+ * - As most of the checkpoint logic is implemented in the SPA and doesn't
+ * distinguish datasets when it comes to space accounting, having a
+ * checkpoint can potentially break the boundaries set by dataset
+ * reservations.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_checkpoint.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+
+/*
+ * The following parameter limits the amount of memory to be used for the
+ * prefetching of the checkpoint space map done on each vdev while
+ * discarding the checkpoint.
+ *
+ * The reason it exists is because top-level vdevs with long checkpoint
+ * space maps can potentially take up a lot of memory depending on the
+ * amount of checkpointed data that has been freed within them while
+ * the pool had a checkpoint.
+ */
+uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
+
+int
+spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+ bzero(pcs, sizeof (pool_checkpoint_stat_t));
+
+ int error = zap_contains(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
+ ASSERT(error == 0 || error == ENOENT);
+
+ if (error == ENOENT)
+ pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
+ else
+ pcs->pcs_state = CS_CHECKPOINT_EXISTS;
+
+ pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
+ pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
+
+ return (0);
+}
+
+static void
+spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = arg;
+
+ spa->spa_checkpoint_info.sci_timestamp = 0;
+
+ spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+ spa_history_log_internal(spa, "spa discard checkpoint", tx,
+ "finished discarding checkpointed state from the pool");
+}
+
+typedef struct spa_checkpoint_discard_sync_callback_arg {
+ vdev_t *sdc_vd;
+ uint64_t sdc_txg;
+ uint64_t sdc_entry_limit;
+} spa_checkpoint_discard_sync_callback_arg_t;
+
+static int
+spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
+{
+ spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
+ vdev_t *vd = sdc->sdc_vd;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
+
+ if (sdc->sdc_entry_limit == 0)
+ return (EINTR);
+
+ /*
+ * Since the space map is not condensed, we know that
+ * none of its entries is crossing the boundaries of
+ * its respective metaslab.
+ *
+ * That said, there is no fundamental requirement that
+ * the checkpoint's space map entries should not cross
+ * metaslab boundaries. So if needed we could add code
+ * that handles metaslab-crossing segments in the future.
+ */
+ VERIFY3U(sme->sme_type, ==, SM_FREE);
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * At this point we should not be processing any
+ * other frees concurrently, so the lock is technically
+ * unnecessary. We use the lock anyway though to
+ * potentially save ourselves from future headaches.
+ */
+ mutex_enter(&ms->ms_lock);
+ if (range_tree_is_empty(ms->ms_freeing))
+ vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
+ range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
+ mutex_exit(&ms->ms_lock);
+
+ ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
+ sme->sme_run);
+ ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
+
+ vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
+ vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
+ sdc->sdc_entry_limit--;
+
+ return (0);
+}
+
+static void
+spa_checkpoint_accounting_verify(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t ckpoint_sm_space_sum = 0;
+ uint64_t vs_ckpoint_space_sum = 0;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ if (vd->vdev_checkpoint_sm != NULL) {
+ ckpoint_sm_space_sum +=
+ -space_map_allocated(vd->vdev_checkpoint_sm);
+ vs_ckpoint_space_sum +=
+ vd->vdev_stat.vs_checkpoint_space;
+ ASSERT3U(ckpoint_sm_space_sum, ==,
+ vs_ckpoint_space_sum);
+ } else {
+ ASSERT0(vd->vdev_stat.vs_checkpoint_space);
+ }
+ }
+ ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
+}
+
+static void
+spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *vd = arg;
+ int error;
+
+ /*
+ * The space map callback is applied only to non-debug entries.
+ * Because the number of debug entries is less or equal to the
+ * number of non-debug entries, we want to ensure that we only
+ * read what we prefetched from open-context.
+ *
+ * Thus, we set the maximum entries that the space map callback
+ * will be applied to be half the entries that could fit in the
+ * imposed memory limit.
+ *
+ * Note that since this is a conservative estimate we also
+ * assume the worst case scenario in our computation where each
+ * entry is two-word.
+ */
+ uint64_t max_entry_limit =
+ (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
+
+ /*
+ * Iterate from the end of the space map towards the beginning,
+ * placing its entries on ms_freeing and removing them from the
+ * space map. The iteration stops if one of the following
+ * conditions is true:
+ *
+ * 1] We reached the beginning of the space map. At this point
+ * the space map should be completely empty and
+ * space_map_incremental_destroy should have returned 0.
+ * The next step would be to free and close the space map
+ * and remove its entry from its vdev's top zap. This allows
+ * spa_checkpoint_discard_thread() to move on to the next vdev.
+ *
+ * 2] We reached the memory limit (amount of memory used to hold
+ * space map entries in memory) and space_map_incremental_destroy
+ * returned EINTR. This means that there are entries remaining
+ * in the space map that will be cleared in a future invocation
+ * of this function by spa_checkpoint_discard_thread().
+ */
+ spa_checkpoint_discard_sync_callback_arg_t sdc;
+ sdc.sdc_vd = vd;
+ sdc.sdc_txg = tx->tx_txg;
+ sdc.sdc_entry_limit = max_entry_limit;
+
+ uint64_t words_before =
+ space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+ error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
+ spa_checkpoint_discard_sync_callback, &sdc, tx);
+
+ uint64_t words_after =
+ space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+#ifdef DEBUG
+ spa_checkpoint_accounting_verify(vd->vdev_spa);
+#endif
+
+ zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
+ "deleted %llu words - %llu words are left",
+ tx->tx_txg, vd->vdev_id, (words_before - words_after),
+ words_after);
+
+ if (error != EINTR) {
+ if (error != 0) {
+ zfs_panic_recover("zfs: error %d was returned "
+ "while incrementally destroying the checkpoint "
+ "space map of vdev %llu\n",
+ error, vd->vdev_id);
+ }
+ ASSERT0(words_after);
+ ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
+ ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
+
+ space_map_free(vd->vdev_checkpoint_sm, tx);
+ space_map_close(vd->vdev_checkpoint_sm);
+ vd->vdev_checkpoint_sm = NULL;
+
+ VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
+ }
+}
+
+static boolean_t
+spa_checkpoint_discard_is_done(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(!spa_has_checkpoint(spa));
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
+ return (B_FALSE);
+ ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
+ }
+
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+boolean_t
+spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (B_FALSE);
+
+ if (spa_has_checkpoint(spa))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+void
+spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ while (vd->vdev_checkpoint_sm != NULL) {
+ space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
+ int numbufs;
+ dmu_buf_t **dbp;
+
+ if (zthr_iscancelled(zthr))
+ return;
+
+ ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+
+ uint64_t size = MIN(space_map_length(checkpoint_sm),
+ zfs_spa_discard_memory_limit);
+ uint64_t offset =
+ space_map_length(checkpoint_sm) - size;
+
+ /*
+ * Ensure that the part of the space map that will
+ * be destroyed by the synctask, is prefetched in
+ * memory before the synctask runs.
+ */
+ int error = dmu_buf_hold_array_by_bonus(
+ checkpoint_sm->sm_dbuf, offset, size,
+ B_TRUE, FTAG, &numbufs, &dbp);
+ if (error != 0) {
+ zfs_panic_recover("zfs: error %d was returned "
+ "while prefetching checkpoint space map "
+ "entries of vdev %llu\n",
+ error, vd->vdev_id);
+ }
+
+ VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+ spa_checkpoint_discard_thread_sync, vd,
+ 0, ZFS_SPACE_CHECK_NONE));
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ }
+ }
+
+ VERIFY(spa_checkpoint_discard_is_done(spa));
+ VERIFY0(spa->spa_checkpoint_info.sci_dspace);
+ VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+ spa_checkpoint_discard_complete_sync, spa,
+ 0, ZFS_SPACE_CHECK_NONE));
+}
+
+
+/* ARGSUSED */
+static int
+spa_checkpoint_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ENOTSUP));
+
+ if (!spa_top_vdevs_spacemap_addressable(spa))
+ return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
+
+ if (spa->spa_vdev_removal != NULL)
+ return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
+
+ if (spa->spa_checkpoint_txg != 0)
+ return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ uberblock_t checkpoint = spa->spa_ubsync;
+
+ /*
+ * At this point, there should not be a checkpoint in the MOS.
+ */
+ ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
+
+ ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
+ ASSERT0(spa->spa_checkpoint_info.sci_dspace);
+
+ /*
+ * Since the checkpointed uberblock is the one that just got synced
+ * (we use spa_ubsync), its txg must be equal to the txg number of
+ * the txg we are syncing, minus 1.
+ */
+ ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
+
+ /*
+ * Once the checkpoint is in place, we need to ensure that none of
+ * its blocks will be marked for reuse after it has been freed.
+ * When there is a checkpoint and a block is freed, we compare its
+ * birth txg to the txg of the checkpointed uberblock to see if the
+ * block is part of the checkpoint or not. Therefore, we have to set
+ * spa_checkpoint_txg before any frees happen in this txg (which is
+ * why this is done as an early_synctask as explained in the comment
+ * in spa_checkpoint()).
+ */
+ spa->spa_checkpoint_txg = checkpoint.ub_txg;
+ spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+ checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
+ VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
+ sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
+ &checkpoint, tx));
+
+ /*
+ * Increment the feature refcount and thus activate the feature.
+ * Note that the feature will be deactivated when we've
+ * completely discarded all checkpointed state (both vdev
+ * space maps and uberblock).
+ */
+ spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+ spa_history_log_internal(spa, "spa checkpoint", tx,
+ "checkpointed uberblock txg=%llu", checkpoint.ub_txg);
+}
+
+/*
+ * Create a checkpoint for the pool.
+ */
+int
+spa_checkpoint(const char *pool)
+{
+ int error;
+ spa_t *spa;
+
+ error = spa_open(pool, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ mutex_enter(&spa->spa_vdev_top_lock);
+
+ /*
+ * Wait for current syncing txg to finish so the latest synced
+ * uberblock (spa_ubsync) has all the changes that we expect
+ * to see if we were to revert later to the checkpoint. In other
+ * words we want the checkpointed uberblock to include/reference
+ * all the changes that were pending at the time that we issued
+ * the checkpoint command.
+ */
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * As the checkpointed uberblock references blocks from the previous
+ * txg (spa_ubsync) we want to ensure that are not freeing any of
+ * these blocks in the same txg that the following synctask will
+ * run. Thus, we run it as an early synctask, so the dirty changes
+ * that are synced to disk afterwards during zios and other synctasks
+ * do not reuse checkpointed blocks.
+ */
+ error = dsl_early_sync_task(pool, spa_checkpoint_check,
+ spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
+
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+ if (spa->spa_checkpoint_txg == 0)
+ return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+ VERIFY0(zap_contains(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, tx));
+
+ spa->spa_checkpoint_txg = 0;
+
+ zthr_wakeup(spa->spa_checkpoint_discard_zthr);
+
+ spa_history_log_internal(spa, "spa discard checkpoint", tx,
+ "started discarding checkpointed state from the pool");
+}
+
+/*
+ * Discard the checkpoint from a pool.
+ */
+int
+spa_checkpoint_discard(const char *pool)
+{
+ /*
+ * Similarly to spa_checkpoint(), we want our synctask to run
+ * before any pending dirty data are written to disk so they
+ * won't end up in the checkpoint's data structures (e.g.
+ * ms_checkpointing and vdev_checkpoint_sm) and re-create any
+ * space maps that the discarding open-context thread has
+ * deleted.
+ * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
+ */
+ return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
+ spa_checkpoint_discard_sync, NULL, 0,
+ ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
new file mode 100644
index 000000000000..b616a439f7b8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -0,0 +1,594 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/utsname.h>
+#include <sys/sunddi.h>
+#include <sys/zfeature.h>
+#ifdef _KERNEL
+#include <sys/kobj.h>
+#include <sys/zone.h>
+#endif
+
+/*
+ * Pool configuration repository.
+ *
+ * Pool configuration is stored as a packed nvlist on the filesystem. By
+ * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
+ * (when the ZFS module is loaded). Pools can also have the 'cachefile'
+ * property set that allows them to be stored in an alternate location until
+ * the control of external software.
+ *
+ * For each cache file, we have a single nvlist which holds all the
+ * configuration information. When the module loads, we read this information
+ * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is
+ * maintained independently in spa.c. Whenever the namespace is modified, or
+ * the configuration of a pool is changed, we call spa_write_cachefile(), which
+ * walks through all the active pools and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+const char *spa_config_path = ZPOOL_CACHE;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace. It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+ void *buf = NULL;
+ nvlist_t *nvlist, *child;
+ nvpair_t *nvpair;
+ char *pathname;
+ struct _buf *file;
+ uint64_t fsize;
+
+ /*
+ * Open the configuration file.
+ */
+ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
+
+ file = kobj_open_file(pathname);
+
+ kmem_free(pathname, MAXPATHLEN);
+
+ if (file == (struct _buf *)-1)
+ return;
+
+ if (kobj_get_filesize(file, &fsize) != 0)
+ goto out;
+
+ buf = kmem_alloc(fsize, KM_SLEEP);
+
+ /*
+ * Read the nvlist from the file.
+ */
+ if (kobj_read_file(file, buf, fsize, 0) < 0)
+ goto out;
+
+ /*
+ * Unpack the nvlist.
+ */
+ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
+ goto out;
+
+ /*
+ * Iterate over all elements in the nvlist, creating a new spa_t for
+ * each one with the specified configuration.
+ */
+ mutex_enter(&spa_namespace_lock);
+ nvpair = NULL;
+ while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+ if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+ continue;
+
+ child = fnvpair_value_nvlist(nvpair);
+
+ if (spa_lookup(nvpair_name(nvpair)) != NULL)
+ continue;
+ (void) spa_add(nvpair_name(nvpair), child, NULL);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(nvlist);
+
+out:
+ if (buf != NULL)
+ kmem_free(buf, fsize);
+
+ kobj_close_file(file);
+}
+
+static void
+spa_config_clean(nvlist_t *nvl)
+{
+ nvlist_t **child;
+ nvlist_t *nvroot = NULL;
+ uint_t c, children;
+
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ spa_config_clean(child[c]);
+ }
+
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0)
+ spa_config_clean(nvroot);
+
+ nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY);
+ nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY);
+}
+
+static int
+spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
+{
+ size_t buflen;
+ char *buf;
+ vnode_t *vp;
+ int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+ char *temp;
+ int err;
+
+ /*
+ * If the nvlist is empty (NULL), then remove the old cachefile.
+ */
+ if (nvl == NULL) {
+ err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
+ return (err);
+ }
+
+ /*
+ * Pack the configuration into a buffer.
+ */
+ buf = fnvlist_pack(nvl, &buflen);
+ temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ /*
+ * Write the configuration to disk. We need to do the traditional
+ * 'write to temporary file, sync, move over original' to make sure we
+ * always have a consistent view of the data.
+ */
+ (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path);
+
+ err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0);
+ if (err == 0) {
+ err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, NULL);
+ if (err == 0)
+ err = VOP_FSYNC(vp, FSYNC, kcred, NULL);
+ if (err == 0)
+ err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
+ (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
+ }
+
+ (void) vn_remove(temp, UIO_SYSSPACE, RMFILE);
+
+ fnvlist_pack_free(buf, buflen);
+ kmem_free(temp, MAXPATHLEN);
+ return (err);
+}
+
+/*
+ * Synchronize pool configuration to disk. This must be called with the
+ * namespace lock held. Synchronizing the pool cache is typically done after
+ * the configuration has been synced to the MOS. This exposes a window where
+ * the MOS config will have been updated but the cache file has not. If
+ * the system were to crash at that instant then the cached config may not
+ * contain the correct information to open the pool and an explicit import
+ * would be required.
+ */
+void
+spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
+{
+ spa_config_dirent_t *dp, *tdp;
+ nvlist_t *nvl;
+ boolean_t ccw_failure;
+ int error;
+ char *pool_name;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (rootdir == NULL || !(spa_mode_global & FWRITE))
+ return;
+
+ /*
+ * Iterate over all cachefiles for the pool, past or present. When the
+ * cachefile is changed, the new one is pushed onto this list, allowing
+ * us to update previous cachefiles that no longer contain this pool.
+ */
+ ccw_failure = B_FALSE;
+ for (dp = list_head(&target->spa_config_list); dp != NULL;
+ dp = list_next(&target->spa_config_list, dp)) {
+ spa_t *spa = NULL;
+ if (dp->scd_path == NULL)
+ continue;
+
+ /*
+ * Iterate over all pools, adding any matching pools to 'nvl'.
+ */
+ nvl = NULL;
+ while ((spa = spa_next(spa)) != NULL) {
+ nvlist_t *nvroot = NULL;
+ /*
+ * Skip over our own pool if we're about to remove
+ * ourselves from the spa namespace or any pool that
+ * is readonly. Since we cannot guarantee that a
+ * readonly pool would successfully import upon reboot,
+ * we don't allow them to be written to the cache file.
+ */
+ if ((spa == target && removing) ||
+ (spa_state(spa) == POOL_STATE_ACTIVE &&
+ !spa_writeable(spa)))
+ continue;
+
+ mutex_enter(&spa->spa_props_lock);
+ tdp = list_head(&spa->spa_config_list);
+ if (spa->spa_config == NULL ||
+ tdp->scd_path == NULL ||
+ strcmp(tdp->scd_path, dp->scd_path) != 0) {
+ mutex_exit(&spa->spa_props_lock);
+ continue;
+ }
+
+ if (nvl == NULL)
+ nvl = fnvlist_alloc();
+
+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
+ pool_name = fnvlist_lookup_string(
+ spa->spa_config, ZPOOL_CONFIG_POOL_NAME);
+ } else {
+ pool_name = spa_name(spa);
+ }
+
+ fnvlist_add_nvlist(nvl, pool_name,
+ spa->spa_config);
+ mutex_exit(&spa->spa_props_lock);
+
+ if (nvlist_lookup_nvlist(nvl, pool_name, &nvroot) == 0)
+ spa_config_clean(nvroot);
+ }
+
+ error = spa_config_write(dp, nvl);
+ if (error != 0)
+ ccw_failure = B_TRUE;
+ nvlist_free(nvl);
+ }
+
+ if (ccw_failure) {
+ /*
+ * Keep trying so that configuration data is
+ * written if/when any temporary filesystem
+ * resource issues are resolved.
+ */
+ if (target->spa_ccw_fail_time == 0) {
+ zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
+ target, NULL, NULL, 0, 0);
+ }
+ target->spa_ccw_fail_time = gethrtime();
+ spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
+ } else {
+ /*
+ * Do not rate limit future attempts to update
+ * the config cache.
+ */
+ target->spa_ccw_fail_time = 0;
+ }
+
+ /*
+ * Remove any config entries older than the current one.
+ */
+ dp = list_head(&target->spa_config_list);
+ while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) {
+ list_remove(&target->spa_config_list, tdp);
+ if (tdp->scd_path != NULL)
+ spa_strfree(tdp->scd_path);
+ kmem_free(tdp, sizeof (spa_config_dirent_t));
+ }
+
+ spa_config_generation++;
+
+ if (postsysevent)
+ spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
+}
+
+/*
+ * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+ nvlist_t *pools;
+ spa_t *spa = NULL;
+
+ if (*generation == spa_config_generation)
+ return (NULL);
+
+ pools = fnvlist_alloc();
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (INGLOBALZONE(curthread) ||
+ zone_dataset_visible(spa_name(spa), NULL)) {
+ mutex_enter(&spa->spa_props_lock);
+ fnvlist_add_nvlist(pools, spa_name(spa),
+ spa->spa_config);
+ mutex_exit(&spa->spa_props_lock);
+ }
+ }
+ *generation = spa_config_generation;
+ mutex_exit(&spa_namespace_lock);
+
+ return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+ mutex_enter(&spa->spa_props_lock);
+ if (spa->spa_config != NULL && spa->spa_config != config)
+ nvlist_free(spa->spa_config);
+ spa->spa_config = config;
+ mutex_exit(&spa->spa_props_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ *
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+ nvlist_t *config, *nvroot;
+ vdev_t *rvd = spa->spa_root_vdev;
+ unsigned long hostid = 0;
+ boolean_t locked = B_FALSE;
+ uint64_t split_guid;
+ char *pool_name;
+
+ if (vd == NULL) {
+ vd = rvd;
+ locked = B_TRUE;
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ }
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
+ (SCL_CONFIG | SCL_STATE));
+
+ /*
+ * If txg is -1, report the current value of spa->spa_config_txg.
+ */
+ if (txg == -1ULL)
+ txg = spa->spa_config_txg;
+
+ /*
+ * Originally, users had to handle spa namespace collisions by either
+ * exporting the already imported pool or by specifying a new name for
+ * the pool with a conflicting name. In the case of root pools from
+ * virtual guests, neither approach to collision resolution is
+ * reasonable. This is addressed by extending the new name syntax with
+ * an option to specify that the new name is temporary. When specified,
+ * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us
+ * to use the previous name, which we do below.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
+ pool_name = fnvlist_lookup_string(spa->spa_config,
+ ZPOOL_CONFIG_POOL_NAME);
+ } else {
+ pool_name = spa_name(spa);
+ }
+
+ config = fnvlist_alloc();
+
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
+ fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+ if (spa->spa_comment != NULL) {
+ fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
+ spa->spa_comment);
+ }
+
+ hostid = spa_get_hostid();
+ if (hostid != 0) {
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
+ }
+ fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename);
+
+ int config_gen_flags = 0;
+ if (vd != rvd) {
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+ vd->vdev_top->vdev_guid);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid);
+ if (vd->vdev_isspare) {
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_IS_SPARE, 1ULL);
+ }
+ if (vd->vdev_islog) {
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_IS_LOG, 1ULL);
+ }
+ vd = vd->vdev_top; /* label contains top config */
+ } else {
+ /*
+ * Only add the (potentially large) split information
+ * in the mos config, and not in the vdev labels
+ */
+ if (spa->spa_config_splitting != NULL)
+ fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ spa->spa_config_splitting);
+ fnvlist_add_boolean(config,
+ ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);
+
+ config_gen_flags |= VDEV_CONFIG_MOS;
+ }
+
+ /*
+ * Add the top-level config. We even add this on pools which
+ * don't support holes in the namespace.
+ */
+ vdev_top_config_generate(spa, config);
+
+ /*
+ * If we're splitting, record the original pool's guid.
+ */
+ if (spa->spa_config_splitting != NULL &&
+ nvlist_lookup_uint64(spa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
+ split_guid);
+ }
+
+ nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
+ fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
+ nvlist_free(nvroot);
+
+ /*
+ * Store what's necessary for reading the MOS in the label.
+ */
+ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+ spa->spa_label_features);
+
+ if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
+ ddt_histogram_t *ddh;
+ ddt_stat_t *dds;
+ ddt_object_t *ddo;
+
+ ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh);
+ fnvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_HISTOGRAM,
+ (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t));
+ kmem_free(ddh, sizeof (ddt_histogram_t));
+
+ ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
+ ddt_get_dedup_object_stats(spa, ddo);
+ fnvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_OBJ_STATS,
+ (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t));
+ kmem_free(ddo, sizeof (ddt_object_t));
+
+ dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
+ ddt_get_dedup_stats(spa, dds);
+ fnvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_DDT_STATS,
+ (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t));
+ kmem_free(dds, sizeof (ddt_stat_t));
+ }
+
+ if (locked)
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ return (config);
+}
+
+/*
+ * Update all disk labels, generate a fresh config based on the current
+ * in-core state, and sync the global config cache (do not sync the config
+ * cache if this is a booting rootpool).
+ */
+void
+spa_config_update(spa_t *spa, int what)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg;
+ int c;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ txg = spa_last_synced_txg(spa) + 1;
+ if (what == SPA_CONFIG_UPDATE_POOL) {
+ vdev_config_dirty(rvd);
+ } else {
+ /*
+ * If we have top-level vdevs that were added but have
+ * not yet been prepared for allocation, do that now.
+ * (It's safe now because the config cache is up to date,
+ * so it will be able to translate the new DVAs.)
+ * See comments in spa_vdev_add() for full details.
+ */
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ /*
+ * Explicitly skip vdevs that are indirect or
+ * log vdevs that are being removed. The reason
+ * is that both of those can have vdev_ms_array
+ * set to 0 and we wouldn't want to change their
+ * metaslab size nor call vdev_expand() on them.
+ */
+ if (!vdev_is_concrete(tvd) ||
+ (tvd->vdev_islog && tvd->vdev_removing))
+ continue;
+
+ if (tvd->vdev_ms_array == 0) {
+ vdev_ashift_optimize(tvd);
+ vdev_metaslab_set_size(tvd);
+ }
+ vdev_expand(tvd, txg);
+ }
+ }
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * Wait for the mosconfig to be regenerated and synced.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ /*
+ * Update the global config cache to reflect the new mosconfig.
+ */
+ spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
+
+ if (what == SPA_CONFIG_UPDATE_POOL)
+ spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
new file mode 100644
index 000000000000..8ce780537abb
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation. This is actually the union of two distinct logs: the last log,
+ * and the current log. All errors seen are logged to the current log. When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized. This way, if an error is somehow
+ * corrected, a new scrub will show that that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data. When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known. This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path. Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name. Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+#ifdef _KERNEL
+static void
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
+{
+ zb->zb_objset = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_object = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+#endif
+
+/*
+ * Log an uncorrectable error to the persistent error log. We add it to the
+ * spa's list of pending errors. The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, zio_t *zio)
+{
+ zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_tree_t *tree;
+ avl_index_t where;
+
+ /*
+ * If we are trying to import a pool, ignore any errors, as we won't be
+ * writing to the pool any time soon.
+ */
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+ return;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * If we have had a request to rotate the log, log it to the next list
+ * instead of the current one.
+ */
+ if (spa->spa_scrub_active || spa->spa_scrub_finished)
+ tree = &spa->spa_errlist_scrub;
+ else
+ tree = &spa->spa_errlist_last;
+
+ search.se_bookmark = *zb;
+ if (avl_find(tree, &search, &where) != NULL) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *zb;
+ avl_insert(tree, new, where);
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log. This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+ uint64_t total = 0, count;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += avl_numnodes(&spa->spa_errlist_last);
+ total += avl_numnodes(&spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_phys_t zb;
+
+ if (obj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+
+ if (*count == 0) {
+ zap_cursor_fini(&zc);
+ return (SET_ERROR(ENOMEM));
+ }
+
+ name_to_bookmark(za.za_name, &zb);
+
+ if (copyout(&zb, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
+ zap_cursor_fini(&zc);
+ return (SET_ERROR(EFAULT));
+ }
+
+ *count -= 1;
+ }
+
+ zap_cursor_fini(&zc);
+
+ return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+ spa_error_entry_t *se;
+
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+ if (*count == 0)
+ return (SET_ERROR(ENOMEM));
+
+ if (copyout(&se->se_bookmark, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0)
+ return (SET_ERROR(EFAULT));
+
+ *count -= 1;
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks. This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists. We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+ int ret = 0;
+
+#ifdef _KERNEL
+ mutex_enter(&spa->spa_errlog_lock);
+
+ ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+ if (!ret && !spa->spa_scrub_finished)
+ ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+ count);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ count);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ count);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+ return (ret);
+}
+
+/*
+ * Called when a scrub completes. This simply set a bit which tells which AVL
+ * tree to add new errors. spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+ mutex_enter(&spa->spa_errlist_lock);
+ spa->spa_scrub_finished = B_TRUE;
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t. Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+ spa_error_entry_t *se;
+ void *cookie;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+ spa_error_entry_t *se;
+ char buf[64];
+ void *cookie;
+
+ if (avl_numnodes(t) != 0) {
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE,
+ 0, tx);
+
+ /* add errors to the current log */
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ char *name = se->se_name ? se->se_name : "";
+
+ bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+ (void) zap_update(spa->spa_meta_objset,
+ *obj, buf, 1, strlen(name) + 1, name, tx);
+ }
+
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ }
+}
+
+/*
+ * Sync the error log out to disk. This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock. So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them. Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing. Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list. Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ avl_tree_t scrub, last;
+ int scrub_finished;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Bail out early under normal circumstances.
+ */
+ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+ avl_numnodes(&spa->spa_errlist_last) == 0 &&
+ !spa->spa_scrub_finished) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ spa_get_errlists(spa, &last, &scrub);
+ scrub_finished = spa->spa_scrub_finished;
+ spa->spa_scrub_finished = B_FALSE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+ mutex_enter(&spa->spa_errlog_lock);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ /*
+ * Sync out the current list of errors.
+ */
+ sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+ /*
+ * Rotate the log if necessary.
+ */
+ if (scrub_finished) {
+ if (spa->spa_errlog_last != 0)
+ VERIFY(dmu_object_free(spa->spa_meta_objset,
+ spa->spa_errlog_last, tx) == 0);
+ spa->spa_errlog_last = spa->spa_errlog_scrub;
+ spa->spa_errlog_scrub = 0;
+
+ sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+ }
+
+ /*
+ * Sync out any pending scrub errors.
+ */
+ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+ /*
+ * Update the MOS to reflect the new values.
+ */
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+ &spa->spa_errlog_last, tx);
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+ &spa->spa_errlog_scrub, tx);
+
+ dmu_tx_commit(tx);
+
+ mutex_exit(&spa->spa_errlog_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
new file mode 100644
index 000000000000..4b080fc48cdf
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
@@ -0,0 +1,628 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/utsname.h>
+#include <sys/sunddi.h>
+#include <sys/cred.h>
+#include "zfs_comutil.h"
+#ifdef _KERNEL
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#endif
+
+/*
+ * Routines to manage the on-disk history log.
+ *
+ * The history log is stored as a dmu object containing
+ * <packed record length, record nvlist> tuples.
+ *
+ * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
+ * "packed record length" is the packed length of the "record nvlist" stored
+ * as a little endian uint64_t.
+ *
+ * The log is implemented as a ring buffer, though the original creation
+ * of the pool ('zpool create') is never overwritten.
+ *
+ * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer
+ * of 'spa_history' stores the offsets for logging/retrieving history as
+ * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of
+ * where the 'zpool create' record is stored. This allows us to never
+ * overwrite the original creation of the pool. 'sh_phys_max_off' is the
+ * physical ending offset in bytes of the log. This tells you the length of
+ * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record
+ * is added, 'sh_eof' is incremented by the the size of the record.
+ * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes).
+ * This is where the consumer should start reading from after reading in
+ * the 'zpool create' portion of the log.
+ *
+ * 'sh_records_lost' keeps track of how many records have been overwritten
+ * and permanently lost.
+ */
+
+/* convert a logical offset to physical */
+static uint64_t
+spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
+{
+ uint64_t phys_len;
+
+ phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
+ return ((log_off - shpp->sh_pool_create_len) % phys_len
+ + shpp->sh_pool_create_len);
+}
+
+void
+spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
+{
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ objset_t *mos = spa->spa_meta_objset;
+
+ ASSERT(spa->spa_history == 0);
+ spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ sizeof (spa_history_phys_t), tx);
+
+ VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_HISTORY, sizeof (uint64_t), 1,
+ &spa->spa_history, tx) == 0);
+
+ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
+
+ shpp = dbp->db_data;
+ dmu_buf_will_dirty(dbp, tx);
+
+ /*
+ * Figure out maximum size of history log. We set it at
+ * 0.1% of pool size, with a max of 1G and min of 128KB.
+ */
+ shpp->sh_phys_max_off =
+ metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
+ shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
+ shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
+
+ dmu_buf_rele(dbp, FTAG);
+}
+
+/*
+ * Change 'sh_bof' to the beginning of the next record.
+ */
+static int
+spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t firstread, reclen, phys_bof;
+ char buf[sizeof (reclen)];
+ int err;
+
+ phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
+ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
+
+ if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
+ buf, DMU_READ_PREFETCH)) != 0)
+ return (err);
+ if (firstread != sizeof (reclen)) {
+ if ((err = dmu_read(mos, spa->spa_history,
+ shpp->sh_pool_create_len, sizeof (reclen) - firstread,
+ buf + firstread, DMU_READ_PREFETCH)) != 0)
+ return (err);
+ }
+
+ reclen = LE_64(*((uint64_t *)buf));
+ shpp->sh_bof += reclen + sizeof (reclen);
+ shpp->sh_records_lost++;
+ return (0);
+}
+
+static int
+spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
+ dmu_tx_t *tx)
+{
+ uint64_t firstwrite, phys_eof;
+ objset_t *mos = spa->spa_meta_objset;
+ int err;
+
+ ASSERT(MUTEX_HELD(&spa->spa_history_lock));
+
+ /* see if we need to reset logical BOF */
+ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
+ (shpp->sh_eof - shpp->sh_bof) <= len) {
+ if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
+ return (err);
+ }
+ }
+
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+ firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
+ shpp->sh_eof += len;
+ dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
+
+ len -= firstwrite;
+ if (len > 0) {
+ /* write out the rest at the beginning of physical file */
+ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
+ len, (char *)buf + firstwrite, tx);
+ }
+
+ return (0);
+}
+
+static char *
+spa_history_zone(void)
+{
+#ifdef _KERNEL
+ /* XXX: pr_hostname can be changed by default from within a jail! */
+ if (jailed(curthread->td_ucred))
+ return (curthread->td_ucred->cr_prison->pr_hostname);
+#endif
+ return (NULL);
+}
+
+/*
+ * Post a history sysevent.
+ *
+ * The nvlist_t* passed into this function will be transformed into a new
+ * nvlist where:
+ *
+ * 1. Nested nvlists will be flattened to a single level
+ * 2. Keys will have their names normalized (to remove any problematic
+ * characters, such as whitespace)
+ *
+ * The nvlist_t passed into this function will duplicated and should be freed
+ * by caller.
+ *
+ */
+static void
+spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
+{
+ nvlist_t *hist_nvl = fnvlist_alloc();
+ uint64_t uint64;
+ char *string;
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string);
+
+ if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
+ fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64);
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0)
+ fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64);
+
+ spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT);
+
+ nvlist_free(hist_nvl);
+}
+
+/*
+ * Write out a history event.
+ */
+/*ARGSUSED*/
+static void
+spa_history_log_sync(void *arg, dmu_tx_t *tx)
+{
+ nvlist_t *nvl = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ size_t reclen;
+ uint64_t le_len;
+ char *record_packed = NULL;
+ int ret;
+
+ /*
+ * If we have an older pool that doesn't have a command
+ * history object, create it now.
+ */
+ mutex_enter(&spa->spa_history_lock);
+ if (!spa->spa_history)
+ spa_history_create_obj(spa, tx);
+ mutex_exit(&spa->spa_history_lock);
+
+ /*
+ * Get the offset of where we need to write via the bonus buffer.
+ * Update the offset when the write completes.
+ */
+ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ shpp = dbp->db_data;
+
+ dmu_buf_will_dirty(dbp, tx);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
+#ifdef _KERNEL
+ fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename);
+#endif
+ if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
+ zfs_dbgmsg("command: %s",
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
+ } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
+ if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
+ zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
+ } else {
+ zfs_dbgmsg("txg %lld %s %s",
+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
+ }
+ /*
+ * The history sysevent is posted only for internal history
+ * messages to show what has happened, not how it happened. For
+ * example, the following command:
+ *
+ * # zfs destroy -r tank/foo
+ *
+ * will result in one sysevent posted per dataset that is
+ * destroyed as a result of the command - which could be more
+ * than one event in total. By contrast, if the sysevent was
+ * posted as a result of the ZPOOL_HIST_CMD key being present
+ * it would result in only one sysevent being posted with the
+ * full command line arguments, requiring the consumer to know
+ * how to parse and understand zfs(1M) command invocations.
+ */
+ spa_history_log_notify(spa, nvl);
+ } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
+ zfs_dbgmsg("ioctl %s",
+ fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
+ }
+
+ record_packed = fnvlist_pack(nvl, &reclen);
+
+ mutex_enter(&spa->spa_history_lock);
+
+ /* write out the packed length as little endian */
+ le_len = LE_64((uint64_t)reclen);
+ ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
+ if (!ret)
+ ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
+
+ /* The first command is the create, which we keep forever */
+ if (ret == 0 && shpp->sh_pool_create_len == 0 &&
+ nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
+ shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
+ }
+
+ mutex_exit(&spa->spa_history_lock);
+ fnvlist_pack_free(record_packed, reclen);
+ dmu_buf_rele(dbp, FTAG);
+ fnvlist_free(nvl);
+}
+
+/*
+ * Write out a history event.
+ */
+int
+spa_history_log(spa_t *spa, const char *msg)
+{
+ int err;
+ nvlist_t *nvl = fnvlist_alloc();
+
+ fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
+ err = spa_history_log_nvl(spa, nvl);
+ fnvlist_free(nvl);
+ return (err);
+}
+
+int
+spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
+{
+ int err = 0;
+ dmu_tx_t *tx;
+ nvlist_t *nvarg;
+
+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
+ return (EINVAL);
+
+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
+ return (SET_ERROR(EINVAL));
+
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ nvarg = fnvlist_dup(nvl);
+ if (spa_history_zone() != NULL) {
+ fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
+ spa_history_zone());
+ }
+ fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
+
+ /* Kick this off asynchronously; errors are ignored. */
+ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
+ nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
+ dmu_tx_commit(tx);
+
+ /* spa_history_log_sync will free nvl */
+ return (err);
+
+}
+
+/*
+ * Read out the command history.
+ */
+int
+spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ uint64_t read_len, phys_read_off, phys_eof;
+ uint64_t leftover = 0;
+ spa_history_phys_t *shpp;
+ int err;
+
+ /*
+ * If the command history doesn't exist (older pool),
+ * that's ok, just return ENOENT.
+ */
+ if (!spa->spa_history)
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * The history is logged asynchronously, so when they request
+ * the first chunk of history, make sure everything has been
+ * synced to disk so that we get it.
+ */
+ if (*offp == 0 && spa_writeable(spa))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
+ return (err);
+ shpp = dbp->db_data;
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ mutex_enter(&spa->spa_history_lock);
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+
+ if (*offp < shpp->sh_pool_create_len) {
+ /* read in just the zpool create history */
+ phys_read_off = *offp;
+ read_len = MIN(*len, shpp->sh_pool_create_len -
+ phys_read_off);
+ } else {
+ /*
+ * Need to reset passed in offset to BOF if the passed in
+ * offset has since been overwritten.
+ */
+ *offp = MAX(*offp, shpp->sh_bof);
+ phys_read_off = spa_history_log_to_phys(*offp, shpp);
+
+ /*
+ * Read up to the minimum of what the user passed down or
+ * the EOF (physical or logical). If we hit physical EOF,
+ * use 'leftover' to read from the physical BOF.
+ */
+ if (phys_read_off <= phys_eof) {
+ read_len = MIN(*len, phys_eof - phys_read_off);
+ } else {
+ read_len = MIN(*len,
+ shpp->sh_phys_max_off - phys_read_off);
+ if (phys_read_off + *len > shpp->sh_phys_max_off) {
+ leftover = MIN(*len - read_len,
+ phys_eof - shpp->sh_pool_create_len);
+ }
+ }
+ }
+
+ /* offset for consumer to use next */
+ *offp += read_len + leftover;
+
+ /* tell the consumer how much you actually read */
+ *len = read_len + leftover;
+
+ if (read_len == 0) {
+ mutex_exit(&spa->spa_history_lock);
+ dmu_buf_rele(dbp, FTAG);
+ return (0);
+ }
+
+ err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
+ DMU_READ_PREFETCH);
+ if (leftover && err == 0) {
+ err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
+ leftover, buf + read_len, DMU_READ_PREFETCH);
+ }
+ mutex_exit(&spa->spa_history_lock);
+
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+}
+
+/*
+ * The nvlist will be consumed by this call.
+ */
+static void
+log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
+ dmu_tx_t *tx, const char *fmt, va_list adx)
+{
+ char *msg;
+ va_list adx2;
+
+ /*
+ * If this is part of creating a pool, not everything is
+ * initialized yet, so don't bother logging the internal events.
+ * Likewise if the pool is not writeable.
+ */
+ if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) {
+ fnvlist_free(nvl);
+ return;
+ }
+
+ va_copy(adx2, adx);
+
+ msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
+ (void) vsprintf(msg, fmt, adx2);
+ fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
+ strfree(msg);
+
+ va_end(adx2);
+
+ fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
+ fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
+
+ if (dmu_tx_is_syncing(tx)) {
+ spa_history_log_sync(nvl, tx);
+ } else {
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
+ }
+ /* spa_history_log_sync() will free nvl */
+}
+
+void
+spa_history_log_internal(spa_t *spa, const char *operation,
+ dmu_tx_t *tx, const char *fmt, ...)
+{
+ dmu_tx_t *htx = tx;
+ va_list adx;
+
+ /* create a tx if we didn't get one */
+ if (tx == NULL) {
+ htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
+ dmu_tx_abort(htx);
+ return;
+ }
+ }
+
+ va_start(adx, fmt);
+ log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
+ va_end(adx);
+
+ /* if we didn't get a tx from the caller, commit the one we made */
+ if (tx == NULL)
+ dmu_tx_commit(htx);
+}
+
+void
+spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
+ dmu_tx_t *tx, const char *fmt, ...)
+{
+ va_list adx;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+ nvlist_t *nvl = fnvlist_alloc();
+
+ ASSERT(tx != NULL);
+
+ dsl_dataset_name(ds, namebuf);
+ fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
+ fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
+
+ va_start(adx, fmt);
+ log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
+ va_end(adx);
+}
+
+void
+spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
+ dmu_tx_t *tx, const char *fmt, ...)
+{
+ va_list adx;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+ nvlist_t *nvl = fnvlist_alloc();
+
+ ASSERT(tx != NULL);
+
+ dsl_dir_name(dd, namebuf);
+ fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
+ fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
+ dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+ va_start(adx, fmt);
+ log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
+ va_end(adx);
+}
+
+void
+spa_history_log_version(spa_t *spa, const char *operation)
+{
+ spa_history_log_internal(spa, operation, NULL,
+ "pool version %llu; software version %llu/%llu; uts %s %s %s %s",
+ (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
+ utsname.nodename, utsname.release, utsname.version,
+ utsname.machine);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
new file mode 100644
index 000000000000..0706767a9d1f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -0,0 +1,2523 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_boot.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_initialize.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
+#include <sys/fs/zfs.h>
+#include <sys/metaslab_impl.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include "zfs_prop.h"
+#include <sys/zfeature.h>
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+/*
+ * SPA locking
+ *
+ * There are four basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ * This lock must be acquired to do any of the following:
+ *
+ * - Lookup a spa_t by name
+ * - Add or remove a spa_t from the namespace
+ * - Increase spa_refcount from non-zero
+ * - Check if spa_refcount is zero
+ * - Rename a spa_t
+ * - add/remove/attach/detach devices
+ * - Held for the duration of create/destroy/import/export
+ *
+ * It does not need to handle recursion. A create or destroy may
+ * reference objects (files or zvols) in other pools, but by
+ * definition they must have an existing reference, and will never need
+ * to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa zfs_refcount_t protected by mutex)
+ *
+ * This reference count keep track of any active users of the spa_t. The
+ * spa_t cannot be destroyed or freed while this is non-zero. Internally,
+ * the refcount is never really 'zero' - opening a pool implicitly keeps
+ * some references in the DMU. Internally we check against spa_minref, but
+ * present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock[] (per-spa array of rwlocks)
+ *
+ * This protects the spa_t from config changes, and must be held in
+ * the following circumstances:
+ *
+ * - RW_READER to perform I/O to the spa
+ * - RW_WRITER to change the vdev config
+ *
+ * The locking order is fairly straightforward:
+ *
+ * spa_namespace_lock -> spa_refcount
+ *
+ * The namespace lock must be acquired to increase the refcount from 0
+ * or to check if it is zero.
+ *
+ * spa_refcount -> spa_config_lock[]
+ *
+ * There must be at least one valid reference on the spa_t to acquire
+ * the config lock.
+ *
+ * spa_namespace_lock -> spa_config_lock[]
+ *
+ * The namespace lock must always be taken before the config lock.
+ *
+ *
+ * The spa_namespace_lock can be acquired directly and is globally visible.
+ *
+ * The namespace is manipulated using the following functions, all of which
+ * require the spa_namespace_lock to be held.
+ *
+ * spa_lookup() Lookup a spa_t by name.
+ *
+ * spa_add() Create a new spa_t in the namespace.
+ *
+ * spa_remove() Remove a spa_t from the namespace. This also
+ * frees up any memory associated with the spa_t.
+ *
+ * spa_next() Returns the next spa_t in the system, or the
+ * first if NULL is passed.
+ *
+ * spa_evict_all() Shutdown and remove all spa_t structures in
+ * the system.
+ *
+ * spa_guid_exists() Determine whether a pool/device guid exists.
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ * spa_open_ref() Adds a reference to the given spa_t. Must be
+ * called with spa_namespace_lock held if the
+ * refcount is currently zero.
+ *
+ * spa_close() Remove a reference from the spa_t. This will
+ * not free the spa_t or remove it from the
+ * namespace. No locking is required.
+ *
+ * spa_refcount_zero() Returns true if the refcount is currently
+ * zero. Must be called with spa_namespace_lock
+ * held.
+ *
+ * The spa_config_lock[] is an array of rwlocks, ordered as follows:
+ * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
+ * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
+ *
+ * To read the configuration, it suffices to hold one of these locks as reader.
+ * To modify the configuration, you must hold all locks as writer. To modify
+ * vdev state without altering the vdev tree's topology (e.g. online/offline),
+ * you must hold SCL_STATE and SCL_ZIO as writer.
+ *
+ * We use these distinct config locks to avoid recursive lock entry.
+ * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
+ * block allocations (SCL_ALLOC), which may require reading space maps
+ * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
+ *
+ * The spa config locks cannot be normal rwlocks because we need the
+ * ability to hand off ownership. For example, SCL_ZIO is acquired
+ * by the issuing thread and later released by an interrupt thread.
+ * They do, however, obey the usual write-wanted semantics to prevent
+ * writer (i.e. system administrator) starvation.
+ *
+ * The lock acquisition rules are as follows:
+ *
+ * SCL_CONFIG
+ * Protects changes to the vdev tree topology, such as vdev
+ * add/remove/attach/detach. Protects the dirty config list
+ * (spa_config_dirty_list) and the set of spares and l2arc devices.
+ *
+ * SCL_STATE
+ * Protects changes to pool state and vdev state, such as vdev
+ * online/offline/fault/degrade/clear. Protects the dirty state list
+ * (spa_state_dirty_list) and global pool state (spa_state).
+ *
+ * SCL_ALLOC
+ * Protects changes to metaslab groups and classes.
+ * Held as reader by metaslab_alloc() and metaslab_claim().
+ *
+ * SCL_ZIO
+ * Held by bp-level zios (those which have no io_vd upon entry)
+ * to prevent changes to the vdev tree. The bp-level zio implicitly
+ * protects all of its vdev child zios, which do not hold SCL_ZIO.
+ *
+ * SCL_FREE
+ * Protects changes to metaslab groups and classes.
+ * Held as reader by metaslab_free(). SCL_FREE is distinct from
+ * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
+ * blocks in zio_done() while another i/o that holds either
+ * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
+ *
+ * SCL_VDEV
+ * Held as reader to prevent changes to the vdev tree during trivial
+ * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
+ * other locks, and lower than all of them, to ensure that it's safe
+ * to acquire regardless of caller context.
+ *
+ * In addition, the following rules apply:
+ *
+ * (a) spa_props_lock protects pool properties, spa_config and spa_config_list.
+ * The lock ordering is SCL_CONFIG > spa_props_lock.
+ *
+ * (b) I/O operations on leaf vdevs. For any zio operation that takes
+ * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
+ * or zio_write_phys() -- the caller must ensure that the config cannot
+ * cannot change in the interim, and that the vdev cannot be reopened.
+ * SCL_STATE as reader suffices for both.
+ *
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
+ *
+ * spa_vdev_enter() Acquire the namespace lock and the config lock
+ * for writing.
+ *
+ * spa_vdev_exit() Release the config lock, wait for all I/O
+ * to complete, sync the updated configs to the
+ * cache, and release the namespace lock.
+ *
+ * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
+ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
+ * locking is, always, based on spa_namespace_lock and spa_config_lock[].
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+static int spa_active_count;
+int spa_max_replication_override = SPA_DVAS_PER_BP;
+
+static kmutex_t spa_spare_lock;
+static avl_tree_t spa_spare_avl;
+static kmutex_t spa_l2cache_lock;
+static avl_tree_t spa_l2cache_avl;
+
+kmem_cache_t *spa_buffer_pool;
+int spa_mode_global;
+
+#ifdef ZFS_DEBUG
+/*
+ * Everything except dprintf, spa, and indirect_remap is on by default
+ * in debug builds.
+ */
+int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP);
+#else
+int zfs_flags = 0;
+#endif
+
+/*
+ * zfs_recover can be set to nonzero to attempt to recover from
+ * otherwise-fatal errors, typically caused by on-disk corruption. When
+ * set, calls to zfs_panic_recover() will turn into warning messages.
+ * This should only be used as a last resort, as it typically results
+ * in leaked space, or worse.
+ */
+boolean_t zfs_recover = B_FALSE;
+
+/*
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress. While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked". Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers. In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks. However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug). In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless. In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do. Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
+ */
+boolean_t zfs_free_leak_on_eio = B_FALSE;
+
+/*
+ * Expiration time in milliseconds. This value has two meanings. First it is
+ * used to determine when the spa_deadman() logic should fire. By default the
+ * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that
+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+ * in a system panic.
+ */
+uint64_t zfs_deadman_synctime_ms = 1000000ULL;
+
+/*
+ * Check time in milliseconds. This defines the frequency at which we check
+ * for hung I/O.
+ */
+uint64_t zfs_deadman_checktime_ms = 5000ULL;
+
+/*
+ * Default value of -1 for zfs_deadman_enabled is resolved in
+ * zfs_deadman_init()
+ */
+int zfs_deadman_enabled = -1;
+
+/*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that. Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
+ * the worst case is:
+ * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
+ */
+int spa_asize_inflation = 24;
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
+ "Try to recover from otherwise-fatal errors.");
+
+static int
+sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
+{
+ int err, val;
+
+ val = zfs_flags;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ /*
+ * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
+ * arc buffers in the system have the necessary additional
+ * checksum data. However, it is safe to disable at any
+ * time.
+ */
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ val &= ~ZFS_DEBUG_MODIFY;
+ zfs_flags = val;
+
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+ sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RWTUN,
+ &zfs_deadman_synctime_ms, 0,
+ "Stalled ZFS I/O expiration time in milliseconds");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RWTUN,
+ &zfs_deadman_checktime_ms, 0,
+ "Period of checks for stalled ZFS I/O in milliseconds");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RWTUN,
+ &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
+ &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
+#endif
+
+#ifndef illumos
+#ifdef _KERNEL
+static void
+zfs_deadman_init()
+{
+ /*
+ * If we are not i386 or amd64 or in a virtual machine,
+ * disable ZFS deadman thread by default
+ */
+ if (zfs_deadman_enabled == -1) {
+#if defined(__amd64__) || defined(__i386__)
+ zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
+#else
+ zfs_deadman_enabled = 0;
+#endif
+ }
+}
+#endif /* _KERNEL */
+#endif /* !illumos */
+
+/*
+ * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
+ * the pool to be consumed. This ensures that we don't run the pool
+ * completely out of space, due to unaccounted changes (e.g. to the MOS).
+ * It also limits the worst-case time to allocate space. If we have
+ * less than this amount of free space, most ZPL operations (e.g. write,
+ * create) will return ENOSPC.
+ *
+ * Certain operations (e.g. file removal, most administrative actions) can
+ * use half the slop space. They will only return ENOSPC if less than half
+ * the slop space is free. Typically, once the pool has less than the slop
+ * space free, the user will use these operations to free up space in the pool.
+ * These are the operations that call dsl_pool_adjustedsize() with the netfree
+ * argument set to TRUE.
+ *
+ * Operations that are almost guaranteed to free up space in the absence of
+ * a pool checkpoint can use up to three quarters of the slop space
+ * (e.g zfs destroy).
+ *
+ * A very restricted set of operations are always permitted, regardless of
+ * the amount of free space. These are the operations that call
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
+ * increase in the amount of space used, it is possible to run the pool
+ * completely out of space, causing it to be permanently read-only.
+ *
+ * Note that on very small pools, the slop space will be larger than
+ * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
+ * but we never allow it to be more than half the pool size.
+ *
+ * See also the comments in zfs_space_check_t.
+ */
+int spa_slop_shift = 5;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
+ &spa_slop_shift, 0,
+ "Shift value of reserved space (1/(2^spa_slop_shift)).");
+uint64_t spa_min_slop = 128 * 1024 * 1024;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
+ &spa_min_slop, 0,
+ "Minimal value of reserved space");
+
+int spa_allocators = 4;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_allocators, CTLFLAG_RWTUN,
+ &spa_allocators, 0,
+ "Number of allocators per metaslab group");
+
+/*PRINTFLIKE2*/
+void
+spa_load_failed(spa_t *spa, const char *fmt, ...)
+{
+ va_list adx;
+ char buf[256];
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
+ spa->spa_trust_config ? "trusted" : "untrusted", buf);
+}
+
+/*PRINTFLIKE2*/
+void
+spa_load_note(spa_t *spa, const char *fmt, ...)
+{
+ va_list adx;
+ char buf[256];
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
+ spa->spa_trust_config ? "trusted" : "untrusted", buf);
+}
+
+/*
+ * By default dedup and user data indirects land in the special class
+ */
+int zfs_ddt_data_is_special = B_TRUE;
+int zfs_user_indirect_is_special = B_TRUE;
+
+/*
+ * The percentage of special class final space reserved for metadata only.
+ * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
+ * let metadata into the class.
+ */
+int zfs_special_class_metadata_reserve_pct = 25;
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ddt_data_is_special, CTLFLAG_RWTUN,
+ &zfs_ddt_data_is_special, 0,
+ "Whether DDT data is eligible for the special class vdevs");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, user_indirect_is_special, CTLFLAG_RWTUN,
+ &zfs_user_indirect_is_special, 0,
+ "Whether indirect blocks are eligible for the special class vdevs");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, special_class_metadata_reserve_pct,
+ CTLFLAG_RWTUN, &zfs_special_class_metadata_reserve_pct, 0,
+ "Percentage of space in the special class reserved solely for metadata");
+#endif
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+static void
+spa_config_lock_init(spa_t *spa)
+{
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
+ zfs_refcount_create_untracked(&scl->scl_count);
+ scl->scl_writer = NULL;
+ scl->scl_write_wanted = 0;
+ }
+}
+
+static void
+spa_config_lock_destroy(spa_t *spa)
+{
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ mutex_destroy(&scl->scl_lock);
+ cv_destroy(&scl->scl_cv);
+ zfs_refcount_destroy(&scl->scl_count);
+ ASSERT(scl->scl_writer == NULL);
+ ASSERT(scl->scl_write_wanted == 0);
+ }
+}
+
+int
+spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
+{
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (!(locks & (1 << i)))
+ continue;
+ mutex_enter(&scl->scl_lock);
+ if (rw == RW_READER) {
+ if (scl->scl_writer || scl->scl_write_wanted) {
+ mutex_exit(&scl->scl_lock);
+ spa_config_exit(spa, locks & ((1 << i) - 1),
+ tag);
+ return (0);
+ }
+ } else {
+ ASSERT(scl->scl_writer != curthread);
+ if (!zfs_refcount_is_zero(&scl->scl_count)) {
+ mutex_exit(&scl->scl_lock);
+ spa_config_exit(spa, locks & ((1 << i) - 1),
+ tag);
+ return (0);
+ }
+ scl->scl_writer = curthread;
+ }
+ (void) zfs_refcount_add(&scl->scl_count, tag);
+ mutex_exit(&scl->scl_lock);
+ }
+ return (1);
+}
+
+void
+spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
+{
+ int wlocks_held = 0;
+
+ ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
+
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (scl->scl_writer == curthread)
+ wlocks_held |= (1 << i);
+ if (!(locks & (1 << i)))
+ continue;
+ mutex_enter(&scl->scl_lock);
+ if (rw == RW_READER) {
+ while (scl->scl_writer || scl->scl_write_wanted) {
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ }
+ } else {
+ ASSERT(scl->scl_writer != curthread);
+ while (!zfs_refcount_is_zero(&scl->scl_count)) {
+ scl->scl_write_wanted++;
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ scl->scl_write_wanted--;
+ }
+ scl->scl_writer = curthread;
+ }
+ (void) zfs_refcount_add(&scl->scl_count, tag);
+ mutex_exit(&scl->scl_lock);
+ }
+ ASSERT3U(wlocks_held, <=, locks);
+}
+
+void
+spa_config_exit(spa_t *spa, int locks, void *tag)
+{
+ for (int i = SCL_LOCKS - 1; i >= 0; i--) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (!(locks & (1 << i)))
+ continue;
+ mutex_enter(&scl->scl_lock);
+ ASSERT(!zfs_refcount_is_zero(&scl->scl_count));
+ if (zfs_refcount_remove(&scl->scl_count, tag) == 0) {
+ ASSERT(scl->scl_writer == NULL ||
+ scl->scl_writer == curthread);
+ scl->scl_writer = NULL; /* OK in either case */
+ cv_broadcast(&scl->scl_cv);
+ }
+ mutex_exit(&scl->scl_lock);
+ }
+}
+
+int
+spa_config_held(spa_t *spa, int locks, krw_t rw)
+{
+ int locks_held = 0;
+
+ for (int i = 0; i < SCL_LOCKS; i++) {
+ spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (!(locks & (1 << i)))
+ continue;
+ if ((rw == RW_READER &&
+ !zfs_refcount_is_zero(&scl->scl_count)) ||
+ (rw == RW_WRITER && scl->scl_writer == curthread))
+ locks_held |= 1 << i;
+ }
+
+ return (locks_held);
+}
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+ static spa_t search; /* spa_t is large; don't allocate on stack */
+ spa_t *spa;
+ avl_index_t where;
+ char *cp;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
+
+ /*
+ * If it's a full dataset name, figure out the pool name and
+ * just use that.
+ */
+ cp = strpbrk(search.spa_name, "/@#");
+ if (cp != NULL)
+ *cp = '\0';
+
+ spa = avl_find(&spa_namespace_avl, &search, &where);
+
+ return (spa);
+}
+
+/*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+static void
+spa_deadman(void *arg, int pending)
+{
+ spa_t *spa = arg;
+
+ /*
+ * Disable the deadman timer if the pool is suspended.
+ */
+ if (spa_suspended(spa)) {
+#ifdef illumos
+ VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+#else
+ /* Nothing. just don't schedule any future callouts. */
+#endif
+ return;
+ }
+
+ zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+ (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+ ++spa->spa_deadman_calls);
+ if (zfs_deadman_enabled)
+ vdev_deadman(spa->spa_root_vdev);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ callout_schedule(&spa->spa_deadman_cycid,
+ hz * zfs_deadman_checktime_ms / MILLISEC);
+#endif
+#endif
+}
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static void
+spa_deadman_timeout(void *arg)
+{
+ spa_t *spa = arg;
+
+ taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
+}
+#endif
+
+/*
+ * Create an uninitialized spa_t with the given name. Requires
+ * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name, nvlist_t *config, const char *altroot)
+{
+ spa_t *spa;
+ spa_config_dirent_t *dp;
+#ifdef illumos
+ cyc_handler_t hdlr;
+ cyc_time_t when;
+#endif
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+ mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
+
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_create(&spa->spa_free_bplist[t]);
+
+ (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+ spa->spa_freeze_txg = UINT64_MAX;
+ spa->spa_final_txg = UINT64_MAX;
+ spa->spa_load_max_txg = UINT64_MAX;
+ spa->spa_proc = &p0;
+ spa->spa_proc_state = SPA_PROC_NONE;
+ spa->spa_trust_config = B_TRUE;
+
+#ifdef illumos
+ hdlr.cyh_func = spa_deadman;
+ hdlr.cyh_arg = spa;
+ hdlr.cyh_level = CY_LOW_LEVEL;
+#endif
+
+ spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
+
+#ifdef illumos
+ /*
+ * This determines how often we need to check for hung I/Os after
+ * the cyclic has already fired. Since checking for hung I/Os is
+ * an expensive operation we don't want to check too frequently.
+ * Instead wait for 5 seconds before checking again.
+ */
+ when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
+ when.cyt_when = CY_INFINITY;
+ mutex_enter(&cpu_lock);
+ spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
+ mutex_exit(&cpu_lock);
+#else /* !illumos */
+#ifdef _KERNEL
+ /*
+ * callout(9) does not provide a way to initialize a callout with
+ * a function and an argument, so we use callout_reset() to schedule
+ * the callout in the very distant future. Even if that event ever
+ * fires, it should be okayas we won't have any active zio-s.
+ * But normally spa_sync() will reschedule the callout with a proper
+ * timeout.
+ * callout(9) does not allow the callback function to sleep but
+ * vdev_deadman() needs to acquire vq_lock and illumos mutexes are
+ * emulated using sx(9). For this reason spa_deadman_timeout()
+ * will schedule spa_deadman() as task on a taskqueue that allows
+ * sleeping.
+ */
+ TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
+ callout_init(&spa->spa_deadman_cycid, 1);
+ callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
+ spa_deadman_timeout, spa, 0);
+#endif
+#endif
+ zfs_refcount_create(&spa->spa_refcount);
+ spa_config_lock_init(spa);
+
+ avl_add(&spa_namespace_avl, spa);
+
+ /*
+ * Set the alternate root, if there is one.
+ */
+ if (altroot) {
+ spa->spa_root = spa_strdup(altroot);
+ spa_active_count++;
+ }
+
+ spa->spa_alloc_count = spa_allocators;
+ spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (kmutex_t), KM_SLEEP);
+ spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (avl_tree_t), KM_SLEEP);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+ sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+ }
+
+ /*
+ * Every pool starts with the default cachefile
+ */
+ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
+ offsetof(spa_config_dirent_t, scd_link));
+
+ dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
+ dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
+ list_insert_head(&spa->spa_config_list, dp);
+
+ VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+
+ if (config != NULL) {
+ nvlist_t *features;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+ &features) == 0) {
+ VERIFY(nvlist_dup(features, &spa->spa_label_features,
+ 0) == 0);
+ }
+
+ VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+ }
+
+ if (spa->spa_label_features == NULL) {
+ VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ }
+
+ spa->spa_min_ashift = INT_MAX;
+ spa->spa_max_ashift = 0;
+
+ /*
+ * As a pool is being created, treat all features as disabled by
+ * setting SPA_FEATURE_DISABLED for all entries in the feature
+ * refcount cache.
+ */
+ for (int i = 0; i < SPA_FEATURES; i++) {
+ spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
+ }
+
+ list_create(&spa->spa_leaf_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_leaf_node));
+
+ return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used. Requires
+ * spa_namespace_lock. This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+ spa_config_dirent_t *dp;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
+
+ nvlist_free(spa->spa_config_splitting);
+
+ avl_remove(&spa_namespace_avl, spa);
+ cv_broadcast(&spa_namespace_cv);
+
+ if (spa->spa_root) {
+ spa_strfree(spa->spa_root);
+ spa_active_count--;
+ }
+
+ while ((dp = list_head(&spa->spa_config_list)) != NULL) {
+ list_remove(&spa->spa_config_list, dp);
+ if (dp->scd_path != NULL)
+ spa_strfree(dp->scd_path);
+ kmem_free(dp, sizeof (spa_config_dirent_t));
+ }
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ avl_destroy(&spa->spa_alloc_trees[i]);
+ mutex_destroy(&spa->spa_alloc_locks[i]);
+ }
+ kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
+ sizeof (kmutex_t));
+ kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
+ sizeof (avl_tree_t));
+
+ list_destroy(&spa->spa_config_list);
+ list_destroy(&spa->spa_leaf_list);
+
+ nvlist_free(spa->spa_label_features);
+ nvlist_free(spa->spa_load_info);
+ nvlist_free(spa->spa_feat_stats);
+ spa_config_set(spa, NULL);
+
+#ifdef illumos
+ mutex_enter(&cpu_lock);
+ if (spa->spa_deadman_cycid != CYCLIC_NONE)
+ cyclic_remove(spa->spa_deadman_cycid);
+ mutex_exit(&cpu_lock);
+ spa->spa_deadman_cycid = CYCLIC_NONE;
+#else /* !illumos */
+#ifdef _KERNEL
+ callout_drain(&spa->spa_deadman_cycid);
+ taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
+#endif
+#endif
+
+ zfs_refcount_destroy(&spa->spa_refcount);
+
+ spa_config_lock_destroy(spa);
+
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_destroy(&spa->spa_free_bplist[t]);
+
+ zio_checksum_templates_free(spa);
+
+ cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_evicting_os_cv);
+ cv_destroy(&spa->spa_proc_cv);
+ cv_destroy(&spa->spa_scrub_io_cv);
+ cv_destroy(&spa->spa_suspend_cv);
+
+ mutex_destroy(&spa->spa_async_lock);
+ mutex_destroy(&spa->spa_errlist_lock);
+ mutex_destroy(&spa->spa_errlog_lock);
+ mutex_destroy(&spa->spa_evicting_os_lock);
+ mutex_destroy(&spa->spa_history_lock);
+ mutex_destroy(&spa->spa_proc_lock);
+ mutex_destroy(&spa->spa_props_lock);
+ mutex_destroy(&spa->spa_cksum_tmpls_lock);
+ mutex_destroy(&spa->spa_scrub_lock);
+ mutex_destroy(&spa->spa_suspend_lock);
+ mutex_destroy(&spa->spa_vdev_top_lock);
+ mutex_destroy(&spa->spa_feat_stats_lock);
+
+ kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none. If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (prev)
+ return (AVL_NEXT(&spa_namespace_avl, prev));
+ else
+ return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+ ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
+ MUTEX_HELD(&spa_namespace_lock));
+ (void) zfs_refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+ ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
+ MUTEX_HELD(&spa_namespace_lock));
+ (void) zfs_refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t held by a dsl dir that is
+ * being asynchronously released. Async releases occur from a taskq
+ * performing eviction of dsl datasets and dirs. The namespace lock
+ * isn't held and the hold by the object being evicted may contribute to
+ * spa_minref (e.g. dataset or directory released during pool export),
+ * so the asserts in spa_close() do not apply.
+ */
+void
+spa_async_close(spa_t *spa, void *tag)
+{
+ (void) zfs_refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero. Must be called with
+ * spa_namespace_lock held. We really compare against spa_minref, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
+}
+
+/*
+ * ==========================================================================
+ * SPA spare and l2cache tracking
+ * ==========================================================================
+ */
+
+/*
+ * Hot spares and cache devices are tracked using the same code below,
+ * for 'auxiliary' devices.
+ */
+
+typedef struct spa_aux {
+ uint64_t aux_guid;
+ uint64_t aux_pool;
+ avl_node_t aux_avl;
+ int aux_count;
+} spa_aux_t;
+
+static inline int
+spa_aux_compare(const void *a, const void *b)
+{
+ const spa_aux_t *sa = (const spa_aux_t *)a;
+ const spa_aux_t *sb = (const spa_aux_t *)b;
+
+ return (AVL_CMP(sa->aux_guid, sb->aux_guid));
+}
+
+void
+spa_aux_add(vdev_t *vd, avl_tree_t *avl)
+{
+ avl_index_t where;
+ spa_aux_t search;
+ spa_aux_t *aux;
+
+ search.aux_guid = vd->vdev_guid;
+ if ((aux = avl_find(avl, &search, &where)) != NULL) {
+ aux->aux_count++;
+ } else {
+ aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
+ aux->aux_guid = vd->vdev_guid;
+ aux->aux_count = 1;
+ avl_insert(avl, aux, where);
+ }
+}
+
+void
+spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
+{
+ spa_aux_t search;
+ spa_aux_t *aux;
+ avl_index_t where;
+
+ search.aux_guid = vd->vdev_guid;
+ aux = avl_find(avl, &search, &where);
+
+ ASSERT(aux != NULL);
+
+ if (--aux->aux_count == 0) {
+ avl_remove(avl, aux);
+ kmem_free(aux, sizeof (spa_aux_t));
+ } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
+ aux->aux_pool = 0ULL;
+ }
+}
+
+boolean_t
+spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
+{
+ spa_aux_t search, *found;
+
+ search.aux_guid = guid;
+ found = avl_find(avl, &search, NULL);
+
+ if (pool) {
+ if (found)
+ *pool = found->aux_pool;
+ else
+ *pool = 0ULL;
+ }
+
+ if (refcnt) {
+ if (found)
+ *refcnt = found->aux_count;
+ else
+ *refcnt = 0;
+ }
+
+ return (found != NULL);
+}
+
+void
+spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
+{
+ spa_aux_t search, *found;
+ avl_index_t where;
+
+ search.aux_guid = vd->vdev_guid;
+ found = avl_find(avl, &search, &where);
+ ASSERT(found != NULL);
+ ASSERT(found->aux_pool == 0ULL);
+
+ found->aux_pool = spa_guid(vd->vdev_spa);
+}
+
+/*
+ * Spares are tracked globally due to the following constraints:
+ *
+ * - A spare may be part of multiple pools.
+ * - A spare may be added to a pool even if it's actively in use within
+ * another pool.
+ * - A spare in use in any pool can only be the source of a replacement if
+ * the target is a spare in the same pool.
+ *
+ * We keep track of all spares on the system through the use of a reference
+ * counted AVL tree. When a vdev is added as a spare, or used as a replacement
+ * spare, then we bump the reference count in the AVL tree. In addition, we set
+ * the 'vdev_isspare' member to indicate that the device is a spare (active or
+ * inactive). When a spare is made active (used to replace a device in the
+ * pool), we also keep track of which pool its been made a part of.
+ *
+ * The 'spa_spare_lock' protects the AVL tree. These functions are normally
+ * called under the spa_namespace lock as part of vdev reconfiguration. The
+ * separate spare lock exists for the status query path, which does not need to
+ * be completely consistent with respect to other vdev configuration changes.
+ */
+
+static int
+spa_spare_compare(const void *a, const void *b)
+{
+ return (spa_aux_compare(a, b));
+}
+
+void
+spa_spare_add(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(!vd->vdev_isspare);
+ spa_aux_add(vd, &spa_spare_avl);
+ vd->vdev_isspare = B_TRUE;
+ mutex_exit(&spa_spare_lock);
+}
+
+void
+spa_spare_remove(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(vd->vdev_isspare);
+ spa_aux_remove(vd, &spa_spare_avl);
+ vd->vdev_isspare = B_FALSE;
+ mutex_exit(&spa_spare_lock);
+}
+
+boolean_t
+spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
+{
+ boolean_t found;
+
+ mutex_enter(&spa_spare_lock);
+ found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
+ mutex_exit(&spa_spare_lock);
+
+ return (found);
+}
+
+void
+spa_spare_activate(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(vd->vdev_isspare);
+ spa_aux_activate(vd, &spa_spare_avl);
+ mutex_exit(&spa_spare_lock);
+}
+
+/*
+ * Level 2 ARC devices are tracked globally for the same reasons as spares.
+ * Cache devices currently only support one pool per cache device, and so
+ * for these devices the aux reference count is currently unused beyond 1.
+ */
+
+static int
+spa_l2cache_compare(const void *a, const void *b)
+{
+ return (spa_aux_compare(a, b));
+}
+
+void
+spa_l2cache_add(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(!vd->vdev_isl2cache);
+ spa_aux_add(vd, &spa_l2cache_avl);
+ vd->vdev_isl2cache = B_TRUE;
+ mutex_exit(&spa_l2cache_lock);
+}
+
+void
+spa_l2cache_remove(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(vd->vdev_isl2cache);
+ spa_aux_remove(vd, &spa_l2cache_avl);
+ vd->vdev_isl2cache = B_FALSE;
+ mutex_exit(&spa_l2cache_lock);
+}
+
+boolean_t
+spa_l2cache_exists(uint64_t guid, uint64_t *pool)
+{
+ boolean_t found;
+
+ mutex_enter(&spa_l2cache_lock);
+ found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
+ mutex_exit(&spa_l2cache_lock);
+
+ return (found);
+}
+
+void
+spa_l2cache_activate(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(vd->vdev_isl2cache);
+ spa_aux_activate(vd, &spa_l2cache_avl);
+ mutex_exit(&spa_l2cache_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+ mutex_enter(&spa->spa_vdev_top_lock);
+ mutex_enter(&spa_namespace_lock);
+ return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter(). Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+
+ return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
+ */
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ int config_changed = B_FALSE;
+
+ ASSERT(txg > spa_last_synced_txg(spa));
+
+ spa->spa_pending_vdev = NULL;
+
+ /*
+ * Reassess the DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
+ config_changed = B_TRUE;
+ spa->spa_config_generation++;
+ }
+
+ /*
+ * Verify the metaslab classes.
+ */
+ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
+
+ spa_config_exit(spa, SCL_ALL, spa);
+
+ /*
+ * Panic the system if the specified tag requires it. This
+ * is useful for ensuring that configurations are updated
+ * transactionally.
+ */
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, tag, 0);
+
+ /*
+ * Note: this txg_wait_synced() is important because it ensures
+ * that there won't be more than one config change per txg.
+ * This allows us to use the txg as the generation number.
+ */
+ if (error == 0)
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ if (vd != NULL) {
+ ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+ vdev_free(vd);
+ spa_config_exit(spa, SCL_ALL, spa);
+ }
+
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed)
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ spa_vdev_config_exit(spa, vd, txg, error, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
+}
+
+/*
+ * Lock the given spa_t for the purpose of changing vdev state.
+ */
+void
+spa_vdev_state_enter(spa_t *spa, int oplocks)
+{
+ int locks = SCL_STATE_ALL | oplocks;
+
+ /*
+ * Root pools may need to read of the underlying devfs filesystem
+ * when opening up a vdev. Unfortunately if we're holding the
+ * SCL_ZIO lock it will result in a deadlock when we try to issue
+ * the read from the root filesystem. Instead we "prefetch"
+ * the associated vnodes that we need prior to opening the
+ * underlying devices and cache them so that we can prevent
+ * any I/O when we are doing the actual open.
+ */
+ if (spa_is_root(spa)) {
+ int low = locks & ~(SCL_ZIO - 1);
+ int high = locks & ~low;
+
+ spa_config_enter(spa, high, spa, RW_WRITER);
+ vdev_hold(spa->spa_root_vdev);
+ spa_config_enter(spa, low, spa, RW_WRITER);
+ } else {
+ spa_config_enter(spa, locks, spa, RW_WRITER);
+ }
+ spa->spa_vdev_locks = locks;
+}
+
+int
+spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
+{
+ boolean_t config_changed = B_FALSE;
+
+ if (vd != NULL || error == 0)
+ vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
+ 0, 0, B_FALSE);
+
+ if (vd != NULL) {
+ vdev_state_dirty(vd->vdev_top);
+ config_changed = B_TRUE;
+ spa->spa_config_generation++;
+ }
+
+ if (spa_is_root(spa))
+ vdev_rele(spa->spa_root_vdev);
+
+ ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+ spa_config_exit(spa, spa->spa_vdev_locks, spa);
+
+ /*
+ * If anything changed, wait for it to sync. This ensures that,
+ * from the system administrator's perspective, zpool(1M) commands
+ * are synchronous. This is important for things like zpool offline:
+ * when the command completes, you expect no further I/O from ZFS.
+ */
+ if (vd != NULL)
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed) {
+ mutex_enter(&spa_namespace_lock);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+void
+spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
+{
+ if (!nvlist_exists(spa->spa_label_features, feature)) {
+ fnvlist_add_boolean(spa->spa_label_features, feature);
+ /*
+ * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
+ * dirty the vdev config because lock SCL_CONFIG is not held.
+ * Thankfully, in this case we don't need to dirty the config
+ * because it will be written out anyway when we finish
+ * creating the pool.
+ */
+ if (tx->tx_txg != TXG_INITIAL)
+ vdev_config_dirty(spa->spa_root_vdev);
+ }
+}
+
+void
+spa_deactivate_mos_feature(spa_t *spa, const char *feature)
+{
+ if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
+ vdev_config_dirty(spa->spa_root_vdev);
+}
+
+/*
+ * Return the spa_t associated with given pool_guid, if it exists. If
+ * device_guid is non-zero, determine whether the pool exists *and* contains
+ * a device with the specified device_guid.
+ */
+spa_t *
+spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
+{
+ spa_t *spa;
+ avl_tree_t *t = &spa_namespace_avl;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ continue;
+ if (spa->spa_root_vdev == NULL)
+ continue;
+ if (spa_guid(spa) == pool_guid) {
+ if (device_guid == 0)
+ break;
+
+ if (vdev_lookup_by_guid(spa->spa_root_vdev,
+ device_guid) != NULL)
+ break;
+
+ /*
+ * Check any devices we may be in the process of adding.
+ */
+ if (spa->spa_pending_vdev) {
+ if (vdev_lookup_by_guid(spa->spa_pending_vdev,
+ device_guid) != NULL)
+ break;
+ }
+ }
+ }
+
+ return (spa);
+}
+
+/*
+ * Determine whether a pool with the given pool_guid exists.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ return (spa_by_guid(pool_guid, device_guid) != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+ size_t len;
+ char *new;
+
+ len = strlen(s);
+ new = kmem_alloc(len + 1, KM_SLEEP);
+ bcopy(s, new, len);
+ new[len] = '\0';
+
+ return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+ kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+ uint64_t r;
+
+ ASSERT(range != 0);
+
+ if (range == 1)
+ return (0);
+
+ (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+ return (r % range);
+}
+
+uint64_t
+spa_generate_guid(spa_t *spa)
+{
+ uint64_t guid = spa_get_random(-1ULL);
+
+ if (spa != NULL) {
+ while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
+ guid = spa_get_random(-1ULL);
+ } else {
+ while (guid == 0 || spa_guid_exists(guid, 0))
+ guid = spa_get_random(-1ULL);
+ }
+
+ return (guid);
+}
+
+void
+snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
+{
+ char type[256];
+ char *checksum = NULL;
+ char *compress = NULL;
+
+ if (bp != NULL) {
+ if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
+ dmu_object_byteswap_t bswap =
+ DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
+ (void) snprintf(type, sizeof (type), "bswap %s %s",
+ DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
+ "metadata" : "data",
+ dmu_ot_byteswap[bswap].ob_name);
+ } else {
+ (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
+ sizeof (type));
+ }
+ if (!BP_IS_EMBEDDED(bp)) {
+ checksum =
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ }
+ compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
+ }
+
+ SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
+ compress);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+ uint64_t freeze_txg = 0;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ if (spa->spa_freeze_txg == UINT64_MAX) {
+ freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+ spa->spa_freeze_txg = freeze_txg;
+ }
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (freeze_txg != 0)
+ txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+void
+zfs_panic_recover(const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
+ va_end(adx);
+}
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexadecimal numbers that don't overflow.
+ */
+uint64_t
+zfs_strtonum(const char *str, char **nptr)
+{
+ uint64_t val = 0;
+ char c;
+ int digit;
+
+ while ((c = *str) != '\0') {
+ if (c >= '0' && c <= '9')
+ digit = c - '0';
+ else if (c >= 'a' && c <= 'f')
+ digit = 10 + c - 'a';
+ else
+ break;
+
+ val *= 16;
+ val += digit;
+
+ str++;
+ }
+
+ if (nptr)
+ *nptr = (char *)str;
+
+ return (val);
+}
+
+void
+spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
+{
+ /*
+ * We bump the feature refcount for each special vdev added to the pool
+ */
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
+ spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+boolean_t
+spa_shutting_down(spa_t *spa)
+{
+ return (spa->spa_async_suspended);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+ return (spa->spa_dsl_pool);
+}
+
+boolean_t
+spa_is_initializing(spa_t *spa)
+{
+ return (spa->spa_is_initializing);
+}
+
+boolean_t
+spa_indirect_vdevs_loaded(spa_t *spa)
+{
+ return (spa->spa_indirect_vdevs_loaded);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+ return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+ spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+ if (spa->spa_root == NULL)
+ buf[0] = '\0';
+ else
+ (void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+ return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+ return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ uint64_t guid;
+
+ /*
+ * If we fail to parse the config during spa_load(), we can go through
+ * the error path (which posts an ereport) and end up here with no root
+ * vdev. We stash the original pool guid in 'spa_config_guid' to handle
+ * this case.
+ */
+ if (spa->spa_root_vdev == NULL)
+ return (spa->spa_config_guid);
+
+ guid = spa->spa_last_synced_guid != 0 ?
+ spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
+
+ /*
+ * Return the most recently synced out guid unless we're
+ * in syncing context.
+ */
+ if (dp && dsl_pool_sync_context(dp))
+ return (spa->spa_root_vdev->vdev_guid);
+ else
+ return (guid);
+}
+
+uint64_t
+spa_load_guid(spa_t *spa)
+{
+ /*
+ * This is a GUID that exists solely as a reference for the
+ * purposes of the arc. It is generated at load time, and
+ * is never written to persistent storage.
+ */
+ return (spa->spa_load_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+ return (spa->spa_first_txg);
+}
+
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+ return (spa->spa_syncing_txg);
+}
+
+/*
+ * Return the last txg where data can be dirtied. The final txgs
+ * will be used to just clear out any deferred frees that remain.
+ */
+uint64_t
+spa_final_dirty_txg(spa_t *spa)
+{
+ return (spa->spa_final_txg - TXG_DEFER_SIZE);
+}
+
+pool_state_t
+spa_state(spa_t *spa)
+{
+ return (spa->spa_state);
+}
+
+spa_load_state_t
+spa_load_state(spa_t *spa)
+{
+ return (spa->spa_load_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+ return (spa->spa_freeze_txg);
+}
+
+/* ARGSUSED */
+uint64_t
+spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
+{
+ return (lsize * spa_asize_inflation);
+}
+
+/*
+ * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
+ * or at least 128MB, unless that would cause it to be more than half the
+ * pool size.
+ *
+ * See the comment above spa_slop_shift for details.
+ */
+uint64_t
+spa_get_slop_space(spa_t *spa)
+{
+ uint64_t space = spa_get_dspace(spa);
+ return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
+}
+
+uint64_t
+spa_get_dspace(spa_t *spa)
+{
+ return (spa->spa_dspace);
+}
+
+uint64_t
+spa_get_checkpoint_space(spa_t *spa)
+{
+ return (spa->spa_checkpoint_info.sci_dspace);
+}
+
+void
+spa_update_dspace(spa_t *spa)
+{
+ spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
+ ddt_get_dedup_dspace(spa);
+ if (spa->spa_vdev_removal != NULL) {
+ /*
+ * We can't allocate from the removing device, so
+ * subtract its size. This prevents the DMU/DSL from
+ * filling up the (now smaller) pool while we are in the
+ * middle of removing the device.
+ *
+ * Note that the DMU/DSL doesn't actually know or care
+ * how much space is allocated (it does its own tracking
+ * of how much space has been logically used). So it
+ * doesn't matter that the data we are moving may be
+ * allocated twice (on the old device and the new
+ * device).
+ */
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vdev_t *vd =
+ vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+ spa->spa_dspace -= spa_deflate(spa) ?
+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+ }
+}
+
+/*
+ * Return the failure mode that has been set to this pool. The default
+ * behavior will be to block all I/Os when a complete failure occurs.
+ */
+uint8_t
+spa_get_failmode(spa_t *spa)
+{
+ return (spa->spa_failmode);
+}
+
+boolean_t
+spa_suspended(spa_t *spa)
+{
+ return (spa->spa_suspended != ZIO_SUSPEND_NONE);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_version);
+}
+
+boolean_t
+spa_deflate(spa_t *spa)
+{
+ return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+ return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+ return (spa->spa_log_class);
+}
+
+metaslab_class_t *
+spa_special_class(spa_t *spa)
+{
+ return (spa->spa_special_class);
+}
+
+metaslab_class_t *
+spa_dedup_class(spa_t *spa)
+{
+ return (spa->spa_dedup_class);
+}
+
+/*
+ * Locate an appropriate allocation class
+ */
+metaslab_class_t *
+spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
+ uint_t level, uint_t special_smallblk)
+{
+ if (DMU_OT_IS_ZIL(objtype)) {
+ if (spa->spa_log_class->mc_groups != 0)
+ return (spa_log_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
+
+ if (DMU_OT_IS_DDT(objtype)) {
+ if (spa->spa_dedup_class->mc_groups != 0)
+ return (spa_dedup_class(spa));
+ else if (has_special_class && zfs_ddt_data_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /* Indirect blocks for user data can land in special if allowed */
+ if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
+ if (has_special_class && zfs_user_indirect_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ if (DMU_OT_IS_METADATA(objtype) || level > 0) {
+ if (has_special_class)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /*
+ * Allow small file blocks in special class in some cases (like
+ * for the dRAID vdev feature). But always leave a reserve of
+ * zfs_special_class_metadata_reserve_pct exclusively for metadata.
+ */
+ if (DMU_OT_IS_FILE(objtype) &&
+ has_special_class && size <= special_smallblk) {
+ metaslab_class_t *special = spa_special_class(spa);
+ uint64_t alloc = metaslab_class_get_alloc(special);
+ uint64_t space = metaslab_class_get_space(special);
+ uint64_t limit =
+ (space * (100 - zfs_special_class_metadata_reserve_pct))
+ / 100;
+
+ if (alloc < limit)
+ return (special);
+ }
+
+ return (spa_normal_class(spa));
+}
+
+void
+spa_evicting_os_register(spa_t *spa, objset_t *os)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ list_insert_head(&spa->spa_evicting_os_list, os);
+ mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_deregister(spa_t *spa, objset_t *os)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ list_remove(&spa->spa_evicting_os_list, os);
+ cv_broadcast(&spa->spa_evicting_os_cv);
+ mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_wait(spa_t *spa)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ while (!list_is_empty(&spa->spa_evicting_os_list))
+ cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
+ mutex_exit(&spa->spa_evicting_os_lock);
+
+ dmu_buf_user_evict_wait();
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+ /*
+ * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
+ * handle BPs with more than one DVA allocated. Set our max
+ * replication level accordingly.
+ */
+ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
+ return (1);
+ return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
+}
+
+int
+spa_prev_software_version(spa_t *spa)
+{
+ return (spa->spa_prev_software_version);
+}
+
+uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+ return (spa->spa_deadman_synctime);
+}
+
+struct proc *
+spa_proc(spa_t *spa)
+{
+ return (spa->spa_proc);
+}
+
+uint64_t
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
+{
+ uint64_t asize = DVA_GET_ASIZE(dva);
+ uint64_t dsize = asize;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ if (asize != 0 && spa->spa_deflate) {
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ if (vd == NULL) {
+ panic(
+ "dva_get_dsize_sync(): bad DVA %llu:%llu",
+ (u_longlong_t)vdev, (u_longlong_t)asize);
+ }
+ dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+ }
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ return (dsize);
+}
+
+uint64_t
+spa_dirty_data(spa_t *spa)
+{
+ return (spa->spa_dsl_pool->dp_dirty_total);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+ const spa_t *s1 = a1;
+ const spa_t *s2 = a2;
+ int s;
+
+ s = strcmp(s1->spa_name, s2->spa_name);
+
+ return (AVL_ISIGN(s));
+}
+
+int
+spa_busy(void)
+{
+ return (spa_active_count);
+}
+
+void
+spa_boot_init()
+{
+ spa_config_load();
+}
+
+#ifdef _KERNEL
+EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
+#endif
+
+void
+spa_init(int mode)
+{
+ mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+ avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+ offsetof(spa_t, spa_avl));
+
+ avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
+ offsetof(spa_aux_t, aux_avl));
+
+ avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
+ offsetof(spa_aux_t, aux_avl));
+
+ spa_mode_global = mode;
+
+#ifdef illumos
+#ifdef _KERNEL
+ spa_arch_init();
+#else
+ if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
+ arc_procfd = open("/proc/self/ctl", O_WRONLY);
+ if (arc_procfd == -1) {
+ perror("could not enable watchpoints: "
+ "opening /proc/self/ctl failed: ");
+ } else {
+ arc_watch = B_TRUE;
+ }
+ }
+#endif
+#endif /* illumos */
+
+ zfs_refcount_init();
+ unique_init();
+ range_tree_init();
+ metaslab_alloc_trace_init();
+ zio_init();
+ lz4_init();
+ dmu_init();
+ zil_init();
+ vdev_cache_stat_init();
+ vdev_file_init();
+ zfs_prop_init();
+ zpool_prop_init();
+ zpool_feature_init();
+ spa_config_load();
+ l2arc_start();
+ scan_init();
+ dsl_scan_global_init();
+#ifndef illumos
+#ifdef _KERNEL
+ zfs_deadman_init();
+#endif
+#endif /* !illumos */
+}
+
+void
+spa_fini(void)
+{
+ l2arc_stop();
+
+ spa_evict_all();
+
+ vdev_file_fini();
+ vdev_cache_stat_fini();
+ zil_fini();
+ dmu_fini();
+ lz4_fini();
+ zio_fini();
+ metaslab_alloc_trace_fini();
+ range_tree_fini();
+ unique_fini();
+ zfs_refcount_fini();
+ scan_fini();
+
+ avl_destroy(&spa_namespace_avl);
+ avl_destroy(&spa_spare_avl);
+ avl_destroy(&spa_l2cache_avl);
+
+ cv_destroy(&spa_namespace_cv);
+ mutex_destroy(&spa_namespace_lock);
+ mutex_destroy(&spa_spare_lock);
+ mutex_destroy(&spa_l2cache_lock);
+}
+
+/*
+ * Return whether this pool has slogs. No locking needed.
+ * It's not a problem if the wrong answer is returned as it's only for
+ * performance and not correctness
+ */
+boolean_t
+spa_has_slogs(spa_t *spa)
+{
+ return (spa->spa_log_class->mc_rotor != NULL);
+}
+
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+ return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+ spa->spa_log_state = state;
+}
+
+boolean_t
+spa_is_root(spa_t *spa)
+{
+ return (spa->spa_is_root);
+}
+
+boolean_t
+spa_writeable(spa_t *spa)
+{
+ return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
+}
+
+/*
+ * Returns true if there is a pending sync task in any of the current
+ * syncing txg, the current quiescing txg, or the current open txg.
+ */
+boolean_t
+spa_has_pending_synctask(spa_t *spa)
+{
+ return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
+ !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
+}
+
+int
+spa_mode(spa_t *spa)
+{
+ return (spa->spa_mode);
+}
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+ return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+ return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+ return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+ return (spa->spa_dedup_checksum);
+}
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+ /* data not stored on disk */
+ spa->spa_scan_pass_start = gethrestime_sec();
+ if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
+ spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
+ else
+ spa->spa_scan_pass_scrub_pause = 0;
+ spa->spa_scan_pass_scrub_spent_paused = 0;
+ spa->spa_scan_pass_exam = 0;
+ spa->spa_scan_pass_issued = 0;
+ vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+ dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+ if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+ return (SET_ERROR(ENOENT));
+ bzero(ps, sizeof (pool_scan_stat_t));
+
+ /* data stored on disk */
+ ps->pss_func = scn->scn_phys.scn_func;
+ ps->pss_state = scn->scn_phys.scn_state;
+ ps->pss_start_time = scn->scn_phys.scn_start_time;
+ ps->pss_end_time = scn->scn_phys.scn_end_time;
+ ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+ ps->pss_to_process = scn->scn_phys.scn_to_process;
+ ps->pss_processed = scn->scn_phys.scn_processed;
+ ps->pss_errors = scn->scn_phys.scn_errors;
+ ps->pss_examined = scn->scn_phys.scn_examined;
+ ps->pss_issued =
+ scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
+ /* data not stored on disk */
+ ps->pss_pass_start = spa->spa_scan_pass_start;
+ ps->pss_pass_exam = spa->spa_scan_pass_exam;
+ ps->pss_pass_issued = spa->spa_scan_pass_issued;
+ ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
+ ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
+
+ return (0);
+}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SPA_MAXBLOCKSIZE);
+ else
+ return (SPA_OLD_MAXBLOCKSIZE);
+}
+
+int
+spa_maxdnodesize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
+ return (DNODE_MAX_SIZE);
+ else
+ return (DNODE_MIN_SIZE);
+}
+
+boolean_t
+spa_multihost(spa_t *spa)
+{
+ return (spa->spa_multihost ? B_TRUE : B_FALSE);
+}
+
+unsigned long
+spa_get_hostid(void)
+{
+ unsigned long myhostid;
+
+#ifdef _KERNEL
+ myhostid = zone_get_hostid(NULL);
+#else /* _KERNEL */
+ /*
+ * We're emulating the system's hostid in userland, so
+ * we can't use zone_get_hostid().
+ */
+ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
+#endif /* _KERNEL */
+
+ return (myhostid);
+}
+
+/*
+ * Returns the txg that the last device removal completed. No indirect mappings
+ * have been added since this txg.
+ */
+uint64_t
+spa_get_last_removal_txg(spa_t *spa)
+{
+ uint64_t vdevid;
+ uint64_t ret = -1ULL;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ /*
+ * sr_prev_indirect_vdev is only modified while holding all the
+ * config locks, so it is sufficient to hold SCL_VDEV as reader when
+ * examining it.
+ */
+ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
+
+ while (vdevid != -1ULL) {
+ vdev_t *vd = vdev_lookup_top(spa, vdevid);
+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ /*
+ * If the removal did not remap any data, we don't care.
+ */
+ if (vdev_indirect_births_count(vib) != 0) {
+ ret = vdev_indirect_births_last_entry_txg(vib);
+ break;
+ }
+
+ vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
+ }
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ IMPLY(ret != -1ULL,
+ spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+ return (ret);
+}
+
+boolean_t
+spa_trust_config(spa_t *spa)
+{
+ return (spa->spa_trust_config);
+}
+
+uint64_t
+spa_missing_tvds_allowed(spa_t *spa)
+{
+ return (spa->spa_missing_tvds_allowed);
+}
+
+void
+spa_set_missing_tvds(spa_t *spa, uint64_t missing)
+{
+ spa->spa_missing_tvds = missing;
+}
+
+boolean_t
+spa_top_vdevs_spacemap_addressable(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+boolean_t
+spa_has_checkpoint(spa_t *spa)
+{
+ return (spa->spa_checkpoint_txg != 0);
+}
+
+boolean_t
+spa_importing_readonly_checkpoint(spa_t *spa)
+{
+ return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
+ spa->spa_mode == FREAD);
+}
+
+uint64_t
+spa_min_claim_txg(spa_t *spa)
+{
+ uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
+
+ if (checkpoint_txg != 0)
+ return (checkpoint_txg + 1);
+
+ return (spa->spa_first_txg);
+}
+
+/*
+ * If there is a checkpoint, async destroys may consume more space from
+ * the pool instead of freeing it. In an attempt to save the pool from
+ * getting suspended when it is about to run out of space, we stop
+ * processing async destroys.
+ */
+boolean_t
+spa_suspend_async_destroy(spa_t *spa)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+
+ uint64_t unreserved = dsl_pool_unreserved_space(dp,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
+ uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+ uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
+
+ if (spa_has_checkpoint(spa) && avail == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
new file mode 100644
index 000000000000..9ed7a1f4b761
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -0,0 +1,1073 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio.h>
+#include <sys/space_map.h>
+#include <sys/refcount.h>
+#include <sys/zfeature.h>
+
+SYSCTL_DECL(_vfs_zfs);
+
+/*
+ * Note on space map block size:
+ *
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer I/O operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more I/O bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ */
+
+/*
+ * Enabled whenever we want to stress test the use of double-word
+ * space map entries.
+ */
+boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+
+/*
+ * Override the default indirect block size of 128K, instead using 16K for
+ * spacemaps (2^14 bytes). This dramatically reduces write inflation since
+ * appending to a spacemap typically has to write one data block (4KB) and one
+ * or two indirect blocks (16K-32K, rather than 128K).
+ */
+int space_map_ibs = 14;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
+ &space_map_ibs, 0, "Space map indirect block shift");
+
+boolean_t
+sm_entry_is_debug(uint64_t e)
+{
+ return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
+}
+
+boolean_t
+sm_entry_is_single_word(uint64_t e)
+{
+ uint8_t prefix = SM_PREFIX_DECODE(e);
+ return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
+}
+
+boolean_t
+sm_entry_is_double_word(uint64_t e)
+{
+ return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
+}
+
+/*
+ * Iterate through the space map, invoking the callback on each (non-debug)
+ * space map entry. Stop after reading 'end' bytes of the space map.
+ */
+int
+space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
+{
+ uint64_t blksz = sm->sm_blksz;
+
+ ASSERT3U(blksz, !=, 0);
+ ASSERT3U(end, <=, space_map_length(sm));
+ ASSERT0(P2PHASE(end, sizeof (uint64_t)));
+
+ dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
+ ZIO_PRIORITY_SYNC_READ);
+
+ int error = 0;
+ for (uint64_t block_base = 0; block_base < end && error == 0;
+ block_base += blksz) {
+ dmu_buf_t *db;
+ error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
+ block_base, FTAG, &db, DMU_READ_PREFETCH);
+ if (error != 0)
+ return (error);
+
+ uint64_t *block_start = db->db_data;
+ uint64_t block_length = MIN(end - block_base, blksz);
+ uint64_t *block_end = block_start +
+ (block_length / sizeof (uint64_t));
+
+ VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
+ VERIFY3U(block_length, !=, 0);
+ ASSERT3U(blksz, ==, db->db_size);
+
+ for (uint64_t *block_cursor = block_start;
+ block_cursor < block_end && error == 0; block_cursor++) {
+ uint64_t e = *block_cursor;
+
+ if (sm_entry_is_debug(e)) /* Skip debug entries */
+ continue;
+
+ uint64_t raw_offset, raw_run, vdev_id;
+ maptype_t type;
+ if (sm_entry_is_single_word(e)) {
+ type = SM_TYPE_DECODE(e);
+ vdev_id = SM_NO_VDEVID;
+ raw_offset = SM_OFFSET_DECODE(e);
+ raw_run = SM_RUN_DECODE(e);
+ } else {
+ /* it is a two-word entry */
+ ASSERT(sm_entry_is_double_word(e));
+ raw_run = SM2_RUN_DECODE(e);
+ vdev_id = SM2_VDEV_DECODE(e);
+
+ /* move on to the second word */
+ block_cursor++;
+ e = *block_cursor;
+ VERIFY3P(block_cursor, <=, block_end);
+
+ type = SM2_TYPE_DECODE(e);
+ raw_offset = SM2_OFFSET_DECODE(e);
+ }
+
+ uint64_t entry_offset = (raw_offset << sm->sm_shift) +
+ sm->sm_start;
+ uint64_t entry_run = raw_run << sm->sm_shift;
+
+ VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+ VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+ ASSERT3U(entry_offset, >=, sm->sm_start);
+ ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
+ ASSERT3U(entry_run, <=, sm->sm_size);
+ ASSERT3U(entry_offset + entry_run, <=,
+ sm->sm_start + sm->sm_size);
+
+ space_map_entry_t sme = {
+ .sme_type = type,
+ .sme_vdev = vdev_id,
+ .sme_offset = entry_offset,
+ .sme_run = entry_run
+ };
+ error = callback(&sme, arg);
+ }
+ dmu_buf_rele(db, FTAG);
+ }
+ return (error);
+}
+
+/*
+ * Reads the entries from the last block of the space map into
+ * buf in reverse order. Populates nwords with number of words
+ * in the last block.
+ *
+ * Refer to block comment within space_map_incremental_destroy()
+ * to understand why this function is needed.
+ */
+static int
+space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
+ uint64_t bufsz, uint64_t *nwords)
+{
+ int error = 0;
+ dmu_buf_t *db;
+
+ /*
+ * Find the offset of the last word in the space map and use
+ * that to read the last block of the space map with
+ * dmu_buf_hold().
+ */
+ uint64_t last_word_offset =
+ sm->sm_phys->smp_length - sizeof (uint64_t);
+ error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
+ FTAG, &db, DMU_READ_NO_PREFETCH);
+ if (error != 0)
+ return (error);
+
+ ASSERT3U(sm->sm_object, ==, db->db_object);
+ ASSERT3U(sm->sm_blksz, ==, db->db_size);
+ ASSERT3U(bufsz, >=, db->db_size);
+ ASSERT(nwords != NULL);
+
+ uint64_t *words = db->db_data;
+ *nwords =
+ (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
+
+ ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
+
+ uint64_t n = *nwords;
+ uint64_t j = n - 1;
+ for (uint64_t i = 0; i < n; i++) {
+ uint64_t entry = words[i];
+ if (sm_entry_is_double_word(entry)) {
+ /*
+ * Since we are populating the buffer backwards
+ * we have to be extra careful and add the two
+ * words of the double-word entry in the right
+ * order.
+ */
+ ASSERT3U(j, >, 0);
+ buf[j - 1] = entry;
+
+ i++;
+ ASSERT3U(i, <, n);
+ entry = words[i];
+ buf[j] = entry;
+ j -= 2;
+ } else {
+ ASSERT(sm_entry_is_debug(entry) ||
+ sm_entry_is_single_word(entry));
+ buf[j] = entry;
+ j--;
+ }
+ }
+
+ /*
+ * Assert that we wrote backwards all the
+ * way to the beginning of the buffer.
+ */
+ ASSERT3S(j, ==, -1);
+
+ dmu_buf_rele(db, FTAG);
+ return (error);
+}
+
+/*
+ * Note: This function performs destructive actions - specifically
+ * it deletes entries from the end of the space map. Thus, callers
+ * should ensure that they are holding the appropriate locks for
+ * the space map that they provide.
+ */
+int
+space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+ dmu_tx_t *tx)
+{
+ uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+ uint64_t *buf = zio_buf_alloc(bufsz);
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ /*
+ * Ideally we would want to iterate from the beginning of the
+ * space map to the end in incremental steps. The issue with this
+ * approach is that we don't have any field on-disk that points
+ * us where to start between each step. We could try zeroing out
+ * entries that we've destroyed, but this doesn't work either as
+ * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
+ *
+ * As a result, we destroy its entries incrementally starting from
+ * the end after applying the callback to each of them.
+ *
+ * The problem with this approach is that we cannot literally
+ * iterate through the words in the space map backwards as we
+ * can't distinguish two-word space map entries from their second
+ * word. Thus we do the following:
+ *
+ * 1] We get all the entries from the last block of the space map
+ * and put them into a buffer in reverse order. This way the
+ * last entry comes first in the buffer, the second to last is
+ * second, etc.
+ * 2] We iterate through the entries in the buffer and we apply
+ * the callback to each one. As we move from entry to entry we
+ * we decrease the size of the space map, deleting effectively
+ * each entry.
+ * 3] If there are no more entries in the space map or the callback
+ * returns a value other than 0, we stop iterating over the
+ * space map. If there are entries remaining and the callback
+ * returned 0, we go back to step [1].
+ */
+ int error = 0;
+ while (space_map_length(sm) > 0 && error == 0) {
+ uint64_t nwords = 0;
+ error = space_map_reversed_last_block_entries(sm, buf, bufsz,
+ &nwords);
+ if (error != 0)
+ break;
+
+ ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
+
+ for (uint64_t i = 0; i < nwords; i++) {
+ uint64_t e = buf[i];
+
+ if (sm_entry_is_debug(e)) {
+ sm->sm_phys->smp_length -= sizeof (uint64_t);
+ continue;
+ }
+
+ int words = 1;
+ uint64_t raw_offset, raw_run, vdev_id;
+ maptype_t type;
+ if (sm_entry_is_single_word(e)) {
+ type = SM_TYPE_DECODE(e);
+ vdev_id = SM_NO_VDEVID;
+ raw_offset = SM_OFFSET_DECODE(e);
+ raw_run = SM_RUN_DECODE(e);
+ } else {
+ ASSERT(sm_entry_is_double_word(e));
+ words = 2;
+
+ raw_run = SM2_RUN_DECODE(e);
+ vdev_id = SM2_VDEV_DECODE(e);
+
+ /* move to the second word */
+ i++;
+ e = buf[i];
+
+ ASSERT3P(i, <=, nwords);
+
+ type = SM2_TYPE_DECODE(e);
+ raw_offset = SM2_OFFSET_DECODE(e);
+ }
+
+ uint64_t entry_offset =
+ (raw_offset << sm->sm_shift) + sm->sm_start;
+ uint64_t entry_run = raw_run << sm->sm_shift;
+
+ VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+ VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+ VERIFY3U(entry_offset, >=, sm->sm_start);
+ VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
+ VERIFY3U(entry_run, <=, sm->sm_size);
+ VERIFY3U(entry_offset + entry_run, <=,
+ sm->sm_start + sm->sm_size);
+
+ space_map_entry_t sme = {
+ .sme_type = type,
+ .sme_vdev = vdev_id,
+ .sme_offset = entry_offset,
+ .sme_run = entry_run
+ };
+ error = callback(&sme, arg);
+ if (error != 0)
+ break;
+
+ if (type == SM_ALLOC)
+ sm->sm_phys->smp_alloc -= entry_run;
+ else
+ sm->sm_phys->smp_alloc += entry_run;
+ sm->sm_phys->smp_length -= words * sizeof (uint64_t);
+ }
+ }
+
+ if (space_map_length(sm) == 0) {
+ ASSERT0(error);
+ ASSERT0(space_map_allocated(sm));
+ }
+
+ zio_buf_free(buf, bufsz);
+ return (error);
+}
+
+typedef struct space_map_load_arg {
+ space_map_t *smla_sm;
+ range_tree_t *smla_rt;
+ maptype_t smla_type;
+} space_map_load_arg_t;
+
+static int
+space_map_load_callback(space_map_entry_t *sme, void *arg)
+{
+ space_map_load_arg_t *smla = arg;
+ if (sme->sme_type == smla->smla_type) {
+ VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
+ smla->smla_sm->sm_size);
+ range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
+ } else {
+ range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
+ }
+
+ return (0);
+}
+
+/*
+ * Load the spacemap into the rangetree, like space_map_load. But only
+ * read the first 'length' bytes of the spacemap.
+ */
+int
+space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t length)
+{
+ space_map_load_arg_t smla;
+
+ VERIFY0(range_tree_space(rt));
+
+ if (maptype == SM_FREE)
+ range_tree_add(rt, sm->sm_start, sm->sm_size);
+
+ smla.smla_rt = rt;
+ smla.smla_sm = sm;
+ smla.smla_type = maptype;
+ int err = space_map_iterate(sm, length,
+ space_map_load_callback, &smla);
+
+ if (err != 0)
+ range_tree_vacate(rt, NULL, NULL);
+
+ return (err);
+}
+
+/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ */
+int
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+{
+ return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
+}
+
+void
+space_map_histogram_clear(space_map_t *sm)
+{
+ if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+ return;
+
+ bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
+}
+
+boolean_t
+space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
+{
+ /*
+ * Verify that the in-core range tree does not have any
+ * ranges smaller than our sm_shift size.
+ */
+ for (int i = 0; i < sm->sm_shift; i++) {
+ if (rt->rt_histogram[i] != 0)
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+void
+space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
+{
+ int idx = 0;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ VERIFY3U(space_map_object(sm), !=, 0);
+
+ if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+ return;
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ ASSERT(space_map_histogram_verify(sm, rt));
+ /*
+ * Transfer the content of the range tree histogram to the space
+ * map histogram. The space map histogram contains 32 buckets ranging
+ * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
+ * however, can represent ranges from 2^0 to 2^63. Since the space
+ * map only cares about allocatable blocks (minimum of sm_shift) we
+ * can safely ignore all ranges in the range tree smaller than sm_shift.
+ */
+ for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+
+ /*
+ * Since the largest histogram bucket in the space map is
+ * 2^(32+sm_shift-1), we need to normalize the values in
+ * the range tree for any bucket larger than that size. For
+ * example given an sm_shift of 9, ranges larger than 2^40
+ * would get normalized as if they were 1TB ranges. Assume
+ * the range tree had a count of 5 in the 2^44 (16TB) bucket,
+ * the calculation below would normalize this to 5 * 2^4 (16).
+ */
+ ASSERT3U(i, >=, idx + sm->sm_shift);
+ sm->sm_phys->smp_histogram[idx] +=
+ rt->rt_histogram[i] << (i - idx - sm->sm_shift);
+
+ /*
+ * Increment the space map's index as long as we haven't
+ * reached the maximum bucket size. Accumulate all ranges
+ * larger than the max bucket size into the last bucket.
+ */
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + sm->sm_shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
+}
+
+static void
+space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+ SM_DEBUG_ACTION_ENCODE(maptype) |
+ SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
+ SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+ dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
+ sizeof (dentry), &dentry, tx);
+
+ sm->sm_phys->smp_length += sizeof (dentry);
+}
+
+/*
+ * Writes one or more entries given a segment.
+ *
+ * Note: The function may release the dbuf from the pointer initially
+ * passed to it, and return a different dbuf. Also, the space map's
+ * dbuf must be dirty for the changes in sm_phys to take effect.
+ */
+static void
+space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
+ uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
+{
+ ASSERT3U(words, !=, 0);
+ ASSERT3U(words, <=, 2);
+
+ /* ensure the vdev_id can be represented by the space map */
+ ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
+
+ /*
+ * if this is a single word entry, ensure that no vdev was
+ * specified.
+ */
+ IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
+
+ dmu_buf_t *db = *dbp;
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ uint64_t *block_base = db->db_data;
+ uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
+ uint64_t *block_cursor = block_base +
+ (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
+
+ ASSERT3P(block_cursor, <=, block_end);
+
+ uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+ uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+ uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
+
+ ASSERT3U(rs->rs_start, >=, sm->sm_start);
+ ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
+ ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
+ ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
+
+ while (size != 0) {
+ ASSERT3P(block_cursor, <=, block_end);
+
+ /*
+ * If we are at the end of this block, flush it and start
+ * writing again from the beginning.
+ */
+ if (block_cursor == block_end) {
+ dmu_buf_rele(db, tag);
+
+ uint64_t next_word_offset = sm->sm_phys->smp_length;
+ VERIFY0(dmu_buf_hold(sm->sm_os,
+ space_map_object(sm), next_word_offset,
+ tag, &db, DMU_READ_PREFETCH));
+ dmu_buf_will_dirty(db, tx);
+
+ /* update caller's dbuf */
+ *dbp = db;
+
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ block_base = db->db_data;
+ block_cursor = block_base;
+ block_end = block_base +
+ (db->db_size / sizeof (uint64_t));
+ }
+
+ /*
+ * If we are writing a two-word entry and we only have one
+ * word left on this block, just pad it with an empty debug
+ * entry and write the two-word entry in the next block.
+ */
+ uint64_t *next_entry = block_cursor + 1;
+ if (next_entry == block_end && words > 1) {
+ ASSERT3U(words, ==, 2);
+ *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+ SM_DEBUG_ACTION_ENCODE(0) |
+ SM_DEBUG_SYNCPASS_ENCODE(0) |
+ SM_DEBUG_TXG_ENCODE(0);
+ block_cursor++;
+ sm->sm_phys->smp_length += sizeof (uint64_t);
+ ASSERT3P(block_cursor, ==, block_end);
+ continue;
+ }
+
+ uint64_t run_len = MIN(size, run_max);
+ switch (words) {
+ case 1:
+ *block_cursor = SM_OFFSET_ENCODE(start) |
+ SM_TYPE_ENCODE(maptype) |
+ SM_RUN_ENCODE(run_len);
+ block_cursor++;
+ break;
+ case 2:
+ /* write the first word of the entry */
+ *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
+ SM2_RUN_ENCODE(run_len) |
+ SM2_VDEV_ENCODE(vdev_id);
+ block_cursor++;
+
+ /* move on to the second word of the entry */
+ ASSERT3P(block_cursor, <, block_end);
+ *block_cursor = SM2_TYPE_ENCODE(maptype) |
+ SM2_OFFSET_ENCODE(start);
+ block_cursor++;
+ break;
+ default:
+ panic("%d-word space map entries are not supported",
+ words);
+ break;
+ }
+ sm->sm_phys->smp_length += words * sizeof (uint64_t);
+
+ start += run_len;
+ size -= run_len;
+ }
+ ASSERT0(size);
+
+}
+
+/*
+ * Note: The space map's dbuf must be dirty for the changes in sm_phys to
+ * take effect.
+ */
+static void
+space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t vdev_id, dmu_tx_t *tx)
+{
+ spa_t *spa = tx->tx_pool->dp_spa;
+ dmu_buf_t *db;
+
+ space_map_write_intro_debug(sm, maptype, tx);
+
+#ifdef DEBUG
+ /*
+ * We do this right after we write the intro debug entry
+ * because the estimate does not take it into account.
+ */
+ uint64_t initial_objsize = sm->sm_phys->smp_length;
+ uint64_t estimated_growth =
+ space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
+ uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
+#endif
+
+ /*
+ * Find the offset right after the last word in the space map
+ * and use that to get a hold of the last block, so we can
+ * start appending to it.
+ */
+ uint64_t next_word_offset = sm->sm_phys->smp_length;
+ VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
+ next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ dmu_buf_will_dirty(db, tx);
+
+ avl_tree_t *t = &rt->rt_root;
+ for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+ uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+ uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+ uint8_t words = 1;
+
+ /*
+ * We only write two-word entries when both of the following
+ * are true:
+ *
+ * [1] The feature is enabled.
+ * [2] The offset or run is too big for a single-word entry,
+ * or the vdev_id is set (meaning not equal to
+ * SM_NO_VDEVID).
+ *
+ * Note that for purposes of testing we've added the case that
+ * we write two-word entries occasionally when the feature is
+ * enabled and zfs_force_some_double_word_sm_entries has been
+ * set.
+ */
+ if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
+ (offset >= (1ULL << SM_OFFSET_BITS) ||
+ length > SM_RUN_MAX ||
+ vdev_id != SM_NO_VDEVID ||
+ (zfs_force_some_double_word_sm_entries &&
+ spa_get_random(100) == 0)))
+ words = 2;
+
+ space_map_write_seg(sm, rs, maptype, vdev_id, words,
+ &db, FTAG, tx);
+ }
+
+ dmu_buf_rele(db, FTAG);
+
+#ifdef DEBUG
+ /*
+ * We expect our estimation to be based on the worst case
+ * scenario [see comment in space_map_estimate_optimal_size()].
+ * Therefore we expect the actual objsize to be equal or less
+ * than whatever we estimated it to be.
+ */
+ ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
+#endif
+}
+
+/*
+ * Note: This function manipulates the state of the given space map but
+ * does not hold any locks implicitly. Thus the caller is responsible
+ * for synchronizing writes to the space map.
+ */
+void
+space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t vdev_id, dmu_tx_t *tx)
+{
+ objset_t *os = sm->sm_os;
+
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ VERIFY3U(space_map_object(sm), !=, 0);
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ /*
+ * This field is no longer necessary since the in-core space map
+ * now contains the object number but is maintained for backwards
+ * compatibility.
+ */
+ sm->sm_phys->smp_object = sm->sm_object;
+
+ if (range_tree_is_empty(rt)) {
+ VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
+ return;
+ }
+
+ if (maptype == SM_ALLOC)
+ sm->sm_phys->smp_alloc += range_tree_space(rt);
+ else
+ sm->sm_phys->smp_alloc -= range_tree_space(rt);
+
+ uint64_t nodes = avl_numnodes(&rt->rt_root);
+ uint64_t rt_space = range_tree_space(rt);
+
+ space_map_write_impl(sm, rt, maptype, vdev_id, tx);
+
+ /*
+ * Ensure that the space_map's accounting wasn't changed
+ * while we were in the middle of writing it out.
+ */
+ VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
+ VERIFY3U(range_tree_space(rt), ==, rt_space);
+}
+
+static int
+space_map_open_impl(space_map_t *sm)
+{
+ int error;
+ u_longlong_t blocks;
+
+ error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
+ if (error)
+ return (error);
+
+ dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
+ sm->sm_phys = sm->sm_dbuf->db_data;
+ return (0);
+}
+
+int
+space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+ uint64_t start, uint64_t size, uint8_t shift)
+{
+ space_map_t *sm;
+ int error;
+
+ ASSERT(*smp == NULL);
+ ASSERT(os != NULL);
+ ASSERT(object != 0);
+
+ sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
+
+ sm->sm_start = start;
+ sm->sm_size = size;
+ sm->sm_shift = shift;
+ sm->sm_os = os;
+ sm->sm_object = object;
+
+ error = space_map_open_impl(sm);
+ if (error != 0) {
+ space_map_close(sm);
+ return (error);
+ }
+ *smp = sm;
+
+ return (0);
+}
+
+void
+space_map_close(space_map_t *sm)
+{
+ if (sm == NULL)
+ return;
+
+ if (sm->sm_dbuf != NULL)
+ dmu_buf_rele(sm->sm_dbuf, sm);
+ sm->sm_dbuf = NULL;
+ sm->sm_phys = NULL;
+
+ kmem_free(sm, sizeof (*sm));
+}
+
+void
+space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
+{
+ objset_t *os = sm->sm_os;
+ spa_t *spa = dmu_objset_spa(os);
+ dmu_object_info_t doi;
+
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(dmu_tx_is_syncing(tx));
+ VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));
+
+ dmu_object_info_from_db(sm->sm_dbuf, &doi);
+
+ /*
+ * If the space map has the wrong bonus size (because
+ * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
+ * the wrong block size (because space_map_blksz has changed),
+ * free and re-allocate its object with the updated sizes.
+ *
+ * Otherwise, just truncate the current object.
+ */
+ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+ doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
+ doi.doi_data_block_size != blocksize ||
+ doi.doi_metadata_block_size != 1 << space_map_ibs) {
+ zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
+ "object[%llu]: old bonus %u, old blocksz %u",
+ dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
+ doi.doi_bonus_size, doi.doi_data_block_size);
+
+ space_map_free(sm, tx);
+ dmu_buf_rele(sm->sm_dbuf, sm);
+
+ sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
+ VERIFY0(space_map_open_impl(sm));
+ } else {
+ VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
+
+ /*
+ * If the spacemap is reallocated, its histogram
+ * will be reset. Do the same in the common case so that
+ * bugs related to the uncommon case do not go unnoticed.
+ */
+ bzero(sm->sm_phys->smp_histogram,
+ sizeof (sm->sm_phys->smp_histogram));
+ }
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+ sm->sm_phys->smp_length = 0;
+ sm->sm_phys->smp_alloc = 0;
+}
+
+uint64_t
+space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ uint64_t object;
+ int bonuslen;
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+ spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+ bonuslen = sizeof (space_map_phys_t);
+ ASSERT3U(bonuslen, <=, dmu_bonus_max());
+ } else {
+ bonuslen = SPACE_MAP_SIZE_V0;
+ }
+
+ object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
+ space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
+
+ return (object);
+}
+
+void
+space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+ dmu_object_info_t doi;
+
+ VERIFY0(dmu_object_info(os, smobj, &doi));
+ if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
+ spa_feature_decr(spa,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+ }
+ }
+
+ VERIFY0(dmu_object_free(os, smobj, tx));
+}
+
+void
+space_map_free(space_map_t *sm, dmu_tx_t *tx)
+{
+ if (sm == NULL)
+ return;
+
+ space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
+ sm->sm_object = 0;
+}
+
+/*
+ * Given a range tree, it makes a worst-case estimate of how much
+ * space would the tree's segments take if they were written to
+ * the given space map.
+ */
+uint64_t
+space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+ uint64_t vdev_id)
+{
+ spa_t *spa = dmu_objset_spa(sm->sm_os);
+ uint64_t shift = sm->sm_shift;
+ uint64_t *histogram = rt->rt_histogram;
+ uint64_t entries_for_seg = 0;
+
+ /*
+ * In order to get a quick estimate of the optimal size that this
+ * range tree would have on-disk as a space map, we iterate through
+ * its histogram buckets instead of iterating through its nodes.
+ *
+ * Note that this is a highest-bound/worst-case estimate for the
+ * following reasons:
+ *
+ * 1] We assume that we always add a debug padding for each block
+ * we write and we also assume that we start at the last word
+ * of a block attempting to write a two-word entry.
+ * 2] Rounding up errors due to the way segments are distributed
+ * in the buckets of the range tree's histogram.
+ * 3] The activation of zfs_force_some_double_word_sm_entries
+ * (tunable) when testing.
+ *
+ * = Math and Rounding Errors =
+ *
+ * rt_histogram[i] bucket of a range tree represents the number
+ * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
+ * that, we want to divide the buckets into groups: Buckets that
+ * can be represented using a single-word entry, ones that can
+ * be represented with a double-word entry, and ones that can
+ * only be represented with multiple two-word entries.
+ *
+ * [Note that if the new encoding feature is not enabled there
+ * are only two groups: single-word entry buckets and multiple
+ * single-word entry buckets. The information below assumes
+ * two-word entries enabled, but it can easily applied when
+ * the feature is not enabled]
+ *
+ * To find the highest bucket that can be represented with a
+ * single-word entry we look at the maximum run that such entry
+ * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
+ * the run of a space map entry is shifted by sm_shift, thus we
+ * add it to the exponent]. This way, excluding the value of the
+ * maximum run that can be represented by a single-word entry,
+ * all runs that are smaller exist in buckets 0 to
+ * SM_RUN_BITS + shift - 1.
+ *
+ * To find the highest bucket that can be represented with a
+ * double-word entry, we follow the same approach. Finally, any
+ * bucket higher than that are represented with multiple two-word
+ * entries. To be more specific, if the highest bucket whose
+ * segments can be represented with a single two-word entry is X,
+ * then bucket X+1 will need 2 two-word entries for each of its
+ * segments, X+2 will need 4, X+3 will need 8, ...etc.
+ *
+ * With all of the above we make our estimation based on bucket
+ * groups. There is a rounding error though. As we mentioned in
+ * the example with the one-word entry, the maximum run that can
+ * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
+ * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
+ * that length fall into the next bucket (and bucket group) where
+ * we start counting two-word entries and this is one more reason
+ * why the estimated size may end up being bigger than the actual
+ * size written.
+ */
+ uint64_t size = 0;
+ uint64_t idx = 0;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
+ (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
+
+ /*
+ * If we are trying to force some double word entries just
+ * assume the worst-case of every single word entry being
+ * written as a double word entry.
+ */
+ uint64_t entry_size =
+ (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
+ zfs_force_some_double_word_sm_entries) ?
+ (2 * sizeof (uint64_t)) : sizeof (uint64_t);
+
+ uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
+ for (; idx <= single_entry_max_bucket; idx++)
+ size += histogram[idx] * entry_size;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
+ for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+ ASSERT3U(idx, >=, single_entry_max_bucket);
+ entries_for_seg =
+ 1ULL << (idx - single_entry_max_bucket);
+ size += histogram[idx] *
+ entries_for_seg * entry_size;
+ }
+ return (size);
+ }
+ }
+
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
+
+ uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
+ for (; idx <= double_entry_max_bucket; idx++)
+ size += histogram[idx] * 2 * sizeof (uint64_t);
+
+ for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+ ASSERT3U(idx, >=, double_entry_max_bucket);
+ entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
+ size += histogram[idx] *
+ entries_for_seg * 2 * sizeof (uint64_t);
+ }
+
+ /*
+ * Assume the worst case where we start with the padding at the end
+ * of the current block and we add an extra padding entry at the end
+ * of all subsequent blocks.
+ */
+ size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
+
+ return (size);
+}
+
+uint64_t
+space_map_object(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_object : 0);
+}
+
+int64_t
+space_map_allocated(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
+}
+
+uint64_t
+space_map_length(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_phys->smp_length : 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
new file mode 100644
index 000000000000..aa289ba1061d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/range_tree.h>
+#include <sys/space_reftree.h>
+
+/*
+ * Space reference trees.
+ *
+ * A range tree is a collection of integers. Every integer is either
+ * in the tree, or it's not. A space reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a range tree.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps. For example, the union of
+ * N range trees is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N range trees is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform. Unions and intersections
+ * are hard to perform in the 'range tree domain', so we convert the trees
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_reftree_compare(const void *x1, const void *x2)
+{
+ const space_ref_t *sr1 = (const space_ref_t *)x1;
+ const space_ref_t *sr2 = (const space_ref_t *)x2;
+
+ int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset);
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_PCMP(sr1, sr2));
+}
+
+void
+space_reftree_create(avl_tree_t *t)
+{
+ avl_create(t, space_reftree_compare,
+ sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_reftree_destroy(avl_tree_t *t)
+{
+ space_ref_t *sr;
+ void *cookie = NULL;
+
+ while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(sr, sizeof (*sr));
+
+ avl_destroy(t);
+}
+
+static void
+space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+ space_ref_t *sr;
+
+ sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
+ sr->sr_offset = offset;
+ sr->sr_refcnt = refcnt;
+
+ avl_add(t, sr);
+}
+
+void
+space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+ int64_t refcnt)
+{
+ space_reftree_add_node(t, start, refcnt);
+ space_reftree_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a range tree into a reference tree.
+ */
+void
+space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
+{
+ range_seg_t *rs;
+
+ for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
+ space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
+}
+
+/*
+ * Convert a reference tree into a range tree. The range tree will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref)
+{
+ uint64_t start = -1ULL;
+ int64_t refcnt = 0;
+ space_ref_t *sr;
+
+ range_tree_vacate(rt, NULL, NULL);
+
+ for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+ refcnt += sr->sr_refcnt;
+ if (refcnt >= minref) {
+ if (start == -1ULL) {
+ start = sr->sr_offset;
+ }
+ } else {
+ if (start != -1ULL) {
+ uint64_t end = sr->sr_offset;
+ ASSERT(start <= end);
+ if (end > start)
+ range_tree_add(rt, start, end - start);
+ start = -1ULL;
+ }
+ }
+ }
+ ASSERT(refcnt == 0);
+ ASSERT(start == -1ULL);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h
new file mode 100644
index 000000000000..9689f931fb29
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h
@@ -0,0 +1,154 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_H
+#define _ABD_H
+
+#include <sys/isa_defs.h>
+#ifdef illumos
+#include <sys/int_types.h>
+#else
+#include <sys/stdint.h>
+#endif
+#include <sys/debug.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/uio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum abd_flags {
+ ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
+ ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
+ ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */
+} abd_flags_t;
+
+typedef struct abd {
+ abd_flags_t abd_flags;
+ uint_t abd_size; /* excludes scattered abd_offset */
+ struct abd *abd_parent;
+ zfs_refcount_t abd_children;
+ union {
+ struct abd_scatter {
+ uint_t abd_offset;
+ uint_t abd_chunk_size;
+ void *abd_chunks[];
+ } abd_scatter;
+ struct abd_linear {
+ void *abd_buf;
+ } abd_linear;
+ } abd_u;
+} abd_t;
+
+typedef int abd_iter_func_t(void *, size_t, void *);
+typedef int abd_iter_func2_t(void *, void *, size_t, void *);
+
+extern boolean_t zfs_abd_scatter_enabled;
+
+inline boolean_t
+abd_is_linear(abd_t *abd)
+{
+ return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
+}
+
+/*
+ * Allocations and deallocations
+ */
+
+abd_t *abd_alloc(size_t, boolean_t);
+abd_t *abd_alloc_linear(size_t, boolean_t);
+abd_t *abd_alloc_for_io(size_t, boolean_t);
+abd_t *abd_alloc_sametype(abd_t *, size_t);
+void abd_free(abd_t *);
+abd_t *abd_get_offset(abd_t *, size_t);
+abd_t *abd_get_from_buf(void *, size_t);
+void abd_put(abd_t *);
+
+/*
+ * Conversion to and from a normal buffer
+ */
+
+void *abd_to_buf(abd_t *);
+void *abd_borrow_buf(abd_t *, size_t);
+void *abd_borrow_buf_copy(abd_t *, size_t);
+void abd_return_buf(abd_t *, void *, size_t);
+void abd_return_buf_copy(abd_t *, void *, size_t);
+void abd_take_ownership_of_buf(abd_t *, boolean_t);
+void abd_release_ownership_of_buf(abd_t *);
+
+/*
+ * ABD operations
+ */
+
+int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
+int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
+ abd_iter_func2_t *, void *);
+void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
+void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
+void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
+int abd_cmp(abd_t *, abd_t *, size_t);
+int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
+void abd_zero_off(abd_t *, size_t, size_t);
+
+/*
+ * Wrappers for calls with offsets of 0
+ */
+
+inline void
+abd_copy(abd_t *dabd, abd_t *sabd, size_t size)
+{
+ abd_copy_off(dabd, sabd, 0, 0, size);
+}
+
+inline void
+abd_copy_from_buf(abd_t *abd, const void *buf, size_t size)
+{
+ abd_copy_from_buf_off(abd, buf, 0, size);
+}
+
+inline void
+abd_copy_to_buf(void* buf, abd_t *abd, size_t size)
+{
+ abd_copy_to_buf_off(buf, abd, 0, size);
+}
+
+inline int
+abd_cmp_buf(abd_t *abd, const void *buf, size_t size)
+{
+ return (abd_cmp_buf_off(abd, buf, 0, size));
+}
+
+inline void
+abd_zero(abd_t *abd, size_t size)
+{
+ abd_zero_off(abd, 0, size);
+}
+
+/*
+ * Module lifecycle
+ */
+
+void abd_init(void);
+void abd_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ABD_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h
new file mode 100644
index 000000000000..2ae0835e55a2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h
@@ -0,0 +1,58 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_AGGSUM_H
+#define _SYS_AGGSUM_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct aggsum_bucket {
+ kmutex_t asc_lock;
+ int64_t asc_delta;
+ uint64_t asc_borrowed;
+ uint64_t asc_pad[2]; /* pad out to cache line (64 bytes) */
+} aggsum_bucket_t __aligned(CACHE_LINE_SIZE);
+
+/*
+ * Fan out over FANOUT cpus.
+ */
+typedef struct aggsum {
+ kmutex_t as_lock;
+ int64_t as_lower_bound;
+ int64_t as_upper_bound;
+ uint_t as_numbuckets;
+ aggsum_bucket_t *as_buckets;
+} aggsum_t;
+
+void aggsum_init(aggsum_t *, uint64_t);
+void aggsum_fini(aggsum_t *);
+int64_t aggsum_lower_bound(aggsum_t *);
+int64_t aggsum_upper_bound(aggsum_t *);
+int aggsum_compare(aggsum_t *, uint64_t);
+uint64_t aggsum_value(aggsum_t *);
+void aggsum_add(aggsum_t *, int64_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_AGGSUM_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
new file mode 100644
index 000000000000..95513f0cba21
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -0,0 +1,290 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ */
+
+#ifndef _SYS_ARC_H
+#define _SYS_ARC_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define ARC_EVICT_ALL -1ULL
+
+#define HDR_SET_LSIZE(hdr, x) do { \
+ ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
+ (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define HDR_SET_PSIZE(hdr, x) do { \
+ ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
+ (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
+#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT)
+
+typedef struct arc_buf_hdr arc_buf_hdr_t;
+typedef struct arc_buf arc_buf_t;
+typedef struct arc_prune arc_prune_t;
+
+/*
+ * Because the ARC can store encrypted data, errors (not due to bugs) may arise
+ * while transforming data into its desired format - specifically, when
+ * decrypting, the key may not be present, or the HMAC may not be correct
+ * which signifies deliberate tampering with the on-disk state
+ * (assuming that the checksum was correct). If any error occurs, the "buf"
+ * parameter will be NULL.
+ */
+typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *bp, arc_buf_t *buf, void *priv);
+typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
+typedef void arc_prune_func_t(int64_t bytes, void *priv);
+
+/* Shared module parameters */
+extern uint64_t zfs_arc_average_blocksize;
+
+/* generic arc_done_func_t's which you can use */
+arc_read_done_func_t arc_bcopy_func;
+arc_read_done_func_t arc_getbuf_func;
+
+/* generic arc_prune_func_t wrapper for callbacks */
+struct arc_prune {
+ arc_prune_func_t *p_pfunc;
+ void *p_private;
+ uint64_t p_adjust;
+ list_node_t p_node;
+ zfs_refcount_t p_refcnt;
+};
+
+typedef enum arc_strategy {
+ ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */
+ ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */
+} arc_strategy_t;
+
+typedef enum arc_flags
+{
+ /*
+ * Public flags that can be passed into the ARC by external consumers.
+ */
+ ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */
+ ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */
+ ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
+ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
+ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
+ ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
+ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */
+
+ /*
+ * Private ARC flags. These flags are private ARC only flags that
+ * will show up in b_flags in the arc_hdr_buf_t. These flags should
+ * only be set by ARC code.
+ */
+ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
+ ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
+ ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
+ ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */
+ /* Indicates that block was read with ASYNC priority. */
+ ARC_FLAG_PRIO_ASYNC_READ = 1 << 11,
+ ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */
+ ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */
+ ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */
+ /* indicates that the buffer contains metadata (otherwise, data) */
+ ARC_FLAG_BUFC_METADATA = 1 << 15,
+
+ /* Flags specifying whether optional hdr struct fields are defined */
+ ARC_FLAG_HAS_L1HDR = 1 << 16,
+ ARC_FLAG_HAS_L2HDR = 1 << 17,
+
+ /*
+ * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
+ * This allows the l2arc to use the blkptr's checksum to verify
+ * the data without having to store the checksum in the hdr.
+ */
+ ARC_FLAG_COMPRESSED_ARC = 1 << 18,
+ ARC_FLAG_SHARED_DATA = 1 << 19,
+
+ /*
+ * The arc buffer's compression mode is stored in the top 7 bits of the
+ * flags field, so these dummy flags are included so that MDB can
+ * interpret the enum properly.
+ */
+ ARC_FLAG_COMPRESS_0 = 1 << 24,
+ ARC_FLAG_COMPRESS_1 = 1 << 25,
+ ARC_FLAG_COMPRESS_2 = 1 << 26,
+ ARC_FLAG_COMPRESS_3 = 1 << 27,
+ ARC_FLAG_COMPRESS_4 = 1 << 28,
+ ARC_FLAG_COMPRESS_5 = 1 << 29,
+ ARC_FLAG_COMPRESS_6 = 1 << 30
+
+} arc_flags_t;
+
+typedef enum arc_buf_flags {
+ ARC_BUF_FLAG_SHARED = 1 << 0,
+ ARC_BUF_FLAG_COMPRESSED = 1 << 1
+} arc_buf_flags_t;
+
+struct arc_buf {
+ arc_buf_hdr_t *b_hdr;
+ arc_buf_t *b_next;
+ kmutex_t b_evict_lock;
+ void *b_data;
+ arc_buf_flags_t b_flags;
+};
+
+typedef enum arc_buf_contents {
+ ARC_BUFC_INVALID, /* invalid type */
+ ARC_BUFC_DATA, /* buffer contains data */
+ ARC_BUFC_METADATA, /* buffer contains metadata */
+ ARC_BUFC_NUMTYPES
+} arc_buf_contents_t;
+
+/*
+ * The following breakdows of arc_size exist for kstat only.
+ */
+typedef enum arc_space_type {
+ ARC_SPACE_DATA,
+ ARC_SPACE_META,
+ ARC_SPACE_HDRS,
+ ARC_SPACE_L2HDRS,
+ ARC_SPACE_DBUF,
+ ARC_SPACE_DNODE,
+ ARC_SPACE_BONUS,
+ ARC_SPACE_NUMTYPES
+} arc_space_type_t;
+
+typedef enum arc_state_type {
+ ARC_STATE_ANON,
+ ARC_STATE_MRU,
+ ARC_STATE_MRU_GHOST,
+ ARC_STATE_MFU,
+ ARC_STATE_MFU_GHOST,
+ ARC_STATE_L2C_ONLY,
+ ARC_STATE_NUMTYPES
+} arc_state_type_t;
+
+typedef struct arc_buf_info {
+ arc_state_type_t abi_state_type;
+ arc_buf_contents_t abi_state_contents;
+ uint64_t abi_state_index;
+ uint32_t abi_flags;
+ uint32_t abi_bufcnt;
+ uint64_t abi_size;
+ uint64_t abi_spa;
+ uint64_t abi_access;
+ uint32_t abi_mru_hits;
+ uint32_t abi_mru_ghost_hits;
+ uint32_t abi_mfu_hits;
+ uint32_t abi_mfu_ghost_hits;
+ uint32_t abi_l2arc_hits;
+ uint32_t abi_holds;
+ uint64_t abi_l2arc_dattr;
+ uint64_t abi_l2arc_asize;
+ enum zio_compress abi_l2arc_compress;
+} arc_buf_info_t;
+
+void arc_space_consume(uint64_t space, arc_space_type_t type);
+void arc_space_return(uint64_t space, arc_space_type_t type);
+boolean_t arc_is_metadata(arc_buf_t *buf);
+enum zio_compress arc_get_compression(arc_buf_t *buf);
+int arc_decompress(arc_buf_t *buf);
+arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type,
+ int32_t size);
+arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag,
+ uint64_t psize, uint64_t lsize, enum zio_compress compression_type);
+arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size);
+arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type);
+void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
+void arc_buf_destroy(arc_buf_t *buf, void *tag);
+void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index);
+int arc_buf_size(arc_buf_t *buf);
+int arc_buf_lsize(arc_buf_t *buf);
+void arc_buf_access(arc_buf_t *buf);
+void arc_release(arc_buf_t *buf, void *tag);
+int arc_released(arc_buf_t *buf);
+void arc_buf_freeze(arc_buf_t *buf);
+void arc_buf_thaw(arc_buf_t *buf);
+#ifdef ZFS_DEBUG
+int arc_referenced(arc_buf_t *buf);
+#endif
+
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ arc_read_done_func_t *done, void *priv, zio_priority_t priority,
+ int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
+ arc_write_done_func_t *physdone, arc_write_done_func_t *done,
+ void *priv, zio_priority_t priority, int zio_flags,
+ const zbookmark_phys_t *zb);
+
+arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv);
+void arc_remove_prune_callback(arc_prune_t *p);
+void arc_freed(spa_t *spa, const blkptr_t *bp);
+
+void arc_flush(spa_t *spa, boolean_t retry);
+void arc_tempreserve_clear(uint64_t reserve);
+int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
+
+uint64_t arc_max_bytes(void);
+void arc_init(void);
+void arc_fini(void);
+
+/*
+ * Level 2 ARC
+ */
+
+void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
+void l2arc_remove_vdev(vdev_t *vd);
+boolean_t l2arc_vdev_present(vdev_t *vd);
+void l2arc_init(void);
+void l2arc_fini(void);
+void l2arc_start(void);
+void l2arc_stop(void);
+
+#ifdef illumos
+#ifndef _KERNEL
+extern boolean_t arc_watch;
+extern int arc_procfd;
+#endif
+#endif /* illumos */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h
new file mode 100644
index 000000000000..77b1b827ac37
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h
@@ -0,0 +1,39 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_BLKPTR_H
+#define _SYS_BLKPTR_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void encode_embedded_bp_compressed(blkptr_t *, void *,
+ enum zio_compress, int, int);
+void decode_embedded_bp_compressed(const blkptr_t *, void *);
+int decode_embedded_bp(const blkptr_t *, void *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BLKPTR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
new file mode 100644
index 000000000000..471be9047ec2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_BPLIST_H
+#define _SYS_BPLIST_H
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bplist_entry {
+ blkptr_t bpe_blk;
+ list_node_t bpe_node;
+} bplist_entry_t;
+
+typedef struct bplist {
+ kmutex_t bpl_lock;
+ list_t bpl_list;
+} bplist_t;
+
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
+ void *arg, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPLIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
new file mode 100644
index 000000000000..d425e239f6a6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_BPOBJ_H
+#define _SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+ /*
+ * This is the bonus buffer for the dead lists. The object's
+ * contents is an array of bpo_entries blkptr_t's, representing
+ * a total of bpo_bytes physical space.
+ */
+ uint64_t bpo_num_blkptrs;
+ uint64_t bpo_bytes;
+ uint64_t bpo_comp;
+ uint64_t bpo_uncomp;
+ uint64_t bpo_subobjs;
+ uint64_t bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
+#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+ kmutex_t bpo_lock;
+ objset_t *bpo_os;
+ uint64_t bpo_object;
+ int bpo_epb;
+ uint8_t bpo_havecomp;
+ uint8_t bpo_havesubobj;
+ bpobj_phys_t *bpo_phys;
+ dmu_buf_t *bpo_dbuf;
+ dmu_buf_t *bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+boolean_t bpobj_is_open(const bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+boolean_t bpobj_is_empty(bpobj_t *bpo);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
new file mode 100644
index 000000000000..327c128bf493
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_BPTREE_H
+#define _SYS_BPTREE_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bptree_phys {
+ uint64_t bt_begin;
+ uint64_t bt_end;
+ uint64_t bt_bytes;
+ uint64_t bt_comp;
+ uint64_t bt_uncomp;
+} bptree_phys_t;
+
+typedef struct bptree_entry_phys {
+ blkptr_t be_bp;
+ uint64_t be_birth_txg; /* only delete blocks born after this txg */
+ zbookmark_phys_t be_zb; /* holds traversal resume point if needed */
+} bptree_entry_phys_t;
+
+typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
+int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
+
+void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+ uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
+
+int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free,
+ bptree_itor_t func, void *arg, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPTREE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
new file mode 100644
index 000000000000..63722df1bbf3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
@@ -0,0 +1,54 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _BQUEUE_H
+#define _BQUEUE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+
+typedef struct bqueue {
+ list_t bq_list;
+ kmutex_t bq_lock;
+ kcondvar_t bq_add_cv;
+ kcondvar_t bq_pop_cv;
+ uint64_t bq_size;
+ uint64_t bq_maxsize;
+ size_t bq_node_offset;
+} bqueue_t;
+
+typedef struct bqueue_node {
+ list_node_t bqn_node;
+ uint64_t bqn_size;
+} bqueue_node_t;
+
+
+int bqueue_init(bqueue_t *, uint64_t, size_t);
+void bqueue_destroy(bqueue_t *);
+void bqueue_enqueue(bqueue_t *, void *, uint64_t);
+void *bqueue_dequeue(bqueue_t *);
+boolean_t bqueue_empty(bqueue_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BQUEUE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h
new file mode 100644
index 000000000000..33c3b7bc2532
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_CITYHASH_H
+#define _SYS_CITYHASH_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CITYHASH_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
new file mode 100644
index 000000000000..4b1a9e11b165
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -0,0 +1,417 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#ifndef _SYS_DBUF_H
+#define _SYS_DBUF_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+#include <sys/zrlock.h>
+#include <sys/multilist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define IN_DMU_SYNC 2
+
+/*
+ * define flags for dbuf_read
+ */
+
+#define DB_RF_MUST_SUCCEED (1 << 0)
+#define DB_RF_CANFAIL (1 << 1)
+#define DB_RF_HAVESTRUCT (1 << 2)
+#define DB_RF_NOPREFETCH (1 << 3)
+#define DB_RF_NEVERWAIT (1 << 4)
+#define DB_RF_CACHED (1 << 5)
+
+/*
+ * The simplified state transition diagram for dbufs looks like:
+ *
+ * +----> READ ----+
+ * | |
+ * | V
+ * (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
+ * | ^ ^
+ * | | |
+ * +----> FILL ----+ |
+ * | |
+ * | |
+ * +--------> NOFILL -------+
+ *
+ * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
+ * to find all dbufs in a range of a dnode and must be less than any other
+ * dbuf_states_t (see comment on dn_dbufs in dnode.h).
+ */
+typedef enum dbuf_states {
+ DB_SEARCH = -1,
+ DB_UNCACHED,
+ DB_FILL,
+ DB_NOFILL,
+ DB_READ,
+ DB_CACHED,
+ DB_EVICTING
+} dbuf_states_t;
+
+typedef enum dbuf_cached_state {
+ DB_NO_CACHE = -1,
+ DB_DBUF_CACHE,
+ DB_DBUF_METADATA_CACHE,
+ DB_CACHE_MAX
+} dbuf_cached_state_t;
+
+struct dnode;
+struct dmu_tx;
+
+/*
+ * level = 0 means the user data
+ * level = 1 means the single indirect block
+ * etc.
+ */
+
+struct dmu_buf_impl;
+
+typedef enum override_states {
+ DR_NOT_OVERRIDDEN,
+ DR_IN_DMU_SYNC,
+ DR_OVERRIDDEN
+} override_states_t;
+
+typedef struct dbuf_dirty_record {
+ /* link on our parents dirty list */
+ list_node_t dr_dirty_node;
+
+ /* transaction group this data will sync in */
+ uint64_t dr_txg;
+
+ /* zio of outstanding write IO */
+ zio_t *dr_zio;
+
+ /* pointer back to our dbuf */
+ struct dmu_buf_impl *dr_dbuf;
+
+ /* pointer to next dirty record */
+ struct dbuf_dirty_record *dr_next;
+
+ /* pointer to parent dirty record */
+ struct dbuf_dirty_record *dr_parent;
+
+ /* How much space was changed to dsl_pool_dirty_space() for this? */
+ unsigned int dr_accounted;
+
+ /* A copy of the bp that points to us */
+ blkptr_t dr_bp_copy;
+
+ union dirty_types {
+ struct dirty_indirect {
+
+ /* protect access to list */
+ kmutex_t dr_mtx;
+
+ /* Our list of dirty children */
+ list_t dr_children;
+ } di;
+ struct dirty_leaf {
+
+ /*
+ * dr_data is set when we dirty the buffer
+ * so that we can retain the pointer even if it
+ * gets COW'd in a subsequent transaction group.
+ */
+ arc_buf_t *dr_data;
+ blkptr_t dr_overridden_by;
+ override_states_t dr_override_state;
+ uint8_t dr_copies;
+ boolean_t dr_nopwrite;
+ } dl;
+ } dt;
+} dbuf_dirty_record_t;
+
+typedef struct dmu_buf_impl {
+ /*
+ * The following members are immutable, with the exception of
+ * db.db_data, which is protected by db_mtx.
+ */
+
+ /* the publicly visible structure */
+ dmu_buf_t db;
+
+ /* the objset we belong to */
+ struct objset *db_objset;
+
+ /*
+ * handle to safely access the dnode we belong to (NULL when evicted)
+ */
+ struct dnode_handle *db_dnode_handle;
+
+ /*
+ * our parent buffer; if the dnode points to us directly,
+ * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
+ * only accessed by sync thread ???
+ * (NULL when evicted)
+ * May change from NULL to non-NULL under the protection of db_mtx
+ * (see dbuf_check_blkptr())
+ */
+ struct dmu_buf_impl *db_parent;
+
+ /*
+ * link for hash table of all dmu_buf_impl_t's
+ */
+ struct dmu_buf_impl *db_hash_next;
+
+ /*
+ * Our link on the owner dnodes's dn_dbufs list.
+ * Protected by its dn_dbufs_mtx. Should be on the same cache line
+ * as db_level and db_blkid for the best avl_add() performance.
+ */
+ avl_node_t db_link;
+
+ /* our block number */
+ uint64_t db_blkid;
+
+ /*
+ * Pointer to the blkptr_t which points to us. May be NULL if we
+ * don't have one yet. (NULL when evicted)
+ */
+ blkptr_t *db_blkptr;
+
+ /*
+ * Our indirection level. Data buffers have db_level==0.
+ * Indirect buffers which point to data buffers have
+ * db_level==1. etc. Buffers which contain dnodes have
+ * db_level==0, since the dnodes are stored in a file.
+ */
+ uint8_t db_level;
+
+ /* db_mtx protects the members below */
+ kmutex_t db_mtx;
+
+ /*
+ * Current state of the buffer
+ */
+ dbuf_states_t db_state;
+
+ /*
+ * Refcount accessed by dmu_buf_{hold,rele}.
+ * If nonzero, the buffer can't be destroyed.
+ * Protected by db_mtx.
+ */
+ zfs_refcount_t db_holds;
+
+ /* buffer holding our data */
+ arc_buf_t *db_buf;
+
+ kcondvar_t db_changed;
+ dbuf_dirty_record_t *db_data_pending;
+
+ /* pointer to most recent dirty record for this buffer */
+ dbuf_dirty_record_t *db_last_dirty;
+
+ /* Link in dbuf_cache or dbuf_metadata_cache */
+ multilist_node_t db_cache_link;
+
+ /* Tells us which dbuf cache this dbuf is in, if any */
+ dbuf_cached_state_t db_caching_status;
+
+ /* Data which is unique to data (leaf) blocks: */
+
+ /* User callback information. */
+ dmu_buf_user_t *db_user;
+
+ /*
+ * Evict user data as soon as the dirty and reference
+ * counts are equal.
+ */
+ uint8_t db_user_immediate_evict;
+
+ /*
+ * This block was freed while a read or write was
+ * active.
+ */
+ uint8_t db_freed_in_flight;
+
+ /*
+ * dnode_evict_dbufs() or dnode_evict_bonus() tried to
+ * evict this dbuf, but couldn't due to outstanding
+ * references. Evict once the refcount drops to 0.
+ */
+ uint8_t db_pending_evict;
+
+ uint8_t db_dirtycnt;
+} dmu_buf_impl_t;
+
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define DBUF_MUTEXES 256
+#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+typedef struct dbuf_hash_table {
+ uint64_t hash_table_mask;
+ dmu_buf_impl_t **hash_table;
+ kmutex_t hash_mutexes[DBUF_MUTEXES];
+} dbuf_hash_table_t;
+
+uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
+
+dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
+
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
+dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
+ void *tag);
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
+ boolean_t fail_sparse, boolean_t fail_uncached,
+ void *tag, dmu_buf_impl_t **dbp);
+
+void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
+ zio_priority_t prio, arc_flags_t aflags);
+
+void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
+ uint64_t blkid, void *tag);
+uint64_t dbuf_refcount(dmu_buf_impl_t *db);
+
+void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
+
+dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
+ uint64_t blkid);
+
+int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
+void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+ bp_embedded_type_t etype, enum zio_compress comp,
+ int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
+
+void dbuf_destroy(dmu_buf_impl_t *db);
+
+void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_unoverride(dbuf_dirty_record_t *dr);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
+
+boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
+
+void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
+ struct dmu_tx *);
+
+void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+
+void dbuf_stats_init(dbuf_hash_table_t *hash);
+void dbuf_stats_destroy(void);
+
+#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
+#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
+#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+
+void dbuf_init(void);
+void dbuf_fini(void);
+
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+
+#define DBUF_GET_BUFC_TYPE(_db) \
+ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+
+#define DBUF_IS_CACHEABLE(_db) \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (dbuf_is_metadata(_db) && \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+
+#define DBUF_IS_L2CACHEABLE(_db) \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
+ (dbuf_is_metadata(_db) && \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+
+#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \
+ ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \
+ (((_level) > 0 || \
+ DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \
+ ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but gcc does not
+ * support that preprocessor token.
+ */
+#define dprintf_dbuf(dbuf, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dbuf)->db.db_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj); \
+ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
+ "obj=%s lvl=%u blkid=%lld " fmt, \
+ __db_buf, (dbuf)->db_level, \
+ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
+ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
+ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ kmem_free(__blkbuf, BP_SPRINTF_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define DBUF_VERIFY(db) dbuf_verify(db)
+
+#else
+
+#define dprintf_dbuf(db, fmt, ...)
+#define dprintf_dbuf_bp(db, bp, fmt, ...)
+#define DBUF_VERIFY(db)
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DBUF_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
new file mode 100644
index 000000000000..2468a1485fd3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DDT_H
+#define _SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd;
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+ DDT_TYPE_ZAP = 0,
+ DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+ DDT_CLASS_DITTO = 0,
+ DDT_CLASS_DUPLICATE,
+ DDT_CLASS_UNIQUE,
+ DDT_CLASSES
+};
+
+#define DDT_TYPE_CURRENT 0
+
+#define DDT_COMPRESS_BYTEORDER_MASK 0x80
+#define DDT_COMPRESS_FUNCTION_MASK 0x7f
+
+/*
+ * On-disk ddt entry: key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+ zio_cksum_t ddk_cksum; /* 256-bit block checksum */
+ /*
+ * Encoded with logical & physical size, and compression, as follows:
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | 0 | 0 | 0 | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+ uint64_t ddk_prop;
+} ddt_key_t;
+
+#define DDK_GET_LSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_LSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_PSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_PSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8)
+#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+ dva_t ddp_dva[SPA_DVAS_PER_BP];
+ uint64_t ddp_refcnt;
+ uint64_t ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+ DDT_PHYS_DITTO = 0,
+ DDT_PHYS_SINGLE = 1,
+ DDT_PHYS_DOUBLE = 2,
+ DDT_PHYS_TRIPLE = 3,
+ DDT_PHYS_TYPES
+};
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+ ddt_key_t dde_key;
+ ddt_phys_t dde_phys[DDT_PHYS_TYPES];
+ zio_t *dde_lead_zio[DDT_PHYS_TYPES];
+ struct abd *dde_repair_abd;
+ enum ddt_type dde_type;
+ enum ddt_class dde_class;
+ uint8_t dde_loading;
+ uint8_t dde_loaded;
+ kcondvar_t dde_cv;
+ avl_node_t dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+ kmutex_t ddt_lock;
+ avl_tree_t ddt_tree;
+ avl_tree_t ddt_repair_tree;
+ enum zio_checksum ddt_checksum;
+ spa_t *ddt_spa;
+ objset_t *ddt_os;
+ uint64_t ddt_stat_object;
+ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+ ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
+ avl_node_t ddt_node;
+};
+
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+ uint64_t ddb_class;
+ uint64_t ddb_type;
+ uint64_t ddb_checksum;
+ uint64_t ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
+typedef struct ddt_ops {
+ char ddt_op_name[32];
+ int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+ boolean_t prehash);
+ int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+ int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+ void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
+ ddt_entry_t *dde);
+ int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ uint64_t *walk);
+ int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
+} ddt_ops_t;
+
+#define DDT_NAMELEN 80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class cls, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class cls, uint64_t *walk, ddt_entry_t *dde);
+extern int ddt_object_count(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class cls, uint64_t *count);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class cls, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class cls);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+ uint64_t txg);
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+ const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+ uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
+extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
+extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
+
+extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+ ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+ const blkptr_t *bp);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class cls, ddt_entry_t *dde, dmu_tx_t *tx);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DDT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
new file mode 100644
index 000000000000..1f5a837cc717
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -0,0 +1,1028 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2013 DEY Storage Systems, Inc.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_DMU_H
+#define _SYS_DMU_H
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA. That interface is described in
+ * dmu_spa.h.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/cred.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_priority.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct uio;
+struct xuio;
+struct page;
+struct vnode;
+struct spa;
+struct zilog;
+struct zio;
+struct blkptr;
+struct zap_cursor;
+struct dsl_dataset;
+struct dsl_pool;
+struct dnode;
+struct drr_begin;
+struct drr_end;
+struct zbookmark_phys;
+struct spa;
+struct nvlist;
+struct arc_buf;
+struct zio_prop;
+struct sa_handle;
+struct file;
+struct locked_range;
+
+typedef struct objset objset_t;
+typedef struct dmu_tx dmu_tx_t;
+typedef struct dsl_dir dsl_dir_t;
+typedef struct dnode dnode_t;
+
+typedef enum dmu_object_byteswap {
+ DMU_BSWAP_UINT8,
+ DMU_BSWAP_UINT16,
+ DMU_BSWAP_UINT32,
+ DMU_BSWAP_UINT64,
+ DMU_BSWAP_ZAP,
+ DMU_BSWAP_DNODE,
+ DMU_BSWAP_OBJSET,
+ DMU_BSWAP_ZNODE,
+ DMU_BSWAP_OLDACL,
+ DMU_BSWAP_ACL,
+ /*
+ * Allocating a new byteswap type number makes the on-disk format
+ * incompatible with any other format that uses the same number.
+ *
+ * Data can usually be structured to work with one of the
+ * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
+ */
+ DMU_BSWAP_NUMFUNCS
+} dmu_object_byteswap_t;
+
+#define DMU_OT_NEWTYPE 0x80
+#define DMU_OT_METADATA 0x40
+#define DMU_OT_BYTESWAP_MASK 0x3f
+
+/*
+ * Defines a uint8_t object type. Object types specify if the data
+ * in the object is metadata (boolean) and how to byteswap the data
+ * (dmu_object_byteswap_t). All of the types created by this method
+ * are cached in the dbuf metadata cache.
+ */
+#define DMU_OT(byteswap, metadata) \
+ (DMU_OT_NEWTYPE | \
+ ((metadata) ? DMU_OT_METADATA : 0) | \
+ ((byteswap) & DMU_OT_BYTESWAP_MASK))
+
+#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
+ (ot) < DMU_OT_NUMTYPES)
+
+#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+ ((ot) & DMU_OT_METADATA) : \
+ dmu_ot[(ot)].ot_metadata)
+
+#define DMU_OT_IS_DDT(ot) \
+ ((ot) == DMU_OT_DDT_ZAP)
+
+#define DMU_OT_IS_ZIL(ot) \
+ ((ot) == DMU_OT_INTENT_LOG)
+
+/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
+#define DMU_OT_IS_FILE(ot) \
+ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
+
+#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
+
+/*
+ * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
+ * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
+ * is repurposed for embedded BPs.
+ */
+#define DMU_OT_HAS_FILL(ot) \
+ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
+
+#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+ ((ot) & DMU_OT_BYTESWAP_MASK) : \
+ dmu_ot[(ot)].ot_byteswap)
+
+typedef enum dmu_object_type {
+ DMU_OT_NONE,
+ /* general: */
+ DMU_OT_OBJECT_DIRECTORY, /* ZAP */
+ DMU_OT_OBJECT_ARRAY, /* UINT64 */
+ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
+ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
+ DMU_OT_BPOBJ, /* UINT64 */
+ DMU_OT_BPOBJ_HDR, /* UINT64 */
+ /* spa: */
+ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
+ DMU_OT_SPACE_MAP, /* UINT64 */
+ /* zil: */
+ DMU_OT_INTENT_LOG, /* UINT64 */
+ /* dmu: */
+ DMU_OT_DNODE, /* DNODE */
+ DMU_OT_OBJSET, /* OBJSET */
+ /* dsl: */
+ DMU_OT_DSL_DIR, /* UINT64 */
+ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
+ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
+ DMU_OT_DSL_PROPS, /* ZAP */
+ DMU_OT_DSL_DATASET, /* UINT64 */
+ /* zpl: */
+ DMU_OT_ZNODE, /* ZNODE */
+ DMU_OT_OLDACL, /* Old ACL */
+ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
+ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
+ DMU_OT_MASTER_NODE, /* ZAP */
+ DMU_OT_UNLINKED_SET, /* ZAP */
+ /* zvol: */
+ DMU_OT_ZVOL, /* UINT8 */
+ DMU_OT_ZVOL_PROP, /* ZAP */
+ /* other; for testing only! */
+ DMU_OT_PLAIN_OTHER, /* UINT8 */
+ DMU_OT_UINT64_OTHER, /* UINT64 */
+ DMU_OT_ZAP_OTHER, /* ZAP */
+ /* new object types: */
+ DMU_OT_ERROR_LOG, /* ZAP */
+ DMU_OT_SPA_HISTORY, /* UINT8 */
+ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
+ DMU_OT_POOL_PROPS, /* ZAP */
+ DMU_OT_DSL_PERMS, /* ZAP */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_SYSACL, /* SYSACL */
+ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
+ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
+ DMU_OT_NEXT_CLONES, /* ZAP */
+ DMU_OT_SCAN_QUEUE, /* ZAP */
+ DMU_OT_USERGROUP_USED, /* ZAP */
+ DMU_OT_USERGROUP_QUOTA, /* ZAP */
+ DMU_OT_USERREFS, /* ZAP */
+ DMU_OT_DDT_ZAP, /* ZAP */
+ DMU_OT_DDT_STATS, /* ZAP */
+ DMU_OT_SA, /* System attr */
+ DMU_OT_SA_MASTER_NODE, /* ZAP */
+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
+ DMU_OT_SCAN_XLATE, /* ZAP */
+ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
+ DMU_OT_DEADLIST, /* ZAP */
+ DMU_OT_DEADLIST_HDR, /* UINT64 */
+ DMU_OT_DSL_CLONES, /* ZAP */
+ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
+ /*
+ * Do not allocate new object types here. Doing so makes the on-disk
+ * format incompatible with any other format that uses the same object
+ * type number.
+ *
+ * When creating an object which does not have one of the above types
+ * use the DMU_OTN_* type with the correct byteswap and metadata
+ * values.
+ *
+ * The DMU_OTN_* types do not have entries in the dmu_ot table,
+ * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
+ * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
+ * of indexing into dmu_ot directly (this works for both DMU_OT_* types
+ * and DMU_OTN_* types).
+ */
+ DMU_OT_NUMTYPES,
+
+ /*
+ * Names for valid types declared with DMU_OT().
+ */
+ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
+ DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
+ DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
+ DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
+ DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
+ DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
+ DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
+ DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
+ DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
+ DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
+} dmu_object_type_t;
+
+/*
+ * These flags are intended to be used to specify the "txg_how"
+ * parameter when calling the dmu_tx_assign() function. See the comment
+ * above dmu_tx_assign() for more details on the meaning of these flags.
+ */
+#define TXG_NOWAIT (0ULL)
+#define TXG_WAIT (1ULL<<0)
+#define TXG_NOTHROTTLE (1ULL<<1)
+
+void byteswap_uint64_array(void *buf, size_t size);
+void byteswap_uint32_array(void *buf, size_t size);
+void byteswap_uint16_array(void *buf, size_t size);
+void byteswap_uint8_array(void *buf, size_t size);
+void zap_byteswap(void *buf, size_t size);
+void zfs_oldacl_byteswap(void *buf, size_t size);
+void zfs_acl_byteswap(void *buf, size_t size);
+void zfs_znode_byteswap(void *buf, size_t size);
+
+#define DS_FIND_SNAPSHOTS (1<<0)
+#define DS_FIND_CHILDREN (1<<1)
+#define DS_FIND_SERIALIZE (1<<2)
+
+/*
+ * The maximum number of bytes that can be accessed as part of one
+ * operation, including metadata.
+ */
+#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
+#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
+
+#define DMU_USERUSED_OBJECT (-1ULL)
+#define DMU_GROUPUSED_OBJECT (-2ULL)
+
+/*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define DMU_BONUS_BLKID (-1ULL)
+#define DMU_SPILL_BLKID (-2ULL)
+/*
+ * Public routines to create, destroy, open, and close objsets.
+ */
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
+
+void dmu_objset_evict_dbufs(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
+ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
+ struct nvlist *snaps);
+int dmu_objset_clone(const char *name, const char *origin);
+int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
+ struct nvlist *errlist);
+int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
+int dmu_objset_snapshot_tmp(const char *, const char *, int);
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+ int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+int dsl_dataset_rename_snapshot(const char *fsname,
+ const char *oldsnapname, const char *newsnapname, boolean_t recursive);
+int dmu_objset_remap_indirects(const char *fsname);
+
+typedef struct dmu_buf {
+ uint64_t db_object; /* object that this buffer is part of */
+ uint64_t db_offset; /* byte offset in this object */
+ uint64_t db_size; /* size of buffer in bytes */
+ void *db_data; /* data in buffer */
+} dmu_buf_t;
+
+/*
+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
+ */
+#define DMU_POOL_DIRECTORY_OBJECT 1
+#define DMU_POOL_CONFIG "config"
+#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
+#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
+#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
+#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg"
+#define DMU_POOL_ROOT_DATASET "root_dataset"
+#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
+#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
+#define DMU_POOL_ERRLOG_LAST "errlog_last"
+#define DMU_POOL_SPARES "spares"
+#define DMU_POOL_DEFLATE "deflate"
+#define DMU_POOL_HISTORY "history"
+#define DMU_POOL_PROPS "pool_props"
+#define DMU_POOL_L2CACHE "l2cache"
+#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
+#define DMU_POOL_DDT "DDT-%s-%s-%s"
+#define DMU_POOL_DDT_STATS "DDT-statistics"
+#define DMU_POOL_CREATION_VERSION "creation_version"
+#define DMU_POOL_SCAN "scan"
+#define DMU_POOL_FREE_BPOBJ "free_bpobj"
+#define DMU_POOL_BPTREE_OBJ "bptree_obj"
+#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
+#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
+#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map"
+#define DMU_POOL_REMOVING "com.delphix:removing"
+#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
+#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
+#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
+
+/*
+ * Allocate an object from this objset. The range of object numbers
+ * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
+ *
+ * The transaction must be assigned to a txg. The newly allocated
+ * object will be "held" in the transaction (ie. you can modify the
+ * newly allocated object in this transaction).
+ *
+ * dmu_object_alloc() chooses an object and returns it in *objectp.
+ *
+ * dmu_object_claim() allocates a specific object number. If that
+ * number is already allocated, it fails and returns EEXIST.
+ *
+ * Return 0 on success, or ENOSPC or EEXIST as specified above.
+ */
+uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len,
+ int dnodesize, dmu_tx_t *tx);
+int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len,
+ int dnodesize, dmu_tx_t *tx);
+int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
+ dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
+ int bonuslen, int dnodesize, dmu_tx_t *txp);
+int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
+
+/*
+ * Free an object from this objset.
+ *
+ * The object's data will be freed as well (ie. you don't need to call
+ * dmu_free(object, 0, -1, tx)).
+ *
+ * The object need not be held in the transaction.
+ *
+ * If there are any holds on this object's buffers (via dmu_buf_hold()),
+ * or tx holds on the object (via dmu_tx_hold_object()), you can not
+ * free it; it fails and returns EBUSY.
+ *
+ * If the object is not allocated, it fails and returns ENOENT.
+ *
+ * Return 0 on success, or EBUSY or ENOENT as specified above.
+ */
+int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Find the next allocated or free object.
+ *
+ * The objectp parameter is in-out. It will be updated to be the next
+ * object which is allocated. Ignore objects which have not been
+ * modified since txg.
+ *
+ * XXX Can only be called on a objset with no dirty data.
+ *
+ * Returns 0 on success, or ENOENT if there are no more objects.
+ */
+int dmu_object_next(objset_t *os, uint64_t *objectp,
+ boolean_t hole, uint64_t txg);
+
+/*
+ * Set the data blocksize for an object.
+ *
+ * The object cannot have any blocks allcated beyond the first. If
+ * the first block is allocated already, the new size must be greater
+ * than the current block size. If these conditions are not met,
+ * ENOTSUP will be returned.
+ *
+ * Returns 0 on success, or EBUSY if there are any holds on the object
+ * contents, or ENOTSUP as described above.
+ */
+int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+ int ibs, dmu_tx_t *tx);
+
+/*
+ * Set the checksum property on a dnode. The new checksum algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx);
+
+/*
+ * Set the compress property on a dnode. The new compression algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx);
+
+int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg);
+
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+ void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+ int compressed_size, int byteorder, dmu_tx_t *tx);
+
+/*
+ * Decide how to write a block: checksum, compression, number of copies, etc.
+ */
+#define WP_NOFILL 0x1
+#define WP_DMU_SYNC 0x2
+#define WP_SPILL 0x4
+
+void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
+ struct zio_prop *zp);
+/*
+ * The bonus data is accessed more or less like a regular buffer.
+ * You must dmu_bonus_hold() to get the buffer, which will give you a
+ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
+ * data. As with any normal buffer, you must call dmu_buf_will_dirty()
+ * before modifying it, and the
+ * object must be held in an assigned transaction before calling
+ * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
+ * buffer as well. You must release your hold with dmu_buf_rele().
+ *
+ * Returns ENOENT, EIO, or 0.
+ */
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
+int dmu_bonus_max(void);
+int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
+ void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+
+/*
+ * Obtain the DMU buffer from the specified object which contains the
+ * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
+ * that it will remain in memory. You must release the hold with
+ * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
+ * hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
+ *
+ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
+ * on the returned buffer before reading or writing the buffer's
+ * db_data. The comments for those routines describe what particular
+ * operations are valid after calling them.
+ *
+ * The object number must be a valid, allocated object number.
+ */
+int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **, int flags);
+int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags);
+
+/*
+ * Add a reference to a dmu buffer that has already been held via
+ * dmu_buf_hold() in the current context.
+ */
+void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+
+/*
+ * Attempt to add a reference to a dmu buffer that is in an unknown state,
+ * using a pointer that may have been invalidated by eviction processing.
+ * The request will succeed if the passed in dbuf still represents the
+ * same os/object/blkid, is ineligible for eviction, and has at least
+ * one hold by a user other than the syncer.
+ */
+boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
+ uint64_t blkid, void *tag);
+
+void dmu_buf_rele(dmu_buf_t *db, void *tag);
+uint64_t dmu_buf_refcount(dmu_buf_t *db);
+
+/*
+ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
+ * range of an object. A pointer to an array of dmu_buf_t*'s is
+ * returned (in *dbpp).
+ *
+ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
+ * frees the array. The hold on the array of buffers MUST be released
+ * with dmu_buf_rele_array. You can NOT release the hold on each buffer
+ * individually with dmu_buf_rele.
+ */
+int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, boolean_t read, void *tag,
+ int *numbufsp, dmu_buf_t ***dbpp);
+int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+ boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp,
+ uint32_t flags);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
+
+typedef void dmu_buf_evict_func_t(void *user_ptr);
+
+/*
+ * A DMU buffer user object may be associated with a dbuf for the
+ * duration of its lifetime. This allows the user of a dbuf (client)
+ * to attach private data to a dbuf (e.g. in-core only data such as a
+ * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
+ * when that dbuf has been evicted. Clients typically respond to the
+ * eviction notification by freeing their private data, thus ensuring
+ * the same lifetime for both dbuf and private data.
+ *
+ * The mapping from a dmu_buf_user_t to any client private data is the
+ * client's responsibility. All current consumers of the API with private
+ * data embed a dmu_buf_user_t as the first member of the structure for
+ * their private data. This allows conversions between the two types
+ * with a simple cast. Since the DMU buf user API never needs access
+ * to the private data, other strategies can be employed if necessary
+ * or convenient for the client (e.g. using container_of() to do the
+ * conversion for private data that cannot have the dmu_buf_user_t as
+ * its first member).
+ *
+ * Eviction callbacks are executed without the dbuf mutex held or any
+ * other type of mechanism to guarantee that the dbuf is still available.
+ * For this reason, users must assume the dbuf has already been freed
+ * and not reference the dbuf from the callback context.
+ *
+ * Users requesting "immediate eviction" are notified as soon as the dbuf
+ * is only referenced by dirty records (dirties == holds). Otherwise the
+ * notification occurs after eviction processing for the dbuf begins.
+ */
+typedef struct dmu_buf_user {
+ /*
+ * Asynchronous user eviction callback state.
+ */
+ taskq_ent_t dbu_tqent;
+
+ /*
+ * This instance's eviction function pointers.
+ *
+ * dbu_evict_func_sync is called synchronously and then
+ * dbu_evict_func_async is executed asynchronously on a taskq.
+ */
+ dmu_buf_evict_func_t *dbu_evict_func_sync;
+ dmu_buf_evict_func_t *dbu_evict_func_async;
+#ifdef ZFS_DEBUG
+ /*
+ * Pointer to user's dbuf pointer. NULL for clients that do
+ * not associate a dbuf with their user data.
+ *
+ * The dbuf pointer is cleared upon eviction so as to catch
+ * use-after-evict bugs in clients.
+ */
+ dmu_buf_t **dbu_clear_on_evict_dbufp;
+#endif
+} dmu_buf_user_t;
+
+/*
+ * Initialize the given dmu_buf_user_t instance with the eviction function
+ * evict_func, to be called when the user is evicted.
+ *
+ * NOTE: This function should only be called once on a given dmu_buf_user_t.
+ * To allow enforcement of this, dbu must already be zeroed on entry.
+ */
+/*ARGSUSED*/
+static inline void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
+ dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp)
+{
+ ASSERT(dbu->dbu_evict_func_sync == NULL);
+ ASSERT(dbu->dbu_evict_func_async == NULL);
+
+ /* must have at least one evict func */
+ IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
+ dbu->dbu_evict_func_sync = evict_func_sync;
+ dbu->dbu_evict_func_async = evict_func_async;
+#ifdef ZFS_DEBUG
+ dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
+#endif
+}
+
+/*
+ * Attach user data to a dbuf and mark it for normal (when the dbuf's
+ * data is cleared or its reference count goes to zero) eviction processing.
+ *
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Attach user data to a dbuf and mark it for immediate (its dirty and
+ * reference counts are equal) eviction processing.
+ *
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Replace the current user of a dbuf.
+ *
+ * If given the current user of a dbuf, replaces the dbuf's user with
+ * "new_user" and returns the user data pointer that was replaced.
+ * Otherwise returns the current, and unmodified, dbuf user pointer.
+ */
+void *dmu_buf_replace_user(dmu_buf_t *db,
+ dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
+
+/*
+ * Remove the specified user data for a DMU buffer.
+ *
+ * Returns the user that was removed on success, or the current user if
+ * another user currently owns the buffer.
+ */
+void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
+ */
+void *dmu_buf_get_user(dmu_buf_t *db);
+
+objset_t *dmu_buf_get_objset(dmu_buf_t *db);
+dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
+void dmu_buf_dnode_exit(dmu_buf_t *db);
+
+/* Block until any in-progress dmu buf user evictions complete. */
+void dmu_buf_user_evict_wait(void);
+
+/*
+ * Returns the blkptr associated with this dbuf, or NULL if not set.
+ */
+struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
+
+/*
+ * Indicate that you are going to modify the buffer's data (db_data).
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()). The buffer's object must be held in the tx
+ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+
+/*
+ * You must create a transaction, then hold the objects which you will
+ * (or might) modify as part of this transaction. Then you must assign
+ * the transaction to a transaction group. Once the transaction has
+ * been assigned, you can modify buffers which belong to held objects as
+ * part of this transaction. You can't modify buffers before the
+ * transaction has been assigned; you can't modify buffers which don't
+ * belong to objects which this transaction holds; you can't hold
+ * objects once the transaction has been assigned. You may hold an
+ * object which you are going to free (with dmu_object_free()), but you
+ * don't have to.
+ *
+ * You can abort the transaction before it has been assigned.
+ *
+ * Note that you may hold buffers (with dmu_buf_hold) at any time,
+ * regardless of transaction state.
+ */
+
+#define DMU_NEW_OBJECT (-1ULL)
+#define DMU_OBJECT_END (-1ULL)
+
+dmu_tx_t *dmu_tx_create(objset_t *os);
+void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
+void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
+ int len);
+void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
+ uint64_t len);
+void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
+ uint64_t len);
+void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
+void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
+ const char *name);
+void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
+void dmu_tx_abort(dmu_tx_t *tx);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_wait(dmu_tx_t *tx);
+void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_mark_netfree(dmu_tx_t *tx);
+
+/*
+ * To register a commit callback, dmu_tx_callback_register() must be called.
+ *
+ * dcb_data is a pointer to caller private data that is passed on as a
+ * callback parameter. The caller is responsible for properly allocating and
+ * freeing it.
+ *
+ * When registering a callback, the transaction must be already created, but
+ * it cannot be committed or aborted. It can be assigned to a txg or not.
+ *
+ * The callback will be called after the transaction has been safely written
+ * to stable storage and will also be called if the dmu_tx is aborted.
+ * If there is any error which prevents the transaction from being committed to
+ * disk, the callback will be called with a value of error != 0.
+ */
+typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+ void *dcb_data);
+
+/*
+ * Free up the data blocks for a defined range of a file. If size is
+ * -1, the range from offset to end-of-file is freed.
+ */
+int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx);
+int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size);
+int dmu_free_long_object(objset_t *os, uint64_t object);
+
+/*
+ * Convenience functions.
+ *
+ * Canfail routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+#define DMU_READ_PREFETCH 0 /* prefetch */
+#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
+int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf, uint32_t flags);
+int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
+ uint32_t flags);
+void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx);
+void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx);
+void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx);
+int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
+int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size);
+int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
+int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
+int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
+#ifdef _KERNEL
+#ifdef illumos
+int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, struct page *pp, dmu_tx_t *tx);
+#else
+int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
+int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+ int *rbehind, int *rahead, int last_size);
+#endif
+#endif
+struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
+void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset,
+ struct arc_buf *buf, dmu_tx_t *tx);
+void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
+ dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+ size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied(void);
+void xuio_stat_wbuf_nocopy(void);
+
+extern boolean_t zfs_prefetch_disable;
+extern int zfs_max_recordsize;
+
+/*
+ * Asynchronously try to read in the data.
+ */
+void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+ uint64_t len, enum zio_priority pri);
+
+typedef struct dmu_object_info {
+ /* All sizes are in bytes unless otherwise indicated. */
+ uint32_t doi_data_block_size;
+ uint32_t doi_metadata_block_size;
+ dmu_object_type_t doi_type;
+ dmu_object_type_t doi_bonus_type;
+ uint64_t doi_bonus_size;
+ uint8_t doi_indirection; /* 2 = dnode->indirect->data */
+ uint8_t doi_checksum;
+ uint8_t doi_compress;
+ uint8_t doi_nblkptr;
+ int8_t doi_pad[4];
+ uint64_t doi_dnodesize;
+ uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
+ uint64_t doi_max_offset;
+ uint64_t doi_fill_count; /* number of non-empty blocks */
+} dmu_object_info_t;
+
+typedef void arc_byteswap_func_t(void *buf, size_t size);
+
+typedef struct dmu_object_type_info {
+ dmu_object_byteswap_t ot_byteswap;
+ boolean_t ot_metadata;
+ boolean_t ot_dbuf_metadata_cache;
+ char *ot_name;
+} dmu_object_type_info_t;
+
+typedef struct dmu_object_byteswap_info {
+ arc_byteswap_func_t *ob_func;
+ char *ob_name;
+} dmu_object_byteswap_info_t;
+
+extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
+
+/*
+ * Get information on a DMU object.
+ *
+ * Return 0 on success or ENOENT if object is not allocated.
+ *
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dnode in hand. */
+void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
+void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+/*
+ * Like dmu_object_info_from_db, but faster still when you only care about
+ * the size. This is specifically optimized for zfs_getattr().
+ */
+void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
+ u_longlong_t *nblk512);
+
+void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
+
+typedef struct dmu_objset_stats {
+ uint64_t dds_num_clones; /* number of clones of this */
+ uint64_t dds_creation_txg;
+ uint64_t dds_guid;
+ dmu_objset_type_t dds_type;
+ uint8_t dds_is_snapshot;
+ uint8_t dds_inconsistent;
+ char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
+} dmu_objset_stats_t;
+
+/*
+ * Get stats on a dataset.
+ */
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+
+/*
+ * Add entries to the nvlist for all the objset's properties. See
+ * zfs_prop_table[] and zfs(1m) for details on the properties.
+ */
+void dmu_objset_stats(objset_t *os, struct nvlist *nv);
+
+/*
+ * Get the space usage statistics for statvfs().
+ *
+ * refdbytes is the amount of space "referenced" by this objset.
+ * availbytes is the amount of space available to this objset, taking
+ * into account quotas & reservations, assuming that no other objsets
+ * use the space first. These values correspond to the 'referenced' and
+ * 'available' properties, described in the zfs(1m) manpage.
+ *
+ * usedobjs and availobjs are the number of objects currently allocated,
+ * and available.
+ */
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+
+/*
+ * The fsid_guid is a 56-bit ID that can change to avoid collisions.
+ * (Contrast with the ds_guid which is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.)
+ */
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+
+/*
+ * Get the [cm]time for an objset's snapshot dir
+ */
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
+int dmu_objset_is_snapshot(objset_t *os);
+
+extern struct spa *dmu_objset_spa(objset_t *os);
+extern struct zilog *dmu_objset_zil(objset_t *os);
+extern struct dsl_pool *dmu_objset_pool(objset_t *os);
+extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
+extern void dmu_objset_name(objset_t *os, char *buf);
+extern dmu_objset_type_t dmu_objset_type(objset_t *os);
+extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_dnodesize(objset_t *os);
+extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
+extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
+extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
+extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
+ int maxlen, boolean_t *conflict);
+extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp);
+
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+ void *bonus, uint64_t *userp, uint64_t *groupp);
+extern void dmu_objset_register_type(dmu_objset_type_t ost,
+ objset_used_cb_t *cb);
+extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
+extern void *dmu_objset_get_user(objset_t *os);
+
+/*
+ * Return the txg number for the given assigned transaction.
+ */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+
+/*
+ * Synchronous write.
+ * If a parent zio is provided this function initiates a write on the
+ * provided buffer as a child of the parent zio.
+ * In the absence of a parent zio, the write is completed synchronously.
+ * At write completion, blk is filled with the bp of the written block.
+ * Note that while the data covered by this function will be on stable
+ * storage when the write completes this new data does not become a
+ * permanent part of the file until the associated transaction commits.
+ */
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+ struct lwb *zgd_lwb;
+ struct blkptr *zgd_bp;
+ dmu_buf_t *zgd_db;
+ struct locked_range *zgd_lr;
+ void *zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
+
+/*
+ * Find the next hole or data block in file starting at *off
+ * Return found offset in *off. Return ESRCH for end of file.
+ */
+int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
+ uint64_t *off);
+
+/*
+ * Check if a DMU object has any dirty blocks. If so, sync out
+ * all pending transaction groups. Otherwise, this function
+ * does not alter DMU state. This could be improved to only sync
+ * out the necessary transaction groups for this particular
+ * object.
+ */
+int dmu_object_wait_synced(objset_t *os, uint64_t object);
+
+/*
+ * Initial setup and final teardown.
+ */
+extern void dmu_init(void);
+extern void dmu_fini(void);
+
+typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
+ uint64_t object, uint64_t offset, int len);
+void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
+ dmu_traverse_cb_t cb, void *arg);
+int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+ struct file *fp, offset_t *offp);
+
+/* CRC64 table */
+#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
+extern uint64_t zfs_crc64_table[256];
+
+extern int zfs_mdcomp_disable;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
new file mode 100644
index 000000000000..5cf7aea4711f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@@ -0,0 +1,315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DMU_IMPL_H
+#define _SYS_DMU_IMPL_H
+
+#include <sys/txg_impl.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/kstat.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the locking strategy for the DMU. Numbers in parenthesis are
+ * cases that use that lock order, referenced below:
+ *
+ * ARC is self-contained
+ * bplist is self-contained
+ * refcount is self-contained
+ * txg is self-contained (hopefully!)
+ * zst_lock
+ * zf_rwlock
+ *
+ * XXX try to improve evicting path?
+ *
+ * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
+ * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs
+ *
+ * dp_config_rwlock
+ * must be held before: everything
+ * protects dd namespace changes
+ * protects property changes globally
+ * held from:
+ * dsl_dir_open/r:
+ * dsl_dir_create_sync/w:
+ * dsl_dir_sync_destroy/w:
+ * dsl_dir_rename_sync/w:
+ * dsl_prop_changed_notify/r:
+ *
+ * os_obj_lock
+ * must be held before:
+ * everything except dp_config_rwlock
+ * protects os_obj_next
+ * held from:
+ * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
+ *
+ * dn_struct_rwlock
+ * must be held before:
+ * everything except dp_config_rwlock and os_obj_lock
+ * protects structure of dnode (eg. nlevels)
+ * db_blkptr can change when syncing out change to nlevels
+ * dn_maxblkid
+ * dn_nlevels
+ * dn_*blksz*
+ * phys nlevels, maxblkid, physical blkptr_t's (?)
+ * held from:
+ * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
+ * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
+ * dbuf_read_impl: db_mtx, dmu_zfetch()
+ * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
+ * dbuf_new_size: db_mtx
+ * dbuf_dirty: db_mtx
+ * dbuf_findbp: (callers, phys? - the real need)
+ * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
+ * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
+ * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
+ * dnode_sync/w (increase_indirection): db_mtx (phys)
+ * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
+ * dnode_new_blkid/w: (dn_maxblkid)
+ * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
+ * dnode_next_offset: (phys)
+ *
+ * dn_dbufs_mtx
+ * must be held before:
+ * db_mtx, hash_mutexes
+ * protects:
+ * dn_dbufs
+ * dn_evicted
+ * held from:
+ * dmu_evict_user: db_mtx (dn_dbufs)
+ * dbuf_free_range: db_mtx (dn_dbufs)
+ * dbuf_remove_ref: db_mtx, callees:
+ * dbuf_hash_remove: hash_mutexes, db_mtx
+ * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
+ * dnode_set_blksz: (dn_dbufs)
+ *
+ * hash_mutexes (global)
+ * must be held before:
+ * db_mtx
+ * protects dbuf_hash_table (global) and db_hash_next
+ * held from:
+ * dbuf_find: db_mtx
+ * dbuf_hash_insert: db_mtx
+ * dbuf_hash_remove: db_mtx
+ *
+ * db_mtx (meta-leaf)
+ * must be held before:
+ * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
+ * protects:
+ * db_state
+ * db_holds
+ * db_buf
+ * db_changed
+ * db_data_pending
+ * db_dirtied
+ * db_link
+ * db_dirty_node (??)
+ * db_dirtycnt
+ * db_d.*
+ * db.*
+ * held from:
+ * dbuf_dirty: dn_mtx, dn_dirty_mtx
+ * dbuf_dirty->dsl_dir_willuse_space: dd_lock
+ * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
+ * dbuf_undirty: dn_dirty_mtx (db_d)
+ * dbuf_write_done: dn_dirty_mtx (db_state)
+ * dbuf_*
+ * dmu_buf_update_user: none (db_d)
+ * dmu_evict_user: none (db_d) (maybe can eliminate)
+ * dbuf_find: none (db_holds)
+ * dbuf_hash_insert: none (db_holds)
+ * dmu_buf_read_array_impl: none (db_state, db_changed)
+ * dmu_sync: none (db_dirty_node, db_d)
+ * dnode_reallocate: none (db)
+ *
+ * dn_mtx (leaf)
+ * protects:
+ * dn_dirty_dbufs
+ * dn_ranges
+ * phys accounting
+ * dn_allocated_txg
+ * dn_free_txg
+ * dn_assigned_txg
+ * dn_dirty_txg
+ * dn_notxholds
+ * dn_dirtyctx
+ * dn_dirtyctx_firstset
+ * (dn_phys copy fields?)
+ * (dn_phys contents?)
+ * held from:
+ * dnode_*
+ * dbuf_dirty: none
+ * dbuf_sync: none (phys accounting)
+ * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
+ * dbuf_write_done: none (phys accounting)
+ * dmu_object_info_from_dnode: none (accounting)
+ * dmu_tx_commit: none
+ * dmu_tx_hold_object_impl: none
+ * dmu_tx_try_assign: dn_notxholds(cv)
+ * dmu_tx_unassign: none
+ *
+ * dd_lock
+ * must be held before:
+ * ds_lock
+ * ancestors' dd_lock
+ * protects:
+ * dd_prop_cbs
+ * dd_sync_*
+ * dd_used_bytes
+ * dd_tempreserved
+ * dd_space_towrite
+ * dd_myname
+ * dd_phys accounting?
+ * held from:
+ * dsl_dir_*
+ * dsl_prop_changed_notify: none (dd_prop_cbs)
+ * dsl_prop_register: none (dd_prop_cbs)
+ * dsl_prop_unregister: none (dd_prop_cbs)
+ *
+ * os_lock (leaf)
+ * protects:
+ * os_dirty_dnodes
+ * os_free_dnodes
+ * os_dnodes
+ * os_downgraded_dbufs
+ * dn_dirtyblksz
+ * dn_dirty_link
+ * held from:
+ * dnode_create: none (os_dnodes)
+ * dnode_destroy: none (os_dnodes)
+ * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
+ * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
+ *
+ * ds_lock
+ * protects:
+ * ds_objset
+ * ds_open_refcount
+ * ds_snapname
+ * ds_phys accounting
+ * ds_phys userrefs zapobj
+ * ds_reserved
+ * held from:
+ * dsl_dataset_*
+ *
+ * dr_mtx (leaf)
+ * protects:
+ * dr_children
+ * held from:
+ * dbuf_dirty
+ * dbuf_undirty
+ * dbuf_sync_indirect
+ * dnode_new_blkid
+ */
+
+struct objset;
+struct dmu_pool;
+
+typedef struct dmu_xuio {
+ int next;
+ int cnt;
+ struct arc_buf **bufs;
+ iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+ /* loaned yet not returned arc_buf */
+ kstat_named_t xuiostat_onloan_rbuf;
+ kstat_named_t xuiostat_onloan_wbuf;
+ /* whether a copy is made when loaning out a read buffer */
+ kstat_named_t xuiostat_rbuf_copied;
+ kstat_named_t xuiostat_rbuf_nocopy;
+ /* whether a copy is made when assigning a write buffer */
+ kstat_named_t xuiostat_wbuf_copied;
+ kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+ { "onloan_read_buf", KSTAT_DATA_UINT64 },
+ { "onloan_write_buf", KSTAT_DATA_UINT64 },
+ { "read_buf_copied", KSTAT_DATA_UINT64 },
+ { "read_buf_nocopy", KSTAT_DATA_UINT64 },
+ { "write_buf_copied", KSTAT_DATA_UINT64 },
+ { "write_buf_nocopy", KSTAT_DATA_UINT64 }
+};
+
+#define XUIOSTAT_INCR(stat, val) \
+ atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
+
+/*
+ * The list of data whose inclusion in a send stream can be pending from
+ * one call to backup_cb to another. Multiple calls to dump_free() and
+ * dump_freeobjects() can be aggregated into a single DRR_FREE or
+ * DRR_FREEOBJECTS replay record.
+ */
+typedef enum {
+ PENDING_NONE,
+ PENDING_FREE,
+ PENDING_FREEOBJECTS
+} dmu_pendop_t;
+
+typedef struct dmu_sendarg {
+ list_node_t dsa_link;
+ dmu_replay_record_t *dsa_drr;
+ kthread_t *dsa_td;
+ struct file *dsa_fp;
+ int dsa_outfd;
+ struct proc *dsa_proc;
+ offset_t *dsa_off;
+ objset_t *dsa_os;
+ zio_cksum_t dsa_zc;
+ uint64_t dsa_toguid;
+ int dsa_err;
+ dmu_pendop_t dsa_pending_op;
+ uint64_t dsa_featureflags;
+ uint64_t dsa_last_data_object;
+ uint64_t dsa_last_data_offset;
+ uint64_t dsa_resume_object;
+ uint64_t dsa_resume_offset;
+ boolean_t dsa_sent_begin;
+ boolean_t dsa_sent_end;
+} dmu_sendarg_t;
+
+void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
+void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
+int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
+ void *, dmu_buf_t **);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
new file mode 100644
index 000000000000..cae1c7719a83
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -0,0 +1,221 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_DMU_OBJSET_H
+#define _SYS_DMU_OBJSET_H
+
+#include <sys/spa.h>
+#include <sys/arc.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sa.h>
+#include <sys/zfs_ioctl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern krwlock_t os_lock;
+
+struct dsl_pool;
+struct dsl_dataset;
+struct dmu_tx;
+
+#define OBJSET_PHYS_SIZE 2048
+#define OBJSET_OLD_PHYS_SIZE 1024
+
+#define OBJSET_BUF_HAS_USERUSED(buf) \
+ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
+#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)
+
+typedef struct objset_phys {
+ dnode_phys_t os_meta_dnode;
+ zil_header_t os_zil_header;
+ uint64_t os_type;
+ uint64_t os_flags;
+ char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
+ sizeof (zil_header_t) - sizeof (uint64_t)*2];
+ dnode_phys_t os_userused_dnode;
+ dnode_phys_t os_groupused_dnode;
+} objset_phys_t;
+
+#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1)
+struct objset {
+ /* Immutable: */
+ struct dsl_dataset *os_dsl_dataset;
+ spa_t *os_spa;
+ arc_buf_t *os_phys_buf;
+ objset_phys_t *os_phys;
+ /*
+ * The following "special" dnodes have no parent, are exempt
+ * from dnode_move(), and are not recorded in os_dnodes, but they
+ * root their descendents in this objset using handles anyway, so
+ * that all access to dnodes from dbufs consistently uses handles.
+ */
+ dnode_handle_t os_meta_dnode;
+ dnode_handle_t os_userused_dnode;
+ dnode_handle_t os_groupused_dnode;
+ zilog_t *os_zil;
+
+ list_node_t os_evicting_node;
+
+ /* can change, under dsl_dir's locks: */
+ uint64_t os_dnodesize; /* default dnode size for new objects */
+ enum zio_checksum os_checksum;
+ enum zio_compress os_compress;
+ uint8_t os_copies;
+ enum zio_checksum os_dedup_checksum;
+ boolean_t os_dedup_verify;
+ zfs_logbias_op_t os_logbias;
+ zfs_cache_type_t os_primary_cache;
+ zfs_cache_type_t os_secondary_cache;
+ zfs_sync_type_t os_sync;
+ zfs_redundant_metadata_type_t os_redundant_metadata;
+ int os_recordsize;
+ /*
+ * The next four values are used as a cache of whatever's on disk, and
+ * are initialized the first time these properties are queried. Before
+ * being initialized with their real values, their values are
+ * OBJSET_PROP_UNINITIALIZED.
+ */
+ uint64_t os_version;
+ uint64_t os_normalization;
+ uint64_t os_utf8only;
+ uint64_t os_casesensitivity;
+ /*
+ * The largest zpl file block allowed in special class.
+ * cached here instead of zfsvfs for easier access.
+ */
+ int os_zpl_special_smallblock;
+
+ /*
+ * Pointer is constant; the blkptr it points to is protected by
+ * os_dsl_dataset->ds_bp_rwlock
+ */
+ blkptr_t *os_rootbp;
+
+ /* no lock needed: */
+ struct dmu_tx *os_synctx; /* XXX sketchy */
+ zil_header_t os_zil_header;
+ multilist_t *os_synced_dnodes;
+ uint64_t os_flags;
+ uint64_t os_freed_dnodes;
+ boolean_t os_rescan_dnodes;
+
+ /* Protected by os_obj_lock */
+ kmutex_t os_obj_lock;
+ uint64_t os_obj_next_chunk;
+
+ /* Per-CPU next object to allocate, protected by atomic ops. */
+ uint64_t *os_obj_next_percpu;
+ int os_obj_next_percpu_len;
+
+ /* Protected by os_lock */
+ kmutex_t os_lock;
+ multilist_t *os_dirty_dnodes[TXG_SIZE];
+ list_t os_dnodes;
+ list_t os_downgraded_dbufs;
+
+ /* Protects changes to DMU_{USER,GROUP}USED_OBJECT */
+ kmutex_t os_userused_lock;
+
+ /* stuff we store for the user */
+ kmutex_t os_user_ptr_lock;
+ void *os_user_ptr;
+ sa_os_t *os_sa;
+};
+
+#define DMU_META_OBJSET 0
+#define DMU_META_DNODE_OBJECT 0
+#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode)
+#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode)
+#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)
+
+#define DMU_OS_IS_L2CACHEABLE(os) \
+ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \
+ (os)->os_secondary_cache == ZFS_CACHE_METADATA)
+
+#define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE)
+
+/* called from zpl */
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp);
+int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
+ dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_refresh_ownership(struct dsl_dataset *ds,
+ struct dsl_dataset **newds, void *tag);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
+
+void dmu_objset_stats(objset_t *os, nvlist_t *nv);
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
+ int func(struct dsl_pool *, struct dsl_dataset *, void *),
+ void *arg, int flags);
+int dmu_objset_prefetch(const char *name, void *arg);
+void dmu_objset_evict_dbufs(objset_t *os);
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
+/* called from dsl */
+void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+ blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
+int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
+ objset_t **osp);
+void dmu_objset_evict(objset_t *os);
+void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_t *os);
+int dmu_objset_userspace_upgrade(objset_t *os);
+boolean_t dmu_objset_userspace_present(objset_t *os);
+int dmu_fsname(const char *snapname, char *buf);
+
+void dmu_objset_evict_done(objset_t *os);
+void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx);
+
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_OBJSET_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
new file mode 100644
index 000000000000..1f4b1f2cde9f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#ifndef _DMU_SEND_H
+#define _DMU_SEND_H
+
+#include <sys/spa.h>
+
+struct vnode;
+struct dsl_dataset;
+struct drr_begin;
+struct avl_tree;
+struct dmu_replay_record;
+
+extern const char *recv_clone_name;
+
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+ boolean_t large_block_ok, boolean_t compressok, int outfd,
+ uint64_t resumeobj, uint64_t resumeoff,
+#ifdef illumos
+ struct vnode *vp, offset_t *off);
+#else
+ struct file *fp, offset_t *off);
+#endif
+int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
+ boolean_t stream_compressed, uint64_t *sizep);
+int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
+ boolean_t stream_compressed, uint64_t *sizep);
+int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
+#ifdef illumos
+ int outfd, struct vnode *vp, offset_t *off);
+#else
+ int outfd, struct file *fp, offset_t *off);
+#endif
+
+typedef struct dmu_recv_cookie {
+ struct dsl_dataset *drc_ds;
+ struct dmu_replay_record *drc_drr_begin;
+ struct drr_begin *drc_drrb;
+ const char *drc_tofs;
+ const char *drc_tosnap;
+ boolean_t drc_newfs;
+ boolean_t drc_byteswap;
+ boolean_t drc_force;
+ boolean_t drc_resumable;
+ boolean_t drc_clone;
+ struct avl_tree *drc_guid_to_ds_map;
+ zio_cksum_t drc_cksum;
+ uint64_t drc_newsnapobj;
+ void *drc_owner;
+ cred_t *drc_cred;
+} dmu_recv_cookie_t;
+
+int dmu_recv_begin(char *tofs, char *tosnap,
+ struct dmu_replay_record *drr_begin,
+ boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
+#ifdef illumos
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
+#else
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
+#endif
+ int cleanup_fd, uint64_t *action_handlep);
+int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
+boolean_t dmu_objset_is_receiving(objset_t *os);
+
+#endif /* _DMU_SEND_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
new file mode 100644
index 000000000000..c010edd440d9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DMU_TRAVERSE_H
+#define _SYS_DMU_TRAVERSE_H
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dnode_phys;
+struct dsl_dataset;
+struct zilog;
+struct arc_buf;
+
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg);
+
+#define TRAVERSE_PRE (1<<0)
+#define TRAVERSE_POST (1<<1)
+#define TRAVERSE_PREFETCH_METADATA (1<<2)
+#define TRAVERSE_PREFETCH_DATA (1<<3)
+#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
+#define TRAVERSE_HARD (1<<4)
+
+/* Special traverse error return value to indicate skipping of children */
+#define TRAVERSE_VISIT_NO_CHILDREN -1
+
+int traverse_dataset(struct dsl_dataset *ds,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
+ zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+ uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+ blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
new file mode 100644
index 000000000000..82b8946e5f6d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
@@ -0,0 +1,152 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DMU_TX_H
+#define _SYS_DMU_TX_H
+
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf_impl;
+struct dmu_tx_hold;
+struct dnode_link;
+struct dsl_pool;
+struct dnode;
+struct dsl_dir;
+
+struct dmu_tx {
+ /*
+ * No synchronization is needed because a tx can only be handled
+ * by one thread.
+ */
+ list_t tx_holds; /* list of dmu_tx_hold_t */
+ objset_t *tx_objset;
+ struct dsl_dir *tx_dir;
+ struct dsl_pool *tx_pool;
+ uint64_t tx_txg;
+ uint64_t tx_lastsnap_txg;
+ uint64_t tx_lasttried_txg;
+ txg_handle_t tx_txgh;
+ void *tx_tempreserve_cookie;
+ struct dmu_tx_hold *tx_needassign_txh;
+
+ /* list of dmu_tx_callback_t on this dmu_tx */
+ list_t tx_callbacks;
+
+ /* placeholder for syncing context, doesn't need specific holds */
+ boolean_t tx_anyobj;
+
+ /* transaction is marked as being a "net free" of space */
+ boolean_t tx_netfree;
+
+ /* time this transaction was created */
+ hrtime_t tx_start;
+
+ /* need to wait for sufficient dirty space */
+ boolean_t tx_wait_dirty;
+
+ /* has this transaction already been delayed? */
+ boolean_t tx_dirty_delayed;
+
+ int tx_err;
+};
+
+enum dmu_tx_hold_type {
+ THT_NEWOBJECT,
+ THT_WRITE,
+ THT_BONUS,
+ THT_FREE,
+ THT_ZAP,
+ THT_SPACE,
+ THT_SPILL,
+ THT_NUMTYPES
+};
+
+typedef struct dmu_tx_hold {
+ dmu_tx_t *txh_tx;
+ list_node_t txh_node;
+ struct dnode *txh_dnode;
+ zfs_refcount_t txh_space_towrite;
+ zfs_refcount_t txh_memory_tohold;
+ enum dmu_tx_hold_type txh_type;
+ uint64_t txh_arg1;
+ uint64_t txh_arg2;
+} dmu_tx_hold_t;
+
+typedef struct dmu_tx_callback {
+ list_node_t dcb_node; /* linked to tx_callbacks list */
+ dmu_tx_callback_func_t *dcb_func; /* caller function pointer */
+ void *dcb_data; /* caller private data */
+} dmu_tx_callback_t;
+
+/*
+ * These routines are defined in dmu.h, and are called by the user.
+ */
+dmu_tx_t *dmu_tx_create(objset_t *dd);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_abort(dmu_tx_t *tx);
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx);
+void dmu_tx_wait(dmu_tx_t *tx);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+ void *dcb_data);
+void dmu_tx_do_callbacks(list_t *cb_list, int error);
+
+/*
+ * These routines are defined in dmu_spa.h, and are called by the SPA.
+ */
+extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * These routines are only called by the DMU.
+ */
+dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
+int dmu_tx_is_syncing(dmu_tx_t *tx);
+int dmu_tx_private_ok(dmu_tx_t *tx);
+void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn);
+void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
+void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
+
+#ifdef ZFS_DEBUG
+#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db)
+#else
+#define DMU_TX_DIRTY_BUF(tx, db)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TX_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
new file mode 100644
index 000000000000..21a3ff3a2032
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _DMU_ZFETCH_H
+#define _DMU_ZFETCH_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint64_t zfetch_array_rd_sz;
+
+struct dnode; /* so we can reference dnode */
+
+typedef struct zstream {
+ uint64_t zs_blkid; /* expect next access at this blkid */
+ uint64_t zs_pf_blkid; /* next block to prefetch */
+
+ /*
+ * We will next prefetch the L1 indirect block of this level-0
+ * block id.
+ */
+ uint64_t zs_ipf_blkid;
+
+ kmutex_t zs_lock; /* protects stream */
+ hrtime_t zs_atime; /* time last prefetch issued */
+ list_node_t zs_node; /* link for zf_stream */
+} zstream_t;
+
+typedef struct zfetch {
+ krwlock_t zf_rwlock; /* protects zfetch structure */
+ list_t zf_stream; /* list of zstream_t's */
+ struct dnode *zf_dnode; /* dnode that owns this zfetch */
+} zfetch_t;
+
+void zfetch_init(void);
+void zfetch_fini(void);
+
+void dmu_zfetch_init(zfetch_t *, struct dnode *);
+void dmu_zfetch_fini(zfetch_t *);
+void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DMU_ZFETCH_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
new file mode 100644
index 000000000000..b1a8547013c5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -0,0 +1,599 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#ifndef _SYS_DNODE_H
+#define _SYS_DNODE_H
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/refcount.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
+#include <sys/multilist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * dnode_hold() flags.
+ */
+#define DNODE_MUST_BE_ALLOCATED 1
+#define DNODE_MUST_BE_FREE 2
+
+/*
+ * dnode_next_offset() flags.
+ */
+#define DNODE_FIND_HOLE 1
+#define DNODE_FIND_BACKWARDS 2
+#define DNODE_FIND_HAVELOCK 4
+
+/*
+ * Fixed constants.
+ */
+#define DNODE_SHIFT 9 /* 512 bytes */
+#define DN_MIN_INDBLKSHIFT 12 /* 4k */
+/*
+ * If we ever increase this value beyond 20, we need to revisit all logic that
+ * does x << level * ebps to handle overflow. With a 1M indirect block size,
+ * 4 levels of indirect blocks would not be able to guarantee addressing an
+ * entire object, so 5 levels will be used, but 5 * (20 - 7) = 65.
+ */
+#define DN_MAX_INDBLKSHIFT 17 /* 128k */
+#define DNODE_BLOCK_SHIFT 14 /* 16k */
+#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
+#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
+#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
+
+/*
+ * dnode id flags
+ *
+ * Note: a file will never ever have its
+ * ids moved from bonus->spill
+ * and only in a crypto environment would it be on spill
+ */
+#define DN_ID_CHKED_BONUS 0x1
+#define DN_ID_CHKED_SPILL 0x2
+#define DN_ID_OLD_EXIST 0x4
+#define DN_ID_NEW_EXIST 0x8
+
+/*
+ * Derived constants.
+ */
+#define DNODE_MIN_SIZE (1 << DNODE_SHIFT)
+#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT)
+#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT)
+#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT)
+#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT)
+#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \
+ (1 << SPA_BLKPTRSHIFT))
+#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT)
+#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE))
+#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
+#define DN_KILL_SPILLBLK (1)
+
+#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */
+#define DN_SLOT_FREE ((void *)1UL) /* Free slot */
+#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */
+#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */
+#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR)
+#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL)
+
+#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
+
+/*
+ * This is inaccurate if the indblkshift of the particular object is not the
+ * max. But it's only used by userland to calculate the zvol reservation.
+ */
+#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
+ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+#define DN_MAX_BONUS_LEN(dnp) \
+ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \
+ (uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \
+ (uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp))
+
+#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
+ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
+
+#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
+
+struct dmu_buf_impl;
+struct objset;
+struct zio;
+
+enum dnode_dirtycontext {
+ DN_UNDIRTIED,
+ DN_DIRTY_OPEN,
+ DN_DIRTY_SYNC
+};
+
+/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
+#define DNODE_FLAG_USED_BYTES (1<<0)
+#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
+
+/* Does dnode have a SA spill blkptr in bonus? */
+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
+
+/*
+ * VARIABLE-LENGTH (LARGE) DNODES
+ *
+ * The motivation for variable-length dnodes is to eliminate the overhead
+ * associated with using spill blocks. Spill blocks are used to store
+ * system attribute data (i.e. file metadata) that does not fit in the
+ * dnode's bonus buffer. By allowing a larger bonus buffer area the use of
+ * a spill block can be avoided. Spill blocks potentially incur an
+ * additional read I/O for every dnode in a dnode block. As a worst case
+ * example, reading 32 dnodes from a 16k dnode block and all of the spill
+ * blocks could issue 33 separate reads. Now suppose those dnodes have size
+ * 1024 and therefore don't need spill blocks. Then the worst case number
+ * of blocks read is reduced to from 33 to two--one per dnode block.
+ *
+ * ZFS-on-Linux systems that make heavy use of extended attributes benefit
+ * from this feature. In particular, ZFS-on-Linux supports the xattr=sa
+ * dataset property which allows file extended attribute data to be stored
+ * in the dnode bonus buffer as an alternative to the traditional
+ * directory-based format. Workloads such as SELinux and the Lustre
+ * distributed filesystem often store enough xattr data to force spill
+ * blocks when xattr=sa is in effect. Large dnodes may therefore provide a
+ * performance benefit to such systems. Other use cases that benefit from
+ * this feature include files with large ACLs and symbolic links with long
+ * target names.
+ *
+ * The size of a dnode may be a multiple of 512 bytes up to the size of a
+ * dnode block (currently 16384 bytes). The dn_extra_slots field of the
+ * on-disk dnode_phys_t structure describes the size of the physical dnode
+ * on disk. The field represents how many "extra" dnode_phys_t slots a
+ * dnode consumes in its dnode block. This convention results in a value of
+ * 0 for 512 byte dnodes which preserves on-disk format compatibility with
+ * older software which doesn't support large dnodes.
+ *
+ * Similarly, the in-memory dnode_t structure has a dn_num_slots field
+ * to represent the total number of dnode_phys_t slots consumed on disk.
+ * Thus dn->dn_num_slots is 1 greater than the corresponding
+ * dnp->dn_extra_slots. This difference in convention was adopted
+ * because, unlike on-disk structures, backward compatibility is not a
+ * concern for in-memory objects, so we used a more natural way to
+ * represent size for a dnode_t.
+ *
+ * The default size for newly created dnodes is determined by the value of
+ * the "dnodesize" dataset property. By default the property is set to
+ * "legacy" which is compatible with older software. Setting the property
+ * to "auto" will allow the filesystem to choose the most suitable dnode
+ * size. Currently this just sets the default dnode size to 1k, but future
+ * code improvements could dynamically choose a size based on observed
+ * workload patterns. Dnodes of varying sizes can coexist within the same
+ * dataset and even within the same dnode block.
+ */
+
+typedef struct dnode_phys {
+ uint8_t dn_type; /* dmu_object_type_t */
+ uint8_t dn_indblkshift; /* ln2(indirect block size) */
+ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
+ uint8_t dn_nblkptr; /* length of dn_blkptr */
+ uint8_t dn_bonustype; /* type of data in bonus buffer */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_flags; /* DNODE_FLAG_* */
+ uint16_t dn_datablkszsec; /* data block size in 512b sectors */
+ uint16_t dn_bonuslen; /* length of dn_bonus */
+ uint8_t dn_extra_slots; /* # of subsequent slots consumed */
+ uint8_t dn_pad2[3];
+
+ /* accounting is protected by dn_dirty_mtx */
+ uint64_t dn_maxblkid; /* largest allocated block ID */
+ uint64_t dn_used; /* bytes (or sectors) of disk space */
+
+ /*
+ * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This
+ * allows us to protect any fields that might be added here in the
+ * future. In either case, developers will want to check
+ * zio_crypt_init_uios_dnode() to ensure the new field is being
+ * protected properly.
+ */
+ uint64_t dn_pad3[4];
+ union {
+ blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
+ struct {
+ blkptr_t __dn_ignore1;
+ uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN];
+ };
+ struct {
+ blkptr_t __dn_ignore2;
+ uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
+ sizeof (blkptr_t)];
+ blkptr_t dn_spill;
+ };
+ };
+} dnode_phys_t;
+
+#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \
+ (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
+
+struct dnode {
+ /*
+ * Protects the structure of the dnode, including the number of levels
+ * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
+ */
+ krwlock_t dn_struct_rwlock;
+
+ /* Our link on dn_objset->os_dnodes list; protected by os_lock. */
+ list_node_t dn_link;
+
+ /* immutable: */
+ struct objset *dn_objset;
+ uint64_t dn_object;
+ struct dmu_buf_impl *dn_dbuf;
+ struct dnode_handle *dn_handle;
+ dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
+
+ /*
+ * Copies of stuff in dn_phys. They're valid in the open
+ * context (eg. even before the dnode is first synced).
+ * Where necessary, these are protected by dn_struct_rwlock.
+ */
+ dmu_object_type_t dn_type; /* object type */
+ uint16_t dn_bonuslen; /* bonus length */
+ uint8_t dn_bonustype; /* bonus type */
+ uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_nlevels;
+ uint8_t dn_indblkshift;
+ uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint8_t dn_moved; /* Has this dnode been moved? */
+ uint16_t dn_datablkszsec; /* in 512b sectors */
+ uint32_t dn_datablksz; /* in bytes */
+ uint64_t dn_maxblkid;
+ uint8_t dn_next_type[TXG_SIZE];
+ uint8_t dn_num_slots; /* metadnode slots consumed on disk */
+ uint8_t dn_next_nblkptr[TXG_SIZE];
+ uint8_t dn_next_nlevels[TXG_SIZE];
+ uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint8_t dn_next_bonustype[TXG_SIZE];
+ uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */
+ uint16_t dn_next_bonuslen[TXG_SIZE];
+ uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
+
+ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+ uint32_t dn_dbufs_count; /* count of dn_dbufs */
+
+ /* protected by os_lock: */
+ multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
+
+ /* protected by dn_mtx: */
+ kmutex_t dn_mtx;
+ list_t dn_dirty_records[TXG_SIZE];
+ struct range_tree *dn_free_ranges[TXG_SIZE];
+ uint64_t dn_allocated_txg;
+ uint64_t dn_free_txg;
+ uint64_t dn_assigned_txg;
+ uint64_t dn_dirty_txg; /* txg dnode was last dirtied */
+ kcondvar_t dn_notxholds;
+ enum dnode_dirtycontext dn_dirtyctx;
+ uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
+
+ /* protected by own devices */
+ zfs_refcount_t dn_tx_holds;
+ zfs_refcount_t dn_holds;
+
+ kmutex_t dn_dbufs_mtx;
+ /*
+ * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
+ * can contain multiple dbufs of the same (level, blkid) when a
+ * dbuf is marked DB_EVICTING without being removed from
+ * dn_dbufs. To maintain the avl invariant that there cannot be
+ * duplicate entries, we order the dbufs by an arbitrary value -
+ * their address in memory. This means that dn_dbufs cannot be used to
+ * directly look up a dbuf. Instead, callers must use avl_walk, have
+ * a reference to the dbuf, or look up a non-existant node with
+ * db_state = DB_SEARCH (see dbuf_free_range for an example).
+ */
+ avl_tree_t dn_dbufs;
+
+ /* protected by dn_struct_rwlock */
+ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+
+ boolean_t dn_have_spill; /* have spill or are spilling */
+
+ /* parent IO for current sync write */
+ zio_t *dn_zio;
+
+ /* used in syncing context */
+ uint64_t dn_oldused; /* old phys used bytes */
+ uint64_t dn_oldflags; /* old phys dn_flags */
+ uint64_t dn_olduid, dn_oldgid;
+ uint64_t dn_newuid, dn_newgid;
+ int dn_id_flags;
+
+ /* holds prefetch structure */
+ struct zfetch dn_zfetch;
+};
+
+/*
+ * Since AVL already has embedded element counter, use dn_dbufs_count
+ * only for dbufs not counted there (bonus buffers) and just add them.
+ */
+#define DN_DBUFS_COUNT(dn) ((dn)->dn_dbufs_count + \
+ avl_numnodes(&(dn)->dn_dbufs))
+
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+ /* Protects dnh_dnode from modification by dnode_move(). */
+ zrlock_t dnh_zrlock;
+ dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+ dmu_buf_user_t dnc_dbu; /* User evict data */
+ size_t dnc_count; /* number of children */
+ dnode_handle_t dnc_children[]; /* sized dynamically */
+} dnode_children_t;
+
+typedef struct free_range {
+ avl_node_t fr_node;
+ uint64_t fr_blkid;
+ uint64_t fr_nblks;
+} free_range_t;
+
+void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+ uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
+
+void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
+void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
+void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
+
+int dnode_hold(struct objset *dd, uint64_t object,
+ void *ref, dnode_t **dnp);
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
+ void *ref, dnode_t **dnp);
+boolean_t dnode_add_ref(dnode_t *dn, void *ref);
+void dnode_rele(dnode_t *dn, void *ref);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
+void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
+void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
+void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
+void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
+void dnode_free(dnode_t *dn, dmu_tx_t *tx);
+void dnode_byteswap(dnode_phys_t *dnp);
+void dnode_buf_byteswap(void *buf, size_t size);
+void dnode_verify(dnode_t *dn);
+int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
+void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
+void dnode_diduse_space(dnode_t *dn, int64_t space);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
+uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
+void dnode_init(void);
+void dnode_fini(void);
+int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
+ int minlvl, uint64_t blkfill, uint64_t txg);
+void dnode_evict_dbufs(dnode_t *dn);
+void dnode_evict_bonus(dnode_t *dn);
+void dnode_free_interior_slots(dnode_t *dn);
+boolean_t dnode_needs_remap(const dnode_t *dn);
+
+#define DNODE_IS_DIRTY(_dn) \
+ ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
+
+#define DNODE_IS_CACHEABLE(_dn) \
+ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (DMU_OT_IS_METADATA((_dn)->dn_type) && \
+ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
+
+#define DNODE_META_IS_CACHEABLE(_dn) \
+ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
+
+/*
+ * Used for dnodestats kstat.
+ */
+typedef struct dnode_stats {
+ /*
+ * Number of failed attempts to hold a meta dnode dbuf.
+ */
+ kstat_named_t dnode_hold_dbuf_hold;
+ /*
+ * Number of failed attempts to read a meta dnode dbuf.
+ */
+ kstat_named_t dnode_hold_dbuf_read;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
+ * to hold the requested object number which was allocated. This is
+ * the common case when looking up any allocated object number.
+ */
+ kstat_named_t dnode_hold_alloc_hits;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
+ * able to hold the request object number because it was not allocated.
+ */
+ kstat_named_t dnode_hold_alloc_misses;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
+ * able to hold the request object number because the object number
+ * refers to an interior large dnode slot.
+ */
+ kstat_named_t dnode_hold_alloc_interior;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
+ * to retry acquiring slot zrl locks due to contention.
+ */
+ kstat_named_t dnode_hold_alloc_lock_retry;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
+ * need to create the dnode because another thread did so after
+ * dropping the read lock but before acquiring the write lock.
+ */
+ kstat_named_t dnode_hold_alloc_lock_misses;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
+ * a free dnode instantiated by dnode_create() but not yet allocated
+ * by dnode_allocate().
+ */
+ kstat_named_t dnode_hold_alloc_type_none;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
+ * to hold the requested range of free dnode slots.
+ */
+ kstat_named_t dnode_hold_free_hits;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
+ * able to hold the requested range of free dnode slots because
+ * at least one slot was allocated.
+ */
+ kstat_named_t dnode_hold_free_misses;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
+ * able to hold the requested range of free dnode slots because
+ * after acquiring the zrl lock at least one slot was allocated.
+ */
+ kstat_named_t dnode_hold_free_lock_misses;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
+ * to retry acquiring slot zrl locks due to contention.
+ */
+ kstat_named_t dnode_hold_free_lock_retry;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
+ * a range of dnode slots which were held by another thread.
+ */
+ kstat_named_t dnode_hold_free_refcount;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
+ * a range of dnode slots which would overflow the dnode_phys_t.
+ */
+ kstat_named_t dnode_hold_free_overflow;
+ /*
+ * Number of times a dnode_hold(...) was attempted on a dnode
+ * which had already been unlinked in an earlier txg.
+ */
+ kstat_named_t dnode_hold_free_txg;
+ /*
+ * Number of times dnode_free_interior_slots() needed to retry
+ * acquiring a slot zrl lock due to contention.
+ */
+ kstat_named_t dnode_free_interior_lock_retry;
+ /*
+ * Number of new dnodes allocated by dnode_allocate().
+ */
+ kstat_named_t dnode_allocate;
+ /*
+ * Number of dnodes re-allocated by dnode_reallocate().
+ */
+ kstat_named_t dnode_reallocate;
+ /*
+ * Number of meta dnode dbufs evicted.
+ */
+ kstat_named_t dnode_buf_evict;
+ /*
+ * Number of times dmu_object_alloc*() reached the end of the existing
+ * object ID chunk and advanced to a new one.
+ */
+ kstat_named_t dnode_alloc_next_chunk;
+ /*
+ * Number of times multiple threads attempted to allocate a dnode
+ * from the same block of free dnodes.
+ */
+ kstat_named_t dnode_alloc_race;
+ /*
+ * Number of times dmu_object_alloc*() was forced to advance to the
+ * next meta dnode dbuf due to an error from dmu_object_next().
+ */
+ kstat_named_t dnode_alloc_next_block;
+ /*
+ * Statistics for tracking dnodes which have been moved.
+ */
+ kstat_named_t dnode_move_invalid;
+ kstat_named_t dnode_move_recheck1;
+ kstat_named_t dnode_move_recheck2;
+ kstat_named_t dnode_move_special;
+ kstat_named_t dnode_move_handle;
+ kstat_named_t dnode_move_rwlock;
+ kstat_named_t dnode_move_active;
+} dnode_stats_t;
+
+extern dnode_stats_t dnode_stats;
+
+#define DNODE_STAT_INCR(stat, val) \
+ atomic_add_64(&dnode_stats.stat.value.ui64, (val));
+#define DNODE_STAT_BUMP(stat) \
+ DNODE_STAT_INCR(stat, 1);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define dprintf_dnode(dn, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dn)->dn_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj);\
+ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
+ __db_buf, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define DNODE_VERIFY(dn) dnode_verify(dn)
+#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)
+
+#else
+
+#define dprintf_dnode(db, fmt, ...)
+#define DNODE_VERIFY(dn)
+#define FREE_VERIFY(db, start, end, tx)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DNODE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h
new file mode 100644
index 000000000000..e4d9ec2be033
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_BOOKMARK_H
+#define _SYS_DSL_BOOKMARK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+struct dsl_dataset;
+
+/*
+ * On disk zap object.
+ */
+typedef struct zfs_bookmark_phys {
+ uint64_t zbm_guid; /* guid of bookmarked dataset */
+ uint64_t zbm_creation_txg; /* birth transaction group */
+ uint64_t zbm_creation_time; /* bookmark creation time */
+} zfs_bookmark_phys_t;
+
+int dsl_bookmark_create(nvlist_t *, nvlist_t *);
+int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *);
+int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *);
+int dsl_bookmark_destroy(nvlist_t *, nvlist_t *);
+int dsl_bookmark_rename(const char *fs, const char *from, const char *to);
+int dsl_bookmark_lookup(struct dsl_pool *, const char *,
+ struct dsl_dataset *, zfs_bookmark_phys_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_BOOKMARK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
new file mode 100644
index 000000000000..064ff617fd2e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -0,0 +1,457 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#ifndef _SYS_DSL_DATASET_H
+#define _SYS_DSL_DATASET_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/bplist.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+#include <zfeature_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+struct dsl_pool;
+
+#define DS_FLAG_INCONSISTENT (1ULL<<0)
+#define DS_IS_INCONSISTENT(ds) \
+ (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
+
+/*
+ * Do not allow this dataset to be promoted.
+ */
+#define DS_FLAG_NOPROMOTE (1ULL<<1)
+
+/*
+ * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
+ * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
+ * refquota/refreservations).
+ */
+#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)
+
+/*
+ * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
+ * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
+ */
+#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
+#define DS_IS_DEFER_DESTROY(ds) \
+ (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
+
+/*
+ * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+/*
+ * This field's value is the object ID of a zap object which contains the
+ * bookmarks of this dataset. If it is present, then this dataset is counted
+ * in the refcount of the SPA_FEATURES_BOOKMARKS feature.
+ */
+#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
+
+/*
+ * This field is present (with value=0) if this dataset may contain large
+ * dnodes (>512B). If it is present, then this dataset is counted in the
+ * refcount of the SPA_FEATURE_LARGE_DNODE feature.
+ */
+#define DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode"
+
+/*
+ * These fields are set on datasets that are in the middle of a resumable
+ * receive, and allow the sender to resume the send if it is interrupted.
+ */
+#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
+#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
+#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
+#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
+#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
+#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok"
+#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
+#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok"
+
+/*
+ * This field is set to the object number of the remap deadlist if one exists.
+ */
+#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist"
+
+/*
+ * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
+ * name lookups should be performed case-insensitively.
+ */
+#define DS_FLAG_CI_DATASET (1ULL<<16)
+
+#define DS_CREATE_FLAG_NODIRTY (1ULL<<24)
+
+typedef struct dsl_dataset_phys {
+ uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */
+ uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */
+ uint64_t ds_prev_snap_txg;
+ uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */
+ uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
+ uint64_t ds_num_children; /* clone/snap children; ==0 for head */
+ uint64_t ds_creation_time; /* seconds since 1970 */
+ uint64_t ds_creation_txg;
+ uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */
+ /*
+ * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
+ * include all blocks referenced by this dataset, including those
+ * shared with any other datasets.
+ */
+ uint64_t ds_referenced_bytes;
+ uint64_t ds_compressed_bytes;
+ uint64_t ds_uncompressed_bytes;
+ uint64_t ds_unique_bytes; /* only relevant to snapshots */
+ /*
+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
+ * collisions. The ds_guid is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.
+ */
+ uint64_t ds_fsid_guid;
+ uint64_t ds_guid;
+ uint64_t ds_flags; /* DS_FLAG_* */
+ blkptr_t ds_bp;
+ uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
+ uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
+ uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
+ uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
+} dsl_dataset_phys_t;
+
+typedef struct dsl_dataset {
+ dmu_buf_user_t ds_dbu;
+ rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */
+
+ /* Immutable: */
+ struct dsl_dir *ds_dir;
+ dmu_buf_t *ds_dbuf;
+ uint64_t ds_object;
+ uint64_t ds_fsid_guid;
+ boolean_t ds_is_snapshot;
+
+ /* only used in syncing context, only valid for non-snapshots: */
+ struct dsl_dataset *ds_prev;
+ uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
+
+ /* has internal locking: */
+ dsl_deadlist_t ds_deadlist;
+ bplist_t ds_pending_deadlist;
+
+ /*
+ * The remap deadlist contains blocks (DVA's, really) that are
+ * referenced by the previous snapshot and point to indirect vdevs,
+ * but in this dataset they have been remapped to point to concrete
+ * (or at least, less-indirect) vdevs. In other words, the
+ * physical DVA is referenced by the previous snapshot but not by
+ * this dataset. Logically, the DVA continues to be referenced,
+ * but we are using a different (less indirect) physical DVA.
+ * This deadlist is used to determine when physical DVAs that
+ * point to indirect vdevs are no longer referenced anywhere,
+ * and thus should be marked obsolete.
+ *
+ * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled.
+ */
+ dsl_deadlist_t ds_remap_deadlist;
+ /* protects creation of the ds_remap_deadlist */
+ kmutex_t ds_remap_deadlist_lock;
+
+ /* protected by lock on pool's dp_dirty_datasets list */
+ txg_node_t ds_dirty_link;
+ list_node_t ds_synced_link;
+
+ /*
+ * ds_phys->ds_<accounting> is also protected by ds_lock.
+ * Protected by ds_lock:
+ */
+ kmutex_t ds_lock;
+ objset_t *ds_objset;
+ uint64_t ds_userrefs;
+ void *ds_owner;
+
+ /*
+ * Long holds prevent the ds from being destroyed; they allow the
+ * ds to remain held even after dropping the dp_config_rwlock.
+ * Owning counts as a long hold. See the comments above
+ * dsl_pool_hold() for details.
+ */
+ zfs_refcount_t ds_longholds;
+
+ /* no locking; only for making guesses */
+ uint64_t ds_trysnap_txg;
+
+ /* for objset_open() */
+ kmutex_t ds_opening_lock;
+
+ uint64_t ds_reserved; /* cached refreservation */
+ uint64_t ds_quota; /* cached refquota */
+
+ kmutex_t ds_sendstream_lock;
+ list_t ds_sendstreams;
+
+ /*
+ * When in the middle of a resumable receive, tracks how much
+ * progress we have made.
+ */
+ uint64_t ds_resume_object[TXG_SIZE];
+ uint64_t ds_resume_offset[TXG_SIZE];
+ uint64_t ds_resume_bytes[TXG_SIZE];
+
+ /* Protected by our dsl_dir's dd_lock */
+ list_t ds_prop_cbs;
+
+ /*
+ * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
+ * uses this feature.
+ */
+ uint8_t ds_feature_inuse[SPA_FEATURES];
+
+ /*
+ * Set if we need to activate the feature on this dataset this txg
+ * (used only in syncing context).
+ */
+ uint8_t ds_feature_activation_needed[SPA_FEATURES];
+
+ /* Protected by ds_lock; keep at end of struct for better locality */
+ char ds_snapname[ZFS_MAX_DATASET_NAME_LEN];
+} dsl_dataset_t;
+
+inline dsl_dataset_phys_t *
+dsl_dataset_phys(dsl_dataset_t *ds)
+{
+ return (ds->ds_dbuf->db_data);
+}
+
+typedef struct dsl_dataset_promote_arg {
+ const char *ddpa_clonename;
+ dsl_dataset_t *ddpa_clone;
+ list_t shared_snaps, origin_snaps, clone_snaps;
+ dsl_dataset_t *origin_origin; /* origin of the origin */
+ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
+ nvlist_t *err_ds;
+ cred_t *cr;
+} dsl_dataset_promote_arg_t;
+
+typedef struct dsl_dataset_rollback_arg {
+ const char *ddra_fsname;
+ const char *ddra_tosnap;
+ void *ddra_owner;
+ nvlist_t *ddra_result;
+} dsl_dataset_rollback_arg_t;
+
+typedef struct dsl_dataset_snapshot_arg {
+ nvlist_t *ddsa_snaps;
+ nvlist_t *ddsa_props;
+ nvlist_t *ddsa_errors;
+ cred_t *ddsa_cr;
+} dsl_dataset_snapshot_arg_t;
+
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define MAX_TAG_PREFIX_LEN 17
+
+#define dsl_dataset_is_snapshot(ds) \
+ (dsl_dataset_phys(ds)->ds_num_children != 0)
+
+#define DS_UNIQUE_IS_ACCURATE(ds) \
+ ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+
+int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
+ dsl_dataset_t **dsp);
+boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds,
+ void *tag);
+int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
+ dsl_dataset_t **);
+void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
+int dsl_dataset_own(struct dsl_pool *dp, const char *name,
+ void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
+ void *tag, dsl_dataset_t **dsp);
+void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
+int dsl_dataset_namelen(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
+uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
+ dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
+uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+ uint64_t flags, dmu_tx_t *tx);
+void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx);
+int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx);
+int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors);
+void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx);
+int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx);
+int dsl_dataset_promote(const char *name, char *conflsnap);
+int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
+ boolean_t force);
+int dsl_dataset_rename_snapshot(const char *fsname,
+ const char *oldsnapname, const char *newsnapname, boolean_t recursive);
+int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+ minor_t cleanup_minor, const char *htag);
+
+blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
+
+spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
+
+boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
+ dsl_dataset_t *snap);
+
+void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
+void dsl_dataset_sync_done(dsl_dataset_t *os, dmu_tx_t *tx);
+
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
+ dmu_tx_t *tx);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+ dmu_tx_t *tx, boolean_t async);
+void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev,
+ uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx);
+
+void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
+
+int get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val);
+char *get_receive_resume_stats_impl(dsl_dataset_t *ds);
+char *get_child_receive_stats(dsl_dataset_t *ds);
+uint64_t dsl_get_refratio(dsl_dataset_t *ds);
+uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds);
+uint64_t dsl_get_compressratio(dsl_dataset_t *ds);
+uint64_t dsl_get_used(dsl_dataset_t *ds);
+uint64_t dsl_get_creation(dsl_dataset_t *ds);
+uint64_t dsl_get_creationtxg(dsl_dataset_t *ds);
+uint64_t dsl_get_refquota(dsl_dataset_t *ds);
+uint64_t dsl_get_refreservation(dsl_dataset_t *ds);
+uint64_t dsl_get_guid(dsl_dataset_t *ds);
+uint64_t dsl_get_unique(dsl_dataset_t *ds);
+uint64_t dsl_get_objsetid(dsl_dataset_t *ds);
+uint64_t dsl_get_userrefs(dsl_dataset_t *ds);
+uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds);
+uint64_t dsl_get_referenced(dsl_dataset_t *ds);
+uint64_t dsl_get_numclones(dsl_dataset_t *ds);
+uint64_t dsl_get_inconsistent(dsl_dataset_t *ds);
+uint64_t dsl_get_available(dsl_dataset_t *ds);
+int dsl_get_written(dsl_dataset_t *ds, uint64_t *written);
+int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap);
+int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
+ char *source);
+
+void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv);
+
+void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
+
+void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
+void dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
+int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
+
+int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
+
+int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+ uint64_t asize, uint64_t inflight, uint64_t *used,
+ uint64_t *ref_rsrv);
+int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
+ uint64_t quota);
+int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
+ uint64_t reservation);
+
+boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+ uint64_t earlier_txg);
+void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
+boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
+
+int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
+void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+ dsl_dataset_t *origin_head, dmu_tx_t *tx);
+int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+ dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
+void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+ dmu_tx_t *tx);
+
+void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+ dmu_tx_t *tx);
+void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
+int dsl_dataset_get_snapname(dsl_dataset_t *ds);
+int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
+ uint64_t *value);
+int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+ boolean_t adj_cnt);
+void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+ zprop_source_t source, uint64_t value, dmu_tx_t *tx);
+void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
+
+int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx);
+void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx);
+int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
+ nvlist_t *result);
+
+uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds);
+void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds);
+void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx);
+
+void dsl_dataset_deactivate_feature(uint64_t dsobj,
+ spa_feature_t f, dmu_tx_t *tx);
+
+#ifdef ZFS_DEBUG
+#define dprintf_ds(ds, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
+ dsl_dataset_name(ds, __ds_name); \
+ dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_ds(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DATASET_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
new file mode 100644
index 000000000000..08f38233d7ab
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DEADLIST_H
+#define _SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+ uint64_t dl_used;
+ uint64_t dl_comp;
+ uint64_t dl_uncomp;
+ uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+ objset_t *dl_os;
+ uint64_t dl_object;
+ avl_tree_t dl_tree;
+ boolean_t dl_havetree;
+ struct dmu_buf *dl_dbuf;
+ dsl_deadlist_phys_t *dl_phys;
+ kmutex_t dl_lock;
+
+ /* if it's the old on-disk format: */
+ bpobj_t dl_bpobj;
+ boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+ avl_node_t dle_node;
+ uint64_t dle_mintxg;
+ bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+ uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx);
+boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
new file mode 100644
index 000000000000..6fb6a121ade6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DELEG_H
+#define _SYS_DSL_DELEG_H
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_DELEG_PERM_NONE ""
+#define ZFS_DELEG_PERM_CREATE "create"
+#define ZFS_DELEG_PERM_DESTROY "destroy"
+#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
+#define ZFS_DELEG_PERM_ROLLBACK "rollback"
+#define ZFS_DELEG_PERM_CLONE "clone"
+#define ZFS_DELEG_PERM_PROMOTE "promote"
+#define ZFS_DELEG_PERM_RENAME "rename"
+#define ZFS_DELEG_PERM_MOUNT "mount"
+#define ZFS_DELEG_PERM_SHARE "share"
+#define ZFS_DELEG_PERM_SEND "send"
+#define ZFS_DELEG_PERM_RECEIVE "receive"
+#define ZFS_DELEG_PERM_ALLOW "allow"
+#define ZFS_DELEG_PERM_USERPROP "userprop"
+#define ZFS_DELEG_PERM_VSCAN "vscan"
+#define ZFS_DELEG_PERM_USERQUOTA "userquota"
+#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
+#define ZFS_DELEG_PERM_USERUSED "userused"
+#define ZFS_DELEG_PERM_GROUPUSED "groupused"
+#define ZFS_DELEG_PERM_HOLD "hold"
+#define ZFS_DELEG_PERM_RELEASE "release"
+#define ZFS_DELEG_PERM_DIFF "diff"
+#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
+#define ZFS_DELEG_PERM_REMAP "remap"
+
+/*
+ * Note: the names of properties that are marked delegatable are also
+ * valid delegated permissions
+ */
+
+int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
+int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
+int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
+void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
+int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
+int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
+int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx);
+boolean_t dsl_delegation_on(objset_t *os);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DELEG_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
new file mode 100644
index 000000000000..ae3ca0cfbd5e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
@@ -0,0 +1,68 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DESTROY_H
+#define _SYS_DSL_DESTROY_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct nvlist;
+struct dsl_dataset;
+struct dmu_tx;
+
+int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
+ struct nvlist *);
+int dsl_destroy_snapshot(const char *, boolean_t);
+int dsl_destroy_head(const char *);
+int dsl_destroy_head_check_impl(struct dsl_dataset *, int);
+void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *);
+int dsl_destroy_inconsistent(const char *, void *);
+int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t);
+void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *,
+ boolean_t, struct dmu_tx *);
+
+typedef struct dsl_destroy_snapshot_arg {
+ const char *ddsa_name;
+ boolean_t ddsa_defer;
+} dsl_destroy_snapshot_arg_t;
+
+int dsl_destroy_snapshot_check(void *, dmu_tx_t *);
+void dsl_destroy_snapshot_sync(void *, dmu_tx_t *);
+
+typedef struct dsl_destroy_head_arg {
+ const char *ddha_name;
+} dsl_destroy_head_arg_t;
+
+int dsl_destroy_head_check(void *, dmu_tx_t *);
+void dsl_destroy_head_sync(void *, dmu_tx_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DESTROY_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
new file mode 100644
index 000000000000..21d953cb6013
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
@@ -0,0 +1,209 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DIR_H
+#define _SYS_DSL_DIR_H
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/refcount.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+/*
+ * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
+#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
+#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg"
+
+typedef enum dd_used {
+ DD_USED_HEAD,
+ DD_USED_SNAP,
+ DD_USED_CHILD,
+ DD_USED_CHILD_RSRV,
+ DD_USED_REFRSRV,
+ DD_USED_NUM
+} dd_used_t;
+
+#define DD_FLAG_USED_BREAKDOWN (1<<0)
+
+typedef struct dsl_dir_phys {
+ uint64_t dd_creation_time; /* not actually used */
+ uint64_t dd_head_dataset_obj;
+ uint64_t dd_parent_obj;
+ uint64_t dd_origin_obj;
+ uint64_t dd_child_dir_zapobj;
+ /*
+ * how much space our children are accounting for; for leaf
+ * datasets, == physical space used by fs + snaps
+ */
+ uint64_t dd_used_bytes;
+ uint64_t dd_compressed_bytes;
+ uint64_t dd_uncompressed_bytes;
+ /* Administrative quota setting */
+ uint64_t dd_quota;
+ /* Administrative reservation setting */
+ uint64_t dd_reserved;
+ uint64_t dd_props_zapobj;
+ uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
+ uint64_t dd_flags;
+ uint64_t dd_used_breakdown[DD_USED_NUM];
+ uint64_t dd_clones; /* dsl_dir objects */
+ uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+struct dsl_dir {
+ dmu_buf_user_t dd_dbu;
+
+ /* These are immutable; no lock needed: */
+ uint64_t dd_object;
+ dsl_pool_t *dd_pool;
+
+ /* Stable until user eviction; no lock needed: */
+ dmu_buf_t *dd_dbuf;
+
+ /* protected by lock on pool's dp_dirty_dirs list */
+ txg_node_t dd_dirty_link;
+
+ /* protected by dp_config_rwlock */
+ dsl_dir_t *dd_parent;
+
+ /* Protected by dd_lock */
+ kmutex_t dd_lock;
+ list_t dd_props; /* list of dsl_prop_record_t's */
+ timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+ uint64_t dd_origin_txg;
+
+ /* gross estimate of space used by in-flight tx's */
+ uint64_t dd_tempreserved[TXG_SIZE];
+ /* amount of space we expect to write; == amount of dirty data */
+ int64_t dd_space_towrite[TXG_SIZE];
+
+ /* protected by dd_lock; keep at end of struct for better locality */
+ char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
+};
+
+inline dsl_dir_phys_t *
+dsl_dir_phys(dsl_dir_t *dd)
+{
+ return (dd->dd_dbuf->db_data);
+}
+
+void dsl_dir_rele(dsl_dir_t *dd, void *tag);
+void dsl_dir_async_rele(dsl_dir_t *dd, void *tag);
+int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
+ dsl_dir_t **, const char **tail);
+int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **);
+void dsl_dir_name(dsl_dir_t *dd, char *buf);
+int dsl_dir_namelen(dsl_dir_t *dd);
+uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
+ const char *name, dmu_tx_t *tx);
+
+uint64_t dsl_dir_get_used(dsl_dir_t *dd);
+uint64_t dsl_dir_get_compressed(dsl_dir_t *dd);
+uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
+uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
+uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
+uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd);
+uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd);
+uint64_t dsl_dir_get_usedds(dsl_dir_t *dd);
+uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd);
+uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd);
+void dsl_dir_get_origin(dsl_dir_t *dd, char *buf);
+int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count);
+int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count);
+int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count);
+
+void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
+uint64_t dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
+void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
+void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
+int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
+ uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx);
+void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
+void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
+void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
+void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
+ dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
+int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
+ uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+ uint64_t reservation);
+int dsl_dir_activate_fs_ss_limit(const char *);
+int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
+ cred_t *);
+void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
+int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t);
+int dsl_dir_rename(const char *oldname, const char *newname);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+ uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
+boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
+void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
+ uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
+void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
+timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
+void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
+ dmu_tx_t *tx);
+void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
+boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
+
+/* internal reserved dir name */
+#define MOS_DIR_NAME "$MOS"
+#define ORIGIN_DIR_NAME "$ORIGIN"
+#define FREE_DIR_NAME "$FREE"
+#define LEAK_DIR_NAME "$LEAK"
+
+#ifdef ZFS_DEBUG
+#define dprintf_dd(dd, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
+ dsl_dir_name(dd, __ds_name); \
+ dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_dd(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DIR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
new file mode 100644
index 000000000000..7dce64bfd40b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -0,0 +1,191 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_POOL_H
+#define _SYS_DSL_POOL_H
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/txg_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/ddt.h>
+#include <sys/arc.h>
+#include <sys/bpobj.h>
+#include <sys/bptree.h>
+#include <sys/rrwlock.h>
+#include <sys/dsl_synctask.h>
+#include <sys/mmp.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+struct dsl_scan;
+
+extern uint64_t zfs_dirty_data_max;
+extern uint64_t zfs_dirty_data_max_max;
+extern uint64_t zfs_dirty_data_sync_pct;
+extern int zfs_dirty_data_max_percent;
+extern int zfs_delay_min_dirty_percent;
+extern uint64_t zfs_delay_scale;
+
+/* These macros are for indexing into the zfs_all_blkstats_t. */
+#define DMU_OT_DEFERRED DMU_OT_NONE
+#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
+#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1)
+
+typedef struct zfs_blkstat {
+ uint64_t zb_count;
+ uint64_t zb_asize;
+ uint64_t zb_lsize;
+ uint64_t zb_psize;
+ uint64_t zb_gangs;
+ uint64_t zb_ditto_2_of_2_samevdev;
+ uint64_t zb_ditto_2_of_3_samevdev;
+ uint64_t zb_ditto_3_of_3_samevdev;
+} zfs_blkstat_t;
+
+typedef struct zfs_all_blkstats {
+ zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
+ kmutex_t zab_lock;
+} zfs_all_blkstats_t;
+
+
+typedef struct dsl_pool {
+ /* Immutable */
+ spa_t *dp_spa;
+ struct objset *dp_meta_objset;
+ struct dsl_dir *dp_root_dir;
+ struct dsl_dir *dp_mos_dir;
+ struct dsl_dir *dp_free_dir;
+ struct dsl_dir *dp_leak_dir;
+ struct dsl_dataset *dp_origin_snap;
+ uint64_t dp_root_dir_obj;
+ struct taskq *dp_vnrele_taskq;
+
+ /* No lock needed - sync context only */
+ blkptr_t dp_meta_rootbp;
+ uint64_t dp_tmp_userrefs_obj;
+ bpobj_t dp_free_bpobj;
+ uint64_t dp_bptree_obj;
+ uint64_t dp_empty_bpobj;
+ bpobj_t dp_obsolete_bpobj;
+
+ struct dsl_scan *dp_scan;
+
+ /* Uses dp_lock */
+ kmutex_t dp_lock;
+ kcondvar_t dp_spaceavail_cv;
+ uint64_t dp_dirty_pertxg[TXG_SIZE];
+ uint64_t dp_dirty_total;
+ uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
+ uint64_t dp_mos_used_delta;
+ uint64_t dp_mos_compressed_delta;
+ uint64_t dp_mos_uncompressed_delta;
+
+ /*
+ * Time of most recently scheduled (furthest in the future)
+ * wakeup for delayed transactions.
+ */
+ hrtime_t dp_last_wakeup;
+
+ /* Has its own locking */
+ tx_state_t dp_tx;
+ txg_list_t dp_dirty_datasets;
+ txg_list_t dp_dirty_zilogs;
+ txg_list_t dp_dirty_dirs;
+ txg_list_t dp_sync_tasks;
+ txg_list_t dp_early_sync_tasks;
+ taskq_t *dp_sync_taskq;
+ taskq_t *dp_zil_clean_taskq;
+
+ /*
+ * Protects administrative changes (properties, namespace)
+ *
+ * It is only held for write in syncing context. Therefore
+ * syncing context does not need to ever have it for read, since
+ * nobody else could possibly have it for write.
+ */
+ rrwlock_t dp_config_rwlock;
+
+ zfs_all_blkstats_t *dp_blkstats;
+} dsl_pool_t;
+
+int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
+int dsl_pool_open(dsl_pool_t *dp);
+void dsl_pool_close(dsl_pool_t *dp);
+dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
+void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
+int dsl_pool_sync_context(dsl_pool_t *dp);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
+uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
+ zfs_space_check_t slop_policy);
+void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
+void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
+ const blkptr_t *bpp);
+void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+ int64_t used, int64_t comp, int64_t uncomp);
+void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp,
+ int64_t used, int64_t comp, int64_t uncomp);
+void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
+void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
+void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
+boolean_t dsl_pool_config_held(dsl_pool_t *dp);
+boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
+boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
+
+taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
+
+int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, uint64_t now, dmu_tx_t *tx);
+int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+ const char *tag, dmu_tx_t *tx);
+void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp);
+void dsl_pool_rele(dsl_pool_t *dp, void *tag);
+
+void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_POOL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
new file mode 100644
index 000000000000..21e6f4674be9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_PROP_H
+#define _SYS_DSL_PROP_H
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_synctask.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+
+/* The callback func may not call into the DMU or DSL! */
+typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
+
+typedef struct dsl_prop_record {
+ list_node_t pr_node; /* link on dd_props */
+ const char *pr_propname;
+ list_t pr_cbs;
+} dsl_prop_record_t;
+
+typedef struct dsl_prop_cb_record {
+ list_node_t cbr_pr_node; /* link on pr_cbs */
+ list_node_t cbr_ds_node; /* link on ds_prop_cbs */
+ dsl_prop_record_t *cbr_pr;
+ struct dsl_dataset *cbr_ds;
+ dsl_prop_changed_cb_t *cbr_func;
+ void *cbr_arg;
+} dsl_prop_cb_record_t;
+
+typedef struct dsl_props_arg {
+ nvlist_t *pa_props;
+ zprop_source_t pa_source;
+} dsl_props_arg_t;
+
+void dsl_prop_init(dsl_dir_t *dd);
+void dsl_prop_fini(dsl_dir_t *dd);
+int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg);
+void dsl_prop_notify_all(struct dsl_dir *dd);
+boolean_t dsl_prop_hascb(struct dsl_dataset *ds);
+
+int dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(const char *dsname, nvlist_t **nvp);
+int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname,
+ uint64_t *valuep);
+int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint,
+ boolean_t snapshot);
+
+void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source,
+ nvlist_t *props, dmu_tx_t *tx);
+void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname,
+ zprop_source_t source, int intsz, int numints, const void *value,
+ dmu_tx_t *tx);
+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
+int dsl_prop_set_int(const char *dsname, const char *propname,
+ zprop_source_t source, uint64_t value);
+int dsl_prop_set_string(const char *dsname, const char *propname,
+ zprop_source_t source, const char *value);
+int dsl_prop_inherit(const char *dsname, const char *propname,
+ zprop_source_t source);
+
+int dsl_prop_predict(dsl_dir_t *dd, const char *propname,
+ zprop_source_t source, uint64_t value, uint64_t *newvalp);
+
+/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
+boolean_t dsl_prop_get_hasrecvd(const char *dsname);
+int dsl_prop_set_hasrecvd(const char *dsname);
+void dsl_prop_unset_hasrecvd(const char *dsname);
+
+void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
+void dsl_prop_nvlist_add_string(nvlist_t *nv,
+ zfs_prop_t prop, const char *value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_PROP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
new file mode 100644
index 000000000000..5ddffe57bf97
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
@@ -0,0 +1,188 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ */
+
+#ifndef _SYS_DSL_SCAN_H
+#define _SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+ uint64_t scn_func; /* pool_scan_func_t */
+ uint64_t scn_state; /* dsl_scan_state_t */
+ uint64_t scn_queue_obj;
+ uint64_t scn_min_txg;
+ uint64_t scn_max_txg;
+ uint64_t scn_cur_min_txg;
+ uint64_t scn_cur_max_txg;
+ uint64_t scn_start_time;
+ uint64_t scn_end_time;
+ uint64_t scn_to_examine; /* total bytes to be scanned */
+ uint64_t scn_examined; /* bytes scanned so far */
+ uint64_t scn_to_process;
+ uint64_t scn_processed;
+ uint64_t scn_errors; /* scan I/O error count */
+ uint64_t scn_ddt_class_max;
+ ddt_bookmark_t scn_ddt_bookmark;
+ zbookmark_phys_t scn_bookmark;
+ uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+ DSF_VISIT_DS_AGAIN = 1<<0,
+ DSF_SCRUB_PAUSED = 1<<1,
+} dsl_scan_flags_t;
+
+/*
+ * Every pool will have one dsl_scan_t and this structure will contain
+ * in-memory information about the scan and a pointer to the on-disk
+ * representation (i.e. dsl_scan_phys_t). Most of the state of the scan
+ * is contained on-disk to allow the scan to resume in the event of a reboot
+ * or panic. This structure maintains information about the behavior of a
+ * running scan, some caching information, and how it should traverse the pool.
+ *
+ * The following members of this structure direct the behavior of the scan:
+ *
+ * scn_suspending - a scan that cannot be completed in a single txg or
+ * has exceeded its allotted time will need to suspend.
+ * When this flag is set the scanner will stop traversing
+ * the pool and write out the current state to disk.
+ *
+ * scn_restart_txg - directs the scanner to either restart or start a
+ * a scan at the specified txg value.
+ *
+ * scn_done_txg - when a scan completes its traversal it will set
+ * the completion txg to the next txg. This is necessary
+ * to ensure that any blocks that were freed during
+ * the scan but have not yet been processed (i.e deferred
+ * frees) are accounted for.
+ *
+ * This structure also maintains information about deferred frees which are
+ * a special kind of traversal. Deferred free can exist in either a bptree or
+ * a bpobj structure. The scn_is_bptree flag will indicate the type of
+ * deferred free that is in progress. If the deferred free is part of an
+ * asynchronous destroy then the scn_async_destroying flag will be set.
+ */
+typedef struct dsl_scan {
+ struct dsl_pool *scn_dp;
+
+ uint64_t scn_restart_txg;
+ uint64_t scn_done_txg;
+ uint64_t scn_sync_start_time;
+ uint64_t scn_issued_before_pass;
+
+ /* for freeing blocks */
+ boolean_t scn_is_bptree;
+ boolean_t scn_async_destroying;
+ boolean_t scn_async_stalled;
+ uint64_t scn_async_block_min_time_ms;
+ /* flags and stats for controlling scan state */
+ boolean_t scn_is_sorted; /* doing sequential scan */
+ boolean_t scn_clearing; /* scan is issuing sequential extents */
+ boolean_t scn_checkpointing; /* scan is issuing all queued extents */
+ boolean_t scn_suspending; /* scan is suspending until next txg */
+ uint64_t scn_last_checkpoint; /* time of last checkpoint */
+
+ /* members for thread synchronization */
+ zio_t *scn_zio_root; /* root zio for waiting on IO */
+ taskq_t *scn_taskq; /* task queue for issuing extents */
+
+ /* for controlling scan prefetch, protected by spa_scrub_lock */
+ boolean_t scn_prefetch_stop; /* prefetch should stop */
+ zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */
+ avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */
+ uint64_t scn_maxinflight_bytes; /* max bytes in flight for poool */
+
+ /* per txg statistics */
+ uint64_t scn_visited_this_txg; /* total bps visited this txg */
+ uint64_t scn_holes_this_txg;
+ uint64_t scn_lt_min_this_txg;
+ uint64_t scn_gt_max_this_txg;
+ uint64_t scn_ddt_contained_this_txg;
+ uint64_t scn_objsets_visited_this_txg;
+ uint64_t scn_avg_seg_size_this_txg;
+ uint64_t scn_segs_this_txg;
+ uint64_t scn_avg_zio_size_this_txg;
+ uint64_t scn_zios_this_txg;
+
+ /* members needed for syncing scan status to disk */
+ dsl_scan_phys_t scn_phys; /* on disk representation of scan */
+ dsl_scan_phys_t scn_phys_cached;
+ avl_tree_t scn_queue; /* queue of datasets to scan */
+ uint64_t scn_bytes_pending; /* outstanding data to issue */
+} dsl_scan_t;
+
+typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
+
+void dsl_scan_global_init(void);
+
+void scan_init(void);
+void scan_fini(void);
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
+int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+ struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
+boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
+void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
+void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
+void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
new file mode 100644
index 000000000000..957963ffe553
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_SYNCTASK_H
+#define _SYS_DSL_SYNCTASK_H
+
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+
+typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
+typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *);
+
+typedef enum zfs_space_check {
+ /*
+ * Normal space check: if there is less than 3.2% free space,
+ * the operation will fail. Operations which are logically
+ * creating things should use this (e.g. "zfs create", "zfs snapshot").
+ * User writes (via the ZPL / ZVOL) also fail at this point.
+ */
+ ZFS_SPACE_CHECK_NORMAL,
+
+ /*
+ * Space check allows use of half the slop space. If there
+ * is less than 1.6% free space, the operation will fail. Most
+ * operations should use this (e.g. "zfs set", "zfs rename"),
+ * because we want them to succeed even after user writes are failing,
+ * so that they can be used as part of the space recovery process.
+ */
+ ZFS_SPACE_CHECK_RESERVED,
+
+ /*
+ * Space check allows use of three quarters of the slop space.
+ * If there is less than 0.8% free space, the operation will
+ * fail.
+ */
+ ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+ /*
+ * In all cases "zfs destroy" is expected to result in an net
+ * reduction of space, except one. When the pool has a
+ * checkpoint, space freed by "zfs destroy" will not actually
+ * free anything internally. Thus, it starts failing after
+ * three quarters of the slop space is exceeded.
+ */
+ ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+ /*
+ * A channel program can run a "zfs destroy" as part of its
+ * script and therefore has the same space_check policy when
+ * being evaluated.
+ */
+ ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY,
+
+ /*
+ * No space check is performed. This level of space check should
+ * be used cautiously as operations that use it can even run when
+ * 0.8% capacity is left for use. In this scenario, if there is a
+ * checkpoint, async destroys are suspended and any kind of freeing
+ * can potentially add space instead of freeing it.
+ *
+ * See also the comments above spa_slop_shift.
+ */
+ ZFS_SPACE_CHECK_NONE,
+
+ ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE,
+
+} zfs_space_check_t;
+
+typedef struct dsl_sync_task {
+ txg_node_t dst_node;
+ struct dsl_pool *dst_pool;
+ uint64_t dst_txg;
+ int dst_space;
+ zfs_space_check_t dst_space_check;
+ dsl_checkfunc_t *dst_checkfunc;
+ dsl_syncfunc_t *dst_syncfunc;
+ void *dst_arg;
+ int dst_error;
+ boolean_t dst_nowaiter;
+} dsl_sync_task_t;
+
+void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *);
+int dsl_sync_task(const char *, dsl_checkfunc_t *,
+ dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+ void *, int, zfs_space_check_t, dmu_tx_t *);
+int dsl_early_sync_task(const char *, dsl_checkfunc_t *,
+ dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+ void *, int, zfs_space_check_t, dmu_tx_t *);
+int dsl_sync_task_sig(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *,
+ dsl_sigfunc_t *, void *, int, zfs_space_check_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SYNCTASK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
new file mode 100644
index 000000000000..071aeb86d1f1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
@@ -0,0 +1,57 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_USERHOLD_H
+#define _SYS_DSL_USERHOLD_H
+
+#include <sys/nvpair.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+struct dsl_dataset;
+struct dmu_tx;
+
+int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor,
+ nvlist_t *errlist);
+int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl);
+void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds);
+int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag,
+ boolean_t temphold, struct dmu_tx *tx);
+void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag,
+ minor_t minor, uint64_t now, struct dmu_tx *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_USERHOLD_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
new file mode 100644
index 000000000000..7219dc967427
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#ifndef _SYS_METASLAB_H
+#define _SYS_METASLAB_H
+
+#include <sys/spa.h>
+#include <sys/space_map.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct metaslab_ops {
+ uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
+} metaslab_ops_t;
+
+
+extern metaslab_ops_t *zfs_metaslab_ops;
+
+int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
+ metaslab_t **);
+void metaslab_fini(metaslab_t *);
+
+int metaslab_load(metaslab_t *);
+void metaslab_unload(metaslab_t *);
+
+uint64_t metaslab_allocated_space(metaslab_t *);
+
+void metaslab_sync(metaslab_t *, uint64_t);
+void metaslab_sync_done(metaslab_t *, uint64_t);
+void metaslab_sync_reassess(metaslab_group_t *);
+uint64_t metaslab_block_maxsize(metaslab_t *);
+
+/*
+ * metaslab alloc flags
+ */
+#define METASLAB_HINTBP_FAVOR 0x0
+#define METASLAB_HINTBP_AVOID 0x1
+#define METASLAB_GANG_HEADER 0x2
+#define METASLAB_GANG_CHILD 0x4
+#define METASLAB_ASYNC_ALLOC 0x8
+#define METASLAB_DONT_THROTTLE 0x10
+#define METASLAB_MUST_RESERVE 0x20
+#define METASLAB_FASTWRITE 0x40
+
+int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
+ blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
+ int);
+int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
+ dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
+void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
+void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
+void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
+void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
+void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
+int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
+int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
+void metaslab_check_free(spa_t *, const blkptr_t *);
+
+void metaslab_alloc_trace_init(void);
+void metaslab_alloc_trace_fini(void);
+void metaslab_trace_init(zio_alloc_list_t *);
+void metaslab_trace_fini(zio_alloc_list_t *);
+
+metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
+void metaslab_class_destroy(metaslab_class_t *);
+int metaslab_class_validate(metaslab_class_t *);
+void metaslab_class_histogram_verify(metaslab_class_t *);
+uint64_t metaslab_class_fragmentation(metaslab_class_t *);
+uint64_t metaslab_class_expandable_space(metaslab_class_t *);
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
+ zio_t *, int);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
+
+uint64_t metaslab_class_get_alloc(metaslab_class_t *);
+uint64_t metaslab_class_get_space(metaslab_class_t *);
+uint64_t metaslab_class_get_dspace(metaslab_class_t *);
+uint64_t metaslab_class_get_deferred(metaslab_class_t *);
+uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
+
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
+void metaslab_group_destroy(metaslab_group_t *);
+void metaslab_group_activate(metaslab_group_t *);
+void metaslab_group_passivate(metaslab_group_t *);
+boolean_t metaslab_group_initialized(metaslab_group_t *);
+uint64_t metaslab_group_get_space(metaslab_group_t *);
+void metaslab_group_histogram_verify(metaslab_group_t *);
+uint64_t metaslab_group_fragmentation(metaslab_group_t *);
+void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
+ boolean_t);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
+void metaslab_recalculate_weight_and_sort(metaslab_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
new file mode 100644
index 000000000000..ae49795fec1a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -0,0 +1,501 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_METASLAB_IMPL_H
+#define _SYS_METASLAB_IMPL_H
+
+#include <sys/metaslab.h>
+#include <sys/space_map.h>
+#include <sys/range_tree.h>
+#include <sys/vdev.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Metaslab allocation tracing record.
+ */
+typedef struct metaslab_alloc_trace {
+ list_node_t mat_list_node;
+ metaslab_group_t *mat_mg;
+ metaslab_t *mat_msp;
+ uint64_t mat_size;
+ uint64_t mat_weight;
+ uint32_t mat_dva_id;
+ uint64_t mat_offset;
+ int mat_allocator;
+} metaslab_alloc_trace_t;
+
+/*
+ * Used by the metaslab allocation tracing facility to indicate
+ * error conditions. These errors are stored to the offset member
+ * of the metaslab_alloc_trace_t record and displayed by mdb.
+ */
+typedef enum trace_alloc_type {
+ TRACE_ALLOC_FAILURE = -1ULL,
+ TRACE_TOO_SMALL = -2ULL,
+ TRACE_FORCE_GANG = -3ULL,
+ TRACE_NOT_ALLOCATABLE = -4ULL,
+ TRACE_GROUP_FAILURE = -5ULL,
+ TRACE_ENOSPC = -6ULL,
+ TRACE_CONDENSING = -7ULL,
+ TRACE_VDEV_ERROR = -8ULL,
+ TRACE_INITIALIZING = -9ULL
+} trace_alloc_type_t;
+
+#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
+#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
+#define METASLAB_WEIGHT_CLAIM (1ULL << 61)
+#define METASLAB_WEIGHT_TYPE (1ULL << 60)
+#define METASLAB_ACTIVE_MASK \
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
+ METASLAB_WEIGHT_CLAIM)
+
+/*
+ * The metaslab weight is used to encode the amount of free space in a
+ * metaslab, such that the "best" metaslab appears first when sorting the
+ * metaslabs by weight. The weight (and therefore the "best" metaslab) can
+ * be determined in two different ways: by computing a weighted sum of all
+ * the free space in the metaslab (a space based weight) or by counting only
+ * the free segments of the largest size (a segment based weight). We prefer
+ * the segment based weight because it reflects how the free space is
+ * comprised, but we cannot always use it -- legacy pools do not have the
+ * space map histogram information necessary to determine the largest
+ * contiguous regions. Pools that have the space map histogram determine
+ * the segment weight by looking at each bucket in the histogram and
+ * determining the free space whose size in bytes is in the range:
+ * [2^i, 2^(i+1))
+ * We then encode the largest index, i, that contains regions into the
+ * segment-weighted value.
+ *
+ * Space-based weight:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * |PSC1| weighted-free space |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * PS - indicates primary and secondary activation
+ * C - indicates activation for claimed block zio
+ * space - the fragmentation-weighted space
+ *
+ * Segment-based weight:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * |PSC0| idx| count of segments in region |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * PS - indicates primary and secondary activation
+ * C - indicates activation for claimed block zio
+ * idx - index for the highest bucket in the histogram
+ * count - number of segments in the specified bucket
+ */
+#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3)
+#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x)
+
+#define WEIGHT_IS_SPACEBASED(weight) \
+ ((weight) == 0 || BF64_GET((weight), 60, 1))
+#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1)
+
+/*
+ * These macros are only applicable to segment-based weighting.
+ */
+#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6)
+#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x)
+#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54)
+#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x)
+
+/*
+ * A metaslab class encompasses a category of allocatable top-level vdevs.
+ * Each top-level vdev is associated with a metaslab group which defines
+ * the allocatable region for that vdev. Examples of these categories include
+ * "normal" for data block allocations (i.e. main pool allocations) or "log"
+ * for allocations designated for intent log devices (i.e. slog devices).
+ * When a block allocation is requested from the SPA it is associated with a
+ * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
+ * to the class can be used to satisfy that request. Allocations are done
+ * by traversing the metaslab groups that are linked off of the mc_rotor field.
+ * This rotor points to the next metaslab group where allocations will be
+ * attempted. Allocating a block is a 3 step process -- select the metaslab
+ * group, select the metaslab, and then allocate the block. The metaslab
+ * class defines the low-level block allocator that will be used as the
+ * final step in allocation. These allocators are pluggable allowing each class
+ * to use a block allocator that best suits that class.
+ */
+struct metaslab_class {
+ kmutex_t mc_lock;
+ spa_t *mc_spa;
+ metaslab_group_t *mc_rotor;
+ metaslab_ops_t *mc_ops;
+ uint64_t mc_aliquot;
+
+ /*
+ * Track the number of metaslab groups that have been initialized
+ * and can accept allocations. An initialized metaslab group is
+ * one has been completely added to the config (i.e. we have
+ * updated the MOS config and the space has been added to the pool).
+ */
+ uint64_t mc_groups;
+
+ /*
+ * Toggle to enable/disable the allocation throttle.
+ */
+ boolean_t mc_alloc_throttle_enabled;
+
+ /*
+ * The allocation throttle works on a reservation system. Whenever
+ * an asynchronous zio wants to perform an allocation it must
+ * first reserve the number of blocks that it wants to allocate.
+ * If there aren't sufficient slots available for the pending zio
+ * then that I/O is throttled until more slots free up. The current
+ * number of reserved allocations is maintained by the mc_alloc_slots
+ * refcount. The mc_alloc_max_slots value determines the maximum
+ * number of allocations that the system allows. Gang blocks are
+ * allowed to reserve slots even if we've reached the maximum
+ * number of allocations allowed.
+ */
+ uint64_t *mc_alloc_max_slots;
+ zfs_refcount_t *mc_alloc_slots;
+
+ uint64_t mc_alloc_groups; /* # of allocatable groups */
+
+ uint64_t mc_alloc; /* total allocated space */
+ uint64_t mc_deferred; /* total deferred frees */
+ uint64_t mc_space; /* total space (alloc + free) */
+ uint64_t mc_dspace; /* total deflated space */
+ uint64_t mc_minblocksize;
+ uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+};
+
+/*
+ * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
+ * of a top-level vdev. They are linked togther to form a circular linked
+ * list and can belong to only one metaslab class. Metaslab groups may become
+ * ineligible for allocations for a number of reasons such as limited free
+ * space, fragmentation, or going offline. When this happens the allocator will
+ * simply find the next metaslab group in the linked list and attempt
+ * to allocate from that group instead.
+ */
+struct metaslab_group {
+ kmutex_t mg_lock;
+ metaslab_t **mg_primaries;
+ metaslab_t **mg_secondaries;
+ avl_tree_t mg_metaslab_tree;
+ uint64_t mg_aliquot;
+ boolean_t mg_allocatable; /* can we allocate? */
+ uint64_t mg_ms_ready;
+
+ /*
+ * A metaslab group is considered to be initialized only after
+ * we have updated the MOS config and added the space to the pool.
+ * We only allow allocation attempts to a metaslab group if it
+ * has been initialized.
+ */
+ boolean_t mg_initialized;
+
+ uint64_t mg_free_capacity; /* percentage free */
+ int64_t mg_bias;
+ int64_t mg_activation_count;
+ metaslab_class_t *mg_class;
+ vdev_t *mg_vd;
+ taskq_t *mg_taskq;
+ metaslab_group_t *mg_prev;
+ metaslab_group_t *mg_next;
+
+ /*
+ * In order for the allocation throttle to function properly, we cannot
+ * have too many IOs going to each disk by default; the throttle
+ * operates by allocating more work to disks that finish quickly, so
+ * allocating larger chunks to each disk reduces its effectiveness.
+ * However, if the number of IOs going to each allocator is too small,
+ * we will not perform proper aggregation at the vdev_queue layer,
+ * also resulting in decreased performance. Therefore, we will use a
+ * ramp-up strategy.
+ *
+ * Each allocator in each metaslab group has a current queue depth
+ * (mg_alloc_queue_depth[allocator]) and a current max queue depth
+ * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
+ * has an absolute max queue depth (mg_max_alloc_queue_depth). We
+ * add IOs to an allocator until the mg_alloc_queue_depth for that
+ * allocator hits the cur_max. Every time an IO completes for a given
+ * allocator on a given metaslab group, we increment its cur_max until
+ * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
+ * help protect against disks that decrease in performance over time.
+ *
+ * It's possible for an allocator to handle more allocations than
+ * its max. This can occur when gang blocks are required or when other
+ * groups are unable to handle their share of allocations.
+ */
+ uint64_t mg_max_alloc_queue_depth;
+ uint64_t *mg_cur_max_alloc_queue_depth;
+ zfs_refcount_t *mg_alloc_queue_depth;
+ int mg_allocators;
+ /*
+ * A metalab group that can no longer allocate the minimum block
+ * size will set mg_no_free_space. Once a metaslab group is out
+ * of space then its share of work must be distributed to other
+ * groups.
+ */
+ boolean_t mg_no_free_space;
+
+ uint64_t mg_allocations;
+ uint64_t mg_failed_allocations;
+ uint64_t mg_fragmentation;
+ uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ int mg_ms_initializing;
+ boolean_t mg_initialize_updating;
+ kmutex_t mg_ms_initialize_lock;
+ kcondvar_t mg_ms_initialize_cv;
+};
+
+/*
+ * This value defines the number of elements in the ms_lbas array. The value
+ * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
+ * This is the equivalent of highbit(UINT64_MAX).
+ */
+#define MAX_LBAS 64
+
+/*
+ * Each metaslab maintains a set of in-core trees to track metaslab
+ * operations. The in-core free tree (ms_allocatable) contains the list of
+ * free segments which are eligible for allocation. As blocks are
+ * allocated, the allocated segment are removed from the ms_allocatable and
+ * added to a per txg allocation tree (ms_allocating). As blocks are
+ * freed, they are added to the free tree (ms_freeing). These trees
+ * allow us to process all allocations and frees in syncing context
+ * where it is safe to update the on-disk space maps. An additional set
+ * of in-core trees is maintained to track deferred frees
+ * (ms_defer). Once a block is freed it will move from the
+ * ms_freed to the ms_defer tree. A deferred free means that a block
+ * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
+ * transactions groups later. For example, a block that is freed in txg
+ * 50 will not be available for reallocation until txg 52 (50 +
+ * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback.
+ * A pool could be safely rolled back TXG_DEFERS_SIZE transactions
+ * groups and ensure that no block has been reallocated.
+ *
+ * The simplified transition diagram looks like this:
+ *
+ *
+ * ALLOCATE
+ * |
+ * V
+ * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
+ * ^
+ * | ms_freeing <--- FREE
+ * | |
+ * | v
+ * | ms_freed
+ * | |
+ * +-------- ms_defer[2] <-------+-------> (write to space map)
+ *
+ *
+ * Each metaslab's space is tracked in a single space map in the MOS,
+ * which is only updated in syncing context. Each time we sync a txg,
+ * we append the allocs and frees from that txg to the space map. The
+ * pool space is only updated once all metaslabs have finished syncing.
+ *
+ * To load the in-core free tree we read the space map from disk. This
+ * object contains a series of alloc and free records that are combined
+ * to make up the list of all free segments in this metaslab. These
+ * segments are represented in-core by the ms_allocatable and are stored
+ * in an AVL tree.
+ *
+ * As the space map grows (as a result of the appends) it will
+ * eventually become space-inefficient. When the metaslab's in-core
+ * free tree is zfs_condense_pct/100 times the size of the minimal
+ * on-disk representation, we rewrite it in its minimized form. If a
+ * metaslab needs to condense then we must set the ms_condensing flag to
+ * ensure that allocations are not performed on the metaslab that is
+ * being written.
+ */
+struct metaslab {
+ /*
+ * This is the main lock of the metaslab and its purpose is to
+ * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+ * metaslab_free_concrete(), ..etc] with our various syncing
+ * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+ *
+ * The lock is also used during some miscellaneous operations like
+ * using the metaslab's histogram for the metaslab group's histogram
+ * aggregation, or marking the metaslab for initialization.
+ */
+ kmutex_t ms_lock;
+
+ /*
+ * Acquired together with the ms_lock whenever we expect to
+ * write to metaslab data on-disk (i.e flushing entries to
+ * the metaslab's space map). It helps coordinate readers of
+ * the metaslab's space map [see spa_vdev_remove_thread()]
+ * with writers [see metaslab_sync()].
+ *
+ * Note that metaslab_load(), even though a reader, uses
+ * a completely different mechanism to deal with the reading
+ * of the metaslab's space map based on ms_synced_length. That
+ * said, the function still uses the ms_sync_lock after it
+ * has read the ms_sm [see relevant comment in metaslab_load()
+ * as to why].
+ */
+ kmutex_t ms_sync_lock;
+
+ kcondvar_t ms_load_cv;
+ space_map_t *ms_sm;
+ uint64_t ms_id;
+ uint64_t ms_start;
+ uint64_t ms_size;
+ uint64_t ms_fragmentation;
+
+ range_tree_t *ms_allocating[TXG_SIZE];
+ range_tree_t *ms_allocatable;
+ uint64_t ms_allocated_this_txg;
+
+ /*
+ * The following range trees are accessed only from syncing context.
+ * ms_free*tree only have entries while syncing, and are empty
+ * between syncs.
+ */
+ range_tree_t *ms_freeing; /* to free this syncing txg */
+ range_tree_t *ms_freed; /* already freed this syncing txg */
+ range_tree_t *ms_defer[TXG_DEFER_SIZE];
+ range_tree_t *ms_checkpointing; /* to add to the checkpoint */
+
+ boolean_t ms_condensing; /* condensing? */
+ boolean_t ms_condense_wanted;
+ uint64_t ms_condense_checked_txg;
+
+ uint64_t ms_initializing; /* leaves initializing this ms */
+
+ /*
+ * We must always hold the ms_lock when modifying ms_loaded
+ * and ms_loading.
+ */
+ boolean_t ms_loaded;
+ boolean_t ms_loading;
+
+ /*
+ * The following histograms count entries that are in the
+ * metaslab's space map (and its histogram) but are not in
+ * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+ * or ms_defer[].
+ *
+ * When the metaslab is not loaded, its ms_weight needs to
+ * reflect what is allocatable (i.e. what will be part of
+ * ms_allocatable if it is loaded). The weight is computed from
+ * the spacemap histogram, but that includes ranges that are
+ * not yet allocatable (because they are in ms_freed,
+ * ms_freeing, or ms_defer[]). Therefore, when calculating the
+ * weight, we need to remove those ranges.
+ *
+ * The ranges in the ms_freed and ms_defer[] range trees are all
+ * present in the spacemap. However, the spacemap may have
+ * multiple entries to represent a contiguous range, because it
+ * is written across multiple sync passes, but the changes of
+ * all sync passes are consolidated into the range trees.
+ * Adjacent ranges that are freed in different sync passes of
+ * one txg will be represented separately (as 2 or more entries)
+ * in the space map (and its histogram), but these adjacent
+ * ranges will be consolidated (represented as one entry) in the
+ * ms_freed/ms_defer[] range trees (and their histograms).
+ *
+ * When calculating the weight, we can not simply subtract the
+ * range trees' histograms from the spacemap's histogram,
+ * because the range trees' histograms may have entries in
+ * higher buckets than the spacemap, due to consolidation.
+ * Instead we must subtract the exact entries that were added to
+ * the spacemap's histogram. ms_synchist and ms_deferhist[]
+ * represent these exact entries, so we can subtract them from
+ * the spacemap's histogram when calculating ms_weight.
+ *
+ * ms_synchist represents the same ranges as ms_freeing +
+ * ms_freed, but without consolidation across sync passes.
+ *
+ * ms_deferhist[i] represents the same ranges as ms_defer[i],
+ * but without consolidation across sync passes.
+ */
+ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+ uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+ /*
+ * Tracks the exact amount of allocated space of this metaslab
+ * (and specifically the metaslab's space map) up to the most
+ * recently completed sync pass [see usage in metaslab_sync()].
+ */
+ uint64_t ms_allocated_space;
+ int64_t ms_deferspace; /* sum of ms_defermap[] space */
+ uint64_t ms_weight; /* weight vs. others in group */
+ uint64_t ms_activation_weight; /* activation weight */
+
+ /*
+ * Track of whenever a metaslab is selected for loading or allocation.
+ * We use this value to determine how long the metaslab should
+ * stay cached.
+ */
+ uint64_t ms_selected_txg;
+
+ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
+ uint64_t ms_max_size; /* maximum allocatable size */
+
+ /*
+ * -1 if it's not active in an allocator, otherwise set to the allocator
+ * this metaslab is active for.
+ */
+ int ms_allocator;
+ boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */
+
+ /*
+ * The metaslab block allocators can optionally use a size-ordered
+ * range tree and/or an array of LBAs. Not all allocators use
+ * this functionality. The ms_allocatable_by_size should always
+ * contain the same number of segments as the ms_allocatable. The
+ * only difference is that the ms_allocatable_by_size is ordered by
+ * segment sizes.
+ */
+ avl_tree_t ms_allocatable_by_size;
+ uint64_t ms_lbas[MAX_LBAS];
+
+ metaslab_group_t *ms_group; /* metaslab group */
+ avl_node_t ms_group_node; /* node in metaslab group tree */
+ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+
+ /* updated every time we are done syncing the metaslab's space map */
+ uint64_t ms_synced_length;
+
+ boolean_t ms_new;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h
new file mode 100644
index 000000000000..527e3323b4b9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2017 by Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef _SYS_MMP_H
+#define _SYS_MMP_H
+
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MMP_MIN_INTERVAL 100 /* ms */
+#define MMP_DEFAULT_INTERVAL 1000 /* ms */
+#define MMP_DEFAULT_IMPORT_INTERVALS 20
+#define MMP_DEFAULT_FAIL_INTERVALS 10
+#define MMP_MIN_FAIL_INTERVALS 2 /* min if != 0 */
+#define MMP_IMPORT_SAFETY_FACTOR 200 /* pct */
+#define MMP_INTERVAL_OK(interval) MAX(interval, MMP_MIN_INTERVAL)
+#define MMP_FAIL_INTVS_OK(fails) (fails == 0 ? 0 : MAX(fails, \
+ MMP_MIN_FAIL_INTERVALS))
+
+typedef struct mmp_thread {
+ kmutex_t mmp_thread_lock; /* protect thread mgmt fields */
+ kcondvar_t mmp_thread_cv;
+ kthread_t *mmp_thread;
+ uint8_t mmp_thread_exiting;
+ kmutex_t mmp_io_lock; /* protect below */
+ hrtime_t mmp_last_write; /* last successful MMP write */
+ uint64_t mmp_delay; /* decaying avg ns between MMP writes */
+ uberblock_t mmp_ub; /* last ub written by sync */
+ zio_t *mmp_zio_root; /* root of mmp write zios */
+ uint64_t mmp_kstat_id; /* unique id for next MMP write kstat */
+ int mmp_skip_error; /* reason for last skipped write */
+ vdev_t *mmp_last_leaf; /* last mmp write sent here */
+ uint64_t mmp_leaf_last_gen; /* last mmp write sent here */
+ uint32_t mmp_seq; /* intra-second update counter */
+} mmp_thread_t;
+
+
+extern void mmp_init(struct spa *spa);
+extern void mmp_fini(struct spa *spa);
+extern void mmp_thread_start(struct spa *spa);
+extern void mmp_thread_stop(struct spa *spa);
+extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub);
+extern void mmp_signal_all_threads(void);
+
+/* Global tuning */
+extern ulong_t zfs_multihost_interval;
+extern uint_t zfs_multihost_fail_intervals;
+extern uint_t zfs_multihost_import_intervals;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MMP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
new file mode 100644
index 000000000000..a3b44e60eb97
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_MULTILIST_H
+#define _SYS_MULTILIST_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef list_node_t multilist_node_t;
+typedef struct multilist multilist_t;
+typedef struct multilist_sublist multilist_sublist_t;
+typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
+
+struct multilist_sublist {
+ /*
+ * The mutex used internally to implement thread safe insertions
+ * and removals to this individual sublist. It can also be locked
+ * by a consumer using multilist_sublist_{lock,unlock}, which is
+ * useful if a consumer needs to traverse the list in a thread
+ * safe manner.
+ */
+ kmutex_t mls_lock;
+ /*
+ * The actual list object containing all objects in this sublist.
+ */
+ list_t mls_list;
+ /*
+ * Pad to cache line, in an effort to try and prevent cache line
+ * contention.
+ */
+} __aligned(CACHE_LINE_SIZE);
+
+struct multilist {
+ /*
+ * This is used to get to the multilist_node_t structure given
+ * the void *object contained on the list.
+ */
+ size_t ml_offset;
+ /*
+ * The number of sublists used internally by this multilist.
+ */
+ uint64_t ml_num_sublists;
+ /*
+ * The array of pointers to the actual sublists.
+ */
+ multilist_sublist_t *ml_sublists;
+ /*
+ * Pointer to function which determines the sublist to use
+ * when inserting and removing objects from this multilist.
+ * Please see the comment above multilist_create for details.
+ */
+ multilist_sublist_index_func_t *ml_index_func;
+};
+
+void multilist_destroy(multilist_t *);
+multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *);
+
+void multilist_insert(multilist_t *, void *);
+void multilist_remove(multilist_t *, void *);
+int multilist_is_empty(multilist_t *);
+
+unsigned int multilist_get_num_sublists(multilist_t *);
+unsigned int multilist_get_random_index(multilist_t *);
+
+multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
+void multilist_sublist_unlock(multilist_sublist_t *);
+
+void multilist_sublist_insert_head(multilist_sublist_t *, void *);
+void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
+void multilist_sublist_remove(multilist_sublist_t *, void *);
+int multilist_sublist_is_empty(multilist_sublist_t *);
+int multilist_sublist_is_empty_idx(multilist_t *, unsigned int);
+
+void *multilist_sublist_head(multilist_sublist_t *);
+void *multilist_sublist_tail(multilist_sublist_t *);
+void *multilist_sublist_next(multilist_sublist_t *, void *);
+void *multilist_sublist_prev(multilist_sublist_t *, void *);
+
+void multilist_link_init(multilist_node_t *);
+int multilist_link_active(multilist_node_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MULTILIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
new file mode 100644
index 000000000000..bbdf66cade63
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
@@ -0,0 +1,124 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_RANGE_TREE_H
+#define _SYS_RANGE_TREE_H
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RANGE_TREE_HISTOGRAM_SIZE 64
+
+typedef struct range_tree_ops range_tree_ops_t;
+
+/*
+ * Note: the range_tree may not be accessed concurrently; consumers
+ * must provide external locking if required.
+ */
+typedef struct range_tree {
+ avl_tree_t rt_root; /* offset-ordered segment AVL tree */
+ uint64_t rt_space; /* sum of all segments in the map */
+ range_tree_ops_t *rt_ops;
+ void *rt_arg;
+
+ /* rt_avl_compare should only be set it rt_arg is an AVL tree */
+ uint64_t rt_gap; /* allowable inter-segment gap */
+ int (*rt_avl_compare)(const void *, const void *);
+ /*
+ * The rt_histogram maintains a histogram of ranges. Each bucket,
+ * rt_histogram[i], contains the number of ranges whose size is:
+ * 2^i <= size of range in bytes < 2^(i+1)
+ */
+ uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+} range_tree_t;
+
+typedef struct range_seg {
+ avl_node_t rs_node; /* AVL node */
+ avl_node_t rs_pp_node; /* AVL picker-private node */
+ uint64_t rs_start; /* starting offset of this segment */
+ uint64_t rs_end; /* ending offset (non-inclusive) */
+ uint64_t rs_fill; /* actual fill if gap mode is on */
+} range_seg_t;
+
+struct range_tree_ops {
+ void (*rtop_create)(range_tree_t *rt, void *arg);
+ void (*rtop_destroy)(range_tree_t *rt, void *arg);
+ void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg);
+ void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg);
+ void (*rtop_vacate)(range_tree_t *rt, void *arg);
+};
+
+typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
+
+void range_tree_init(void);
+void range_tree_fini(void);
+range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+ int (*avl_compare)(const void*, const void*), uint64_t gap);
+range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
+void range_tree_destroy(range_tree_t *rt);
+boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_verify_not_present(range_tree_t *rt,
+ uint64_t start, uint64_t size);
+range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+ uint64_t newstart, uint64_t newsize);
+uint64_t range_tree_space(range_tree_t *rt);
+boolean_t range_tree_is_empty(range_tree_t *rt);
+void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
+void range_tree_stat_verify(range_tree_t *rt);
+uint64_t range_tree_min(range_tree_t *rt);
+uint64_t range_tree_max(range_tree_t *rt);
+uint64_t range_tree_span(range_tree_t *rt);
+
+void range_tree_add(void *arg, uint64_t start, uint64_t size);
+void range_tree_remove(void *arg, uint64_t start, uint64_t size);
+void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta);
+void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
+
+void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
+void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
+range_seg_t *range_tree_first(range_tree_t *rt);
+
+void rt_avl_create(range_tree_t *rt, void *arg);
+void rt_avl_destroy(range_tree_t *rt, void *arg);
+void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_vacate(range_tree_t *rt, void *arg);
+extern struct range_tree_ops rt_avl_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_RANGE_TREE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
new file mode 100644
index 000000000000..f1fd04792fef
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_REFCOUNT_H
+#define _SYS_REFCOUNT_H
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+/* For FreeBSD refcount(9). */
+#include_next <sys/refcount.h>
+#include <sys/list.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * If the reference is held only by the calling function and not any
+ * particular object, use FTAG (which is a string) for the holder_tag.
+ * Otherwise, use the object that holds the reference.
+ */
+#define FTAG ((char *)(uintptr_t)__func__)
+
+#ifdef ZFS_DEBUG
+typedef struct reference {
+ list_node_t ref_link;
+ void *ref_holder;
+ uint64_t ref_number;
+ uint8_t *ref_removed;
+} reference_t;
+
+typedef struct refcount {
+ kmutex_t rc_mtx;
+ boolean_t rc_tracked;
+ list_t rc_list;
+ list_t rc_removed;
+ uint64_t rc_count;
+ uint64_t rc_removed_count;
+} zfs_refcount_t;
+
+/*
+ * Note: zfs_refcount_t must be initialized with
+ * refcount_create[_untracked]()
+ */
+
+void zfs_refcount_create(zfs_refcount_t *);
+void zfs_refcount_create_untracked(zfs_refcount_t *);
+void zfs_refcount_create_tracked(zfs_refcount_t *);
+void zfs_refcount_destroy(zfs_refcount_t *);
+void zfs_refcount_destroy_many(zfs_refcount_t *, uint64_t);
+int zfs_refcount_is_zero(zfs_refcount_t *);
+int64_t zfs_refcount_count(zfs_refcount_t *);
+int64_t zfs_refcount_add(zfs_refcount_t *, void *);
+int64_t zfs_refcount_remove(zfs_refcount_t *, void *);
+int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, void *);
+int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, void *);
+void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *);
+void zfs_refcount_transfer_ownership(zfs_refcount_t *, void *, void *);
+boolean_t zfs_refcount_held(zfs_refcount_t *, void *);
+boolean_t zfs_refcount_not_held(zfs_refcount_t *, void *);
+
+void zfs_refcount_init(void);
+void zfs_refcount_fini(void);
+
+#else /* ZFS_DEBUG */
+
+typedef struct refcount {
+ uint64_t rc_count;
+} zfs_refcount_t;
+
+#define zfs_refcount_create(rc) ((rc)->rc_count = 0)
+#define zfs_refcount_create_untracked(rc) ((rc)->rc_count = 0)
+#define zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0)
+#define zfs_refcount_destroy(rc) ((rc)->rc_count = 0)
+#define zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
+#define zfs_refcount_is_zero(rc) ((rc)->rc_count == 0)
+#define zfs_refcount_count(rc) ((rc)->rc_count)
+#define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
+#define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
+#define zfs_refcount_add_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, number)
+#define zfs_refcount_remove_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, -number)
+#define zfs_refcount_transfer(dst, src) { \
+ uint64_t __tmp = (src)->rc_count; \
+ atomic_add_64(&(src)->rc_count, -__tmp); \
+ atomic_add_64(&(dst)->rc_count, __tmp); \
+}
+#define zfs_refcount_transfer_ownership(rc, current_holder, new_holder) (void)0
+#define zfs_refcount_held(rc, holder) ((rc)->rc_count > 0)
+#define zfs_refcount_not_held(rc, holder) (B_TRUE)
+
+#define zfs_refcount_init()
+#define zfs_refcount_fini()
+
+#endif /* ZFS_DEBUG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_REFCOUNT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
new file mode 100644
index 000000000000..e0898dfe0ae8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
@@ -0,0 +1,112 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_RR_RW_LOCK_H
+#define _SYS_RR_RW_LOCK_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+/*
+ * A reader-writer lock implementation that allows re-entrant reads, but
+ * still gives writers priority on "new" reads.
+ *
+ * See rrwlock.c for more details about the implementation.
+ *
+ * Fields of the rrwlock_t structure:
+ * - rr_lock: protects modification and reading of rrwlock_t fields
+ * - rr_cv: cv for waking up readers or waiting writers
+ * - rr_writer: thread id of the current writer
+ * - rr_anon_rount: number of active anonymous readers
+ * - rr_linked_rcount: total number of non-anonymous active readers
+ * - rr_writer_wanted: a writer wants the lock
+ */
+typedef struct rrwlock {
+ kmutex_t rr_lock;
+ kcondvar_t rr_cv;
+ kthread_t *rr_writer;
+ zfs_refcount_t rr_anon_rcount;
+ zfs_refcount_t rr_linked_rcount;
+ boolean_t rr_writer_wanted;
+ boolean_t rr_track_all;
+} rrwlock_t;
+
+/*
+ * 'tag' is used in reference counting tracking. The
+ * 'tag' must be the same in a rrw_enter() as in its
+ * corresponding rrw_exit().
+ */
+void rrw_init(rrwlock_t *rrl, boolean_t track_all);
+void rrw_destroy(rrwlock_t *rrl);
+void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
+void rrw_enter_read(rrwlock_t *rrl, void *tag);
+void rrw_enter_read_prio(rrwlock_t *rrl, void *tag);
+void rrw_enter_write(rrwlock_t *rrl);
+void rrw_exit(rrwlock_t *rrl, void *tag);
+boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
+void rrw_tsd_destroy(void *arg);
+
+#define RRW_READ_HELD(x) rrw_held(x, RW_READER)
+#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER)
+#define RRW_LOCK_HELD(x) \
+ (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, pessimizing write acquisitions.
+ *
+ * This should be a prime number. See comment in rrwlock.c near
+ * RRM_TD_LOCK() for details.
+ */
+#define RRM_NUM_LOCKS 17
+typedef struct rrmlock {
+ rrwlock_t locks[RRM_NUM_LOCKS];
+} rrmlock_t;
+
+void rrm_init(rrmlock_t *rrl, boolean_t track_all);
+void rrm_destroy(rrmlock_t *rrl);
+void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
+void rrm_enter_read(rrmlock_t *rrl, void *tag);
+void rrm_enter_write(rrmlock_t *rrl);
+void rrm_exit(rrmlock_t *rrl, void *tag);
+boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
+
+#define RRM_READ_HELD(x) rrm_held(x, RW_READER)
+#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER)
+#define RRM_LOCK_HELD(x) \
+ (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_RR_RW_LOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
new file mode 100644
index 000000000000..62332ea126a0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
@@ -0,0 +1,170 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SA_H
+#define _SYS_SA_H
+
+#include <sys/dmu.h>
+#include <sys/uio.h>
+
+/*
+ * Currently available byteswap functions.
+ * If it all possible new attributes should used
+ * one of the already defined byteswap functions.
+ * If a new byteswap function is added then the
+ * ZPL/Pool version will need to be bumped.
+ */
+
+typedef enum sa_bswap_type {
+ SA_UINT64_ARRAY,
+ SA_UINT32_ARRAY,
+ SA_UINT16_ARRAY,
+ SA_UINT8_ARRAY,
+ SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t sa_attr_type_t;
+
+/*
+ * Attribute to register support for.
+ */
+typedef struct sa_attr_reg {
+ char *sa_name; /* attribute name */
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap; /* bswap functon enum */
+ sa_attr_type_t sa_attr; /* filled in during registration */
+} sa_attr_reg_t;
+
+
+typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
+ boolean_t, void *userptr);
+
+/*
+ * array of attributes to store.
+ *
+ * This array should be treated as opaque/private data.
+ * The SA_BULK_ADD_ATTR() macro should be used for manipulating
+ * the array.
+ *
+ * When sa_replace_all_by_template() is used the attributes
+ * will be stored in the order defined in the array, except that
+ * the attributes may be split between the bonus and the spill buffer
+ *
+ */
+typedef struct sa_bulk_attr {
+ void *sa_data;
+ sa_data_locator_t *sa_data_func;
+ uint16_t sa_length;
+ sa_attr_type_t sa_attr;
+ /* the following are private to the sa framework */
+ void *sa_addr;
+ uint16_t sa_buftype;
+ uint16_t sa_size;
+} sa_bulk_attr_t;
+
+
+/*
+ * special macro for adding entries for bulk attr support
+ * bulk - sa_bulk_attr_t
+ * count - integer that will be incremented during each add
+ * attr - attribute to manipulate
+ * func - function for accessing data.
+ * data - pointer to data.
+ * len - length of data
+ */
+
+#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
+{ \
+ b[idx].sa_attr = attr;\
+ b[idx].sa_data_func = func; \
+ b[idx].sa_data = data; \
+ b[idx++].sa_length = len; \
+}
+
+typedef struct sa_os sa_os_t;
+
+typedef enum sa_handle_type {
+ SA_HDL_SHARED,
+ SA_HDL_PRIVATE
+} sa_handle_type_t;
+
+struct sa_handle;
+typedef void *sa_lookup_tab_t;
+typedef struct sa_handle sa_handle_t;
+
+typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
+
+int sa_handle_get(objset_t *, uint64_t, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+void sa_handle_destroy(sa_handle_t *);
+int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
+void sa_buf_rele(dmu_buf_t *, void *);
+int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
+int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
+ uint32_t buflen, dmu_tx_t *);
+int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
+int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
+int sa_size(sa_handle_t *, sa_attr_type_t, int *);
+int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
+ uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
+void sa_object_info(sa_handle_t *, dmu_object_info_t *);
+void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
+void *sa_get_userdata(sa_handle_t *);
+void sa_set_userp(sa_handle_t *, void *);
+dmu_buf_t *sa_get_db(sa_handle_t *);
+uint64_t sa_handle_object(sa_handle_t *);
+boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
+void sa_register_update_callback(objset_t *, sa_update_cb_t *);
+int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
+void sa_tear_down(objset_t *);
+int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+boolean_t sa_enabled(objset_t *);
+void sa_cache_init(void);
+void sa_cache_fini(void);
+int sa_set_sa_object(objset_t *, uint64_t);
+int sa_hdrsize(void *);
+void sa_handle_lock(sa_handle_t *);
+void sa_handle_unlock(sa_handle_t *);
+
+#ifdef _KERNEL
+int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
new file mode 100644
index 000000000000..50430125b253
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
@@ -0,0 +1,291 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#ifndef _SYS_SA_IMPL_H
+#define _SYS_SA_IMPL_H
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/list.h>
+
+/*
+ * Array of known attributes and their
+ * various characteristics.
+ */
+typedef struct sa_attr_table {
+ sa_attr_type_t sa_attr;
+ uint8_t sa_registered;
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap;
+ char *sa_name;
+} sa_attr_table_t;
+
+/*
+ * Zap attribute format for attribute registration
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | unused | len | bswap | attr num |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Zap attribute format for layout information.
+ *
+ * layout information is stored as an array of attribute numbers
+ * The name of the attribute is the layout number (0, 1, 2, ...)
+ *
+ * 16 0
+ * +---- ---+
+ * | attr # |
+ * +--------+
+ * | attr # |
+ * +--- ----+
+ * ......
+ *
+ */
+
+#define ATTR_BSWAP(x) BF32_GET(x, 16, 8)
+#define ATTR_LENGTH(x) BF32_GET(x, 24, 16)
+#define ATTR_NUM(x) BF32_GET(x, 0, 16)
+#define ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+ BF64_SET(x, 24, 16, length); \
+ BF64_SET(x, 16, 8, bswap); \
+ BF64_SET(x, 0, 16, attr); \
+}
+
+#define TOC_OFF(x) BF32_GET(x, 0, 23)
+#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1)
+#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4)
+#define TOC_ATTR_ENCODE(x, len_idx, offset) \
+{ \
+ BF32_SET(x, 31, 1, 1); \
+ BF32_SET(x, 24, 7, len_idx); \
+ BF32_SET(x, 0, 24, offset); \
+}
+
+#define SA_LAYOUTS "LAYOUTS"
+#define SA_REGISTRY "REGISTRY"
+
+/*
+ * Each unique layout will have their own table
+ * sa_lot (layout_table)
+ */
+typedef struct sa_lot {
+ avl_node_t lot_num_node;
+ avl_node_t lot_hash_node;
+ uint64_t lot_num;
+ uint64_t lot_hash;
+ sa_attr_type_t *lot_attrs; /* array of attr #'s */
+ uint32_t lot_var_sizes; /* how many aren't fixed size */
+ uint32_t lot_attr_count; /* total attr count */
+ list_t lot_idx_tab; /* should be only a couple of entries */
+ int lot_instance; /* used with lot_hash to identify entry */
+} sa_lot_t;
+
+/* index table of offsets */
+typedef struct sa_idx_tab {
+ list_node_t sa_next;
+ sa_lot_t *sa_layout;
+ uint16_t *sa_variable_lengths;
+ zfs_refcount_t sa_refcount;
+ uint32_t *sa_idx_tab; /* array of offsets */
+} sa_idx_tab_t;
+
+/*
+ * Since the offset/index information into the actual data
+ * will usually be identical we can share that information with
+ * all handles that have the exact same offsets.
+ *
+ * You would typically only have a large number of different table of
+ * contents if you had a several variable sized attributes.
+ *
+ * Two AVL trees are used to track the attribute layout numbers.
+ * one is keyed by number and will be consulted when a DMU_OT_SA
+ * object is first read. The second tree is keyed by the hash signature
+ * of the attributes and will be consulted when an attribute is added
+ * to determine if we already have an instance of that layout. Both
+ * of these tree's are interconnected. The only difference is that
+ * when an entry is found in the "hash" tree the list of attributes will
+ * need to be compared against the list of attributes you have in hand.
+ * The assumption is that typically attributes will just be updated and
+ * adding a completely new attribute is a very rare operation.
+ */
+struct sa_os {
+ kmutex_t sa_lock;
+ boolean_t sa_need_attr_registration;
+ boolean_t sa_force_spill;
+ uint64_t sa_master_obj;
+ uint64_t sa_reg_attr_obj;
+ uint64_t sa_layout_attr_obj;
+ int sa_num_attrs;
+ sa_attr_table_t *sa_attr_table; /* private attr table */
+ sa_update_cb_t *sa_update_cb;
+ avl_tree_t sa_layout_num_tree; /* keyed by layout number */
+ avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */
+ int sa_user_table_sz;
+ sa_attr_type_t *sa_user_table; /* user name->attr mapping table */
+};
+
+/*
+ * header for all bonus and spill buffers.
+ *
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attributes which are determined by the "layout number"
+ */
+
+#define SA_MAGIC 0x2F505A /* ZFS SA */
+typedef struct sa_hdr_phys {
+ uint32_t sa_magic;
+ /* BEGIN CSTYLED */
+ /*
+ * Encoded with hdrsize and layout number as follows:
+ * 16 10 0
+ * +--------+-------+
+ * | hdrsz |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ * 2 ==> 16 byte header
+ *
+ */
+ /* END CSTYLED */
+ uint16_t sa_layout_info;
+ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
+ /* ... Data follows the lengths. */
+} sa_hdr_phys_t;
+
+#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0)
+#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+ BF32_SET_SB(x, 10, 6, 3, 0, size); \
+ BF32_SET(x, 0, 10, num); \
+}
+
+typedef enum sa_buf_type {
+ SA_BONUS = 1,
+ SA_SPILL = 2
+} sa_buf_type_t;
+
+typedef enum sa_data_op {
+ SA_LOOKUP,
+ SA_UPDATE,
+ SA_ADD,
+ SA_REPLACE,
+ SA_REMOVE
+} sa_data_op_t;
+
+/*
+ * Opaque handle used for most sa functions
+ *
+ * This needs to be kept as small as possible.
+ */
+
+struct sa_handle {
+ dmu_buf_user_t sa_dbu;
+ kmutex_t sa_lock;
+ dmu_buf_t *sa_bonus;
+ dmu_buf_t *sa_spill;
+ objset_t *sa_os;
+ void *sa_userp;
+ sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */
+ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */
+};
+
+#define SA_GET_DB(hdl, type) \
+ (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
+
+#define SA_GET_HDR(hdl, type) \
+ ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
+ type))->db.db_data))
+
+#define SA_IDX_TAB_GET(hdl, type) \
+ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
+
+#define IS_SA_BONUSTYPE(a) \
+ ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
+
+#define SA_BONUSTYPE_FROM_DB(db) \
+ (dmu_get_bonustype((dmu_buf_t *)db))
+
+#define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t))
+
+#define SA_LAYOUT_NUM(x, type) \
+ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
+ ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
+
+
+#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
+
+#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
+ hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
+ SA_REGISTERED_LEN(sa, attr))
+
+#define SA_SET_HDR(hdr, num, size) \
+ { \
+ hdr->sa_magic = SA_MAGIC; \
+ SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
+ }
+
+#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
+ { \
+ bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
+ bulk.sa_buftype = type; \
+ bulk.sa_addr = \
+ (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
+ (uintptr_t)hdr); \
+}
+
+#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
+ (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
+ (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
+ sizeof (uint16_t), 8) : 0)))
+
+int sa_add_impl(sa_handle_t *, sa_attr_type_t,
+ uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
+
+void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
+int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
+
+void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
+ uint16_t *, sa_hdr_phys_t *);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
new file mode 100644
index 000000000000..5bdc4feb3d5e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -0,0 +1,969 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#ifndef _SYS_SPA_H
+#define _SYS_SPA_H
+
+#include <sys/avl.h>
+#include <sys/zfs_context.h>
+#include <sys/nvpair.h>
+#include <sys/sysevent.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Forward references that lots of things need.
+ */
+typedef struct spa spa_t;
+typedef struct vdev vdev_t;
+typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
+typedef struct zilog zilog_t;
+typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
+struct dsl_pool;
+struct dsl_dataset;
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) do { \
+ ASSERT3U(val, <, 1U << (len)); \
+ ASSERT3U(low + len, <=, 32); \
+ (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BF64_SET(x, low, len, val) do { \
+ ASSERT3U(val, <, 1ULL << (len)); \
+ ASSERT3U(low + len, <=, 64); \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
+ ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
+ ASSERT3S((val) >> (shift), >=, bias); \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
+ ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
+ ASSERT3S((val) >> (shift), >=, bias); \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+
+/*
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
+ */
+#define SPA_MINBLOCKSHIFT 9
+#define SPA_OLD_MAXBLOCKSHIFT 17
+#define SPA_MAXBLOCKSHIFT 24
+#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
+#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
+
+/*
+ * Default maximum supported logical ashift.
+ *
+ * The current 8k allocation block size limit is due to the 8k
+ * aligned/sized operations performed by vdev_probe() on
+ * vdev_label->vl_pad2. Using another "safe region" for these tests
+ * would allow the limit to be raised to 16k, at the expense of
+ * only having 8 available uberblocks in the label area.
+ */
+#define SPA_MAXASHIFT 13
+
+/*
+ * Default minimum supported logical ashift.
+ */
+#define SPA_MINASHIFT SPA_MINBLOCKSHIFT
+
+/*
+ * Size of block to hold the configuration data (a packed nvlist)
+ */
+#define SPA_CONFIG_BLOCKSIZE (1ULL << 14)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
+#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
+#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
+
+#define SPA_COMPRESSBITS 7
+#define SPA_VDEVBITS 24
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+ uint64_t dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+ uint64_t zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
+ * secret and is suitable for use in MAC algorithms as the key.
+ */
+typedef struct zio_cksum_salt {
+ uint8_t zcs_bytes[32];
+} zio_cksum_salt_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | pad | vdev1 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1 |G| offset1 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2 | pad | vdev2 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3 |G| offset2 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4 | pad | vdev3 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5 |G| offset3 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9 | physical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | logical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | fill count |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * c | checksum[0] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * d | checksum[1] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * e | checksum[2] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * f | checksum[3] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev virtual device ID
+ * offset offset into virtual device
+ * LSIZE logical size
+ * PSIZE physical size (after compression)
+ * ASIZE allocated size (including RAID-Z parity and gang block headers)
+ * GRID RAID-Z layout information (reserved for future use)
+ * cksum checksum function
+ * comp compression function
+ * G gang block indicator
+ * B byteorder (endianness)
+ * D dedup
+ * X encryption (on version 30, which is not supported)
+ * E blkptr_t contains embedded data (see below)
+ * lvl level of indirection
+ * type DMU object type
+ * phys birth txg when dva[0] was written; zero if same as logical birth txg
+ * note that typically all the dva's would be written in this
+ * txg, but they could be different if they were moved by
+ * device removal.
+ * log. birth transaction group in which the block was logically born
+ * fill count number of non-zero blocks under this bp
+ * checksum[4] 256-bit checksum of the data this bp describes
+ */
+
+/*
+ * "Embedded" blkptr_t's don't actually point to a block, instead they
+ * have a data payload embedded in the blkptr_t itself. See the comment
+ * in blkptr.c for more details.
+ *
+ * The blkptr_t is laid out as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | payload |
+ * 1 | payload |
+ * 2 | payload |
+ * 3 | payload |
+ * 4 | payload |
+ * 5 | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | payload |
+ * 8 | payload |
+ * 9 | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | logical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | payload |
+ * c | payload |
+ * d | payload |
+ * e | payload |
+ * f | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * payload contains the embedded data
+ * B (byteorder) byteorder (endianness)
+ * D (dedup) padding (set to zero)
+ * X encryption (set to zero; see above)
+ * E (embedded) set to one
+ * lvl indirection level
+ * type DMU object type
+ * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
+ * comp compression function of payload
+ * PSIZE size of payload after compression, in bytes
+ * LSIZE logical size of payload, in bytes
+ * note that 25 bits is enough to store the largest
+ * "normal" BP's LSIZE (2^16 * 2^9) in bytes
+ * log. birth transaction group in which the block was logically born
+ *
+ * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
+ * bp's they are stored in units of SPA_MINBLOCKSHIFT.
+ * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
+ * The B, D, X, lvl, type, and comp fields are stored the same as with normal
+ * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
+ * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
+ * other macros, as they assert that they are only used on BP's of the correct
+ * "embedded-ness".
+ */
+
+#define BPE_GET_ETYPE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET((bp)->blk_prop, 40, 8))
+#define BPE_SET_ETYPE(bp, t) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET((bp)->blk_prop, 40, 8, t); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BPE_GET_LSIZE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
+#define BPE_SET_LSIZE(bp, x) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BPE_GET_PSIZE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
+#define BPE_SET_PSIZE(bp, x) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+typedef enum bp_embedded_type {
+ BP_EMBEDDED_TYPE_DATA,
+ BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
+ NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
+} bp_embedded_type_t;
+
+#define BPE_NUM_WORDS 14
+#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define BPE_IS_PAYLOADWORD(bp, wp) \
+ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
+#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
+#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */
+
+/*
+ * A block is a hole when it has either 1) never been written to, or
+ * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
+ * without physically allocating disk space. Holes are represented in the
+ * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
+ * done through the BP_IS_HOLE macro. For holes, the logical size, level,
+ * DMU object type, and birth times are all also stored for holes that
+ * were written to at some point (i.e. were punched after having been filled).
+ */
+typedef struct blkptr {
+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[2]; /* Extra space for the future */
+ uint64_t blk_phys_birth; /* txg when block was allocated */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define DVA_GET_ASIZE(dva) \
+ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_ASIZE(dva, x) \
+ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
+ SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
+#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
+#define DVA_SET_VDEV(dva, x) \
+ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
+
+#define DVA_GET_OFFSET(dva) \
+ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_OFFSET(dva, x) \
+ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
+#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define BP_GET_LSIZE(bp) \
+ (BP_IS_EMBEDDED(bp) ? \
+ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
+ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_LSIZE(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, \
+ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BP_GET_PSIZE(bp) \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_PSIZE(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, \
+ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BP_GET_COMPRESS(bp) \
+ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
+#define BP_SET_COMPRESS(bp, x) \
+ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
+
+#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
+#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)
+
+#define BP_GET_CHECKSUM(bp) \
+ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
+ BF64_GET((bp)->blk_prop, 40, 8))
+#define BP_SET_CHECKSUM(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET((bp)->blk_prop, 40, 8, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_PHYSICAL_BIRTH(bp) \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
+#define BP_SET_BIRTH(bp, logical, physical) \
+{ \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ (bp)->blk_birth = (logical); \
+ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
+
+#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
+
+#define BP_IS_METADATA(bp) \
+ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+
+#define BP_GET_ASIZE(bp) \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_GET_UCSIZE(bp) \
+ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
+
+#define BP_GET_NDVAS(bp) \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_COUNT_GANG(bp) \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[2])))
+
+#define DVA_EQUAL(dva1, dva2) \
+ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+ (dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define BP_EQUAL(bp1, bp2) \
+ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
+ (bp1)->blk_birth == (bp2)->blk_birth && \
+ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
+#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
+ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
+ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
+ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
+ ((zc1).zc_word[3] - (zc2).zc_word[3])))
+
+#define ZIO_CHECKSUM_IS_ZERO(zc) \
+ (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
+ (zc)->zc_word[2] | (zc)->zc_word[3]))
+
+#define ZIO_CHECKSUM_BSWAP(zcp) \
+{ \
+ (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \
+ (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \
+ (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \
+ (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \
+}
+
+
+#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
+
+#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
+{ \
+ (zcp)->zc_word[0] = w0; \
+ (zcp)->zc_word[1] = w1; \
+ (zcp)->zc_word[2] = w2; \
+ (zcp)->zc_word[3] = w3; \
+}
+
+#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) \
+ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
+#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
+ (dva)->dva_word[1] == 0ULL)
+#define BP_IS_HOLE(bp) \
+ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
+
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+ BP_GET_PSIZE(bp))
+
+#define BP_ZERO(bp) \
+{ \
+ (bp)->blk_dva[0].dva_word[0] = 0; \
+ (bp)->blk_dva[0].dva_word[1] = 0; \
+ (bp)->blk_dva[1].dva_word[0] = 0; \
+ (bp)->blk_dva[1].dva_word[1] = 0; \
+ (bp)->blk_dva[2].dva_word[0] = 0; \
+ (bp)->blk_dva[2].dva_word[1] = 0; \
+ (bp)->blk_prop = 0; \
+ (bp)->blk_pad[0] = 0; \
+ (bp)->blk_pad[1] = 0; \
+ (bp)->blk_phys_birth = 0; \
+ (bp)->blk_birth = 0; \
+ (bp)->blk_fill = 0; \
+ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
+}
+
+#if BYTE_ORDER == _BIG_ENDIAN
+#define ZFS_HOST_BYTEORDER (0ULL)
+#else
+#define ZFS_HOST_BYTEORDER (1ULL)
+#endif
+
+#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
+
+#define BP_SPRINTF_LEN 320
+
+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
+{ \
+ static const char *copyname[] = \
+ { "zero", "single", "double", "triple" }; \
+ int len = 0; \
+ int copies = 0; \
+ \
+ if (bp == NULL) { \
+ len += func(buf + len, size - len, "<NULL>"); \
+ } else if (BP_IS_HOLE(bp)) { \
+ len += func(buf + len, size - len, \
+ "HOLE [L%llu %s] " \
+ "size=%llxL birth=%lluL", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ (u_longlong_t)BP_GET_LSIZE(bp), \
+ (u_longlong_t)bp->blk_birth); \
+ } else if (BP_IS_EMBEDDED(bp)) { \
+ len = func(buf + len, size - len, \
+ "EMBEDDED [L%llu %s] et=%u %s " \
+ "size=%llxL/%llxP birth=%lluL", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ (int)BPE_GET_ETYPE(bp), \
+ compress, \
+ (u_longlong_t)BPE_GET_LSIZE(bp), \
+ (u_longlong_t)BPE_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth); \
+ } else { \
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
+ const dva_t *dva = &bp->blk_dva[d]; \
+ if (DVA_IS_VALID(dva)) \
+ copies++; \
+ len += func(buf + len, size - len, \
+ "DVA[%d]=<%llu:%llx:%llx>%c", d, \
+ (u_longlong_t)DVA_GET_VDEV(dva), \
+ (u_longlong_t)DVA_GET_OFFSET(dva), \
+ (u_longlong_t)DVA_GET_ASIZE(dva), \
+ ws); \
+ } \
+ if (BP_IS_GANG(bp) && \
+ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
+ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
+ copies--; \
+ len += func(buf + len, size - len, \
+ "[L%llu %s] %s %s %s %s %s %s%c" \
+ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
+ "cksum=%llx:%llx:%llx:%llx", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ checksum, \
+ compress, \
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
+ BP_IS_GANG(bp) ? "gang" : "contiguous", \
+ BP_GET_DEDUP(bp) ? "dedup" : "unique", \
+ copyname[copies], \
+ ws, \
+ (u_longlong_t)BP_GET_LSIZE(bp), \
+ (u_longlong_t)BP_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth, \
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
+ (u_longlong_t)BP_GET_FILL(bp), \
+ ws, \
+ (u_longlong_t)bp->blk_cksum.zc_word[0], \
+ (u_longlong_t)bp->blk_cksum.zc_word[1], \
+ (u_longlong_t)bp->blk_cksum.zc_word[2], \
+ (u_longlong_t)bp->blk_cksum.zc_word[3]); \
+ } \
+ ASSERT(len < size); \
+}
+
+#define BP_GET_BUFC_TYPE(bp) \
+ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+
+typedef enum spa_import_type {
+ SPA_IMPORT_EXISTING,
+ SPA_IMPORT_ASSEMBLE
+} spa_import_type_t;
+
+/* state manipulation functions */
+extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
+ nvlist_t *policy, nvlist_t **config);
+extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
+ size_t buflen);
+extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
+ nvlist_t *zplprops);
+#ifdef illumos
+extern int spa_import_rootpool(char *devpath, char *devid);
+#else
+extern int spa_import_rootpool(const char *name, bool checkpointrewind);
+#endif
+extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
+ uint64_t flags);
+extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
+extern int spa_destroy(char *pool);
+extern int spa_checkpoint(const char *pool);
+extern int spa_checkpoint_discard(const char *pool);
+extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
+ boolean_t hardforce);
+extern int spa_reset(char *pool);
+extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_unrequest(spa_t *spa, int flag);
+extern void spa_async_suspend(spa_t *spa);
+extern void spa_async_resume(spa_t *spa);
+extern spa_t *spa_inject_addref(char *pool);
+extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
+
+#define SPA_ASYNC_CONFIG_UPDATE 0x01
+#define SPA_ASYNC_REMOVE 0x02
+#define SPA_ASYNC_PROBE 0x04
+#define SPA_ASYNC_RESILVER_DONE 0x08
+#define SPA_ASYNC_RESILVER 0x10
+#define SPA_ASYNC_AUTOEXPAND 0x20
+#define SPA_ASYNC_REMOVE_DONE 0x40
+#define SPA_ASYNC_REMOVE_STOP 0x80
+#define SPA_ASYNC_INITIALIZE_RESTART 0x100
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define SPA_REMOVE_UNSPARE 0x01
+#define SPA_REMOVE_DONE 0x02
+
+/* device manipulation */
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
+ int replacing);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
+ int replace_done);
+extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
+extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
+extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
+extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp);
+
+/* spare state (which is global across all pools) */
+extern void spa_spare_add(vdev_t *vd);
+extern void spa_spare_remove(vdev_t *vd);
+extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
+extern void spa_spare_activate(vdev_t *vd);
+
+/* L2ARC state (which is global across all pools) */
+extern void spa_l2cache_add(vdev_t *vd);
+extern void spa_l2cache_remove(vdev_t *vd);
+extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
+extern void spa_l2cache_activate(vdev_t *vd);
+extern void spa_l2cache_drop(spa_t *spa);
+
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);
+extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
+
+/* spa syncing */
+extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
+extern void spa_sync_allpools(void);
+
+/* spa namespace global mutex */
+extern kmutex_t spa_namespace_lock;
+
+/*
+ * SPA configuration functions in spa_config.c
+ */
+
+#define SPA_CONFIG_UPDATE_POOL 0
+#define SPA_CONFIG_UPDATE_VDEVS 1
+
+extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
+extern void spa_config_load(void);
+extern nvlist_t *spa_all_configs(uint64_t *);
+extern void spa_config_set(spa_t *spa, nvlist_t *config);
+extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int getstats);
+extern void spa_config_update(spa_t *spa, int what);
+
+/*
+ * Miscellaneous SPA routines in spa_misc.c
+ */
+
+/* Namespace manipulation */
+extern spa_t *spa_lookup(const char *name);
+extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
+extern void spa_remove(spa_t *spa);
+extern spa_t *spa_next(spa_t *prev);
+
+/* Refcount functions */
+extern void spa_open_ref(spa_t *spa, void *tag);
+extern void spa_close(spa_t *spa, void *tag);
+extern void spa_async_close(spa_t *spa, void *tag);
+extern boolean_t spa_refcount_zero(spa_t *spa);
+
+#define SCL_NONE 0x00
+#define SCL_CONFIG 0x01
+#define SCL_STATE 0x02
+#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */
+#define SCL_ALLOC 0x08
+#define SCL_ZIO 0x10
+#define SCL_FREE 0x20
+#define SCL_VDEV 0x40
+#define SCL_LOCKS 7
+#define SCL_ALL ((1 << SCL_LOCKS) - 1)
+#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO)
+
+/* Pool configuration locks */
+extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
+extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
+extern void spa_config_exit(spa_t *spa, int locks, void *tag);
+extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
+
+/* Pool vdev add/remove lock */
+extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int error, char *tag);
+extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
+
+/* Pool vdev state change lock */
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
+extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
+
+/* Log state */
+typedef enum spa_log_state {
+ SPA_LOG_UNKNOWN = 0, /* unknown log state */
+ SPA_LOG_MISSING, /* missing log(s) */
+ SPA_LOG_CLEAR, /* clear the log(s) */
+ SPA_LOG_GOOD, /* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+extern int spa_reset_logs(spa_t *spa);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
+/* Accessor functions */
+extern boolean_t spa_shutting_down(spa_t *spa);
+extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern boolean_t spa_is_initializing(spa_t *spa);
+extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
+extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
+extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
+extern void spa_altroot(spa_t *, char *, size_t);
+extern int spa_sync_pass(spa_t *spa);
+extern char *spa_name(spa_t *spa);
+extern uint64_t spa_guid(spa_t *spa);
+extern uint64_t spa_load_guid(spa_t *spa);
+extern uint64_t spa_last_synced_txg(spa_t *spa);
+extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
+extern uint64_t spa_final_dirty_txg(spa_t *spa);
+extern uint64_t spa_version(spa_t *spa);
+extern pool_state_t spa_state(spa_t *spa);
+extern spa_load_state_t spa_load_state(spa_t *spa);
+extern uint64_t spa_freeze_txg(spa_t *spa);
+extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_checkpoint_space(spa_t *spa);
+extern uint64_t spa_get_slop_space(spa_t *spa);
+extern void spa_update_dspace(spa_t *spa);
+extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern metaslab_class_t *spa_special_class(spa_t *spa);
+extern metaslab_class_t *spa_dedup_class(spa_t *spa);
+extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
+ dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
+
+extern void spa_evicting_os_register(spa_t *, objset_t *os);
+extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
+extern void spa_evicting_os_wait(spa_t *spa);
+extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
+extern int spa_busy(void);
+extern uint8_t spa_get_failmode(spa_t *spa);
+extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);
+extern uint64_t spa_deadman_synctime(spa_t *spa);
+extern struct proc *spa_proc(spa_t *spa);
+extern uint64_t spa_dirty_data(spa_t *spa);
+
+/* Miscellaneous support routines */
+extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
+extern void spa_load_note(spa_t *spa, const char *fmt, ...);
+extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
+ dmu_tx_t *tx);
+extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
+extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
+extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
+extern char *spa_strdup(const char *);
+extern void spa_strfree(char *);
+extern uint64_t spa_get_random(uint64_t range);
+extern uint64_t spa_generate_guid(spa_t *spa);
+extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
+extern void spa_freeze(spa_t *spa);
+extern int spa_change_guid(spa_t *spa);
+extern void spa_upgrade(spa_t *spa, uint64_t version);
+extern void spa_evict_all(void);
+extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
+ boolean_t l2cache);
+extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
+extern boolean_t spa_has_slogs(spa_t *spa);
+extern boolean_t spa_is_root(spa_t *spa);
+extern boolean_t spa_writeable(spa_t *spa);
+extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
+extern int spa_maxdnodesize(spa_t *spa);
+extern boolean_t spa_multihost(spa_t *spa);
+extern unsigned long spa_get_hostid(void);
+extern boolean_t spa_has_checkpoint(spa_t *spa);
+extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
+extern boolean_t spa_suspend_async_destroy(spa_t *spa);
+extern uint64_t spa_min_claim_txg(spa_t *spa);
+extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
+extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
+ const blkptr_t *bp);
+typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
+ void *arg);
+extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
+ spa_remap_cb_t callback, void *arg);
+extern uint64_t spa_get_last_removal_txg(spa_t *spa);
+extern boolean_t spa_trust_config(spa_t *spa);
+extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
+extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
+extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
+extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
+
+extern int spa_mode(spa_t *spa);
+extern uint64_t zfs_strtonum(const char *str, char **nptr);
+
+extern char *spa_his_ievent_table[];
+
+extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
+extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
+ char *his_buf);
+extern int spa_history_log(spa_t *spa, const char *his_buf);
+extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
+extern void spa_history_log_version(spa_t *spa, const char *operation);
+extern void spa_history_log_internal(spa_t *spa, const char *operation,
+ dmu_tx_t *tx, const char *fmt, ...);
+extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
+ dmu_tx_t *tx, const char *fmt, ...);
+extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
+ dmu_tx_t *tx, const char *fmt, ...);
+
+/* error handling */
+struct zbookmark_phys;
+extern void spa_log_error(spa_t *spa, zio_t *zio);
+extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd,
+ zio_t *zio, uint64_t stateoroffset, uint64_t length);
+extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
+extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
+extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
+extern uint64_t spa_get_errlog_size(spa_t *spa);
+extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
+extern void spa_errlog_rotate(spa_t *spa);
+extern void spa_errlog_drain(spa_t *spa);
+extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
+extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
+
+/* vdev cache */
+extern void vdev_cache_stat_init(void);
+extern void vdev_cache_stat_fini(void);
+
+/* Initialization and termination */
+extern void spa_init(int flags);
+extern void spa_fini(void);
+extern void spa_boot_init(void);
+
+/* properties */
+extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
+extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
+extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
+extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
+
+/* asynchronous event notification */
+extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
+ const char *name);
+extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl,
+ const char *name);
+extern void spa_event_post(sysevent_t *ev);
+extern void spa_event_discard(sysevent_t *ev);
+
+#ifdef ZFS_DEBUG
+#define dprintf_bp(bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
+ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
+ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ kmem_free(__blkbuf, BP_SPRINTF_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_bp(bp, fmt, ...)
+#endif
+
+extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
new file mode 100644
index 000000000000..8df5072a55ef
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
@@ -0,0 +1,48 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_SPA_BOOT_H
+#define _SYS_SPA_BOOT_H
+
+#include <sys/nvpair.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern char *spa_get_bootprop(char *prop);
+extern void spa_free_bootprop(char *prop);
+
+extern void spa_arch_init(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_BOOT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h
new file mode 100644
index 000000000000..9be2b6eeab3c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_SPA_CHECKPOINT_H
+#define _SYS_SPA_CHECKPOINT_H
+
+#include <sys/zthr.h>
+
+typedef struct spa_checkpoint_info {
+ uint64_t sci_timestamp; /* when checkpointed uberblock was synced */
+ uint64_t sci_dspace; /* disk space used by checkpoint in bytes */
+} spa_checkpoint_info_t;
+
+int spa_checkpoint(const char *);
+int spa_checkpoint_discard(const char *);
+
+boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *);
+void spa_checkpoint_discard_thread(void *, zthr_t *);
+
+int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *);
+
+#endif /* _SYS_SPA_CHECKPOINT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
new file mode 100644
index 000000000000..11b6982798e8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -0,0 +1,435 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_SPA_IMPL_H
+#define _SYS_SPA_IMPL_H
+
+#include <sys/spa.h>
+#include <sys/spa_checkpoint.h>
+#include <sys/vdev.h>
+#include <sys/vdev_removal.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/uberblock_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/refcount.h>
+#include <sys/bplist.h>
+#include <sys/bpobj.h>
+#include <sys/zfeature.h>
+#include <sys/zthr.h>
+#include <zfeature_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_error_entry {
+ zbookmark_phys_t se_bookmark;
+ char *se_name;
+ avl_node_t se_avl;
+} spa_error_entry_t;
+
+typedef struct spa_history_phys {
+ uint64_t sh_pool_create_len; /* ending offset of zpool create */
+ uint64_t sh_phys_max_off; /* physical EOF */
+ uint64_t sh_bof; /* logical BOF */
+ uint64_t sh_eof; /* logical EOF */
+ uint64_t sh_records_lost; /* num of records overwritten */
+} spa_history_phys_t;
+
+/*
+ * All members must be uint64_t, for byteswap purposes.
+ */
+typedef struct spa_removing_phys {
+ uint64_t sr_state; /* dsl_scan_state_t */
+
+ /*
+ * The vdev ID that we most recently attempted to remove,
+ * or -1 if no removal has been attempted.
+ */
+ uint64_t sr_removing_vdev;
+
+ /*
+ * The vdev ID that we most recently successfully removed,
+ * or -1 if no devices have been removed.
+ */
+ uint64_t sr_prev_indirect_vdev;
+
+ uint64_t sr_start_time;
+ uint64_t sr_end_time;
+
+ /*
+ * Note that we can not use the space map's or indirect mapping's
+ * accounting as a substitute for these values, because we need to
+ * count frees of not-yet-copied data as though it did the copy.
+ * Otherwise, we could get into a situation where copied > to_copy,
+ * or we complete before copied == to_copy.
+ */
+ uint64_t sr_to_copy; /* bytes that need to be copied */
+ uint64_t sr_copied; /* bytes that have been copied or freed */
+} spa_removing_phys_t;
+
+/*
+ * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT
+ * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense
+ * of an indirect vdev's mapping object is in progress.
+ */
+typedef struct spa_condensing_indirect_phys {
+ /*
+ * The vdev ID of the indirect vdev whose indirect mapping is
+ * being condensed.
+ */
+ uint64_t scip_vdev;
+
+ /*
+ * The vdev's old obsolete spacemap. This spacemap's contents are
+ * being integrated into the new mapping.
+ */
+ uint64_t scip_prev_obsolete_sm_object;
+
+ /*
+ * The new mapping object that is being created.
+ */
+ uint64_t scip_next_mapping_object;
+} spa_condensing_indirect_phys_t;
+
+struct spa_aux_vdev {
+ uint64_t sav_object; /* MOS object for device list */
+ nvlist_t *sav_config; /* cached device config */
+ vdev_t **sav_vdevs; /* devices */
+ int sav_count; /* number devices */
+ boolean_t sav_sync; /* sync the device list */
+ nvlist_t **sav_pending; /* pending device additions */
+ uint_t sav_npending; /* # pending devices */
+};
+
+typedef struct spa_config_lock {
+ kmutex_t scl_lock;
+ kthread_t *scl_writer;
+ int scl_write_wanted;
+ kcondvar_t scl_cv;
+ zfs_refcount_t scl_count;
+} spa_config_lock_t;
+
+typedef struct spa_config_dirent {
+ list_node_t scd_link;
+ char *scd_path;
+} spa_config_dirent_t;
+
+typedef enum zio_taskq_type {
+ ZIO_TASKQ_ISSUE = 0,
+ ZIO_TASKQ_ISSUE_HIGH,
+ ZIO_TASKQ_INTERRUPT,
+ ZIO_TASKQ_INTERRUPT_HIGH,
+ ZIO_TASKQ_TYPES
+} zio_taskq_type_t;
+
+/*
+ * State machine for the zpool-poolname process. The states transitions
+ * are done as follows:
+ *
+ * From To Routine
+ * PROC_NONE -> PROC_CREATED spa_activate()
+ * PROC_CREATED -> PROC_ACTIVE spa_thread()
+ * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate()
+ * PROC_DEACTIVATE -> PROC_GONE spa_thread()
+ * PROC_GONE -> PROC_NONE spa_deactivate()
+ */
+typedef enum spa_proc_state {
+ SPA_PROC_NONE, /* spa_proc = &p0, no process created */
+ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */
+ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */
+ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */
+ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
+} spa_proc_state_t;
+
+typedef struct spa_taskqs {
+ uint_t stqs_count;
+ taskq_t **stqs_taskq;
+} spa_taskqs_t;
+
+typedef enum spa_all_vdev_zap_action {
+ AVZ_ACTION_NONE = 0,
+ AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */
+ AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */
+ AVZ_ACTION_INITIALIZE
+} spa_avz_action_t;
+
+typedef enum spa_config_source {
+ SPA_CONFIG_SRC_NONE = 0,
+ SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */
+ SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */
+ SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */
+ SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */
+ SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */
+} spa_config_source_t;
+
+struct spa {
+ /*
+ * Fields protected by spa_namespace_lock.
+ */
+ char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */
+ char *spa_comment; /* comment */
+ avl_node_t spa_avl; /* node in spa_namespace_avl */
+ nvlist_t *spa_config; /* last synced config */
+ nvlist_t *spa_config_syncing; /* currently syncing config */
+ nvlist_t *spa_config_splitting; /* config for splitting */
+ nvlist_t *spa_load_info; /* info and errors from load */
+ uint64_t spa_config_txg; /* txg of last config change */
+ int spa_sync_pass; /* iterate-to-convergence */
+ pool_state_t spa_state; /* pool state */
+ int spa_inject_ref; /* injection references */
+ uint8_t spa_sync_on; /* sync threads are running */
+ spa_load_state_t spa_load_state; /* current load operation */
+ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */
+ boolean_t spa_trust_config; /* do we trust vdev tree? */
+ spa_config_source_t spa_config_source; /* where config comes from? */
+ uint64_t spa_import_flags; /* import specific flags */
+ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
+ dsl_pool_t *spa_dsl_pool;
+ boolean_t spa_is_initializing; /* true while opening pool */
+ metaslab_class_t *spa_normal_class; /* normal data class */
+ metaslab_class_t *spa_log_class; /* intent log data class */
+ metaslab_class_t *spa_special_class; /* special allocation class */
+ metaslab_class_t *spa_dedup_class; /* dedup allocation class */
+ uint64_t spa_first_txg; /* first txg after spa_open() */
+ uint64_t spa_final_txg; /* txg of export/destroy */
+ uint64_t spa_freeze_txg; /* freeze pool at this txg */
+ uint64_t spa_load_max_txg; /* best initial ub_txg */
+ uint64_t spa_claim_max_txg; /* highest claimed birth txg */
+ timespec_t spa_loaded_ts; /* 1st successful open time */
+ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
+ kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */
+ list_t spa_evicting_os_list; /* Objsets being evicted. */
+ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */
+ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
+ vdev_t *spa_root_vdev; /* top-level vdev container */
+ int spa_min_ashift; /* of vdevs in normal class */
+ int spa_max_ashift; /* of vdevs in normal class */
+ uint64_t spa_config_guid; /* config pool guid */
+ uint64_t spa_load_guid; /* spa_load initialized guid */
+ uint64_t spa_last_synced_guid; /* last synced guid */
+ list_t spa_config_dirty_list; /* vdevs with dirty config */
+ list_t spa_state_dirty_list; /* vdevs with dirty state */
+ /*
+ * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
+ * stored in spa_alloc_count. There is one tree and one lock for each
+ * allocator, to help improve allocation performance in write-heavy
+ * workloads.
+ */
+ kmutex_t *spa_alloc_locks;
+ avl_tree_t *spa_alloc_trees;
+ int spa_alloc_count;
+
+ spa_aux_vdev_t spa_spares; /* hot spares */
+ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
+ nvlist_t *spa_label_features; /* Features for reading MOS */
+ uint64_t spa_config_object; /* MOS object for pool config */
+ uint64_t spa_config_generation; /* config generation number */
+ uint64_t spa_syncing_txg; /* txg currently syncing */
+ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
+ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
+ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
+ /* checksum context templates */
+ kmutex_t spa_cksum_tmpls_lock;
+ void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
+ uberblock_t spa_ubsync; /* last synced uberblock */
+ uberblock_t spa_uberblock; /* current uberblock */
+ boolean_t spa_extreme_rewind; /* rewind past deferred frees */
+ uint64_t spa_last_io; /* lbolt of last non-scan I/O */
+ kmutex_t spa_scrub_lock; /* resilver/scrub lock */
+ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */
+ uint64_t spa_load_verify_ios; /* in-flight verifications IOs */
+ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
+ uint8_t spa_scrub_active; /* active or suspended? */
+ uint8_t spa_scrub_type; /* type of scrub we're doing */
+ uint8_t spa_scrub_finished; /* indicator to rotate logs */
+ uint8_t spa_scrub_started; /* started since last boot */
+ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
+ uint64_t spa_scan_pass_start; /* start time per pass/reboot */
+ uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
+ uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
+ uint64_t spa_scan_pass_exam; /* examined bytes per pass */
+ uint64_t spa_scan_pass_issued; /* issued bytes per pass */
+ kmutex_t spa_async_lock; /* protect async state */
+ kthread_t *spa_async_thread; /* thread doing async task */
+ kthread_t *spa_async_thread_vd; /* thread doing vd async task */
+ int spa_async_suspended; /* async tasks suspended */
+ kcondvar_t spa_async_cv; /* wait for thread_exit() */
+ uint16_t spa_async_tasks; /* async task mask */
+ uint64_t spa_missing_tvds; /* unopenable tvds on load */
+ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */
+
+ spa_removing_phys_t spa_removing_phys;
+ spa_vdev_removal_t *spa_vdev_removal;
+
+ spa_condensing_indirect_phys_t spa_condensing_indirect_phys;
+ spa_condensing_indirect_t *spa_condensing_indirect;
+ zthr_t *spa_condense_zthr; /* zthr doing condense. */
+
+ uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */
+ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
+ zthr_t *spa_checkpoint_discard_zthr;
+
+ char *spa_root; /* alternate root directory */
+ uint64_t spa_ena; /* spa-wide ereport ENA */
+ int spa_last_open_failed; /* error if last open failed */
+ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */
+ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */
+ uint64_t spa_load_txg; /* ub txg that loaded */
+ uint64_t spa_load_txg_ts; /* timestamp from that ub */
+ uint64_t spa_load_meta_errors; /* verify metadata err count */
+ uint64_t spa_load_data_errors; /* verify data err count */
+ uint64_t spa_verify_min_txg; /* start txg of verify scrub */
+ kmutex_t spa_errlog_lock; /* error log lock */
+ uint64_t spa_errlog_last; /* last error log object */
+ uint64_t spa_errlog_scrub; /* scrub error log object */
+ kmutex_t spa_errlist_lock; /* error list/ereport lock */
+ avl_tree_t spa_errlist_last; /* last error list */
+ avl_tree_t spa_errlist_scrub; /* scrub error list */
+ uint64_t spa_deflate; /* should we deflate? */
+ uint64_t spa_history; /* history object */
+ kmutex_t spa_history_lock; /* history lock */
+ vdev_t *spa_pending_vdev; /* pending vdev additions */
+ kmutex_t spa_props_lock; /* property lock */
+ uint64_t spa_pool_props_object; /* object for properties */
+ uint64_t spa_bootfs; /* default boot filesystem */
+ uint64_t spa_failmode; /* failure mode for the pool */
+ uint64_t spa_delegation; /* delegation on/off */
+ list_t spa_config_list; /* previous cache file(s) */
+ /* per-CPU array of root of async I/O: */
+ zio_t **spa_async_zio_root;
+ zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
+ zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */
+ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
+ kcondvar_t spa_suspend_cv; /* notification of resume */
+ zio_suspend_reason_t spa_suspended; /* pool is suspended */
+ uint8_t spa_claiming; /* pool is doing zil_claim() */
+ boolean_t spa_is_root; /* pool is root */
+ int spa_minref; /* num refs when first opened */
+ int spa_mode; /* FREAD | FWRITE */
+ spa_log_state_t spa_log_state; /* log state */
+ uint64_t spa_autoexpand; /* lun expansion on/off */
+ uint64_t spa_bootsize; /* efi system partition size */
+ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+ uint64_t spa_ddt_stat_object; /* DDT statistics */
+ uint64_t spa_dedup_ditto; /* dedup ditto threshold */
+ uint64_t spa_dedup_checksum; /* default dedup checksum */
+ uint64_t spa_dspace; /* dspace in normal class */
+ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
+ kmutex_t spa_proc_lock; /* protects spa_proc* */
+ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
+ spa_proc_state_t spa_proc_state; /* see definition */
+ struct proc *spa_proc; /* "zpool-poolname" process */
+ uint64_t spa_did; /* if procp != p0, did of t1 */
+ kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */
+ kmutex_t spa_trim_lock; /* protects spa_trim_cv */
+ kcondvar_t spa_trim_cv; /* used to notify TRIM thread */
+ boolean_t spa_autoreplace; /* autoreplace set in open */
+ int spa_vdev_locks; /* locks grabbed */
+ uint64_t spa_creation_version; /* version at pool creation */
+ uint64_t spa_prev_software_version; /* See ub_software_version */
+ uint64_t spa_feat_for_write_obj; /* required to write to pool */
+ uint64_t spa_feat_for_read_obj; /* required to read from pool */
+ uint64_t spa_feat_desc_obj; /* Feature descriptions */
+ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */
+ kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */
+ nvlist_t *spa_feat_stats; /* Cache of enabled features */
+ /* cache feature refcounts */
+ uint64_t spa_feat_refcount_cache[SPA_FEATURES];
+#ifdef illumos
+ cyclic_id_t spa_deadman_cycid; /* cyclic id */
+#else /* !illumos */
+#ifdef _KERNEL
+ struct callout spa_deadman_cycid; /* callout id */
+ struct task spa_deadman_task;
+#endif
+#endif /* illumos */
+ uint64_t spa_deadman_calls; /* number of deadman calls */
+ hrtime_t spa_sync_starttime; /* starting time fo spa_sync */
+ uint64_t spa_deadman_synctime; /* deadman expiration timer */
+ uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
+ spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
+
+#ifdef illumos
+ /*
+ * spa_iokstat_lock protects spa_iokstat and
+ * spa_queue_stats[].
+ */
+ kmutex_t spa_iokstat_lock;
+ struct kstat *spa_iokstat; /* kstat of io to this pool */
+ struct {
+ int spa_active;
+ int spa_queued;
+ } spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
+#endif
+ /* arc_memory_throttle() parameters during low memory condition */
+ uint64_t spa_lowmem_page_load; /* memory load during txg */
+ uint64_t spa_lowmem_last_txg; /* txg window start */
+
+ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
+
+ taskq_t *spa_zvol_taskq; /* Taskq for minor management */
+
+ uint64_t spa_multihost; /* multihost aware (mmp) */
+ mmp_thread_t spa_mmp; /* multihost mmp thread */
+ list_t spa_leaf_list; /* list of leaf vdevs */
+ uint64_t spa_leaf_list_gen; /* track leaf_list changes */
+
+ /*
+ * spa_refcount & spa_config_lock must be the last elements
+ * because refcount_t changes size based on compilation options.
+ * because zfs_refcount_t changes size based on compilation options.
+ * In order for the MDB module to function correctly, the other
+ * fields must remain in the same location.
+ */
+ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
+ zfs_refcount_t spa_refcount; /* number of opens */
+#ifndef illumos
+ boolean_t spa_splitting_newspa; /* creating new spa in split */
+#endif
+};
+
+extern const char *spa_config_path;
+
+extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+extern void spa_load_spares(spa_t *spa);
+extern void spa_load_l2cache(spa_t *spa);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
new file mode 100644
index 000000000000..2bce20b48ba5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_SPACE_MAP_H
+#define _SYS_SPACE_MAP_H
+
+#include <sys/avl.h>
+#include <sys/range_tree.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The size of the space map object has increased to include a histogram.
+ * The SPACE_MAP_SIZE_V0 designates the original size and is used to
+ * maintain backward compatibility.
+ */
+#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
+#define SPACE_MAP_HISTOGRAM_SIZE 32
+
+/*
+ * The space_map_phys is the on-disk representation of the space map.
+ * Consumers of space maps should never reference any of the members of this
+ * structure directly. These members may only be updated in syncing context.
+ *
+ * Note the smp_object is no longer used but remains in the structure
+ * for backward compatibility.
+ */
+typedef struct space_map_phys {
+ /* object number: not needed but kept for backwards compatibility */
+ uint64_t smp_object;
+
+ /* length of the object in bytes */
+ uint64_t smp_length;
+
+ /* space allocated from the map */
+ int64_t smp_alloc;
+
+ /* reserved */
+ uint64_t smp_pad[5];
+
+ /*
+ * The smp_histogram maintains a histogram of free regions. Each
+ * bucket, smp_histogram[i], contains the number of free regions
+ * whose size is:
+ * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
+ */
+ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
+} space_map_phys_t;
+
+/*
+ * The space map object defines a region of space, its size, how much is
+ * allocated, and the on-disk object that stores this information.
+ * Consumers of space maps may only access the members of this structure.
+ *
+ * Note: the space_map may not be accessed concurrently; consumers
+ * must provide external locking if required.
+ */
+typedef struct space_map {
+ uint64_t sm_start; /* start of map */
+ uint64_t sm_size; /* size of map */
+ uint8_t sm_shift; /* unit shift */
+ objset_t *sm_os; /* objset for this map */
+ uint64_t sm_object; /* object id for this map */
+ uint32_t sm_blksz; /* block size for space map */
+ dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */
+ space_map_phys_t *sm_phys; /* on-disk space map */
+} space_map_t;
+
+/*
+ * debug entry
+ *
+ * 2 2 10 50
+ * +-----+-----+------------+----------------------------------+
+ * | 1 0 | act | syncpass | txg (lower bits) |
+ * +-----+-----+------------+----------------------------------+
+ * 63 62 61 60 59 50 49 0
+ *
+ *
+ * one-word entry
+ *
+ * 1 47 1 15
+ * +-----------------------------------------------------------+
+ * | 0 | offset (sm_shift units) | type | run |
+ * +-----------------------------------------------------------+
+ * 63 62 16 15 14 0
+ *
+ *
+ * two-word entry
+ *
+ * 2 2 36 24
+ * +-----+-----+---------------------------+-------------------+
+ * | 1 1 | pad | run | vdev |
+ * +-----+-----+---------------------------+-------------------+
+ * 63 62 61 60 59 24 23 0
+ *
+ * 1 63
+ * +------+----------------------------------------------------+
+ * | type | offset |
+ * +------+----------------------------------------------------+
+ * 63 62 0
+ *
+ * Note that a two-word entry will not strandle a block boundary.
+ * If necessary, the last word of a block will be padded with a
+ * debug entry (with act = syncpass = txg = 0).
+ */
+
+typedef enum {
+ SM_ALLOC,
+ SM_FREE
+} maptype_t;
+
+typedef struct space_map_entry {
+ maptype_t sme_type;
+ uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */
+ uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */
+ uint64_t sme_run; /* max is 2^36; units of sm_shift */
+} space_map_entry_t;
+
+#define SM_NO_VDEVID (1 << SPA_VDEVBITS)
+
+/* one-word entry constants */
+#define SM_DEBUG_PREFIX 2
+#define SM_OFFSET_BITS 47
+#define SM_RUN_BITS 15
+
+/* two-word entry constants */
+#define SM2_PREFIX 3
+#define SM2_OFFSET_BITS 63
+#define SM2_RUN_BITS 36
+
+#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2)
+#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2)
+
+#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2)
+#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2)
+#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
+#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
+#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
+#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
+
+#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS)
+#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS)
+#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
+#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
+#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
+#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
+#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
+#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL)
+
+#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1)
+#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS)
+#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS)
+#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS)
+#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1)
+#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
+#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS)
+#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
+#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL)
+#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL)
+
+boolean_t sm_entry_is_debug(uint64_t e);
+boolean_t sm_entry_is_single_word(uint64_t e);
+boolean_t sm_entry_is_double_word(uint64_t e);
+
+typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
+
+int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
+int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t length);
+int space_map_iterate(space_map_t *sm, uint64_t length,
+ sm_cb_t callback, void *arg);
+int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+ dmu_tx_t *tx);
+
+boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt);
+void space_map_histogram_clear(space_map_t *sm);
+void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
+ dmu_tx_t *tx);
+
+uint64_t space_map_object(space_map_t *sm);
+int64_t space_map_allocated(space_map_t *sm);
+uint64_t space_map_length(space_map_t *sm);
+
+void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t vdev_id, dmu_tx_t *tx);
+uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+ uint64_t vdev_id);
+void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
+uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
+void space_map_free(space_map_t *sm, dmu_tx_t *tx);
+void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx);
+
+int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+ uint64_t start, uint64_t size, uint8_t shift);
+void space_map_close(space_map_t *sm);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPACE_MAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h
new file mode 100644
index 000000000000..249b15be6729
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_SPACE_REFTREE_H
+#define _SYS_SPACE_REFTREE_H
+
+#include <sys/range_tree.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct space_ref {
+ avl_node_t sr_node; /* AVL node */
+ uint64_t sr_offset; /* range offset (start or end) */
+ int64_t sr_refcnt; /* associated reference count */
+} space_ref_t;
+
+void space_reftree_create(avl_tree_t *t);
+void space_reftree_destroy(avl_tree_t *t);
+void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+ int64_t refcnt);
+void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt);
+void space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt,
+ int64_t minref);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPACE_REFTREE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
new file mode 100644
index 000000000000..f228d0766631
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_TRIM_MAP_H
+#define _SYS_TRIM_MAP_H
+
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void trim_map_create(vdev_t *vd);
+extern void trim_map_destroy(vdev_t *vd);
+extern void trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg);
+extern boolean_t trim_map_write_start(zio_t *zio);
+extern void trim_map_write_done(zio_t *zio);
+
+extern void trim_thread_create(spa_t *spa);
+extern void trim_thread_destroy(spa_t *spa);
+extern void trim_thread_wakeup(spa_t *spa);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TRIM_MAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
new file mode 100644
index 000000000000..d5c22e41478e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
@@ -0,0 +1,136 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_TXG_H
+#define _SYS_TXG_H
+
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */
+#define TXG_SIZE 4 /* next power of 2 */
+#define TXG_MASK (TXG_SIZE - 1) /* mask for size */
+#define TXG_INITIAL TXG_SIZE /* initial txg */
+#define TXG_IDX (txg & TXG_MASK)
+
+/* Number of txgs worth of frees we defer adding to in-core spacemaps */
+#define TXG_DEFER_SIZE 2
+
+typedef struct tx_cpu tx_cpu_t;
+
+typedef struct txg_handle {
+ tx_cpu_t *th_cpu;
+ uint64_t th_txg;
+} txg_handle_t;
+
+typedef struct txg_node {
+ struct txg_node *tn_next[TXG_SIZE];
+ uint8_t tn_member[TXG_SIZE];
+} txg_node_t;
+
+typedef struct txg_list {
+ kmutex_t tl_lock;
+ size_t tl_offset;
+ spa_t *tl_spa;
+ txg_node_t *tl_head[TXG_SIZE];
+} txg_list_t;
+
+struct dsl_pool;
+
+extern void txg_init(struct dsl_pool *dp, uint64_t txg);
+extern void txg_fini(struct dsl_pool *dp);
+extern void txg_sync_start(struct dsl_pool *dp);
+extern void txg_sync_stop(struct dsl_pool *dp);
+extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
+extern void txg_rele_to_quiesce(txg_handle_t *txghp);
+extern void txg_rele_to_sync(txg_handle_t *txghp);
+extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
+
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
+ hrtime_t resolution);
+extern void txg_kick(struct dsl_pool *dp);
+
+/*
+ * Wait until the given transaction group has finished syncing.
+ * Try to make this happen as soon as possible (eg. kick off any
+ * necessary syncs immediately). If txg==0, wait for the currently open
+ * txg to finish syncing.
+ */
+extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Wait as above. Returns true if the thread was signaled while waiting.
+ */
+extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Wait until the given transaction group, or one after it, is
+ * the open transaction group. Try to make this happen as soon
+ * as possible (eg. kick off any necessary syncs immediately).
+ * If txg == 0, wait for the next open txg.
+ */
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Returns TRUE if we are "backed up" waiting for the syncing
+ * transaction to complete; otherwise returns FALSE.
+ */
+extern boolean_t txg_stalled(struct dsl_pool *dp);
+
+/* returns TRUE if someone is waiting for the next txg to sync */
+extern boolean_t txg_sync_waiting(struct dsl_pool *dp);
+
+extern void txg_verify(spa_t *spa, uint64_t txg);
+
+/*
+ * Per-txg object lists.
+ */
+
+#define TXG_CLEAN(txg) ((txg) - 1)
+
+extern void txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset);
+extern void txg_list_destroy(txg_list_t *tl);
+extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern boolean_t txg_all_lists_empty(txg_list_t *tl);
+extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
+extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
new file mode 100644
index 000000000000..bf3b269d707d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_TXG_IMPL_H
+#define _SYS_TXG_IMPL_H
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The tx_cpu structure is a per-cpu structure that is used to track
+ * the number of active transaction holds (tc_count). As transactions
+ * are assigned into a transaction group the appropriate tc_count is
+ * incremented to indicate that there are pending changes that have yet
+ * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
+ * the tc_count. A transaction group is not considered quiesced until all
+ * tx_cpu structures have reached a tc_count of zero.
+ *
+ * This structure is a per-cpu structure by design. Updates to this structure
+ * are frequent and concurrent. Having a single structure would result in
+ * heavy lock contention so a per-cpu design was implemented. With the fanned
+ * out mutex design, consumers only need to lock the mutex associated with
+ * thread's cpu.
+ *
+ * The tx_cpu contains two locks, the tc_lock and tc_open_lock.
+ * The tc_lock is used to protect all members of the tx_cpu structure with
+ * the exception of the tc_open_lock. This lock should only be held for a
+ * short period of time, typically when updating the value of tc_count.
+ *
+ * The tc_open_lock protects the tx_open_txg member of the tx_state structure.
+ * This lock is used to ensure that transactions are only assigned into
+ * the current open transaction group. In order to move the current open
+ * transaction group to the quiesce phase, the txg_quiesce thread must
+ * grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
+ * The tc_open_lock is held until the transaction is assigned into the
+ * transaction group. Typically, this is a short operation but if throttling
+ * is occuring it may be held for longer periods of time.
+ */
+struct tx_cpu {
+ kmutex_t tc_open_lock; /* protects tx_open_txg */
+ kmutex_t tc_lock; /* protects the rest of this struct */
+ kcondvar_t tc_cv[TXG_SIZE];
+ uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
+ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
+ char tc_pad[8]; /* pad to fill 3 cache lines */
+};
+
+/*
+ * The tx_state structure maintains the state information about the different
+ * stages of the pool's transcation groups. A per pool tx_state structure
+ * is used to track this information. The tx_state structure also points to
+ * an array of tx_cpu structures (described above). Although the tx_sync_lock
+ * is used to protect the members of this structure, it is not used to
+ * protect the tx_open_txg. Instead a special lock in the tx_cpu structure
+ * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
+ * Any thread wishing to update tx_open_txg must grab the tc_open_lock on
+ * every cpu (see txg_quiesce()).
+ */
+typedef struct tx_state {
+ tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */
+ kmutex_t tx_sync_lock; /* protects the rest of this struct */
+
+ uint64_t tx_open_txg; /* currently open txg id */
+ uint64_t tx_quiescing_txg; /* currently quiescing txg id */
+ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
+ uint64_t tx_syncing_txg; /* currently syncing txg id */
+ uint64_t tx_synced_txg; /* last synced txg id */
+
+ hrtime_t tx_open_time; /* start time of tx_open_txg */
+
+ uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
+ uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
+
+ kcondvar_t tx_sync_more_cv;
+ kcondvar_t tx_sync_done_cv;
+ kcondvar_t tx_quiesce_more_cv;
+ kcondvar_t tx_quiesce_done_cv;
+ kcondvar_t tx_timeout_cv;
+ kcondvar_t tx_exit_cv; /* wait for all threads to exit */
+
+ uint8_t tx_threads; /* number of threads */
+ uint8_t tx_exiting; /* set when we're exiting */
+
+ kthread_t *tx_sync_thread;
+ kthread_t *tx_quiesce_thread;
+
+ taskq_t *tx_commit_cb_taskq; /* commit callback taskq */
+} tx_state_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
new file mode 100644
index 000000000000..044e438387c0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_UBERBLOCK_H
+#define _SYS_UBERBLOCK_H
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct uberblock uberblock_t;
+
+extern int uberblock_verify(uberblock_t *);
+extern boolean_t uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg,
+ uint64_t mmp_delay);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
new file mode 100644
index 000000000000..caf43957dfe4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -0,0 +1,145 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_UBERBLOCK_IMPL_H
+#define _SYS_UBERBLOCK_IMPL_H
+
+#include <sys/uberblock.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved. When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked. If the ub_version field is moved, we may not detect
+ * version mismatch. If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
+#define UBERBLOCK_SHIFT 10 /* up to 1K */
+#define MMP_MAGIC 0xa11cea11 /* all-see-all */
+
+#define MMP_INTERVAL_VALID_BIT 0x01
+#define MMP_SEQ_VALID_BIT 0x02
+#define MMP_FAIL_INT_VALID_BIT 0x04
+
+#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
+ ubp->ub_mmp_magic == MMP_MAGIC)
+#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_INTERVAL_VALID_BIT))
+#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_SEQ_VALID_BIT))
+#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_FAIL_INT_VALID_BIT))
+
+#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
+ >> 8)
+#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
+ >> 32)
+#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
+ >> 48)
+
+#define MMP_INTERVAL_SET(write) \
+ (((uint64_t)(write & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT)
+
+#define MMP_SEQ_SET(seq) \
+ (((uint64_t)(seq & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT)
+
+#define MMP_FAIL_INT_SET(fail) \
+ (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT)
+
+struct uberblock {
+ uint64_t ub_magic; /* UBERBLOCK_MAGIC */
+ uint64_t ub_version; /* SPA_VERSION */
+ uint64_t ub_txg; /* txg of last sync */
+ uint64_t ub_guid_sum; /* sum of all vdev guids */
+ uint64_t ub_timestamp; /* UTC time of last sync */
+ blkptr_t ub_rootbp; /* MOS objset_phys_t */
+
+ /* highest SPA_VERSION supported by software that wrote this txg */
+ uint64_t ub_software_version;
+
+ /* Maybe missing in uberblocks we read, but always written */
+ uint64_t ub_mmp_magic;
+ /*
+ * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
+ * Otherwise, nanosec since last MMP write.
+ */
+ uint64_t ub_mmp_delay;
+
+ /*
+ * The ub_mmp_config contains the multihost write interval, multihost
+ * fail intervals, sequence number for sub-second granularity, and
+ * valid bit mask. This layout is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * This allows a write_interval of (2^24/1000)s, over 4.5 hours
+ *
+ * VALID Bits:
+ * - 0x01 - Write Interval (ms)
+ * - 0x02 - Sequence number exists
+ * - 0x04 - Fail Intervals
+ * - 0xf8 - Reserved
+ */
+ uint64_t ub_mmp_config;
+
+ /*
+ * ub_checkpoint_txg indicates two things about the current uberblock:
+ *
+ * 1] If it is not zero then this uberblock is a checkpoint. If it is
+ * zero, then this uberblock is not a checkpoint.
+ *
+ * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+ * the ub_txg that the uberblock had at the time we moved it to
+ * the MOS config.
+ *
+ * The field is set when we checkpoint the uberblock and continues to
+ * hold that value even after we've rewound (unlike the ub_txg that
+ * is reset to a higher value).
+ *
+ * Besides checks used to determine whether we are reopening the
+ * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+ * the value of the field is used to determine which ZIL blocks have
+ * been allocated according to the ms_sm when we are rewinding to a
+ * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+ * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+ */
+ uint64_t ub_checkpoint_txg;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
new file mode 100644
index 000000000000..d4ba32e5c642
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UNIQUE_H
+#define _SYS_UNIQUE_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The number of significant bits in each unique value. */
+#define UNIQUE_BITS 56
+
+void unique_init(void);
+void unique_fini(void);
+
+/*
+ * Return a new unique value (which will not be uniquified against until
+ * it is unique_insert()-ed).
+ */
+uint64_t unique_create(void);
+
+/* Return a unique value, which equals the one passed in if possible. */
+uint64_t unique_insert(uint64_t value);
+
+/* Indicate that this value no longer needs to be uniquified against. */
+void unique_remove(uint64_t value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UNIQUE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
new file mode 100644
index 000000000000..0bb266873c6c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -0,0 +1,196 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#ifndef _SYS_VDEV_H
+#define _SYS_VDEV_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum vdev_dtl_type {
+ DTL_MISSING, /* 0% replication: no copies of the data */
+ DTL_PARTIAL, /* less than 100% replication: some copies missing */
+ DTL_SCRUB, /* unable to fully repair during scrub/resilver */
+ DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
+ DTL_TYPES
+} vdev_dtl_type_t;
+
+extern boolean_t zfs_nocacheflush;
+extern boolean_t zfs_trim_enabled;
+
+extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...);
+extern void vdev_dbgmsg_print_tree(vdev_t *, int);
+extern int vdev_open(vdev_t *);
+extern void vdev_open_children(vdev_t *);
+extern boolean_t vdev_uses_zvols(vdev_t *);
+extern int vdev_validate(vdev_t *);
+extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
+extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
+extern void vdev_close(vdev_t *);
+extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
+extern void vdev_reopen(vdev_t *);
+extern int vdev_validate_aux(vdev_t *vd);
+extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
+extern boolean_t vdev_is_concrete(vdev_t *vd);
+extern boolean_t vdev_is_bootable(vdev_t *vd);
+extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
+extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern int vdev_count_leaves(spa_t *spa);
+extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
+ uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
+ uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
+extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
+extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+ int scrub_done);
+extern boolean_t vdev_dtl_required(vdev_t *vd);
+extern boolean_t vdev_resilver_needed(vdev_t *vd,
+ uint64_t *minp, uint64_t *maxp);
+extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
+ dmu_tx_t *tx);
+extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
+ uint64_t size);
+extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
+ uint64_t offset, uint64_t size, dmu_tx_t *tx);
+
+extern void vdev_hold(vdev_t *);
+extern void vdev_rele(vdev_t *);
+
+extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern void vdev_metaslab_fini(vdev_t *vd);
+extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_ashift_optimize(vdev_t *);
+extern void vdev_expand(vdev_t *vd, uint64_t txg);
+extern void vdev_split(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd);
+
+extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+extern void vdev_clear_stats(vdev_t *vd);
+extern void vdev_stat_update(zio_t *zio, uint64_t psize);
+extern void vdev_scan_stat_init(vdev_t *vd);
+extern void vdev_propagate_state(vdev_t *vd);
+extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
+ vdev_aux_t aux);
+extern boolean_t vdev_children_are_offline(vdev_t *vd);
+
+extern void vdev_space_update(vdev_t *vd,
+ int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
+
+extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
+
+extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
+
+extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
+extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
+extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
+ vdev_state_t *);
+extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
+extern void vdev_clear(spa_t *spa, vdev_t *vd);
+
+extern boolean_t vdev_is_dead(vdev_t *vd);
+extern boolean_t vdev_readable(vdev_t *vd);
+extern boolean_t vdev_writeable(vdev_t *vd);
+extern boolean_t vdev_allocatable(vdev_t *vd);
+extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
+extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
+
+extern void vdev_cache_init(vdev_t *vd);
+extern void vdev_cache_fini(vdev_t *vd);
+extern boolean_t vdev_cache_read(zio_t *zio);
+extern void vdev_cache_write(zio_t *zio);
+extern void vdev_cache_purge(vdev_t *vd);
+
+extern void vdev_queue_init(vdev_t *vd);
+extern void vdev_queue_fini(vdev_t *vd);
+extern zio_t *vdev_queue_io(zio_t *zio);
+extern void vdev_queue_io_done(zio_t *zio);
+extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
+extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
+
+extern void vdev_config_dirty(vdev_t *vd);
+extern void vdev_config_clean(vdev_t *vd);
+extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
+
+extern void vdev_state_dirty(vdev_t *vd);
+extern void vdev_state_clean(vdev_t *vd);
+
+typedef enum vdev_config_flag {
+ VDEV_CONFIG_SPARE = 1 << 0,
+ VDEV_CONFIG_L2CACHE = 1 << 1,
+ VDEV_CONFIG_REMOVING = 1 << 2,
+ VDEV_CONFIG_MOS = 1 << 3,
+ VDEV_CONFIG_MISSING = 1 << 4
+} vdev_config_flag_t;
+
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
+extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
+ boolean_t getstats, vdev_config_flag_t flags);
+
+/*
+ * Label routines
+ */
+struct uberblock;
+extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
+extern int vdev_label_number(uint64_t psise, uint64_t offset);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
+extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
+extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
+ offset, uint64_t size, zio_done_func_t *done, void *priv, int flags);
+extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *);
+extern int vdev_label_write_bootenv(vdev_t *, char *);
+
+typedef enum {
+ VDEV_LABEL_CREATE, /* create/add a new device */
+ VDEV_LABEL_REPLACE, /* replace an existing device */
+ VDEV_LABEL_SPARE, /* add a new hot spare */
+ VDEV_LABEL_REMOVE, /* remove an existing device */
+ VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
+ VDEV_LABEL_SPLIT /* generating new label for split-off dev */
+} vdev_labeltype_t;
+
+extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
+
+extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
new file mode 100644
index 000000000000..61e2f273f0a0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
@@ -0,0 +1,67 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_DISK_H
+#define _SYS_VDEV_DISK_H
+
+#include <sys/vdev.h>
+#ifdef _KERNEL
+#include <sys/buf.h>
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+typedef struct vdev_disk {
+ ddi_devid_t vd_devid;
+ char *vd_minor;
+ ldi_handle_t vd_lh;
+ list_t vd_ldi_cbs;
+ boolean_t vd_ldi_offline;
+} vdev_disk_t;
+#endif
+
+extern int vdev_disk_physio(vdev_t *,
+ caddr_t, size_t, uint64_t, int, boolean_t);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
+#ifdef _KERNEL
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_DISK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
new file mode 100644
index 000000000000..0260b4ab4f79
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_FILE_H
+#define _SYS_VDEV_FILE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_file {
+ vnode_t *vf_vnode;
+} vdev_file_t;
+
+extern void vdev_file_init(void);
+extern void vdev_file_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_FILE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
new file mode 100644
index 000000000000..e40335fc73ae
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -0,0 +1,571 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#ifndef _SYS_VDEV_IMPL_H
+#define _SYS_VDEV_IMPL_H
+
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu.h>
+#include <sys/metaslab.h>
+#include <sys/nvpair.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/dkio.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_removal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Virtual device descriptors.
+ *
+ * All storage pool operations go through the virtual device framework,
+ * which provides data replication and I/O scheduling.
+ */
+
+/*
+ * Forward declarations that lots of things need.
+ */
+typedef struct vdev_queue vdev_queue_t;
+typedef struct vdev_cache vdev_cache_t;
+typedef struct vdev_cache_entry vdev_cache_entry_t;
+struct abd;
+
+extern int zfs_vdev_queue_depth_pct;
+extern int zfs_vdev_def_queue_depth;
+extern uint32_t zfs_vdev_async_write_max_active;
+
+/*
+ * Virtual device operations
+ */
+typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
+ uint64_t *logical_ashift, uint64_t *physical_ashift);
+typedef void vdev_close_func_t(vdev_t *vd);
+typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
+typedef void vdev_io_start_func_t(zio_t *zio);
+typedef void vdev_io_done_func_t(zio_t *zio);
+typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
+typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
+typedef void vdev_hold_func_t(vdev_t *vd);
+typedef void vdev_rele_func_t(vdev_t *vd);
+
+typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
+ uint64_t offset, uint64_t size, void *arg);
+typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
+ vdev_remap_cb_t callback, void *arg);
+/*
+ * Given a target vdev, translates the logical range "in" to the physical
+ * range "res"
+ */
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
+ range_seg_t *res);
+
+typedef struct vdev_ops {
+ vdev_open_func_t *vdev_op_open;
+ vdev_close_func_t *vdev_op_close;
+ vdev_asize_func_t *vdev_op_asize;
+ vdev_io_start_func_t *vdev_op_io_start;
+ vdev_io_done_func_t *vdev_op_io_done;
+ vdev_state_change_func_t *vdev_op_state_change;
+ vdev_need_resilver_func_t *vdev_op_need_resilver;
+ vdev_hold_func_t *vdev_op_hold;
+ vdev_rele_func_t *vdev_op_rele;
+ vdev_remap_func_t *vdev_op_remap;
+ /*
+ * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
+ * Used when initializing vdevs. Isn't used by leaf ops.
+ */
+ vdev_xlation_func_t *vdev_op_xlate;
+ char vdev_op_type[16];
+ boolean_t vdev_op_leaf;
+} vdev_ops_t;
+
+/*
+ * Virtual device properties
+ */
+struct vdev_cache_entry {
+ struct abd *ve_abd;
+ uint64_t ve_offset;
+ uint64_t ve_lastused;
+ avl_node_t ve_offset_node;
+ avl_node_t ve_lastused_node;
+ uint32_t ve_hits;
+ uint16_t ve_missed_update;
+ zio_t *ve_fill_io;
+};
+
+struct vdev_cache {
+ avl_tree_t vc_offset_tree;
+ avl_tree_t vc_lastused_tree;
+ kmutex_t vc_lock;
+};
+
+typedef struct vdev_queue_class {
+ uint32_t vqc_active;
+
+ /*
+ * Sorted by offset or timestamp, depending on if the queue is
+ * LBA-ordered vs FIFO.
+ */
+ avl_tree_t vqc_queued_tree;
+} vdev_queue_class_t;
+
+struct vdev_queue {
+ vdev_t *vq_vdev;
+ vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
+ avl_tree_t vq_active_tree;
+ avl_tree_t vq_read_offset_tree;
+ avl_tree_t vq_write_offset_tree;
+ uint64_t vq_last_offset;
+ hrtime_t vq_io_complete_ts; /* time last i/o completed */
+ kmutex_t vq_lock;
+ uint64_t vq_lastoffset;
+};
+
+typedef enum vdev_alloc_bias {
+ VDEV_BIAS_NONE,
+ VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */
+ VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */
+ VDEV_BIAS_DEDUP /* dedicated to dedup metadata */
+} vdev_alloc_bias_t;
+
+
+/*
+ * On-disk indirect vdev state.
+ *
+ * An indirect vdev is described exclusively in the MOS config of a pool.
+ * The config for an indirect vdev includes several fields, which are
+ * accessed in memory by a vdev_indirect_config_t.
+ */
+typedef struct vdev_indirect_config {
+ /*
+ * Object (in MOS) which contains the indirect mapping. This object
+ * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
+ * vimep_src. The bonus buffer for this object is a
+ * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
+ * removal is initiated.
+ *
+ * Note that this object can be empty if none of the data on the vdev
+ * has been copied yet.
+ */
+ uint64_t vic_mapping_object;
+
+ /*
+ * Object (in MOS) which contains the birth times for the mapping
+ * entries. This object contains an array of
+ * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
+ * buffer for this object is a vdev_indirect_birth_phys_t. This object
+ * is allocated when a vdev removal is initiated.
+ *
+ * Note that this object can be empty if none of the vdev has yet been
+ * copied.
+ */
+ uint64_t vic_births_object;
+
+ /*
+ * This is the vdev ID which was removed previous to this vdev, or
+ * UINT64_MAX if there are no previously removed vdevs.
+ */
+ uint64_t vic_prev_indirect_vdev;
+} vdev_indirect_config_t;
+
+/*
+ * Virtual device descriptor
+ */
+struct vdev {
+ /*
+ * Common to all vdev types.
+ */
+ uint64_t vdev_id; /* child number in vdev parent */
+ uint64_t vdev_guid; /* unique ID for this vdev */
+ uint64_t vdev_guid_sum; /* self guid + all child guids */
+ uint64_t vdev_orig_guid; /* orig. guid prior to remove */
+ uint64_t vdev_asize; /* allocatable device capacity */
+ uint64_t vdev_min_asize; /* min acceptable asize */
+ uint64_t vdev_max_asize; /* max acceptable asize */
+ uint64_t vdev_ashift; /* block alignment shift */
+ /*
+ * Logical block alignment shift
+ *
+ * The smallest sized/aligned I/O supported by the device.
+ */
+ uint64_t vdev_logical_ashift;
+ /*
+ * Physical block alignment shift
+ *
+ * The device supports logical I/Os with vdev_logical_ashift
+ * size/alignment, but optimum performance will be achieved by
+ * aligning/sizing requests to vdev_physical_ashift. Smaller
+ * requests may be inflated or incur device level read-modify-write
+ * operations.
+ *
+ * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
+ */
+ uint64_t vdev_physical_ashift;
+ uint64_t vdev_state; /* see VDEV_STATE_* #defines */
+ uint64_t vdev_prevstate; /* used when reopening a vdev */
+ vdev_ops_t *vdev_ops; /* vdev operations */
+ spa_t *vdev_spa; /* spa for this vdev */
+ void *vdev_tsd; /* type-specific data */
+ vnode_t *vdev_name_vp; /* vnode for pathname */
+ vnode_t *vdev_devid_vp; /* vnode for devid */
+ vdev_t *vdev_top; /* top-level vdev */
+ vdev_t *vdev_parent; /* parent vdev */
+ vdev_t **vdev_child; /* array of children */
+ uint64_t vdev_children; /* number of children */
+ vdev_stat_t vdev_stat; /* virtual device statistics */
+ boolean_t vdev_expanding; /* expand the vdev? */
+ boolean_t vdev_reopening; /* reopen in progress? */
+ boolean_t vdev_nonrot; /* true if solid state */
+ int vdev_open_error; /* error on last open */
+ kthread_t *vdev_open_thread; /* thread opening children */
+ uint64_t vdev_crtxg; /* txg when top-level was added */
+
+ /*
+ * Top-level vdev state.
+ */
+ uint64_t vdev_ms_array; /* metaslab array object */
+ uint64_t vdev_ms_shift; /* metaslab size shift */
+ uint64_t vdev_ms_count; /* number of metaslabs */
+ metaslab_group_t *vdev_mg; /* metaslab group */
+ metaslab_t **vdev_ms; /* metaslab array */
+ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
+ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
+ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
+ boolean_t vdev_remove_wanted; /* async remove wanted? */
+ boolean_t vdev_probe_wanted; /* async probe wanted? */
+ list_node_t vdev_config_dirty_node; /* config dirty list */
+ list_node_t vdev_state_dirty_node; /* state dirty list */
+ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
+ uint64_t vdev_islog; /* is an intent log device */
+ uint64_t vdev_removing; /* device is being removed? */
+ boolean_t vdev_ishole; /* is a hole in the namespace */
+ uint64_t vdev_top_zap;
+ vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
+
+ /* pool checkpoint related */
+ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
+
+ boolean_t vdev_initialize_exit_wanted;
+ vdev_initializing_state_t vdev_initialize_state;
+ kthread_t *vdev_initialize_thread;
+ /* Protects vdev_initialize_thread and vdev_initialize_state. */
+ kmutex_t vdev_initialize_lock;
+ kcondvar_t vdev_initialize_cv;
+ uint64_t vdev_initialize_offset[TXG_SIZE];
+ uint64_t vdev_initialize_last_offset;
+ range_tree_t *vdev_initialize_tree; /* valid while initializing */
+ uint64_t vdev_initialize_bytes_est;
+ uint64_t vdev_initialize_bytes_done;
+ time_t vdev_initialize_action_time; /* start and end time */
+
+ /* for limiting outstanding I/Os */
+ kmutex_t vdev_initialize_io_lock;
+ kcondvar_t vdev_initialize_io_cv;
+ uint64_t vdev_initialize_inflight;
+
+ /*
+ * Values stored in the config for an indirect or removing vdev.
+ */
+ vdev_indirect_config_t vdev_indirect_config;
+
+ /*
+ * The vdev_indirect_rwlock protects the vdev_indirect_mapping
+ * pointer from changing on indirect vdevs (when it is condensed).
+ * Note that removing (not yet indirect) vdevs have different
+ * access patterns (the mapping is not accessed from open context,
+ * e.g. from zio_read) and locking strategy (e.g. svr_lock).
+ */
+ krwlock_t vdev_indirect_rwlock;
+ vdev_indirect_mapping_t *vdev_indirect_mapping;
+ vdev_indirect_births_t *vdev_indirect_births;
+
+ /*
+ * In memory data structures used to manage the obsolete sm, for
+ * indirect or removing vdevs.
+ *
+ * The vdev_obsolete_segments is the in-core record of the segments
+ * that are no longer referenced anywhere in the pool (due to
+ * being freed or remapped and not referenced by any snapshots).
+ * During a sync, segments are added to vdev_obsolete_segments
+ * via vdev_indirect_mark_obsolete(); at the end of each sync
+ * pass, this is appended to vdev_obsolete_sm via
+ * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock
+ * protects against concurrent modifications of vdev_obsolete_segments
+ * from multiple zio threads.
+ */
+ kmutex_t vdev_obsolete_lock;
+ range_tree_t *vdev_obsolete_segments;
+ space_map_t *vdev_obsolete_sm;
+
+ /*
+ * Protects the vdev_scan_io_queue field itself as well as the
+ * structure's contents (when present).
+ */
+ kmutex_t vdev_scan_io_queue_lock;
+ struct dsl_scan_io_queue *vdev_scan_io_queue;
+
+ /*
+ * Leaf vdev state.
+ */
+ range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */
+ space_map_t *vdev_dtl_sm; /* dirty time log space map */
+ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
+ uint64_t vdev_dtl_object; /* DTL object */
+ uint64_t vdev_psize; /* physical device capacity */
+ uint64_t vdev_wholedisk; /* true if this is a whole disk */
+ uint64_t vdev_offline; /* persistent offline state */
+ uint64_t vdev_faulted; /* persistent faulted state */
+ uint64_t vdev_degraded; /* persistent degraded state */
+ uint64_t vdev_removed; /* persistent removed state */
+ uint64_t vdev_resilver_txg; /* persistent resilvering state */
+ uint64_t vdev_nparity; /* number of parity devices for raidz */
+ char *vdev_path; /* vdev path (if any) */
+ char *vdev_devid; /* vdev devid (if any) */
+ char *vdev_physpath; /* vdev device path (if any) */
+ char *vdev_fru; /* physical FRU location */
+ uint64_t vdev_not_present; /* not present during import */
+ uint64_t vdev_unspare; /* unspare when resilvering done */
+ boolean_t vdev_nowritecache; /* true if flushwritecache failed */
+ boolean_t vdev_notrim; /* true if trim failed */
+ boolean_t vdev_checkremove; /* temporary online test */
+ boolean_t vdev_forcefault; /* force online fault */
+ boolean_t vdev_splitting; /* split or repair in progress */
+ boolean_t vdev_delayed_close; /* delayed device close? */
+ boolean_t vdev_tmpoffline; /* device taken offline temporarily? */
+ boolean_t vdev_detached; /* device detached? */
+ boolean_t vdev_cant_read; /* vdev is failing all reads */
+ boolean_t vdev_cant_write; /* vdev is failing all writes */
+ boolean_t vdev_isspare; /* was a hot spare */
+ boolean_t vdev_isl2cache; /* was a l2cache device */
+ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
+ vdev_cache_t vdev_cache; /* physical block cache */
+ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
+ zio_t *vdev_probe_zio; /* root of current probe */
+ vdev_aux_t vdev_label_aux; /* on-disk aux state */
+ struct trim_map *vdev_trimmap; /* map on outstanding trims */
+ uint64_t vdev_leaf_zap;
+ hrtime_t vdev_mmp_pending; /* 0 if write finished */
+ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
+ list_node_t vdev_leaf_node; /* leaf vdev list */
+
+ /*
+ * For DTrace to work in userland (libzpool) context, these fields must
+ * remain at the end of the structure. DTrace will use the kernel's
+ * CTF definition for 'struct vdev', and since the size of a kmutex_t is
+ * larger in userland, the offsets for the rest of the fields would be
+ * incorrect.
+ */
+ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
+ kmutex_t vdev_stat_lock; /* vdev_stat */
+ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
+};
+
+#define VDEV_RAIDZ_MAXPARITY 3
+
+#define VDEV_PAD_SIZE (8 << 10)
+/* 2 padding areas (vl_pad1 and vl_be) to skip */
+#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
+#define VDEV_PHYS_SIZE (112 << 10)
+#define VDEV_UBERBLOCK_RING (128 << 10)
+
+/*
+ * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
+ * ring when MMP is enabled.
+ */
+#define MMP_BLOCKS_PER_LABEL 1
+
+/* The largest uberblock we support is 8k. */
+#define MAX_UBERBLOCK_SHIFT (13)
+#define VDEV_UBERBLOCK_SHIFT(vd) \
+ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
+ MAX_UBERBLOCK_SHIFT)
+#define VDEV_UBERBLOCK_COUNT(vd) \
+ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
+#define VDEV_UBERBLOCK_OFFSET(vd, n) \
+ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
+#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
+
+typedef struct vdev_phys {
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+ zio_eck_t vp_zbt;
+} vdev_phys_t;
+
+typedef enum vbe_vers {
+ /* The bootenv file is stored as ascii text in the envblock */
+ VB_RAW = 0,
+
+ /*
+ * The bootenv file is converted to an nvlist and then packed into the
+ * envblock.
+ */
+ VB_NVLIST = 1
+} vbe_vers_t;
+
+typedef struct vdev_boot_envblock {
+ uint64_t vbe_version;
+ char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) -
+ sizeof (zio_eck_t)];
+ zio_eck_t vbe_zbt;
+} vdev_boot_envblock_t;
+
+CTASSERT(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE);
+
+typedef struct vdev_label {
+ char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
+ vdev_boot_envblock_t vl_be; /* 8K */
+ vdev_phys_t vl_vdev_phys; /* 112K */
+ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
+} vdev_label_t; /* 256K total */
+
+/*
+ * vdev_dirty() flags
+ */
+#define VDD_METASLAB 0x01
+#define VDD_DTL 0x02
+
+/* Offset of embedded boot loader region on each label */
+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
+/*
+ * Size of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
+#define VDEV_LABELS 4
+#define VDEV_BEST_LABEL VDEV_LABELS
+
+#define VDEV_ALLOC_LOAD 0
+#define VDEV_ALLOC_ADD 1
+#define VDEV_ALLOC_SPARE 2
+#define VDEV_ALLOC_L2CACHE 3
+#define VDEV_ALLOC_ROOTPOOL 4
+#define VDEV_ALLOC_SPLIT 5
+#define VDEV_ALLOC_ATTACH 6
+
+/*
+ * Allocate or free a vdev
+ */
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+ vdev_ops_t *ops);
+extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
+ vdev_t *parent, uint_t id, int alloctype);
+extern void vdev_free(vdev_t *vd);
+
+/*
+ * Add or remove children and parents
+ */
+extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_compact_children(vdev_t *pvd);
+extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
+extern void vdev_remove_parent(vdev_t *cvd);
+
+/*
+ * vdev sync load and sync
+ */
+extern boolean_t vdev_log_state_valid(vdev_t *vd);
+extern int vdev_load(vdev_t *vd);
+extern int vdev_dtl_load(vdev_t *vd);
+extern void vdev_sync(vdev_t *vd, uint64_t txg);
+extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
+extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
+extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
+
+/*
+ * Available vdev types.
+ */
+extern vdev_ops_t vdev_root_ops;
+extern vdev_ops_t vdev_mirror_ops;
+extern vdev_ops_t vdev_replacing_ops;
+extern vdev_ops_t vdev_raidz_ops;
+#ifdef _KERNEL
+extern vdev_ops_t vdev_geom_ops;
+#else
+extern vdev_ops_t vdev_disk_ops;
+#endif
+extern vdev_ops_t vdev_file_ops;
+extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
+extern vdev_ops_t vdev_spare_ops;
+extern vdev_ops_t vdev_indirect_ops;
+
+/*
+ * Common size functions
+ */
+extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
+ range_seg_t *out);
+extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
+extern uint64_t vdev_get_min_asize(vdev_t *vd);
+extern void vdev_set_min_asize(vdev_t *vd);
+
+/*
+ * Global variables
+ */
+extern int vdev_standard_sm_blksz;
+/* zdb uses this tunable, so it must be declared here to make lint happy. */
+extern int zfs_vdev_cache_size;
+extern uint_t zfs_geom_probe_vdev_key;
+
+/*
+ * Functions from vdev_indirect.c
+ */
+extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx);
+extern boolean_t vdev_indirect_should_condense(vdev_t *vd);
+extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
+extern int vdev_obsolete_sm_object(vdev_t *vd);
+extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);
+
+#ifdef illumos
+/*
+ * Other miscellaneous functions
+ */
+int vdev_checkpoint_sm_object(vdev_t *vd);
+
+/*
+ * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
+ */
+typedef struct vdev_buf {
+ buf_t vb_buf; /* buffer that describes the io */
+ zio_t *vb_io; /* pointer back to the original zio_t */
+} vdev_buf_t;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
new file mode 100644
index 000000000000..987b14485d2b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H
+#define _SYS_VDEV_INDIRECT_BIRTHS_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_indirect_birth_entry_phys {
+ uint64_t vibe_offset;
+ uint64_t vibe_phys_birth_txg;
+} vdev_indirect_birth_entry_phys_t;
+
+typedef struct vdev_indirect_birth_phys {
+ uint64_t vib_count; /* count of v_i_b_entry_phys_t's */
+} vdev_indirect_birth_phys_t;
+
+typedef struct vdev_indirect_births {
+ uint64_t vib_object;
+
+ /*
+ * Each entry indicates that everything up to but not including
+ * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted
+ * by increasing phys_birth, and also by increasing offset. See
+ * vdev_indirect_births_physbirth for usage.
+ */
+ vdev_indirect_birth_entry_phys_t *vib_entries;
+
+ objset_t *vib_objset;
+
+ dmu_buf_t *vib_dbuf;
+ vdev_indirect_birth_phys_t *vib_phys;
+} vdev_indirect_births_t;
+
+extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os,
+ uint64_t object);
+extern void vdev_indirect_births_close(vdev_indirect_births_t *vib);
+extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib);
+extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx);
+extern void vdev_indirect_births_free(objset_t *os, uint64_t object,
+ dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib);
+extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib);
+
+extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
+ uint64_t offset, uint64_t txg, dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib,
+ uint64_t offset, uint64_t asize);
+
+extern uint64_t vdev_indirect_births_last_entry_txg(
+ vdev_indirect_births_t *vib);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
new file mode 100644
index 000000000000..7e42c1019504
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
@@ -0,0 +1,141 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INDIRECT_MAPPING_H
+#define _SYS_VDEV_INDIRECT_MAPPING_H
+
+#include <sys/dmu.h>
+#include <sys/list.h>
+#include <sys/spa.h>
+#include <sys/space_map.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_indirect_mapping_entry_phys {
+ /*
+ * Decode with DVA_MAPPING_* macros.
+ * Contains:
+ * the source offset (low 63 bits)
+ * the one-bit "mark", used for garbage collection (by zdb)
+ */
+ uint64_t vimep_src;
+
+ /*
+ * Note: the DVA's asize is 24 bits, and can thus store ranges
+ * up to 8GB.
+ */
+ dva_t vimep_dst;
+} vdev_indirect_mapping_entry_phys_t;
+
+#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \
+ BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \
+ BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+typedef struct vdev_indirect_mapping_entry {
+ vdev_indirect_mapping_entry_phys_t vime_mapping;
+ uint32_t vime_obsolete_count;
+ list_node_t vime_node;
+} vdev_indirect_mapping_entry_t;
+
+/*
+ * This is stored in the bonus buffer of the mapping object, see comment of
+ * vdev_indirect_config for more details.
+ */
+typedef struct vdev_indirect_mapping_phys {
+ uint64_t vimp_max_offset;
+ uint64_t vimp_bytes_mapped;
+ uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */
+
+ /*
+ * For each entry in the mapping object, this object contains an
+ * entry representing the number of bytes of that mapping entry
+ * that were no longer in use by the pool at the time this indirect
+ * vdev was last condensed.
+ */
+ uint64_t vimp_counts_object;
+} vdev_indirect_mapping_phys_t;
+
+#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t))
+
+typedef struct vdev_indirect_mapping {
+ uint64_t vim_object;
+ boolean_t vim_havecounts;
+
+ /*
+ * An ordered array of all mapping entries, sorted by source offset.
+ * Note that vim_entries is needed during a removal (and contains
+ * mappings that have been synced to disk so far) to handle frees
+ * from the removing device.
+ */
+ vdev_indirect_mapping_entry_phys_t *vim_entries;
+
+ objset_t *vim_objset;
+
+ dmu_buf_t *vim_dbuf;
+ vdev_indirect_mapping_phys_t *vim_phys;
+} vdev_indirect_mapping_t;
+
+extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os,
+ uint64_t object);
+extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx);
+extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj,
+ dmu_tx_t *tx);
+
+extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_bytes_mapped(
+ vdev_indirect_mapping_t *vim);
+extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim);
+
+/*
+ * Writes the given list of vdev_indirect_mapping_entry_t to the mapping
+ * then updates internal state.
+ */
+extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
+ list_t *vime_list, dmu_tx_t *tx);
+
+extern vdev_indirect_mapping_entry_phys_t *
+ vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
+ uint64_t offset);
+
+extern vdev_indirect_mapping_entry_phys_t *
+ vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
+ uint64_t offset);
+
+extern uint32_t *vdev_indirect_mapping_load_obsolete_counts(
+ vdev_indirect_mapping_t *vim);
+extern void vdev_indirect_mapping_load_obsolete_spacemap(
+ vdev_indirect_mapping_t *vim,
+ uint32_t *counts, space_map_t *obsolete_space_sm);
+extern void vdev_indirect_mapping_increment_obsolete_count(
+ vdev_indirect_mapping_t *vim,
+ uint64_t offset, uint64_t asize, uint32_t *counts);
+extern void vdev_indirect_mapping_free_obsolete_counts(
+ vdev_indirect_mapping_t *vim, uint32_t *counts);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
new file mode 100644
index 000000000000..db4b0572cd60
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INITIALIZE_H
+#define _SYS_VDEV_INITIALIZE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vdev_initialize(vdev_t *vd);
+extern void vdev_initialize_stop(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_stop_all(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_restart(vdev_t *vd);
+extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
+ range_seg_t *physical_rs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INITIALIZE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
new file mode 100644
index 000000000000..e771e668fda6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define _SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#ifdef illumos
+#include <sys/semaphore.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+ caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_RAIDZ_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
new file mode 100644
index 000000000000..3962237afdab
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
@@ -0,0 +1,96 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_REMOVAL_H
+#define _SYS_VDEV_REMOVAL_H
+
+#include <sys/spa.h>
+#include <sys/bpobj.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_vdev_removal {
+ uint64_t svr_vdev_id;
+ uint64_t svr_max_offset_to_sync[TXG_SIZE];
+ /* Thread performing a vdev removal. */
+ kthread_t *svr_thread;
+ /* Segments left to copy from the current metaslab. */
+ range_tree_t *svr_allocd_segs;
+ kmutex_t svr_lock;
+ kcondvar_t svr_cv;
+ boolean_t svr_thread_exit;
+
+ /*
+ * New mappings to write out each txg.
+ */
+ list_t svr_new_segments[TXG_SIZE];
+
+ /*
+ * Ranges that were freed while a mapping was in flight. This is
+ * a subset of the ranges covered by vdev_im_new_segments.
+ */
+ range_tree_t *svr_frees[TXG_SIZE];
+
+ /*
+ * Number of bytes which we have finished our work for
+ * in each txg. This could be data copied (which will be part of
+ * the mappings in vdev_im_new_segments), or data freed before
+ * we got around to copying it.
+ */
+ uint64_t svr_bytes_done[TXG_SIZE];
+
+ /* List of leaf zap objects to be unlinked */
+ nvlist_t *svr_zaplist;
+} spa_vdev_removal_t;
+
+typedef struct spa_condensing_indirect {
+ /*
+ * New mappings to write out each txg.
+ */
+ list_t sci_new_mapping_entries[TXG_SIZE];
+
+ vdev_indirect_mapping_t *sci_new_mapping;
+} spa_condensing_indirect_t;
+
+extern int spa_remove_init(spa_t *);
+extern void spa_restart_removal(spa_t *);
+extern int spa_condense_init(spa_t *);
+extern void spa_condense_fini(spa_t *);
+extern void spa_start_indirect_condensing_thread(spa_t *);
+extern void spa_vdev_condense_suspend(spa_t *);
+extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
+extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
+extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
+extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
+extern void spa_vdev_remove_suspend(spa_t *);
+extern int spa_vdev_remove_cancel(spa_t *);
+extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
+
+extern int vdev_removal_max_span;
+extern int zfs_remove_max_segment;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_REMOVAL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
new file mode 100644
index 000000000000..e60233b4b103
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@@ -0,0 +1,514 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#ifndef _SYS_ZAP_H
+#define _SYS_ZAP_H
+
+/*
+ * ZAP - ZFS Attribute Processor
+ *
+ * The ZAP is a module which sits on top of the DMU (Data Management
+ * Unit) and implements a higher-level storage primitive using DMU
+ * objects. Its primary consumer is the ZPL (ZFS Posix Layer).
+ *
+ * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
+ * Users should use only zap routines to access a zapobj - they should
+ * not access the DMU object directly using DMU routines.
+ *
+ * The attributes stored in a zapobj are name-value pairs. The name is
+ * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
+ * terminating NULL). The value is an array of integers, which may be
+ * 1, 2, 4, or 8 bytes long. The total space used by the array (number
+ * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
+ * Note that an 8-byte integer value can be used to store the location
+ * (object number) of another dmu object (which may be itself a zapobj).
+ * Note that you can use a zero-length attribute to store a single bit
+ * of information - the attribute is present or not.
+ *
+ * The ZAP routines are thread-safe. However, you must observe the
+ * DMU's restriction that a transaction may not be operated on
+ * concurrently.
+ *
+ * Any of the routines that return an int may return an I/O error (EIO
+ * or ECHECKSUM).
+ *
+ *
+ * Implementation / Performance Notes:
+ *
+ * The ZAP is intended to operate most efficiently on attributes with
+ * short (49 bytes or less) names and single 8-byte values, for which
+ * the microzap will be used. The ZAP should be efficient enough so
+ * that the user does not need to cache these attributes.
+ *
+ * The ZAP's locking scheme makes its routines thread-safe. Operations
+ * on different zapobjs will be processed concurrently. Operations on
+ * the same zapobj which only read data will be processed concurrently.
+ * Operations on the same zapobj which modify data will be processed
+ * concurrently when there are many attributes in the zapobj (because
+ * the ZAP uses per-block locking - more than 128 * (number of cpus)
+ * small attributes will suffice).
+ */
+
+/*
+ * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
+ * strings) for the names of attributes, rather than a byte string
+ * bounded by an explicit length. If some day we want to support names
+ * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
+ * we'll have to add routines for using length-bounded strings.
+ */
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Specifies matching criteria for ZAP lookups.
+ * MT_NORMALIZE Use ZAP normalization flags, which can include both
+ * unicode normalization and case-insensitivity.
+ * MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is
+ * specified and ZAP normalization flags include
+ * U8_TEXTPREP_TOUPPER.
+ */
+typedef enum matchtype {
+ MT_NORMALIZE = 1 << 0,
+ MT_MATCH_CASE = 1 << 1,
+} matchtype_t;
+
+typedef enum zap_flags {
+ /* Use 64-bit hash value (serialized cursors will always use 64-bits) */
+ ZAP_FLAG_HASH64 = 1 << 0,
+ /* Key is binary, not string (zap_add_uint64() can be used) */
+ ZAP_FLAG_UINT64_KEY = 1 << 1,
+ /*
+ * First word of key (which must be an array of uint64) is
+ * already randomly distributed.
+ */
+ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
+} zap_flags_t;
+
+/*
+ * Create a new zapobj with no attributes and return its object number.
+ *
+ * dnodesize specifies the on-disk size of the dnode for the new zapobj.
+ * Valid values are multiples of 512 up to DNODE_MAX_SIZE.
+ */
+uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
+uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
+ dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx);
+uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
+ zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx);
+uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
+ uint64_t parent_obj, const char *name, dmu_tx_t *tx);
+uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
+ uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
+uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
+ uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
+
+/*
+ * Initialize an already-allocated object.
+ */
+void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
+ zap_flags_t flags, dmu_tx_t *tx);
+
+/*
+ * Create a new zapobj with no attributes from the given (unallocated)
+ * object number.
+ */
+int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
+int zap_create_claim_norm(objset_t *ds, uint64_t obj,
+ int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
+ int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
+
+/*
+ * The zapobj passed in must be a valid ZAP object for all of the
+ * following routines.
+ */
+
+/*
+ * Destroy this zapobj and all its attributes.
+ *
+ * Frees the object number using dmu_object_free.
+ */
+int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate attributes.
+ *
+ * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
+ */
+
+/*
+ * Retrieve the contents of the attribute with the given name.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ *
+ * If 'integer_size' is smaller than the attribute's integer size, the
+ * call will fail and return EINVAL.
+ *
+ * If 'integer_size' is equal to or larger than the attribute's integer
+ * size, the call will succeed and return 0.
+ *
+ * When converting to a larger integer size, the integers will be treated as
+ * unsigned (ie. no sign-extension will be performed).
+ *
+ * 'num_integers' is the length (in integers) of 'buf'.
+ *
+ * If the attribute is longer than the buffer, as many integers as will
+ * fit will be transferred to 'buf'. If the entire attribute was not
+ * transferred, the call will return EOVERFLOW.
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+
+/*
+ * If rn_len is nonzero, realname will be set to the name of the found
+ * entry (which may be different from the requested name if matchtype is
+ * not MT_EXACT).
+ *
+ * If normalization_conflictp is not NULL, it will be set if there is
+ * another name with the same case/unicode normalized form.
+ */
+int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *normalization_conflictp);
+int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
+int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints);
+int zap_lookup_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp);
+
+int zap_count_write_by_dnode(dnode_t *dn, const char *name,
+ int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite);
+
+/*
+ * Create an attribute with the given name and value.
+ *
+ * If an attribute with the given name already exists, the call will
+ * fail and return EEXIST.
+ */
+int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+int zap_add_by_dnode(dnode_t *dn, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+
+/*
+ * Set the attribute with the given name to the given value. If an
+ * attribute with the given name does not exist, it will be created. If
+ * an attribute with the given name already exists, the previous value
+ * will be overwritten. The integer_size may be different from the
+ * existing attribute's integer size, in which case the attribute's
+ * integer size will be updated to the new value.
+ */
+int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+
+/*
+ * Get the length (in integers) and the integer size of the specified
+ * attribute.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers);
+
+/*
+ * Remove the specified attribute.
+ *
+ * If the specified attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
+int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
+ matchtype_t mt, dmu_tx_t *tx);
+int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
+int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx);
+
+/*
+ * Returns (in *count) the number of attributes in the specified zap
+ * object.
+ */
+int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
+
+/*
+ * Returns (in name) the name of the entry whose (value & mask)
+ * (za_first_integer) is value, or ENOENT if not found. The string
+ * pointed to by name must be at least 256 bytes long. If mask==0, the
+ * match must be exact (ie, same as mask=-1ULL).
+ */
+int zap_value_search(objset_t *os, uint64_t zapobj,
+ uint64_t value, uint64_t mask, char *name);
+
+/*
+ * Transfer all the entries from fromobj into intoobj. Only works on
+ * int_size=8 num_integers=1 values. Fails if there are any duplicated
+ * entries.
+ */
+int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
+
+/* Same as zap_join, but set the values to 'value'. */
+int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx);
+
+/* Same as zap_join, but add together any duplicated entries. */
+int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx);
+
+/*
+ * Manipulate entries where the name + value are the "same" (the name is
+ * a stringified version of the value).
+ */
+int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
+int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx);
+
+/* Here the key is an int and the value is a different int. */
+int zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_update_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t *valuep);
+
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx);
+
+struct zap;
+struct zap_leaf;
+typedef struct zap_cursor {
+ /* This structure is opaque! */
+ objset_t *zc_objset;
+ struct zap *zc_zap;
+ struct zap_leaf *zc_leaf;
+ uint64_t zc_zapobj;
+ uint64_t zc_serialized;
+ uint64_t zc_hash;
+ uint32_t zc_cd;
+ boolean_t zc_prefetch;
+} zap_cursor_t;
+
+typedef struct {
+ int za_integer_length;
+ /*
+ * za_normalization_conflict will be set if there are additional
+ * entries with this normalized form (eg, "foo" and "Foo").
+ */
+ boolean_t za_normalization_conflict;
+ uint64_t za_num_integers;
+ uint64_t za_first_integer; /* no sign extension for <8byte ints */
+ char za_name[ZAP_MAXNAMELEN];
+} zap_attribute_t;
+
+/*
+ * The interface for listing all the attributes of a zapobj can be
+ * thought of as cursor moving down a list of the attributes one by
+ * one. The cookie returned by the zap_cursor_serialize routine is
+ * persistent across system calls (and across reboot, even).
+ */
+
+/*
+ * Initialize a zap cursor, pointing to the "first" attribute of the
+ * zapobj. You must _fini the cursor when you are done with it.
+ */
+void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
+ uint64_t zapobj);
+void zap_cursor_fini(zap_cursor_t *zc);
+
+/*
+ * Get the attribute currently pointed to by the cursor. Returns
+ * ENOENT if at the end of the attributes.
+ */
+int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
+
+/*
+ * Advance the cursor to the next attribute.
+ */
+void zap_cursor_advance(zap_cursor_t *zc);
+
+/*
+ * Get a persistent cookie pointing to the current position of the zap
+ * cursor. The low 4 bits in the cookie are always zero, and thus can
+ * be used as to differentiate a serialized cookie from a different type
+ * of value. The cookie will be less than 2^32 as long as there are
+ * fewer than 2^22 (4.2 million) entries in the zap object.
+ */
+uint64_t zap_cursor_serialize(zap_cursor_t *zc);
+
+/*
+ * Advance the cursor to the attribute having the given key.
+ */
+int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt);
+
+/*
+ * Initialize a zap cursor pointing to the position recorded by
+ * zap_cursor_serialize (in the "serialized" argument). You can also
+ * use a "serialized" argument of 0 to start at the beginning of the
+ * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
+ * zap_cursor_init(...).)
+ */
+void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
+ uint64_t zapobj, uint64_t serialized);
+
+
+#define ZAP_HISTOGRAM_SIZE 10
+
+typedef struct zap_stats {
+ /*
+ * Size of the pointer table (in number of entries).
+ * This is always a power of 2, or zero if it's a microzap.
+ * In general, it should be considerably greater than zs_num_leafs.
+ */
+ uint64_t zs_ptrtbl_len;
+
+ uint64_t zs_blocksize; /* size of zap blocks */
+
+ /*
+ * The number of blocks used. Note that some blocks may be
+ * wasted because old ptrtbl's and large name/value blocks are
+ * not reused. (Although their space is reclaimed, we don't
+ * reuse those offsets in the object.)
+ */
+ uint64_t zs_num_blocks;
+
+ /*
+ * Pointer table values from zap_ptrtbl in the zap_phys_t
+ */
+ uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */
+ uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */
+ uint64_t zs_ptrtbl_zt_blk; /* starting block number */
+ uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */
+ uint64_t zs_ptrtbl_zt_shift; /* bits to index it */
+
+ /*
+ * Values of the other members of the zap_phys_t
+ */
+ uint64_t zs_block_type; /* ZBT_HEADER */
+ uint64_t zs_magic; /* ZAP_MAGIC */
+ uint64_t zs_num_leafs; /* The number of leaf blocks */
+ uint64_t zs_num_entries; /* The number of zap entries */
+ uint64_t zs_salt; /* salt to stir into hash function */
+
+ /*
+ * Histograms. For all histograms, the last index
+ * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
+ * than what can be represented. For example
+ * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
+ * of leafs with more than 45 entries.
+ */
+
+ /*
+ * zs_leafs_with_n_pointers[n] is the number of leafs with
+ * 2^n pointers to it.
+ */
+ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_with_n_entries[n] is the number of leafs with
+ * [n*5, (n+1)*5) entries. In the current implementation, there
+ * can be at most 55 entries in any block, but there may be
+ * fewer if the name or value is large, or the block is not
+ * completely full.
+ */
+ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_n_tenths_full[n] is the number of leafs whose
+ * fullness is in the range [n/10, (n+1)/10).
+ */
+ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_entries_using_n_chunks[n] is the number of entries which
+ * consume n 24-byte chunks. (Note, large names/values only use
+ * one chunk, but contribute to zs_num_blocks_large.)
+ */
+ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_buckets_with_n_entries[n] is the number of buckets (each
+ * leaf has 64 buckets) with n entries.
+ * zs_buckets_with_n_entries[1] should be very close to
+ * zs_num_entries.
+ */
+ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
+} zap_stats_t;
+
+/*
+ * Get statistics about a ZAP object. Note: you need to be aware of the
+ * internal implementation of the ZAP to correctly interpret some of the
+ * statistics. This interface shouldn't be relied on unless you really
+ * know what you're doing.
+ */
+int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
new file mode 100644
index 000000000000..912b8b219c4c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@@ -0,0 +1,242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#ifndef _SYS_ZAP_IMPL_H
+#define _SYS_ZAP_IMPL_H
+
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int fzap_default_block_shift;
+
+#define ZAP_MAGIC 0x2F52AB2ABULL
+
+#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
+
+#define MZAP_ENT_LEN 64
+#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
+#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE
+
+#define ZAP_NEED_CD (-1U)
+
+typedef struct mzap_ent_phys {
+ uint64_t mze_value;
+ uint32_t mze_cd;
+ uint16_t mze_pad; /* in case we want to chain them someday */
+ char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+ uint64_t mz_block_type; /* ZBT_MICRO */
+ uint64_t mz_salt;
+ uint64_t mz_normflags;
+ uint64_t mz_pad[5];
+ mzap_ent_phys_t mz_chunk[1];
+ /* actually variable size depending on block size */
+} mzap_phys_t;
+
+typedef struct mzap_ent {
+ avl_node_t mze_node;
+ int mze_chunkid;
+ uint64_t mze_hash;
+ uint32_t mze_cd; /* copy from mze_phys->mze_cd */
+} mzap_ent_t;
+
+#define MZE_PHYS(zap, mze) \
+ (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+struct dmu_buf;
+struct zap_leaf;
+
+#define ZBT_LEAF ((1ULL << 63) + 0)
+#define ZBT_HEADER ((1ULL << 63) + 1)
+#define ZBT_MICRO ((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/*
+ * the embedded pointer table takes up half a block:
+ * block size / entry size (2^3) / 2
+ */
+#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
+
+/*
+ * The embedded pointer table starts half-way through the block. Since
+ * the pointer table itself is half the block, it starts at (64-bit)
+ * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
+ */
+#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
+ ((uint64_t *)zap_f_phys(zap)) \
+ [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+ uint64_t zap_block_type; /* ZBT_HEADER */
+ uint64_t zap_magic; /* ZAP_MAGIC */
+
+ struct zap_table_phys {
+ uint64_t zt_blk; /* starting block number */
+ uint64_t zt_numblks; /* number of blocks */
+ uint64_t zt_shift; /* bits to index it */
+ uint64_t zt_nextblk; /* next (larger) copy start block */
+ uint64_t zt_blks_copied; /* number source blocks copied */
+ } zap_ptrtbl;
+
+ uint64_t zap_freeblk; /* the next free block */
+ uint64_t zap_num_leafs; /* number of leafs */
+ uint64_t zap_num_entries; /* number of entries */
+ uint64_t zap_salt; /* salt to stir into hash function */
+ uint64_t zap_normflags; /* flags for u8_textprep_str() */
+ uint64_t zap_flags; /* zap_flags_t */
+ /*
+ * This structure is followed by padding, and then the embedded
+ * pointer table. The embedded pointer table takes up second
+ * half of the block. It is accessed using the
+ * ZAP_EMBEDDED_PTRTBL_ENT() macro.
+ */
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+typedef struct zap {
+ dmu_buf_user_t zap_dbu;
+ objset_t *zap_objset;
+ uint64_t zap_object;
+ struct dmu_buf *zap_dbuf;
+ krwlock_t zap_rwlock;
+ boolean_t zap_ismicro;
+ int zap_normflags;
+ uint64_t zap_salt;
+ union {
+ struct {
+ /*
+ * zap_num_entries_mtx protects
+ * zap_num_entries
+ */
+ kmutex_t zap_num_entries_mtx;
+ int zap_block_shift;
+ } zap_fat;
+ struct {
+ int16_t zap_num_entries;
+ int16_t zap_num_chunks;
+ int16_t zap_alloc_next;
+ avl_tree_t zap_avl;
+ } zap_micro;
+ } zap_u;
+} zap_t;
+
+inline zap_phys_t *
+zap_f_phys(zap_t *zap)
+{
+ return (zap->zap_dbuf->db_data);
+}
+
+inline mzap_phys_t *
+zap_m_phys(zap_t *zap)
+{
+ return (zap->zap_dbuf->db_data);
+}
+
+typedef struct zap_name {
+ zap_t *zn_zap;
+ int zn_key_intlen;
+ const void *zn_key_orig;
+ int zn_key_orig_numints;
+ const void *zn_key_norm;
+ int zn_key_norm_numints;
+ uint64_t zn_hash;
+ matchtype_t zn_matchtype;
+ int zn_normflags;
+ char zn_normbuf[ZAP_MAXNAMELEN];
+} zap_name_t;
+
+#define zap_f zap_u.zap_fat
+#define zap_m zap_u.zap_micro
+
+boolean_t zap_match(zap_name_t *zn, const char *matchname);
+int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp);
+void zap_unlockdir(zap_t *zap, void *tag);
+void zap_evict_sync(void *dbu);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
+void zap_name_free(zap_name_t *zn);
+int zap_hashbits(zap_t *zap);
+uint32_t zap_maxcd(zap_t *zap);
+uint64_t zap_getflags(zap_t *zap);
+
+#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+void fzap_byteswap(void *buf, size_t size);
+int fzap_count(zap_t *zap, uint64_t *count);
+int fzap_lookup(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ char *realname, int rn_len, boolean_t *normalization_conflictp);
+void fzap_prefetch(zap_name_t *zn);
+int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
+ const void *val, void *tag, dmu_tx_t *tx);
+int fzap_update(zap_name_t *zn,
+ int integer_size, uint64_t num_integers, const void *val,
+ void *tag, dmu_tx_t *tx);
+int fzap_length(zap_name_t *zn,
+ uint64_t *integer_size, uint64_t *num_integers);
+int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
+int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
+void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
+void zap_put_leaf(struct zap_leaf *l);
+
+int fzap_add_cd(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, void *tag, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
+int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
new file mode 100644
index 000000000000..76b3ecc72557
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#ifndef _SYS_ZAP_LEAF_H
+#define _SYS_ZAP_LEAF_H
+
+#include <sys/zap.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct zap;
+struct zap_name;
+struct zap_stats;
+
+#define ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+#define ZAP_LEAF_CHUNKSIZE 24
+
+/*
+ * The amount of space available for chunks is:
+ * block size (1<<l->l_bs) - hash entry size (2) * number of hash
+ * entries - header space (2*chunksize)
+ */
+#define ZAP_LEAF_NUMCHUNKS(l) \
+ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
+ ZAP_LEAF_CHUNKSIZE - 2)
+
+/*
+ * The amount of space within the chunk available for the array is:
+ * chunk size - space for type (1) - space for next pointer (2)
+ */
+#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
+
+#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
+ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * Low water mark: when there are only this many chunks free, start
+ * growing the ptrtbl. Ideally, this should be larger than a
+ * "reasonably-sized" entry. 20 chunks is more than enough for the
+ * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
+ * while still being only around 3% for 16k blocks.
+ */
+#define ZAP_LEAF_LOW_WATER (20)
+
+/*
+ * The leaf hash table has block size / 2^5 (32) number of entries,
+ * which should be more than enough for the maximum number of entries,
+ * which is less than block size / CHUNKSIZE (24) / minimum number of
+ * chunks per entry (3).
+ */
+#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
+#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
+
+/*
+ * The chunks start immediately after the hash table. The end of the
+ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
+ * chunk_t.
+ */
+#define ZAP_LEAF_CHUNK(l, idx) \
+ ((zap_leaf_chunk_t *) \
+ (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
+
+typedef enum zap_chunk_type {
+ ZAP_CHUNK_FREE = 253,
+ ZAP_CHUNK_ENTRY = 252,
+ ZAP_CHUNK_ARRAY = 251,
+ ZAP_CHUNK_TYPE_MAX = 250
+} zap_chunk_type_t;
+
+#define ZLF_ENTRIES_CDSORTED (1<<0)
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+ struct zap_leaf_header {
+ /* Public to ZAP */
+ uint64_t lh_block_type; /* ZBT_LEAF */
+ uint64_t lh_pad1;
+ uint64_t lh_prefix; /* hash prefix of this leaf */
+ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
+ uint16_t lh_nfree; /* number free chunks */
+ uint16_t lh_nentries; /* number of entries */
+ uint16_t lh_prefix_len; /* num bits used to id this */
+
+ /* Private to zap_leaf */
+ uint16_t lh_freelist; /* chunk head of free list */
+ uint8_t lh_flags; /* ZLF_* flags */
+ uint8_t lh_pad2[11];
+ } l_hdr; /* 2 24-byte chunks */
+
+ /*
+ * The header is followed by a hash table with
+ * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
+ * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
+ * zap_leaf_chunk structures. These structures are accessed
+ * with the ZAP_LEAF_CHUNK() macro.
+ */
+
+ uint16_t l_hash[1];
+} zap_leaf_phys_t;
+
+typedef union zap_leaf_chunk {
+ struct zap_leaf_entry {
+ uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
+ uint8_t le_value_intlen; /* size of value's ints */
+ uint16_t le_next; /* next entry in hash chain */
+ uint16_t le_name_chunk; /* first chunk of the name */
+ uint16_t le_name_numints; /* ints in name (incl null) */
+ uint16_t le_value_chunk; /* first chunk of the value */
+ uint16_t le_value_numints; /* value length in ints */
+ uint32_t le_cd; /* collision differentiator */
+ uint64_t le_hash; /* hash value of the name */
+ } l_entry;
+ struct zap_leaf_array {
+ uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
+ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t la_next; /* next blk or CHAIN_END */
+ } l_array;
+ struct zap_leaf_free {
+ uint8_t lf_type; /* always ZAP_CHUNK_FREE */
+ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t lf_next; /* next in free list, or CHAIN_END */
+ } l_free;
+} zap_leaf_chunk_t;
+
+typedef struct zap_leaf {
+ dmu_buf_user_t l_dbu;
+ krwlock_t l_rwlock;
+ uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
+ int l_bs; /* block size shift */
+ dmu_buf_t *l_dbuf;
+} zap_leaf_t;
+
+inline zap_leaf_phys_t *
+zap_leaf_phys(zap_leaf_t *l)
+{
+ return (l->l_dbuf->db_data);
+}
+
+typedef struct zap_entry_handle {
+ /* Set by zap_leaf and public to ZAP */
+ uint64_t zeh_num_integers;
+ uint64_t zeh_hash;
+ uint32_t zeh_cd;
+ uint8_t zeh_integer_size;
+
+ /* Private to zap_leaf */
+ uint16_t zeh_fakechunk;
+ uint16_t *zeh_chunkp;
+ zap_leaf_t *zeh_leaf;
+} zap_entry_handle_t;
+
+/*
+ * Return a handle to the named entry, or ENOENT if not found. The hash
+ * value must equal zap_hash(name).
+ */
+extern int zap_leaf_lookup(zap_leaf_t *l,
+ struct zap_name *zn, zap_entry_handle_t *zeh);
+
+/*
+ * Return a handle to the entry with this hash+cd, or the entry with the
+ * next closest hash+cd.
+ */
+extern int zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
+
+/*
+ * Read the first num_integers in the attribute. Integer size
+ * conversion will be done without sign extension. Return EINVAL if
+ * integer_size is too small. Return EOVERFLOW if there are more than
+ * num_integers in the attribute.
+ */
+extern int zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf);
+
+extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
+ uint16_t buflen, char *buf);
+
+/*
+ * Replace the value of an existing entry.
+ *
+ * May fail if it runs out of space (ENOSPC).
+ */
+extern int zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf);
+
+/*
+ * Remove an entry.
+ */
+extern void zap_entry_remove(zap_entry_handle_t *zeh);
+
+/*
+ * Create an entry. An equal entry must not exist, and this entry must
+ * belong in this leaf (according to its hash value). Fills in the
+ * entry handle on success. Returns 0 on success or ENOSPC on failure.
+ */
+extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh);
+
+/* Determine whether there is another entry with the same normalized form. */
+extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
+ struct zap_name *zn, const char *name, struct zap *zap);
+
+/*
+ * Other stuff.
+ */
+
+extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
+extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
+extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
+ struct zap_stats *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_LEAF_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h
new file mode 100644
index 000000000000..a6cb575b5b62
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h
@@ -0,0 +1,185 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZCP_H
+#define _SYS_ZCP_H
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+
+#include "lua.h"
+#include "lualib.h"
+#include "lauxlib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZCP_RUN_INFO_KEY "runinfo"
+
+extern uint64_t zfs_lua_max_instrlimit;
+extern uint64_t zfs_lua_max_memlimit;
+
+int zcp_argerror(lua_State *, int, const char *, ...);
+
+int zcp_eval(const char *, const char *, boolean_t, uint64_t, uint64_t,
+ nvpair_t *, nvlist_t *);
+
+int zcp_load_list_lib(lua_State *);
+
+int zcp_load_synctask_lib(lua_State *, boolean_t);
+
+typedef void (zcp_cleanup_t)(void *);
+typedef struct zcp_cleanup_handler {
+ zcp_cleanup_t *zch_cleanup_func;
+ void *zch_cleanup_arg;
+ list_node_t zch_node;
+} zcp_cleanup_handler_t;
+
+typedef struct zcp_alloc_arg {
+ boolean_t aa_must_succeed;
+ int64_t aa_alloc_remaining;
+ int64_t aa_alloc_limit;
+} zcp_alloc_arg_t;
+
+typedef struct zcp_run_info {
+ dsl_pool_t *zri_pool;
+
+ /*
+ * An estimate of the total amount of space consumed by all
+ * synctasks we have successfully performed so far in this
+ * channel program. Used to generate ENOSPC errors for syncfuncs.
+ */
+ int zri_space_used;
+
+ /*
+ * The credentials of the thread which originally invoked the channel
+ * program. Since channel programs are always invoked from the synctask
+ * thread they should always do permissions checks against this cred
+ * rather than the 'current' thread's.
+ */
+ cred_t *zri_cred;
+
+ /*
+ * The tx in which this channel program is running.
+ */
+ dmu_tx_t *zri_tx;
+
+ /*
+ * The maximum number of Lua instructions the channel program is allowed
+ * to execute. If it takes longer than this it will time out. A value
+ * of 0 indicates no instruction limit.
+ */
+ uint64_t zri_maxinstrs;
+
+ /*
+ * The number of Lua instructions the channel program has executed.
+ */
+ uint64_t zri_curinstrs;
+
+ /*
+ * Boolean indicating whether or not the channel program exited
+ * because it timed out.
+ */
+ boolean_t zri_timed_out;
+
+ /*
+ * Channel program was canceled by user
+ */
+ boolean_t zri_canceled;
+
+ /*
+ * Boolean indicating whether or not we are running in syncing
+ * context.
+ */
+ boolean_t zri_sync;
+
+ /*
+ * List of currently registered cleanup handlers, which will be
+ * triggered in the event of a fatal error.
+ */
+ list_t zri_cleanup_handlers;
+
+ /*
+ * The Lua state context of our channel program.
+ */
+ lua_State *zri_state;
+
+ /*
+ * Lua memory allocator arguments.
+ */
+ zcp_alloc_arg_t *zri_allocargs;
+
+ /*
+ * Contains output values from zcp script or error string.
+ */
+ nvlist_t *zri_outnvl;
+
+ /*
+ * The errno number returned to caller of zcp_eval().
+ */
+ int zri_result;
+} zcp_run_info_t;
+
+zcp_run_info_t *zcp_run_info(lua_State *);
+zcp_cleanup_handler_t *zcp_register_cleanup(lua_State *, zcp_cleanup_t, void *);
+void zcp_deregister_cleanup(lua_State *, zcp_cleanup_handler_t *);
+void zcp_cleanup(lua_State *);
+
+/*
+ * Argument parsing routines for channel program callback functions.
+ */
+typedef struct zcp_arg {
+ /*
+ * The name of this argument. For keyword arguments this is the name
+ * functions will use to set the argument. For positional arguments
+ * the name has no programatic meaning, but will appear in error
+ * messages and help output.
+ */
+ const char *za_name;
+
+ /*
+ * The Lua type this argument should have (e.g. LUA_TSTRING,
+ * LUA_TBOOLEAN) see the lua_type() function documentation for a
+ * complete list. Calling a function with an argument that does
+ * not match the expected type will result in the program terminating.
+ */
+ const int za_lua_type;
+} zcp_arg_t;
+
+void zcp_parse_args(lua_State *, const char *, const zcp_arg_t *,
+ const zcp_arg_t *);
+int zcp_nvlist_to_lua(lua_State *, nvlist_t *, char *, int);
+int zcp_dataset_hold_error(lua_State *, dsl_pool_t *, const char *, int);
+struct dsl_dataset *zcp_dataset_hold(lua_State *, dsl_pool_t *,
+ const char *, void *);
+
+typedef int (zcp_lib_func_t)(lua_State *);
+typedef struct zcp_lib_info {
+ const char *name;
+ zcp_lib_func_t *func;
+ const zcp_arg_t pargs[4];
+ const zcp_arg_t kwargs[2];
+} zcp_lib_info_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZCP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h
new file mode 100644
index 000000000000..e227f2f4b7f5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h
@@ -0,0 +1,35 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZCP_GLOBALS_H
+#define _SYS_ZCP_GLOBALS_H
+
+#include "lua.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void zcp_load_globals(lua_State *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZCP_GLOBALS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h
new file mode 100644
index 000000000000..a021e1ce8917
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZCP_LIST_H
+#define _SYS_ZCP_LIST_H
+
+#include "lua.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void zcp_load_list_funcs(lua_State *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZCP_LIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h
new file mode 100644
index 000000000000..97b17619565c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h
@@ -0,0 +1,34 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZCP_PROP_H
+#define _SYS_ZCP_PROP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int zcp_load_get_lib(lua_State *state);
+boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZCP_PROP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
new file mode 100644
index 000000000000..5abde149a615
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZFEATURE_H
+#define _SYS_ZFEATURE_H
+
+#include <sys/nvpair.h>
+#include <sys/txg.h>
+#include "zfeature_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES)
+#define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE || \
+ VALID_FEATURE_FID(fid))
+
+struct spa;
+struct dmu_tx;
+struct objset;
+
+extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *);
+extern void spa_feature_enable(struct spa *, spa_feature_t,
+ struct dmu_tx *);
+extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *);
+extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *);
+extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t);
+extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t);
+extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid,
+ uint64_t *txg);
+extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t);
+extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *);
+
+/*
+ * These functions are only exported for zhack and zdb; normal callers should
+ * use the above interfaces.
+ */
+extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *);
+extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+ uint64_t *res);
+extern void feature_enable_sync(struct spa *, zfeature_info_t *,
+ struct dmu_tx *);
+extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t,
+ struct dmu_tx *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFEATURE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
new file mode 100644
index 000000000000..b34360a3c821
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_ACL_H
+#define _SYS_FS_ZFS_ACL_H
+
+#ifdef _KERNEL
+#include <sys/cred.h>
+#endif
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct znode_phys;
+
+#define ACE_SLOT_CNT 6
+#define ZFS_ACL_VERSION_INITIAL 0ULL
+#define ZFS_ACL_VERSION_FUID 1ULL
+#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID
+
+/*
+ * ZFS ACLs (Access Control Lists) are stored in various forms.
+ *
+ * Files created with ACL version ZFS_ACL_VERSION_INITIAL
+ * will all be created with fixed length ACEs of type
+ * zfs_oldace_t.
+ *
+ * Files with ACL version ZFS_ACL_VERSION_FUID will be created
+ * with various sized ACEs. The abstraction entries will utilize
+ * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t
+ * and some specialized CIFS ACEs will use zfs_object_ace_t.
+ */
+
+/*
+ * All ACEs have a common hdr. For
+ * owner@, group@, and everyone@ this is all
+ * thats needed.
+ */
+typedef struct zfs_ace_hdr {
+ uint16_t z_type;
+ uint16_t z_flags;
+ uint32_t z_access_mask;
+} zfs_ace_hdr_t;
+
+typedef zfs_ace_hdr_t zfs_ace_abstract_t;
+
+/*
+ * Standard ACE
+ */
+typedef struct zfs_ace {
+ zfs_ace_hdr_t z_hdr;
+ uint64_t z_fuid;
+} zfs_ace_t;
+
+/*
+ * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE
+ * and will only be set/retrieved in a CIFS context.
+ */
+
+typedef struct zfs_object_ace {
+ zfs_ace_t z_ace;
+ uint8_t z_object_type[16]; /* object type */
+ uint8_t z_inherit_type[16]; /* inherited object type */
+} zfs_object_ace_t;
+
+typedef struct zfs_oldace {
+ uint32_t z_fuid; /* "who" */
+ uint32_t z_access_mask; /* access mask */
+ uint16_t z_flags; /* flags, i.e inheritance */
+ uint16_t z_type; /* type of entry allow/deny */
+} zfs_oldace_t;
+
+typedef struct zfs_acl_phys_v0 {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_count; /* Number of ACEs */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_pad; /* pad */
+ zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_acl_phys_v0_t;
+
+#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
+
+/*
+ * Size of ACL count is always 2 bytes.
+ * Necessary to for dealing with both V0 ACL and V1 ACL layout
+ */
+#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t))
+
+typedef struct zfs_acl_phys {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_size; /* Number of bytes in ACL */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_count; /* ace count */
+ uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+} zfs_acl_phys_t;
+
+typedef struct acl_ops {
+ uint32_t (*ace_mask_get) (void *acep); /* get access mask */
+ void (*ace_mask_set) (void *acep,
+ uint32_t mask); /* set access mask */
+ uint16_t (*ace_flags_get) (void *acep); /* get flags */
+ void (*ace_flags_set) (void *acep,
+ uint16_t flags); /* set flags */
+ uint16_t (*ace_type_get)(void *acep); /* get type */
+ void (*ace_type_set)(void *acep,
+ uint16_t type); /* set type */
+ uint64_t (*ace_who_get)(void *acep); /* get who/fuid */
+ void (*ace_who_set)(void *acep,
+ uint64_t who); /* set who/fuid */
+ size_t (*ace_size)(void *acep); /* how big is this ace */
+ size_t (*ace_abstract_size)(void); /* sizeof abstract entry */
+ int (*ace_mask_off)(void); /* off of access mask in ace */
+ /* ptr to data if any */
+ int (*ace_data)(void *acep, void **datap);
+} acl_ops_t;
+
+/*
+ * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's.
+ * Each node will have one or more ACEs associated with it. You will
+ * only have multiple nodes during a chmod operation. Normally only
+ * one node is required.
+ */
+typedef struct zfs_acl_node {
+ list_node_t z_next; /* Next chunk of ACEs */
+ void *z_acldata; /* pointer into actual ACE(s) */
+ void *z_allocdata; /* pointer to kmem allocated memory */
+ size_t z_allocsize; /* Size of blob in bytes */
+ size_t z_size; /* length of ACL data */
+ uint64_t z_ace_count; /* number of ACEs in this acl node */
+ int z_ace_idx; /* ace iterator positioned on */
+} zfs_acl_node_t;
+
+typedef struct zfs_acl {
+ uint64_t z_acl_count; /* Number of ACEs */
+ size_t z_acl_bytes; /* Number of bytes in ACL */
+ uint_t z_version; /* version of ACL */
+ void *z_next_ace; /* pointer to next ACE */
+ uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
+ zfs_acl_node_t *z_curr_node; /* current node iterator is handling */
+ list_t z_acl; /* chunks of ACE data */
+ acl_ops_t z_ops; /* ACL operations */
+} zfs_acl_t;
+
+typedef struct acl_locator_cb {
+ zfs_acl_t *cb_aclp;
+ zfs_acl_node_t *cb_acl_node;
+} zfs_acl_locator_cb_t;
+
+#define ACL_DATA_ALLOCED 0x1
+#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
+
+struct zfs_fuid_info;
+
+typedef struct zfs_acl_ids {
+ uint64_t z_fuid; /* file owner fuid */
+ uint64_t z_fgid; /* file group owner fuid */
+ uint64_t z_mode; /* mode to set on create */
+ zfs_acl_t *z_aclp; /* ACL to create with file */
+ struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */
+} zfs_acl_ids_t;
+
+/*
+ * Property values for acl_mode and acl_inherit.
+ *
+ * acl_mode can take discard, noallow, groupmask and passthrough.
+ * whereas acl_inherit has secure instead of groupmask.
+ */
+
+#define ZFS_ACL_DISCARD 0
+#define ZFS_ACL_NOALLOW 1
+#define ZFS_ACL_GROUPMASK 2
+#define ZFS_ACL_PASSTHROUGH 3
+#define ZFS_ACL_RESTRICTED 4
+#define ZFS_ACL_PASSTHROUGH_X 5
+
+struct znode;
+struct zfsvfs;
+
+#ifdef _KERNEL
+int zfs_acl_ids_create(struct znode *, int, vattr_t *,
+ cred_t *, vsecattr_t *, zfs_acl_ids_t *);
+void zfs_acl_ids_free(zfs_acl_ids_t *);
+boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *);
+int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
+int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
+void zfs_acl_rele(void *);
+void zfs_oldace_byteswap(ace_t *, int);
+void zfs_ace_byteswap(void *, size_t, boolean_t);
+extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
+extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
+#ifdef illumos
+int zfs_fastaccesschk_execute(struct znode *, cred_t *);
+#endif
+int zfs_freebsd_fastaccesschk_execute(struct vnode *, cred_t *);
+extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
+extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
+extern int zfs_acl_access(struct znode *, int, cred_t *);
+int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
+int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
+int zfs_zaccess_rename(struct znode *, struct znode *,
+ struct znode *, struct znode *, cred_t *cr);
+void zfs_acl_free(zfs_acl_t *);
+int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
+ struct zfs_fuid_info **, zfs_acl_t **);
+int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
+uint64_t zfs_external_acl(struct znode *);
+int zfs_znode_acl_version(struct znode *);
+int zfs_acl_size(struct znode *, int *);
+zfs_acl_t *zfs_acl_alloc(int);
+zfs_acl_node_t *zfs_acl_node_alloc(size_t);
+void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *);
+void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *,
+ uint64_t *, uint64_t, uint64_t);
+int zfs_acl_chown_setattr(struct znode *);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _SYS_FS_ZFS_ACL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
new file mode 100644
index 000000000000..38fda1d40585
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -0,0 +1,146 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define _SYS_ZFS_CONTEXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <sys/stdint.h>
+#include <sys/note.h>
+#include <sys/kernel.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/taskqueue.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/kcondvar.h>
+#include <sys/random.h>
+#include <sys/byteorder.h>
+#include <sys/systm.h>
+#include <sys/list.h>
+#include <sys/zfs_debug.h>
+#include <sys/sysevent.h>
+#include <sys/uio.h>
+#include <sys/dirent.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/string.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/cred.h>
+#include <sys/sdt.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/sysctl.h>
+#include <sys/sbuf.h>
+#include <sys/priv.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/stack.h>
+#include <sys/lockf.h>
+#include <sys/pathname.h>
+#include <sys/policy.h>
+#include <sys/refstr.h>
+#include <sys/zone.h>
+#include <sys/eventhandler.h>
+#include <sys/extattr.h>
+#include <sys/misc.h>
+#include <sys/sig.h>
+#include <sys/osd.h>
+#include <sys/sysevent/dev.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/u8_textprep.h>
+#include <sys/fm/util.h>
+#include <sys/sunddi.h>
+#ifdef illumos
+#include <sys/cyclic.h>
+#endif
+#include <sys/callo.h>
+#include <sys/disp.h>
+#include <machine/_inttypes.h>
+#include <machine/stdarg.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vnode_pager.h>
+
+#define boot_ncpus (mp_ncpus)
+
+#define CPU_SEQID (curcpu)
+
+#define tsd_create(keyp, destructor) do { \
+ *(keyp) = osd_thread_register((destructor)); \
+ KASSERT(*(keyp) > 0, ("cannot register OSD")); \
+} while (0)
+#define tsd_destroy(keyp) osd_thread_deregister(*(keyp))
+#define tsd_get(key) osd_thread_get(curthread, (key))
+#define tsd_set(key, value) osd_thread_set(curthread, (key), (value))
+
+#ifdef __cplusplus
+}
+#endif
+
+extern int zfs_debug_level;
+extern struct mtx zfs_debug_mtx;
+#define ZFS_LOG(lvl, ...) do { \
+ if (((lvl) & 0xff) <= zfs_debug_level) { \
+ mtx_lock(&zfs_debug_mtx); \
+ printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \
+ printf(__VA_ARGS__); \
+ printf("\n"); \
+ if ((lvl) & 0x100) \
+ kdb_backtrace(); \
+ mtx_unlock(&zfs_debug_mtx); \
+ } \
+} while (0)
+
+#define sys_shutdown rebooting
+
+#define noinline __attribute__((noinline))
+#define likely(x) __builtin_expect((x), 1)
+
+#endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
new file mode 100644
index 000000000000..de770c52add0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _ZFS_CTLDIR_H
+#define _ZFS_CTLDIR_H
+
+#include <sys/vnode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_CTLDIR_NAME ".zfs"
+
+#define zfs_has_ctldir(zdp) \
+ ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
+ ((zdp)->z_zfsvfs->z_ctldir != NULL))
+#define zfs_show_ctldir(zdp) \
+ (zfs_has_ctldir(zdp) && \
+ ((zdp)->z_zfsvfs->z_show_ctldir))
+
+void zfsctl_create(zfsvfs_t *);
+void zfsctl_destroy(zfsvfs_t *);
+int zfsctl_root(zfsvfs_t *, int, vnode_t **);
+void zfsctl_init(void);
+void zfsctl_fini(void);
+boolean_t zfsctl_is_node(vnode_t *);
+
+int zfsctl_rename_snapshot(const char *from, const char *to);
+int zfsctl_destroy_snapshot(const char *snapname, int force);
+int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
+
+int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
+
+#define ZFSCTL_INO_ROOT 0x1
+#define ZFSCTL_INO_SNAPDIR 0x2
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_CTLDIR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
new file mode 100644
index 000000000000..9cbfc26b64e2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_DEBUG_H
+#define _SYS_ZFS_DEBUG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/*
+ * ZFS debugging
+ */
+
+#if defined(DEBUG) || !defined(_KERNEL)
+#if !defined(ZFS_DEBUG)
+#define ZFS_DEBUG
+#endif
+#endif
+
+extern int zfs_flags;
+extern boolean_t zfs_recover;
+extern boolean_t zfs_free_leak_on_eio;
+
+#define ZFS_DEBUG_DPRINTF (1 << 0)
+#define ZFS_DEBUG_DBUF_VERIFY (1 << 1)
+#define ZFS_DEBUG_DNODE_VERIFY (1 << 2)
+#define ZFS_DEBUG_SNAPNAMES (1 << 3)
+#define ZFS_DEBUG_MODIFY (1 << 4)
+/* 1<<5 was previously used, try not to reuse */
+#define ZFS_DEBUG_ZIO_FREE (1 << 6)
+#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7)
+#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
+#define ZFS_DEBUG_INDIRECT_REMAP (1 << 9)
+
+#ifdef ZFS_DEBUG
+extern void __dprintf(const char *file, const char *func,
+ int line, const char *fmt, ...);
+#define dprintf(...) \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) \
+ __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
+#else
+#define dprintf(...) ((void)0)
+#endif /* ZFS_DEBUG */
+
+extern void zfs_panic_recover(const char *fmt, ...);
+
+typedef struct zfs_dbgmsg {
+ list_node_t zdm_node;
+ time_t zdm_timestamp;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+extern void zfs_dbgmsg_init(void);
+extern void zfs_dbgmsg_fini(void);
+extern void zfs_dbgmsg(const char *fmt, ...);
+extern void zfs_dbgmsg_print(const char *tag);
+
+#ifdef illumos
+#ifndef _KERNEL
+extern int dprintf_find_string(const char *string);
+#endif
+#endif /* illumos */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_DEBUG_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
new file mode 100644
index 000000000000..22d8e603c433
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_DIR_H
+#define _SYS_FS_ZFS_DIR_H
+
+#include <sys/pathname.h>
+#include <sys/dmu.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* zfs_dirent_lock() flags */
+#define ZNEW 0x0001 /* entry should not exist */
+#define ZEXISTS 0x0002 /* entry should exist */
+#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */
+#define ZXATTR 0x0008 /* we want the xattr dir */
+#define ZRENAMING 0x0010 /* znode is being renamed */
+#define ZCILOOK 0x0020 /* case-insensitive lookup requested */
+#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */
+#define ZHAVELOCK 0x0080 /* z_name_lock is already held */
+
+/* mknode flags */
+#define IS_ROOT_NODE 0x01 /* create a root node */
+#define IS_XATTR 0x02 /* create an extended attribute node */
+
+extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
+extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
+ boolean_t *);
+#if 0
+extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
+#else
+extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
+#endif
+extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
+ uint_t, znode_t **, zfs_acl_ids_t *);
+extern void zfs_rmnode(znode_t *);
+extern boolean_t zfs_dirempty(znode_t *);
+extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
+extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
+extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
+extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int);
+extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_DIR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
new file mode 100644
index 000000000000..b381bb98e734
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
@@ -0,0 +1,132 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_FUID_H
+#define _SYS_FS_ZFS_FUID_H
+
+#include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/kidmap.h>
+#include <sys/dmu.h>
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/avl.h>
+#include <sys/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ ZFS_OWNER,
+ ZFS_GROUP,
+ ZFS_ACE_USER,
+ ZFS_ACE_GROUP
+} zfs_fuid_type_t;
+
+/*
+ * Estimate space needed for one more fuid table entry.
+ * for now assume its current size + 1K
+ */
+#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
+
+#define FUID_INDEX(x) ((x) >> 32)
+#define FUID_RID(x) ((x) & 0xffffffff)
+#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid))
+/*
+ * FUIDs cause problems for the intent log
+ * we need to replay the creation of the FUID,
+ * but we can't count on the idmapper to be around
+ * and during replay the FUID index may be different than
+ * before. Also, if an ACL has 100 ACEs and 12 different
+ * domains we don't want to log 100 domain strings, but rather
+ * just the unique 12.
+ */
+
+/*
+ * The FUIDs in the log will index into
+ * domain string table and the bottom half will be the rid.
+ * Used for mapping ephemeral uid/gid during ACL setting to FUIDs
+ */
+typedef struct zfs_fuid {
+ list_node_t z_next;
+ uint64_t z_id; /* uid/gid being converted to fuid */
+ uint64_t z_domidx; /* index in AVL domain table */
+ uint64_t z_logfuid; /* index for domain in log */
+} zfs_fuid_t;
+
+/* list of unique domains */
+typedef struct zfs_fuid_domain {
+ list_node_t z_next;
+ uint64_t z_domidx; /* AVL tree idx */
+ const char *z_domain; /* domain string */
+} zfs_fuid_domain_t;
+
+/*
+ * FUID information necessary for logging create, setattr, and setacl.
+ */
+typedef struct zfs_fuid_info {
+ list_t z_fuids;
+ list_t z_domains;
+ uint64_t z_fuid_owner;
+ uint64_t z_fuid_group;
+ char **z_domain_table; /* Used during replay */
+ uint32_t z_fuid_cnt; /* How many fuids in z_fuids */
+ uint32_t z_domain_cnt; /* How many domains */
+ size_t z_domain_str_sz; /* len of domain strings z_domain list */
+} zfs_fuid_info_t;
+
+#ifdef _KERNEL
+struct znode;
+extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
+extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t,
+ uint64_t, uint64_t, zfs_fuid_type_t);
+extern void zfs_fuid_destroy(zfsvfs_t *);
+extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
+ cred_t *, zfs_fuid_info_t **);
+extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t,
+ zfs_fuid_info_t **);
+extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr,
+ uid_t *uid, uid_t *gid);
+extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
+extern void zfs_fuid_info_free(zfs_fuid_info_t *);
+extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *);
+void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *);
+extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain,
+ char **retdomain, boolean_t addok);
+extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx);
+extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+#endif
+
+char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
+void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *);
+uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *);
+void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_FUID_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
new file mode 100644
index 000000000000..756800f8afde
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -0,0 +1,466 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#ifndef _SYS_ZFS_IOCTL_H
+#define _SYS_ZFS_IOCTL_H
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zfs_stat.h>
+
+#ifdef _KERNEL
+#include <sys/nvpair.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The structures in this file are passed between userland and the
+ * kernel. Userland may be running a 32-bit process, while the kernel
+ * is 64-bit. Therefore, these structures need to compile the same in
+ * 32-bit and 64-bit. This means not using type "long", and adding
+ * explicit padding so that the 32-bit structure will not be packed more
+ * tightly than the 64-bit structure (which requires 64-bit alignment).
+ */
+
+/*
+ * Property values for snapdir
+ */
+#define ZFS_SNAPDIR_HIDDEN 0
+#define ZFS_SNAPDIR_VISIBLE 1
+
+/*
+ * Field manipulation macros for the drr_versioninfo field of the
+ * send stream header.
+ */
+
+/*
+ * Header types for zfs send streams.
+ */
+typedef enum drr_headertype {
+ DMU_SUBSTREAM = 0x1,
+ DMU_COMPOUNDSTREAM = 0x2
+} drr_headertype_t;
+
+#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2)
+#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x)
+
+#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30)
+#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x)
+
+/*
+ * Feature flags for zfs send streams (flags in drr_versioninfo)
+ */
+
+#define DMU_BACKUP_FEATURE_DEDUP (1 << 0)
+#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1)
+#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2)
+/* flags #3 - #15 are reserved for incompatible closed-source implementations */
+#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16)
+#define DMU_BACKUP_FEATURE_LZ4 (1 << 17)
+/* flag #18 is reserved for a Delphix feature */
+#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19)
+#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
+/* flag #21 is reserved for a Delphix feature */
+#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22)
+#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23)
+/* flag #24 is reserved for the raw send feature */
+/* flag #25 is reserved for the ZSTD compression feature */
+
+/*
+ * Mask of all supported backup features
+ */
+#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
+ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
+ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \
+ DMU_BACKUP_FEATURE_RESUMING | \
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE | \
+ DMU_BACKUP_FEATURE_COMPRESSED)
+
+/* Are all features in the given flag word currently supported? */
+#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
+
+typedef enum dmu_send_resume_token_version {
+ ZFS_SEND_RESUME_TOKEN_VERSION = 1
+} dmu_send_resume_token_version_t;
+
+/*
+ * The drr_versioninfo field of the dmu_replay_record has the
+ * following layout:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | reserved | feature-flags |C|S|
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * The low order two bits indicate the header type: SUBSTREAM (0x1)
+ * or COMPOUNDSTREAM (0x2). Using two bits for this is historical:
+ * this field used to be a version number, where the two version types
+ * were 1 and 2. Using two bits for this allows earlier versions of
+ * the code to be able to recognize send streams that don't use any
+ * of the features indicated by feature flags.
+ */
+
+#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
+
+/*
+ * Send stream flags. Bits 24-31 are reserved for vendor-specific
+ * implementations and should not be used.
+ */
+#define DRR_FLAG_CLONE (1<<0)
+#define DRR_FLAG_CI_DATA (1<<1)
+/*
+ * This send stream, if it is a full send, includes the FREE and FREEOBJECT
+ * records that are created by the sending process. This means that the send
+ * stream can be received as a clone, even though it is not an incremental.
+ * This is not implemented as a feature flag, because the receiving side does
+ * not need to have implemented it to receive this stream; it is fully backwards
+ * compatible. We need a flag, though, because full send streams without it
+ * cannot necessarily be received as a clone correctly.
+ */
+#define DRR_FLAG_FREERECORDS (1<<2)
+
+/*
+ * flags in the drr_checksumflags field in the DRR_WRITE and
+ * DRR_WRITE_BYREF blocks
+ */
+#define DRR_CHECKSUM_DEDUP (1<<0)
+
+#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP)
+
+/* deal with compressed drr_write replay records */
+#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0)
+#define DRR_WRITE_PAYLOAD_SIZE(drrw) \
+ (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \
+ (drrw)->drr_logical_size)
+
+typedef struct dmu_replay_record {
+ enum {
+ DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
+ DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
+ DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
+ } drr_type;
+ uint32_t drr_payloadlen;
+ union {
+ struct drr_begin {
+ uint64_t drr_magic;
+ uint64_t drr_versioninfo; /* was drr_version */
+ uint64_t drr_creation_time;
+ dmu_objset_type_t drr_type;
+ uint32_t drr_flags;
+ uint64_t drr_toguid;
+ uint64_t drr_fromguid;
+ char drr_toname[MAXNAMELEN];
+ } drr_begin;
+ struct drr_end {
+ zio_cksum_t drr_checksum;
+ uint64_t drr_toguid;
+ } drr_end;
+ struct drr_object {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ dmu_object_type_t drr_bonustype;
+ uint32_t drr_blksz;
+ uint32_t drr_bonuslen;
+ uint8_t drr_checksumtype;
+ uint8_t drr_compress;
+ uint8_t drr_dn_slots;
+ uint8_t drr_pad[5];
+ uint64_t drr_toguid;
+ /* bonus content follows */
+ } drr_object;
+ struct drr_freeobjects {
+ uint64_t drr_firstobj;
+ uint64_t drr_numobjs;
+ uint64_t drr_toguid;
+ } drr_freeobjects;
+ struct drr_write {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ uint32_t drr_pad;
+ uint64_t drr_offset;
+ uint64_t drr_logical_size;
+ uint64_t drr_toguid;
+ uint8_t drr_checksumtype;
+ uint8_t drr_checksumflags;
+ uint8_t drr_compressiontype;
+ uint8_t drr_pad2[5];
+ /* deduplication key */
+ ddt_key_t drr_key;
+ /* only nonzero if drr_compressiontype is not 0 */
+ uint64_t drr_compressed_size;
+ /* content follows */
+ } drr_write;
+ struct drr_free {
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ } drr_free;
+ struct drr_write_byref {
+ /* where to put the data */
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ /* where to find the prior copy of the data */
+ uint64_t drr_refguid;
+ uint64_t drr_refobject;
+ uint64_t drr_refoffset;
+ /* properties of the data */
+ uint8_t drr_checksumtype;
+ uint8_t drr_checksumflags;
+ uint8_t drr_pad2[6];
+ ddt_key_t drr_key; /* deduplication key */
+ } drr_write_byref;
+ struct drr_spill {
+ uint64_t drr_object;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint64_t drr_pad[4]; /* needed for crypto */
+ /* spill data follows */
+ } drr_spill;
+ struct drr_write_embedded {
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ /* logical length, should equal blocksize */
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint8_t drr_compression;
+ uint8_t drr_etype;
+ uint8_t drr_pad[6];
+ uint32_t drr_lsize; /* uncompressed size of payload */
+ uint32_t drr_psize; /* compr. (real) size of payload */
+ /* (possibly compressed) content follows */
+ } drr_write_embedded;
+
+ /*
+ * Nore: drr_checksum is overlaid with all record types
+ * except DRR_BEGIN. Therefore its (non-pad) members
+ * must not overlap with members from the other structs.
+ * We accomplish this by putting its members at the very
+ * end of the struct.
+ */
+ struct drr_checksum {
+ uint64_t drr_pad[34];
+ /*
+ * fletcher-4 checksum of everything preceding the
+ * checksum.
+ */
+ zio_cksum_t drr_checksum;
+ } drr_checksum;
+ } drr_u;
+} dmu_replay_record_t;
+
+/* diff record range types */
+typedef enum diff_type {
+ DDR_NONE = 0x1,
+ DDR_INUSE = 0x2,
+ DDR_FREE = 0x4
+} diff_type_t;
+
+/*
+ * The diff reports back ranges of free or in-use objects.
+ */
+typedef struct dmu_diff_record {
+ uint64_t ddr_type;
+ uint64_t ddr_first;
+ uint64_t ddr_last;
+} dmu_diff_record_t;
+
+typedef struct zinject_record {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+ uint32_t zi_failfast;
+ char zi_func[MAXNAMELEN];
+ uint32_t zi_iotype;
+ int32_t zi_duration;
+ uint64_t zi_timer;
+ uint64_t zi_nlanes;
+ uint32_t zi_cmd;
+ uint32_t zi_pad;
+} zinject_record_t;
+
+#define ZINJECT_NULL 0x1
+#define ZINJECT_FLUSH_ARC 0x2
+#define ZINJECT_UNLOAD_SPA 0x4
+
+typedef enum zinject_type {
+ ZINJECT_UNINITIALIZED,
+ ZINJECT_DATA_FAULT,
+ ZINJECT_DEVICE_FAULT,
+ ZINJECT_LABEL_FAULT,
+ ZINJECT_IGNORED_WRITES,
+ ZINJECT_PANIC,
+ ZINJECT_DELAY_IO,
+} zinject_type_t;
+
+typedef struct zfs_share {
+ uint64_t z_exportdata;
+ uint64_t z_sharedata;
+ uint64_t z_sharetype; /* 0 = share, 1 = unshare */
+ uint64_t z_sharemax; /* max length of share string */
+} zfs_share_t;
+
+/*
+ * ZFS file systems may behave the usual, POSIX-compliant way, where
+ * name lookups are case-sensitive. They may also be set up so that
+ * all the name lookups are case-insensitive, or so that only some
+ * lookups, the ones that set an FIGNORECASE flag, are case-insensitive.
+ */
+typedef enum zfs_case {
+ ZFS_CASE_SENSITIVE,
+ ZFS_CASE_INSENSITIVE,
+ ZFS_CASE_MIXED
+} zfs_case_t;
+
+/*
+ * Note: this struct must have the same layout in 32-bit and 64-bit, so
+ * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit
+ * kernel. Therefore, we add padding to it so that no "hidden" padding
+ * is automatically added on 64-bit (but not on 32-bit).
+ */
+typedef struct zfs_cmd {
+ char zc_name[MAXPATHLEN]; /* name of pool or dataset */
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
+ int zc_pad2;
+
+ /*
+ * The following members are for legacy ioctls which haven't been
+ * converted to the new method.
+ */
+ uint64_t zc_history; /* really (char *) */
+ char zc_value[MAXPATHLEN * 2];
+ char zc_string[MAXNAMELEN];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_conf; /* really (char *) */
+ uint64_t zc_nvlist_conf_size;
+ uint64_t zc_cookie;
+ uint64_t zc_objset_type;
+ uint64_t zc_perm_action;
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
+ zfs_share_t zc_share;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ dmu_replay_record_t zc_begin_record;
+ zinject_record_t zc_inject_record;
+ uint32_t zc_defer_destroy;
+ uint32_t zc_flags;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_simple;
+ uint8_t zc_pad3[3];
+ boolean_t zc_resumable;
+ uint32_t zc_pad4;
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
+} zfs_cmd_t;
+
+typedef struct zfs_useracct {
+ char zu_domain[256];
+ uid_t zu_rid;
+ uint32_t zu_pad;
+ uint64_t zu_space;
+} zfs_useracct_t;
+
+#define ZFSDEV_MAX_MINOR (1 << 16)
+#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1)
+
+#define ZPOOL_EXPORT_AFTER_SPLIT 0x1
+
+#ifdef _KERNEL
+struct objset;
+struct zfsvfs;
+
+typedef struct zfs_creat {
+ nvlist_t *zct_zplprops;
+ nvlist_t *zct_props;
+} zfs_creat_t;
+
+extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *);
+extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *);
+extern int zfs_secpolicy_destroy_perms(const char *, cred_t *);
+extern int zfs_busy(void);
+extern void zfs_unmount_snap(const char *);
+extern void zfs_destroy_unmount_origin(const char *);
+#ifdef illumos
+extern int getzfsvfs_impl(struct objset *, struct zfsvfs **);
+#else
+extern int getzfsvfs_impl(struct objset *, vfs_t **);
+#endif
+extern int getzfsvfs(const char *, struct zfsvfs **);
+
+/*
+ * ZFS minor numbers can refer to either a control device instance or
+ * a zvol. Depending on the value of zss_type, zss_data points to either
+ * a zvol_state_t or a zfs_onexit_t.
+ */
+enum zfs_soft_state_type {
+ ZSST_ZVOL,
+ ZSST_CTLDEV
+};
+
+typedef struct zfs_soft_state {
+ enum zfs_soft_state_type zss_type;
+ void *zss_data;
+} zfs_soft_state_t;
+
+extern void *zfsdev_get_soft_state(minor_t minor,
+ enum zfs_soft_state_type which);
+extern minor_t zfsdev_minor_alloc(void);
+
+extern void *zfsdev_state;
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IOCTL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
new file mode 100644
index 000000000000..4982bd4d0afc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_ONEXIT_H
+#define _SYS_ZFS_ONEXIT_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct zfs_onexit {
+ kmutex_t zo_lock;
+ list_t zo_actions;
+} zfs_onexit_t;
+
+typedef struct zfs_onexit_action_node {
+ list_node_t za_link;
+ void (*za_func)(void *);
+ void *za_data;
+} zfs_onexit_action_node_t;
+
+extern void zfs_onexit_init(zfs_onexit_t **zo);
+extern void zfs_onexit_destroy(zfs_onexit_t *zo);
+
+#endif
+
+extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
+extern void zfs_onexit_fd_rele(int fd);
+extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle);
+extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
+ boolean_t fire);
+extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
+ void **data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_ONEXIT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
new file mode 100644
index 000000000000..ffae1130fd88
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_RLOCK_H
+#define _SYS_FS_ZFS_RLOCK_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __FreeBSD__
+#define rangelock_init zfs_rangelock_init
+#define rangelock_fini zfs_rangelock_fini
+#endif
+
+typedef enum {
+ RL_READER,
+ RL_WRITER,
+ RL_APPEND
+} rangelock_type_t;
+
+struct locked_range;
+
+typedef void (rangelock_cb_t)(struct locked_range *, void *);
+
+#ifdef __FreeBSD__
+typedef struct zfs_rangelock {
+#else
+typedef struct rangelock {
+#endif
+ avl_tree_t rl_tree; /* contains locked_range_t */
+ kmutex_t rl_lock;
+ rangelock_cb_t *rl_cb;
+ void *rl_arg;
+} rangelock_t;
+
+typedef struct locked_range {
+ rangelock_t *lr_rangelock; /* rangelock that this lock applies to */
+ avl_node_t lr_node; /* avl node link */
+ uint64_t lr_offset; /* file range offset */
+ uint64_t lr_length; /* file range length */
+ uint_t lr_count; /* range reference count in tree */
+ rangelock_type_t lr_type; /* range type */
+ kcondvar_t lr_write_cv; /* cv for waiting writers */
+ kcondvar_t lr_read_cv; /* cv for waiting readers */
+ uint8_t lr_proxy; /* acting for original range */
+ uint8_t lr_write_wanted; /* writer wants to lock this range */
+ uint8_t lr_read_wanted; /* reader wants to lock this range */
+} locked_range_t;
+
+void rangelock_init(rangelock_t *, rangelock_cb_t *, void *);
+void rangelock_fini(rangelock_t *);
+
+locked_range_t *rangelock_enter(rangelock_t *,
+ uint64_t, uint64_t, rangelock_type_t);
+locked_range_t *rangelock_tryenter(rangelock_t *,
+ uint64_t, uint64_t, rangelock_type_t);
+void rangelock_exit(locked_range_t *);
+void rangelock_reduce(locked_range_t *, uint64_t, uint64_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_RLOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
new file mode 100644
index 000000000000..fc40b0e9517c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_SA_H
+#define _SYS_ZFS_SA_H
+
+#ifdef _KERNEL
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zil.h>
+
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the list of known attributes
+ * to the ZPL. The values of the actual
+ * attributes are not defined by the order
+ * the enums. It is controlled by the attribute
+ * registration mechanism. Two different file system
+ * could have different numeric values for the same
+ * attributes. this list is only used for dereferencing
+ * into the table that will hold the actual numeric value.
+ */
+typedef enum zpl_attr {
+ ZPL_ATIME,
+ ZPL_MTIME,
+ ZPL_CTIME,
+ ZPL_CRTIME,
+ ZPL_GEN,
+ ZPL_MODE,
+ ZPL_SIZE,
+ ZPL_PARENT,
+ ZPL_LINKS,
+ ZPL_XATTR,
+ ZPL_RDEV,
+ ZPL_FLAGS,
+ ZPL_UID,
+ ZPL_GID,
+ ZPL_PAD,
+ ZPL_ZNODE_ACL,
+ ZPL_DACL_COUNT,
+ ZPL_SYMLINK,
+ ZPL_SCANSTAMP,
+ ZPL_DACL_ACES,
+ ZPL_END
+} zpl_attr_t;
+
+#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108
+#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \
+ sizeof (zfs_acl_phys_t))
+
+#define SA_MODE_OFFSET 0
+#define SA_SIZE_OFFSET 8
+#define SA_GEN_OFFSET 16
+#define SA_UID_OFFSET 24
+#define SA_GID_OFFSET 32
+#define SA_PARENT_OFFSET 40
+
+extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
+extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
+
+/*
+ * This is a deprecated data structure that only exists for
+ * dealing with file systems create prior to ZPL version 5.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_zap; /* 144 - extra attributes */
+ uint64_t zp_pad[3]; /* 152 - future */
+ zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we use this space for the following:
+ * - symbolic links
+ * - 32-byte anti-virus scanstamp (regular files only)
+ */
+} znode_phys_t;
+
+#ifdef _KERNEL
+int zfs_sa_readlink(struct znode *, uio_t *);
+void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
+void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *);
+void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
+void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
+void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_SA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
new file mode 100644
index 000000000000..a8af7ec61ba9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_STAT_H
+#define _SYS_FS_ZFS_STAT_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/dmu.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A limited number of zpl level stats are retrievable
+ * with an ioctl. zfs diff is the current consumer.
+ */
+typedef struct zfs_stat {
+ uint64_t zs_gen;
+ uint64_t zs_mode;
+ uint64_t zs_links;
+ uint64_t zs_ctime[2];
+} zfs_stat_t;
+
+extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_STAT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
new file mode 100644
index 000000000000..8fba5e735da6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -0,0 +1,192 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_VFSOPS_H
+#define _SYS_FS_ZFS_VFSOPS_H
+
+#include <sys/list.h>
+#include <sys/vfs.h>
+#include <sys/zil.h>
+#include <sys/sa.h>
+#include <sys/rrwlock.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/rmlock.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zfsvfs zfsvfs_t;
+struct znode;
+
+struct zfsvfs {
+ vfs_t *z_vfs; /* generic fs struct */
+ zfsvfs_t *z_parent; /* parent fs */
+ objset_t *z_os; /* objset reference */
+ uint64_t z_root; /* id of root znode */
+ uint64_t z_unlinkedobj; /* id of unlinked zapobj */
+ uint64_t z_max_blksz; /* maximum block size for files */
+ uint64_t z_fuid_obj; /* fuid table object number */
+ uint64_t z_fuid_size; /* fuid table size */
+ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */
+ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */
+ krwlock_t z_fuid_lock; /* fuid lock */
+ boolean_t z_fuid_loaded; /* fuid tables are loaded */
+ boolean_t z_fuid_dirty; /* need to sync fuid table ? */
+ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */
+ zilog_t *z_log; /* intent log pointer */
+ uint_t z_acl_mode; /* acl chmod/mode behavior */
+ uint_t z_acl_inherit; /* acl inheritance behavior */
+ zfs_case_t z_case; /* case-sense */
+ boolean_t z_utf8; /* utf8-only */
+ int z_norm; /* normalization flags */
+ boolean_t z_atime; /* enable atimes mount option */
+ boolean_t z_unmounted; /* unmounted */
+ rrmlock_t z_teardown_lock;
+ struct rmslock z_teardown_inactive_lock;
+ list_t z_all_znodes; /* all vnodes in the fs */
+ kmutex_t z_znodes_lock; /* lock for z_all_znodes */
+ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */
+ boolean_t z_show_ctldir; /* expose .zfs in the root dir */
+ boolean_t z_issnap; /* true if this is a snapshot */
+ boolean_t z_vscan; /* virus scan on/off */
+ boolean_t z_use_fuids; /* version allows fuids */
+ boolean_t z_replay; /* set during ZIL replay */
+ boolean_t z_use_sa; /* version allow system attributes */
+ boolean_t z_use_namecache;/* make use of FreeBSD name cache */
+ uint64_t z_version; /* ZPL version */
+ uint64_t z_shares_dir; /* hidden shares dir */
+ kmutex_t z_lock;
+ uint64_t z_userquota_obj;
+ uint64_t z_groupquota_obj;
+ uint64_t z_replay_eof; /* New end of file - replay only */
+ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
+#define ZFS_OBJ_MTX_SZ 64
+ kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
+#if defined(__FreeBSD__)
+ struct task z_unlinked_drain_task;
+#endif
+};
+
+#define ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs) \
+ rms_try_rlock(&(zfsvfs)->z_teardown_inactive_lock)
+
+#define ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs) \
+ rms_rlock(&(zfsvfs)->z_teardown_inactive_lock)
+
+#define ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs) \
+ rms_runlock(&(zfsvfs)->z_teardown_inactive_lock)
+
+#define ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs) \
+ rms_wlock(&(zfsvfs)->z_teardown_inactive_lock)
+
+#define ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs) \
+ rms_wunlock(&(zfsvfs)->z_teardown_inactive_lock)
+
+#define ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs) \
+ rms_wowned(&(zfsvfs)->z_teardown_inactive_lock)
+
+/*
+ * Normal filesystems (those not under .zfs/snapshot) have a total
+ * file ID size limited to 12 bytes (including the length field) due to
+ * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical
+ * reasons, this same limit is being imposed by the Solaris NFSv3 implementation
+ * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It
+ * is not possible to expand beyond 12 bytes without abandoning support
+ * of NFSv2.
+ *
+ * For normal filesystems, we partition up the available space as follows:
+ * 2 bytes fid length (required)
+ * 6 bytes object number (48 bits)
+ * 4 bytes generation number (32 bits)
+ *
+ * We reserve only 48 bits for the object number, as this is the limit
+ * currently defined and imposed by the DMU.
+ */
+typedef struct zfid_short {
+ uint16_t zf_len;
+ uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */
+} zfid_short_t;
+
+/*
+ * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes
+ * (including the length field). This makes files under .zfs/snapshot
+ * accessible by NFSv3 and NFSv4, but not NFSv2.
+ *
+ * For files under .zfs/snapshot, we partition up the available space
+ * as follows:
+ * 2 bytes fid length (required)
+ * 6 bytes object number (48 bits)
+ * 4 bytes generation number (32 bits)
+ * 6 bytes objset id (48 bits)
+ * 4 bytes[**] currently just zero (32 bits)
+ *
+ * We reserve only 48 bits for the object number and objset id, as these are
+ * the limits currently defined and imposed by the DMU.
+ *
+ * [*] 20 bytes on FreeBSD to fit into the size of struct fid.
+ * [**] 2 bytes on FreeBSD for the above reason.
+ */
+typedef struct zfid_long {
+ zfid_short_t z_fid;
+ uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */
+} zfid_long_t;
+
+#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t))
+#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
+
+extern uint_t zfs_fsyncer_key;
+extern int zfs_super_owner;
+
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds);
+extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t *valuep);
+extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
+extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t quota);
+extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *,
+ boolean_t isgroup);
+extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
+ uint64_t fuid);
+extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
+extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
+extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os);
+extern void zfsvfs_free(zfsvfs_t *zfsvfs);
+extern int zfs_check_global_label(const char *dsname, const char *hexsl);
+
+#ifdef _KERNEL
+extern void zfsvfs_update_fromname(const char *oldname, const char *newname);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
new file mode 100644
index 000000000000..a95545bda4e1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -0,0 +1,374 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_ZNODE_H
+#define _SYS_FS_ZFS_ZNODE_H
+
+#ifdef _KERNEL
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/sa.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/rrwlock.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+#include <sys/zfs_rlock.h>
+#endif
+#include <sys/zfs_acl.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Additional file level attributes, that are stored
+ * in the upper half of zp_flags
+ */
+#define ZFS_READONLY 0x0000000100000000
+#define ZFS_HIDDEN 0x0000000200000000
+#define ZFS_SYSTEM 0x0000000400000000
+#define ZFS_ARCHIVE 0x0000000800000000
+#define ZFS_IMMUTABLE 0x0000001000000000
+#define ZFS_NOUNLINK 0x0000002000000000
+#define ZFS_APPENDONLY 0x0000004000000000
+#define ZFS_NODUMP 0x0000008000000000
+#define ZFS_OPAQUE 0x0000010000000000
+#define ZFS_AV_QUARANTINED 0x0000020000000000
+#define ZFS_AV_MODIFIED 0x0000040000000000
+#define ZFS_REPARSE 0x0000080000000000
+#define ZFS_OFFLINE 0x0000100000000000
+#define ZFS_SPARSE 0x0000200000000000
+
+#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
+{ \
+ if (value) \
+ pflags |= attr; \
+ else \
+ pflags &= ~attr; \
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
+ &pflags, sizeof (pflags), tx)); \
+}
+
+/*
+ * Define special zfs pflags
+ */
+#define ZFS_XATTR 0x1 /* is an extended attribute */
+#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
+#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
+#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */
+#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */
+#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
+#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
+#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
+#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
+
+#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME]
+#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME]
+#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME]
+#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME]
+#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN]
+#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES]
+#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR]
+#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK]
+#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV]
+#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP]
+#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID]
+#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID]
+#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT]
+#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS]
+#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE]
+#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT]
+#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
+#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
+#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
+#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]
+
+/*
+ * Is ID ephemeral?
+ */
+#define IS_EPHEMERAL(x) (x > MAXUID)
+
+/*
+ * Should we use FUIDs?
+ */
+#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \
+ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \
+ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
+
+#define MASTER_NODE_OBJ 1
+
+/*
+ * Special attributes for master node.
+ * "userquota@" and "groupquota@" are also valid (from
+ * zfs_userquota_prop_prefixes[]).
+ */
+#define ZFS_FSID "FSID"
+#define ZFS_UNLINKED_SET "DELETE_QUEUE"
+#define ZFS_ROOT_OBJ "ROOT"
+#define ZPL_VERSION_STR "VERSION"
+#define ZFS_FUID_TABLES "FUID"
+#define ZFS_SHARES_DIR "SHARES"
+#define ZFS_SA_ATTRS "SA_ATTRS"
+
+/*
+ * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
+ * the directory entries.
+ */
+#ifndef IFTODT
+#define IFTODT(mode) (((mode) & S_IFMT) >> 12)
+#endif
+
+/*
+ * The directory entry has the type (currently unused on Solaris) in the
+ * top 4 bits, and the object number in the low 48 bits. The "middle"
+ * 12 bits are unused.
+ */
+#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
+#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
+
+/*
+ * Directory entry locks control access to directory entries.
+ * They are used to protect creates, deletes, and renames.
+ * Each directory znode has a mutex and a list of locked names.
+ */
+#ifdef _KERNEL
+typedef struct zfs_dirlock {
+ char *dl_name; /* directory entry being locked */
+ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */
+ uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */
+ uint16_t dl_namesize; /* set if dl_name was allocated */
+ kcondvar_t dl_cv; /* wait for entry to be unlocked */
+ struct znode *dl_dzp; /* directory znode */
+ struct zfs_dirlock *dl_next; /* next in z_dirlocks list */
+} zfs_dirlock_t;
+
+typedef struct znode {
+ struct zfsvfs *z_zfsvfs;
+ vnode_t *z_vnode;
+ uint64_t z_id; /* object ID for this znode */
+#ifdef illumos
+ kmutex_t z_lock; /* znode modification lock */
+ krwlock_t z_parent_lock; /* parent lock for directories */
+ krwlock_t z_name_lock; /* "master" lock for dirent locks */
+ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
+#endif
+ rangelock_t z_rangelock; /* file range locks */
+ uint8_t z_unlinked; /* file has been unlinked */
+ uint8_t z_atime_dirty; /* atime needs to be synced */
+ uint8_t z_zn_prefetch; /* Prefetch znodes? */
+ uint8_t z_moved; /* Has this znode been moved? */
+ uint_t z_blksz; /* block size in bytes */
+ uint_t z_seq; /* modification sequence number */
+ uint64_t z_mapcnt; /* number of pages mapped to file */
+ uint64_t z_dnodesize; /* dnode size */
+ uint64_t z_gen; /* generation (cached) */
+ uint64_t z_size; /* file size (cached) */
+ uint64_t z_atime[2]; /* atime (cached) */
+ uint64_t z_links; /* file links (cached) */
+ uint64_t z_pflags; /* pflags (cached) */
+ uint64_t z_uid; /* uid fuid (cached) */
+ uint64_t z_gid; /* gid fuid (cached) */
+ mode_t z_mode; /* mode (cached) */
+ uint32_t z_sync_cnt; /* synchronous open count */
+ kmutex_t z_acl_lock; /* acl data lock */
+ zfs_acl_t *z_acl_cached; /* cached acl */
+ list_node_t z_link_node; /* all znodes in fs link */
+ sa_handle_t *z_sa_hdl; /* handle to sa data */
+ boolean_t z_is_sa; /* are we native sa? */
+} znode_t;
+
+#define ZFS_LINK_MAX UINT64_MAX
+
+/*
+ * Range locking rules
+ * --------------------
+ * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
+ * file range needs to be locked as RL_WRITER. Only then can the pages be
+ * freed etc and zp_size reset. zp_size must be set within range lock.
+ * 2. For writes and punching holes (zfs_write & zfs_space) just the range
+ * being written or freed needs to be locked as RL_WRITER.
+ * Multiple writes at the end of the file must coordinate zp_size updates
+ * to ensure data isn't lost. A compare and swap loop is currently used
+ * to ensure the file size is at least the offset last written.
+ * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
+ * read needs to be locked as RL_READER. A check against zp_size can then
+ * be made for reading beyond end of file.
+ */
+
+/*
+ * Convert between znode pointers and vnode pointers
+ */
+#ifdef DEBUG
+static __inline vnode_t *
+ZTOV(znode_t *zp)
+{
+ vnode_t *vp = zp->z_vnode;
+
+ ASSERT(vp != NULL && vp->v_data == zp);
+ return (vp);
+}
+static __inline znode_t *
+VTOZ(vnode_t *vp)
+{
+ znode_t *zp = (znode_t *)vp->v_data;
+
+ ASSERT(zp != NULL && zp->z_vnode == vp);
+ return (zp);
+}
+#else
+#define ZTOV(ZP) ((ZP)->z_vnode)
+#define VTOZ(VP) ((znode_t *)(VP)->v_data)
+#endif
+
+#define VTOZ_SMR(VP) ((znode_t *)vn_load_v_data_smr(VP))
+
+/* Called on entry to each ZFS vnode and vfs operation */
+#define ZFS_ENTER(zfsvfs) \
+ { \
+ rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
+ if ((zfsvfs)->z_unmounted) { \
+ ZFS_EXIT(zfsvfs); \
+ return (EIO); \
+ } \
+ }
+
+/* Must be called before exiting the vop */
+#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+
+/* Verifies the znode is valid */
+#define ZFS_VERIFY_ZP(zp) \
+ if ((zp)->z_sa_hdl == NULL) { \
+ ZFS_EXIT((zp)->z_zfsvfs); \
+ return (EIO); \
+ } \
+
+/*
+ * Macros for dealing with dmu_buf_hold
+ */
+#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1))
+#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \
+ (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
+#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
+ mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
+#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \
+ mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
+#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
+ mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
+
+/* Encode ZFS stored time values from a struct timespec */
+#define ZFS_TIME_ENCODE(tp, stmp) \
+{ \
+ (stmp)[0] = (uint64_t)(tp)->tv_sec; \
+ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \
+}
+
+/* Decode ZFS stored time values to a struct timespec */
+#define ZFS_TIME_DECODE(tp, stmp) \
+{ \
+ (tp)->tv_sec = (time_t)(stmp)[0]; \
+ (tp)->tv_nsec = (long)(stmp)[1]; \
+}
+
+/*
+ * Timestamp defines
+ */
+#define ACCESSED (AT_ATIME)
+#define STATE_CHANGED (AT_CTIME)
+#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME)
+
+#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
+ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
+ zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
+
+extern int zfs_init_fs(zfsvfs_t *, znode_t **);
+extern void zfs_set_dataprop(objset_t *);
+extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
+ dmu_tx_t *tx);
+extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
+ uint64_t [2], boolean_t);
+extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
+extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
+extern void zfs_znode_init(void);
+extern void zfs_znode_fini(void);
+extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern int zfs_rezget(znode_t *);
+extern void zfs_zinactive(znode_t *);
+extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
+extern void zfs_znode_free(znode_t *);
+extern void zfs_remove_op_tables();
+extern int zfs_create_op_tables();
+extern dev_t zfs_cmpldev(uint64_t);
+extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
+extern int zfs_get_stats(objset_t *os, nvlist_t *nv);
+extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os);
+extern void zfs_znode_dmu_fini(znode_t *);
+
+extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *,
+ vattr_t *vap);
+extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
+ vattr_t *vap);
+extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, char *name, uint64_t foid);
+#define ZFS_NO_OBJECT 0 /* no object id */
+extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name);
+extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link);
+extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
+extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t len, int ioflag);
+extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len);
+extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
+#ifndef ZFS_NO_ACL
+extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
+ vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
+#endif
+extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
+extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+
+extern zil_get_data_t zfs_get_data;
+extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
+extern int zfsfstype;
+
+extern int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf);
+
+#endif /* _KERNEL */
+
+extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZNODE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
new file mode 100644
index 000000000000..a27a9547ac43
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -0,0 +1,464 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_ZIL_H
+#define _SYS_ZIL_H
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+struct dsl_dataset;
+struct lwb;
+
+/*
+ * Intent log format:
+ *
+ * Each objset has its own intent log. The log header (zil_header_t)
+ * for objset N's intent log is kept in the Nth object of the SPA's
+ * intent_log objset. The log header points to a chain of log blocks,
+ * each of which contains log records (i.e., transactions) followed by
+ * a log block trailer (zil_trailer_t). The format of a log record
+ * depends on the record (or transaction) type, but all records begin
+ * with a common structure that defines the type, length, and txg.
+ */
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log. All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
+ uint64_t zh_replay_seq; /* highest replayed sequence number */
+ blkptr_t zh_log; /* log chain */
+ uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
+ uint64_t zh_flags; /* header flags */
+ uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
+ uint64_t zh_pad[3];
+} zil_header_t;
+
+/*
+ * zh_flags bit settings
+ */
+#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */
+
+/*
+ * Log block chaining.
+ *
+ * Log blocks are chained together. Originally they were chained at the
+ * end of the block. For performance reasons the chain was moved to the
+ * beginning of the block which allows writes for only the data being used.
+ * The older position is supported for backwards compatability.
+ *
+ * The zio_eck_t contains a zec_cksum which for the intent log is
+ * the sequence number of this log block. A seq of 0 is invalid.
+ * The zec_cksum is checked by the SPA against the sequence
+ * number passed in the blk_cksum field of the blkptr_t
+ */
+typedef struct zil_chain {
+ uint64_t zc_pad;
+ blkptr_t zc_next_blk; /* next block in chain */
+ uint64_t zc_nused; /* bytes in log block used */
+ zio_eck_t zc_eck; /* block trailer */
+} zil_chain_t;
+
+#define ZIL_MIN_BLKSZ 4096ULL
+
+/*
+ * ziltest is by and large an ugly hack, but very useful in
+ * checking replay without tedious work.
+ * When running ziltest we want to keep all itx's and so maintain
+ * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
+ * We subtract TXG_CONCURRENT_STATES to allow for common code.
+ */
+#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
+
+/*
+ * The words of a log block checksum.
+ */
+#define ZIL_ZC_GUID_0 0
+#define ZIL_ZC_GUID_1 1
+#define ZIL_ZC_OBJSET 2
+#define ZIL_ZC_SEQ 3
+
+typedef enum zil_create {
+ Z_FILE,
+ Z_DIR,
+ Z_XATTRDIR,
+} zil_create_t;
+
+/*
+ * size of xvattr log section.
+ * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
+ * for create time and a single 64 bit integer for all of the attributes,
+ * and 4 64 bit integers (32 bytes) for the scanstamp.
+ *
+ */
+
+#define ZIL_XVAT_SIZE(mapsize) \
+ sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
+ (sizeof (uint64_t) * 7)
+
+/*
+ * Size of ACL in log. The ACE data is padded out to properly align
+ * on 8 byte boundary.
+ */
+
+#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t)))
+
+/*
+ * Intent log transaction types and record structures
+ */
+#define TX_COMMIT 0 /* Commit marker (no on-disk state) */
+#define TX_CREATE 1 /* Create file */
+#define TX_MKDIR 2 /* Make directory */
+#define TX_MKXATTR 3 /* Make XATTR directory */
+#define TX_SYMLINK 4 /* Create symbolic link to a file */
+#define TX_REMOVE 5 /* Remove file */
+#define TX_RMDIR 6 /* Remove directory */
+#define TX_LINK 7 /* Create hard link to a file */
+#define TX_RENAME 8 /* Rename a file */
+#define TX_WRITE 9 /* File write */
+#define TX_TRUNCATE 10 /* Truncate a file */
+#define TX_SETATTR 11 /* Set file attributes */
+#define TX_ACL_V0 12 /* Set old formatted ACL */
+#define TX_ACL 13 /* Set ACL */
+#define TX_CREATE_ACL 14 /* create with ACL */
+#define TX_CREATE_ATTR 15 /* create + attrs */
+#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
+#define TX_MKDIR_ACL 17 /* mkdir with ACL */
+#define TX_MKDIR_ATTR 18 /* mkdir with attr */
+#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
+#define TX_WRITE2 20 /* dmu_sync EALREADY write */
+#define TX_MAX_TYPE 21 /* Max transaction type */
+
+/*
+ * The transactions for mkdir, symlink, remove, rmdir, link, and rename
+ * may have the following bit set, indicating the original request
+ * specified case-insensitive handling of names.
+ */
+#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
+
+/*
+ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
+ * out of order. For convenience in the code, all such records must have
+ * lr_foid at the same offset.
+ */
+#define TX_OOO(txtype) \
+ ((txtype) == TX_WRITE || \
+ (txtype) == TX_TRUNCATE || \
+ (txtype) == TX_SETATTR || \
+ (txtype) == TX_ACL_V0 || \
+ (txtype) == TX_ACL || \
+ (txtype) == TX_WRITE2)
+
+/*
+ * The number of dnode slots consumed by the object is stored in the 8
+ * unused upper bits of the object ID. We subtract 1 from the value
+ * stored on disk for compatibility with implementations that don't
+ * support large dnodes. The slot count for a single-slot dnode will
+ * contain 0 for those bits to preserve the log record format for
+ * "small" dnodes.
+ */
+#define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1)
+#define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1)
+#define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT)
+#define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x))
+
+/*
+ * Format of log records.
+ * The fields are carefully defined to allow them to be aligned
+ * and sized the same on sparc & intel architectures.
+ * Each log record has a common structure at the beginning.
+ *
+ * The log record on disk (lrc_seq) holds the sequence number of all log
+ * records which is used to ensure we don't replay the same record.
+ */
+typedef struct { /* common log record header */
+ uint64_t lrc_txtype; /* intent log transaction type */
+ uint64_t lrc_reclen; /* transaction record length */
+ uint64_t lrc_txg; /* dmu transaction group number */
+ uint64_t lrc_seq; /* see comment above */
+} lr_t;
+
+/*
+ * Common start of all out-of-order record types (TX_OOO() above).
+ */
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id */
+} lr_ooo_t;
+
+/*
+ * Handle option extended vattr attributes.
+ *
+ * Whenever new attributes are added the version number
+ * will need to be updated as will code in
+ * zfs_log.c and zfs_replay.c
+ */
+typedef struct {
+ uint32_t lr_attr_masksize; /* number of elements in array */
+ uint32_t lr_attr_bitmap; /* First entry of array */
+ /* remainder of array and any additional fields */
+} lr_attr_t;
+
+/*
+ * log record for creates without optional ACL.
+ * This log record does support optional xvattr_t attributes.
+ */
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* object id of directory */
+ uint64_t lr_foid; /* object id of created file object */
+ uint64_t lr_mode; /* mode of object */
+ uint64_t lr_uid; /* uid of object */
+ uint64_t lr_gid; /* gid of object */
+ uint64_t lr_gen; /* generation (txg of creation) */
+ uint64_t lr_crtime[2]; /* creation time */
+ uint64_t lr_rdev; /* rdev of object to create */
+ /* name of object to create follows this */
+ /* for symlinks, link content follows name */
+ /* for creates with xvattr data, the name follows the xvattr info */
+} lr_create_t;
+
+/*
+ * FUID ACL record will be an array of ACEs from the original ACL.
+ * If this array includes ephemeral IDs, the record will also include
+ * an array of log-specific FUIDs to replace the ephemeral IDs.
+ * Only one copy of each unique domain will be present, so the log-specific
+ * FUIDs will use an index into a compressed domain table. On replay this
+ * information will be used to construct real FUIDs (and bypass idmap,
+ * since it may not be available).
+ */
+
+/*
+ * Log record for creates with optional ACL
+ * This log record is also used for recording any FUID
+ * information needed for replaying the create. If the
+ * file doesn't have any actual ACEs then the lr_aclcnt
+ * would be zero.
+ *
+ * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
+ * If create is also setting xvattr's, then acl data follows xvattr.
+ * If ACE FUIDs are needed then they will follow the xvattr_t. Following
+ * the FUIDs will be the domain table information. The FUIDs for the owner
+ * and group will be in lr_create. Name follows ACL data.
+ */
+typedef struct {
+ lr_create_t lr_create; /* common create portion */
+ uint64_t lr_aclcnt; /* number of ACEs in ACL */
+ uint64_t lr_domcnt; /* number of unique domains */
+ uint64_t lr_fuidcnt; /* number of real fuids */
+ uint64_t lr_acl_bytes; /* number of bytes in ACL */
+ uint64_t lr_acl_flags; /* ACL flags */
+} lr_acl_create_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ /* name of object to remove follows this */
+} lr_remove_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ uint64_t lr_link_obj; /* obj id of link */
+ /* name of object to link follows this */
+} lr_link_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_sdoid; /* obj id of source directory */
+ uint64_t lr_tdoid; /* obj id of target directory */
+ /* 2 strings: names of source and destination follow this */
+} lr_rename_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to write */
+ uint64_t lr_offset; /* offset to write to */
+ uint64_t lr_length; /* user data length to write */
+ uint64_t lr_blkoff; /* no longer used */
+ blkptr_t lr_blkptr; /* spa block pointer for replay */
+ /* write data will follow for small writes */
+} lr_write_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id of file to truncate */
+ uint64_t lr_offset; /* offset to truncate from */
+ uint64_t lr_length; /* length to truncate */
+} lr_truncate_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to change attributes */
+ uint64_t lr_mask; /* mask of attributes to set */
+ uint64_t lr_mode; /* mode to set */
+ uint64_t lr_uid; /* uid to set */
+ uint64_t lr_gid; /* gid to set */
+ uint64_t lr_size; /* size to set */
+ uint64_t lr_atime[2]; /* access time */
+ uint64_t lr_mtime[2]; /* modification time */
+ /* optional attribute lr_attr_t may be here */
+} lr_setattr_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* obj id of file */
+ uint64_t lr_aclcnt; /* number of acl entries */
+ /* lr_aclcnt number of ace_t entries follow this */
+} lr_acl_v0_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* obj id of file */
+ uint64_t lr_aclcnt; /* number of ACEs in ACL */
+ uint64_t lr_domcnt; /* number of unique domains */
+ uint64_t lr_fuidcnt; /* number of real fuids */
+ uint64_t lr_acl_bytes; /* number of bytes in ACL */
+ uint64_t lr_acl_flags; /* ACL flags */
+ /* lr_acl_bytes number of variable sized ace's follows */
+} lr_acl_t;
+
+/*
+ * ZIL structure definitions, interface function prototype and globals.
+ */
+
+/*
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ * In this mode, if we need to commit the write later, then the block
+ * is immediately written into the file system (using dmu_sync),
+ * and a pointer to the block is put into the log record.
+ * When the txg commits the block is linked in.
+ * This saves additionally writing the data into the log record.
+ * There are a few requirements for this to occur:
+ * - write is greater than zfs/zvol_immediate_write_sz
+ * - not using slogs (as slogs are assumed to always be faster
+ * than writing into the main pool)
+ * - the write occupies only one block
+ * WR_COPIED:
+ * If we know we'll immediately be committing the
+ * transaction (FSYNC or FDSYNC), the we allocate a larger
+ * log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ * Otherwise we don't allocate a buffer, and *if* we need to
+ * flush the write later then a buffer is allocated and
+ * we retrieve the data using the dmu.
+ */
+typedef enum {
+ WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
+ /* and put blkptr in log, rather than actual data) */
+ WR_COPIED, /* immediate - data is copied into lr_write_t */
+ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
+ WR_NUM_STATES /* number of states */
+} itx_wr_state_t;
+
+typedef struct itx {
+ list_node_t itx_node; /* linkage on zl_itx_list */
+ void *itx_private; /* type-specific opaque data */
+ itx_wr_state_t itx_wr_state; /* write state */
+ uint8_t itx_sync; /* synchronous transaction */
+ uint64_t itx_oid; /* object id */
+ lr_t itx_lr; /* common part of log record */
+ /* followed by type-specific part of lr_xx_t and its immediate data */
+} itx_t;
+
+typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+ uint64_t txg);
+typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+ uint64_t txg);
+typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap);
+typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf,
+ struct lwb *lwb, zio_t *zio);
+
+extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
+
+extern void zil_init(void);
+extern void zil_fini(void);
+
+extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys);
+extern void zil_free(zilog_t *zilog);
+
+extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
+extern void zil_close(zilog_t *zilog);
+
+extern void zil_replay(objset_t *os, void *arg,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
+extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
+extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);
+extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
+
+extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
+extern void zil_itx_destroy(itx_t *itx);
+extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+
+extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid);
+extern void zil_commit(zilog_t *zilog, uint64_t oid);
+extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);
+
+extern int zil_reset(const char *osname, void *txarg);
+extern int zil_claim(struct dsl_pool *dp,
+ struct dsl_dataset *ds, void *txarg);
+extern int zil_check_log_chain(struct dsl_pool *dp,
+ struct dsl_dataset *ds, void *tx);
+extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
+extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
+
+extern int zil_suspend(const char *osname, void **cookiep);
+extern void zil_resume(void *cookie);
+
+extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp);
+extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
+extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
+
+extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);
+
+extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
+
+extern uint64_t zil_max_copied_data(zilog_t *zilog);
+extern uint64_t zil_max_log_data(zilog_t *zilog);
+
+extern int zil_replay_disable;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
new file mode 100644
index 000000000000..a19ba970574f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
@@ -0,0 +1,229 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_ZIL_IMPL_H
+#define _SYS_ZIL_IMPL_H
+
+#include <sys/zil.h>
+#include <sys/dmu_objset.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Possbile states for a given lwb structure.
+ *
+ * An lwb will start out in the "closed" state, and then transition to
+ * the "opened" state via a call to zil_lwb_write_open(). When
+ * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
+ * must be held.
+ *
+ * After the lwb is "opened", it can transition into the "issued" state
+ * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
+ * be held when making this transition.
+ *
+ * After the lwb's write zio completes, it transitions into the "write
+ * done" state via zil_lwb_write_done(); and then into the "flush done"
+ * state via zil_lwb_flush_vdevs_done(). When transitioning from
+ * "issued" to "write done", and then from "write done" to "flush done",
+ * the zilog's "zl_lock" must be held, *not* the "zl_issuer_lock".
+ *
+ * The zilog's "zl_issuer_lock" can become heavily contended in certain
+ * workloads, so we specifically avoid acquiring that lock when
+ * transitioning an lwb from "issued" to "done". This allows us to avoid
+ * having to acquire the "zl_issuer_lock" for each lwb ZIO completion,
+ * which would have added more lock contention on an already heavily
+ * contended lock.
+ *
+ * Additionally, correctness when reading an lwb's state is often
+ * acheived by exploiting the fact that these state transitions occur in
+ * this specific order; i.e. "closed" to "opened" to "issued" to "done".
+ *
+ * Thus, if an lwb is in the "closed" or "opened" state, holding the
+ * "zl_issuer_lock" will prevent a concurrent thread from transitioning
+ * that lwb to the "issued" state. Likewise, if an lwb is already in the
+ * "issued" state, holding the "zl_lock" will prevent a concurrent
+ * thread from transitioning that lwb to the "write done" state.
+ */
+typedef enum {
+ LWB_STATE_CLOSED,
+ LWB_STATE_OPENED,
+ LWB_STATE_ISSUED,
+ LWB_STATE_WRITE_DONE,
+ LWB_STATE_FLUSH_DONE,
+ LWB_NUM_STATES
+} lwb_state_t;
+
+/*
+ * Log write block (lwb)
+ *
+ * Prior to an lwb being issued to disk via zil_lwb_write_issue(), it
+ * will be protected by the zilog's "zl_issuer_lock". Basically, prior
+ * to it being issued, it will only be accessed by the thread that's
+ * holding the "zl_issuer_lock". After the lwb is issued, the zilog's
+ * "zl_lock" is used to protect the lwb against concurrent access.
+ */
+typedef struct lwb {
+ zilog_t *lwb_zilog; /* back pointer to log struct */
+ blkptr_t lwb_blk; /* on disk address of this log blk */
+ boolean_t lwb_slog; /* lwb_blk is on SLOG device */
+ int lwb_nused; /* # used bytes in buffer */
+ int lwb_sz; /* size of block and buffer */
+ lwb_state_t lwb_state; /* the state of this lwb */
+ char *lwb_buf; /* log write buffer */
+ zio_t *lwb_write_zio; /* zio for the lwb buffer */
+ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */
+ dmu_tx_t *lwb_tx; /* tx for log block allocation */
+ uint64_t lwb_max_txg; /* highest txg in this lwb */
+ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
+ list_t lwb_waiters; /* list of zil_commit_waiter's */
+ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */
+ kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */
+ hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
+} lwb_t;
+
+/*
+ * ZIL commit waiter.
+ *
+ * This structure is allocated each time zil_commit() is called, and is
+ * used by zil_commit() to communicate with other parts of the ZIL, such
+ * that zil_commit() can know when it safe for it return. For more
+ * details, see the comment above zil_commit().
+ *
+ * The "zcw_lock" field is used to protect the commit waiter against
+ * concurrent access. This lock is often acquired while already holding
+ * the zilog's "zl_issuer_lock" or "zl_lock"; see the functions
+ * zil_process_commit_list() and zil_lwb_flush_vdevs_done() as examples
+ * of this. Thus, one must be careful not to acquire the
+ * "zl_issuer_lock" or "zl_lock" when already holding the "zcw_lock";
+ * e.g. see the zil_commit_waiter_timeout() function.
+ */
+typedef struct zil_commit_waiter {
+ kcondvar_t zcw_cv; /* signalled when "done" */
+ kmutex_t zcw_lock; /* protects fields of this struct */
+ list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */
+ lwb_t *zcw_lwb; /* back pointer to lwb when linked */
+ boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */
+ int zcw_zio_error; /* contains the zio io_error value */
+} zil_commit_waiter_t;
+
+/*
+ * Intent log transaction lists
+ */
+typedef struct itxs {
+ list_t i_sync_list; /* list of synchronous itxs */
+ avl_tree_t i_async_tree; /* tree of foids for async itxs */
+} itxs_t;
+
+typedef struct itxg {
+ kmutex_t itxg_lock; /* lock for this structure */
+ uint64_t itxg_txg; /* txg for this chain */
+ itxs_t *itxg_itxs; /* sync and async itxs */
+} itxg_t;
+
+/* for async nodes we build up an AVL tree of lists of async itxs per file */
+typedef struct itx_async_node {
+ uint64_t ia_foid; /* file object id */
+ list_t ia_list; /* list of async itxs for this foid */
+ avl_node_t ia_node; /* AVL tree linkage */
+} itx_async_node_t;
+
+/*
+ * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
+ * we've touched so we know which ones need a write cache flush at the end.
+ */
+typedef struct zil_vdev_node {
+ uint64_t zv_vdev; /* vdev to be flushed */
+ avl_node_t zv_node; /* AVL tree linkage */
+} zil_vdev_node_t;
+
+#define ZIL_PREV_BLKS 16
+
+/*
+ * Stable storage intent log management structure. One per dataset.
+ */
+struct zilog {
+ kmutex_t zl_lock; /* protects most zilog_t fields */
+ struct dsl_pool *zl_dmu_pool; /* DSL pool */
+ spa_t *zl_spa; /* handle for read/write log */
+ const zil_header_t *zl_header; /* log header buffer */
+ objset_t *zl_os; /* object set we're logging */
+ zil_get_data_t *zl_get_data; /* callback to get object content */
+ lwb_t *zl_last_lwb_opened; /* most recent lwb opened */
+ hrtime_t zl_last_lwb_latency; /* zio latency of last lwb done */
+ uint64_t zl_lr_seq; /* on-disk log record sequence number */
+ uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */
+ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
+ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
+ uint64_t zl_replaying_seq; /* current replay seq number */
+ uint32_t zl_suspend; /* log suspend count */
+ kcondvar_t zl_cv_suspend; /* log suspend completion */
+ uint8_t zl_suspending; /* log is currently suspending */
+ uint8_t zl_keep_first; /* keep first log block in destroy */
+ uint8_t zl_replay; /* replaying records while set */
+ uint8_t zl_stop_sync; /* for debugging */
+ kmutex_t zl_issuer_lock; /* single writer, per ZIL, at a time */
+ uint8_t zl_logbias; /* latency or throughput */
+ uint8_t zl_sync; /* synchronous or asynchronous */
+ int zl_parse_error; /* last zil_parse() error */
+ uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */
+ uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */
+ uint64_t zl_parse_blk_count; /* number of blocks parsed */
+ uint64_t zl_parse_lr_count; /* number of log records parsed */
+ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
+ list_t zl_itx_commit_list; /* itx list to be committed */
+ uint64_t zl_cur_used; /* current commit log size used */
+ list_t zl_lwb_list; /* in-flight log write list */
+ avl_tree_t zl_bp_tree; /* track bps during log parse */
+ clock_t zl_replay_time; /* lbolt of when replay started */
+ uint64_t zl_replay_blks; /* number of log blocks replayed */
+ zil_header_t zl_old_header; /* debugging aid */
+ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+ uint_t zl_prev_rotor; /* rotor for zl_prev[] */
+ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
+ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */
+ /*
+ * Max block size for this ZIL. Note that this can not be changed
+ * while the ZIL is in use because consumers (ZPL/zvol) need to take
+ * this into account when deciding between WR_COPIED and WR_NEED_COPY
+ * (see zil_max_copied_data()).
+ */
+ uint64_t zl_max_block_size;
+};
+
+typedef struct zil_bp_node {
+ dva_t zn_dva;
+ avl_node_t zn_node;
+} zil_bp_node_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
new file mode 100644
index 000000000000..99aecb67069b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -0,0 +1,675 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ */
+
+#ifndef _ZIO_H
+#define _ZIO_H
+
+#include <sys/zio_priority.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/kstat.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Embedded checksum
+ */
+#define ZEC_MAGIC 0x210da7ab10c7a11ULL
+
+typedef struct zio_eck {
+ uint64_t zec_magic; /* for validation, endianness */
+ zio_cksum_t zec_cksum; /* 256-bit checksum */
+} zio_eck_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
+#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
+#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_eck_t) - \
+ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+ sizeof (uint64_t))
+
+typedef struct zio_gbh {
+ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
+ uint64_t zg_filler[SPA_GBH_FILLER];
+ zio_eck_t zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+ ZIO_CHECKSUM_INHERIT = 0,
+ ZIO_CHECKSUM_ON,
+ ZIO_CHECKSUM_OFF,
+ ZIO_CHECKSUM_LABEL,
+ ZIO_CHECKSUM_GANG_HEADER,
+ ZIO_CHECKSUM_ZILOG,
+ ZIO_CHECKSUM_FLETCHER_2,
+ ZIO_CHECKSUM_FLETCHER_4,
+ ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_ZILOG2,
+ ZIO_CHECKSUM_NOPARITY,
+ ZIO_CHECKSUM_SHA512,
+ ZIO_CHECKSUM_SKEIN,
+#ifdef illumos
+ ZIO_CHECKSUM_EDONR,
+#endif
+ ZIO_CHECKSUM_FUNCTIONS
+};
+
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
+
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
+#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+
+#define ZIO_CHECKSUM_MASK 0xffULL
+#define ZIO_CHECKSUM_VERIFY (1 << 8)
+
+#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
+#define ZIO_DEDUPDITTO_MIN 100
+
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
+
+/*
+ * The meaning of "compress = on" selected by the compression features enabled
+ * on a given pool.
+ */
+#define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB
+#define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4
+
+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+
+#define BOOTFS_COMPRESS_VALID(compress) \
+ ((compress) == ZIO_COMPRESS_LZJB || \
+ (compress) == ZIO_COMPRESS_LZ4 || \
+ (compress) == ZIO_COMPRESS_ON || \
+ (compress) == ZIO_COMPRESS_OFF)
+
+#define ZIO_FAILURE_MODE_WAIT 0
+#define ZIO_FAILURE_MODE_CONTINUE 1
+#define ZIO_FAILURE_MODE_PANIC 2
+
+typedef enum zio_suspend_reason {
+ ZIO_SUSPEND_NONE = 0,
+ ZIO_SUSPEND_IOERR,
+ ZIO_SUSPEND_MMP,
+} zio_suspend_reason_t;
+
+enum zio_flag {
+ /*
+ * Flags inherited by gang, ddt, and vdev children,
+ * and that must be equal for two zios to aggregate
+ */
+ ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
+ ZIO_FLAG_IO_REPAIR = 1 << 1,
+ ZIO_FLAG_SELF_HEAL = 1 << 2,
+ ZIO_FLAG_RESILVER = 1 << 3,
+ ZIO_FLAG_SCRUB = 1 << 4,
+ ZIO_FLAG_SCAN_THREAD = 1 << 5,
+ ZIO_FLAG_PHYSICAL = 1 << 6,
+
+#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
+
+ /*
+ * Flags inherited by ddt, gang, and vdev children.
+ */
+ ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
+ ZIO_FLAG_SPECULATIVE = 1 << 8,
+ ZIO_FLAG_CONFIG_WRITER = 1 << 9,
+ ZIO_FLAG_DONT_RETRY = 1 << 10,
+ ZIO_FLAG_DONT_CACHE = 1 << 11,
+ ZIO_FLAG_NODATA = 1 << 12,
+ ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
+ ZIO_FLAG_IO_ALLOCATING = 1 << 14,
+
+#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+
+ /*
+ * Flags inherited by vdev children.
+ */
+ ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 16,
+ ZIO_FLAG_TRYHARD = 1 << 17,
+ ZIO_FLAG_OPTIONAL = 1 << 18,
+
+#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
+
+ /*
+ * Flags not inherited by any children.
+ */
+ ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
+ ZIO_FLAG_IO_BYPASS = 1 << 21,
+ ZIO_FLAG_IO_REWRITE = 1 << 22,
+ ZIO_FLAG_RAW = 1 << 23,
+ ZIO_FLAG_GANG_CHILD = 1 << 24,
+ ZIO_FLAG_DDT_CHILD = 1 << 25,
+ ZIO_FLAG_GODFATHER = 1 << 26,
+ ZIO_FLAG_NOPWRITE = 1 << 27,
+ ZIO_FLAG_REEXECUTED = 1 << 28,
+ ZIO_FLAG_DELEGATED = 1 << 29,
+};
+
+#define ZIO_FLAG_MUSTSUCCEED 0
+
+#define ZIO_DDT_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \
+ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
+#define ZIO_GANG_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \
+ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
+
+#define ZIO_VDEV_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
+
+#define ZIO_CHILD_BIT(x) (1 << (x))
+#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
+
+enum zio_child {
+ ZIO_CHILD_VDEV = 0,
+ ZIO_CHILD_GANG,
+ ZIO_CHILD_DDT,
+ ZIO_CHILD_LOGICAL,
+ ZIO_CHILD_TYPES
+};
+
+#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
+#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
+#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
+#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
+#define ZIO_CHILD_ALL_BITS \
+ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
+ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
+
+enum zio_wait_type {
+ ZIO_WAIT_READY = 0,
+ ZIO_WAIT_DONE,
+ ZIO_WAIT_TYPES
+};
+
+/*
+ * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD
+ * equivalents. This gives us more useful error messages from strerror(3).
+ */
+#define ECKSUM EINTEGRITY
+#define EFRAGS ENOSPC
+
+typedef void zio_done_func_t(zio_t *zio);
+
+extern boolean_t zio_dva_throttle_enabled;
+extern const char *zio_type_name[ZIO_TYPES];
+
+/*
+ * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
+ * identifies any block in the pool. By convention, the meta-objset (MOS)
+ * is objset 0, and the meta-dnode is object 0. This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
+ *
+ * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
+ *
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
+ *
+ * Note: this structure is passed between userland and the kernel, and is
+ * stored on disk (by virtue of being incorporated into other on-disk
+ * structures, e.g. dsl_scan_phys_t).
+ */
+typedef struct zbookmark_phys {
+ uint64_t zb_objset;
+ uint64_t zb_object;
+ int64_t zb_level;
+ uint64_t zb_blkid;
+} zbookmark_phys_t;
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define ZB_DESTROYED_OBJSET (-1ULL)
+
+#define ZB_ROOT_OBJECT (0ULL)
+#define ZB_ROOT_LEVEL (-1LL)
+#define ZB_ROOT_BLKID (0ULL)
+
+#define ZB_ZIL_OBJECT (0ULL)
+#define ZB_ZIL_LEVEL (-2LL)
+
+#define ZB_DNODE_LEVEL (-3LL)
+#define ZB_DNODE_BLKID (0ULL)
+
+#define ZB_IS_ZERO(zb) \
+ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
+ (zb)->zb_level == 0 && (zb)->zb_blkid == 0)
+#define ZB_IS_ROOT(zb) \
+ ((zb)->zb_object == ZB_ROOT_OBJECT && \
+ (zb)->zb_level == ZB_ROOT_LEVEL && \
+ (zb)->zb_blkid == ZB_ROOT_BLKID)
+
+typedef struct zio_prop {
+ enum zio_checksum zp_checksum;
+ enum zio_compress zp_compress;
+ dmu_object_type_t zp_type;
+ uint8_t zp_level;
+ uint8_t zp_copies;
+ boolean_t zp_dedup;
+ boolean_t zp_dedup_verify;
+ boolean_t zp_nopwrite;
+ uint32_t zp_zpl_smallblk;
+} zio_prop_t;
+
+typedef struct zio_cksum_report zio_cksum_report_t;
+
+typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
+ const void *good_data);
+typedef void zio_cksum_free_f(void *cbdata, size_t size);
+
+struct zio_bad_cksum; /* defined in zio_checksum.h */
+struct dnode_phys;
+struct abd;
+
+struct zio_cksum_report {
+ struct zio_cksum_report *zcr_next;
+ nvlist_t *zcr_ereport;
+ nvlist_t *zcr_detector;
+ void *zcr_cbdata;
+ size_t zcr_cbinfo; /* passed to zcr_free() */
+ uint64_t zcr_align;
+ uint64_t zcr_length;
+ zio_cksum_finish_f *zcr_finish;
+ zio_cksum_free_f *zcr_free;
+
+ /* internal use only */
+ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */
+};
+
+typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
+ void *arg);
+
+zio_vsd_cksum_report_f zio_vsd_default_cksum_report;
+
+typedef struct zio_vsd_ops {
+ zio_done_func_t *vsd_free;
+ zio_vsd_cksum_report_f *vsd_cksum_report;
+} zio_vsd_ops_t;
+
+typedef struct zio_gang_node {
+ zio_gbh_phys_t *gn_gbh;
+ struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
+} zio_gang_node_t;
+
+typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
+ zio_gang_node_t *gn, struct abd *data, uint64_t offset);
+
+typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
+
+typedef struct zio_transform {
+ struct abd *zt_orig_abd;
+ uint64_t zt_orig_size;
+ uint64_t zt_bufsize;
+ zio_transform_func_t *zt_transform;
+ struct zio_transform *zt_next;
+} zio_transform_t;
+
+typedef zio_t *zio_pipe_stage_t(zio_t *zio);
+
+/*
+ * The io_reexecute flags are distinct from io_flags because the child must
+ * be able to propagate them to the parent. The normal io_flags are local
+ * to the zio, not protected by any lock, and not modifiable by children;
+ * the reexecute flags are protected by io_lock, modifiable by children,
+ * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
+ */
+#define ZIO_REEXECUTE_NOW 0x01
+#define ZIO_REEXECUTE_SUSPEND 0x02
+
+typedef struct zio_alloc_list {
+ list_t zal_list;
+ uint64_t zal_size;
+} zio_alloc_list_t;
+
+typedef struct zio_link {
+ zio_t *zl_parent;
+ zio_t *zl_child;
+ list_node_t zl_parent_node;
+ list_node_t zl_child_node;
+} zio_link_t;
+
+/*
+ * Used for TRIM kstat.
+ */
+typedef struct zio_trim_stats {
+ /*
+ * Number of bytes successfully TRIMmed.
+ */
+ kstat_named_t bytes;
+
+ /*
+ * Number of successful TRIM requests.
+ */
+ kstat_named_t success;
+
+ /*
+ * Number of TRIM requests that failed because TRIM is not
+ * supported.
+ */
+ kstat_named_t unsupported;
+
+ /*
+ * Number of TRIM requests that failed for other reasons.
+ */
+ kstat_named_t failed;
+} zio_trim_stats_t;
+
+extern zio_trim_stats_t zio_trim_stats;
+
+#define ZIO_TRIM_STAT_INCR(stat, val) \
+ atomic_add_64(&zio_trim_stats.stat.value.ui64, (val));
+#define ZIO_TRIM_STAT_BUMP(stat) \
+ ZIO_TRIM_STAT_INCR(stat, 1);
+
+struct zio {
+ /* Core information about this I/O */
+ zbookmark_phys_t io_bookmark;
+ zio_prop_t io_prop;
+ zio_type_t io_type;
+ enum zio_child io_child_type;
+ int io_cmd;
+ zio_priority_t io_priority;
+ uint8_t io_reexecute;
+ uint8_t io_state[ZIO_WAIT_TYPES];
+ uint64_t io_txg;
+ spa_t *io_spa;
+ blkptr_t *io_bp;
+ blkptr_t *io_bp_override;
+ blkptr_t io_bp_copy;
+ list_t io_parent_list;
+ list_t io_child_list;
+ zio_t *io_logical;
+ zio_transform_t *io_transform_stack;
+
+ /* Callback info */
+ zio_done_func_t *io_ready;
+ zio_done_func_t *io_children_ready;
+ zio_done_func_t *io_physdone;
+ zio_done_func_t *io_done;
+ void *io_private;
+ int64_t io_prev_space_delta; /* DMU private */
+ blkptr_t io_bp_orig;
+
+ /* Data represented by this I/O */
+ struct abd *io_abd;
+ struct abd *io_orig_abd;
+ uint64_t io_size;
+ uint64_t io_orig_size;
+ /* io_lsize != io_orig_size iff this is a raw write */
+ uint64_t io_lsize;
+
+ /* Stuff for the vdev stack */
+ vdev_t *io_vd;
+ void *io_vsd;
+ const zio_vsd_ops_t *io_vsd_ops;
+ metaslab_class_t *io_metaslab_class; /* dva throttle class */
+
+ uint64_t io_offset;
+ hrtime_t io_timestamp;
+ hrtime_t io_queued_timestamp;
+ hrtime_t io_target_timestamp;
+ avl_node_t io_queue_node;
+ avl_node_t io_offset_node;
+ avl_node_t io_alloc_node;
+ zio_alloc_list_t io_alloc_list;
+
+#ifdef __FreeBSD__
+ struct bio *io_bio;
+#ifdef _KERNEL
+ struct callout io_timer;
+#endif
+#endif
+
+ /* Internal pipeline state */
+ enum zio_flag io_flags;
+ enum zio_stage io_stage;
+ enum zio_stage io_pipeline;
+ enum zio_flag io_orig_flags;
+ enum zio_stage io_orig_stage;
+ enum zio_stage io_orig_pipeline;
+ enum zio_stage io_pipeline_trace;
+ int io_error;
+ int io_child_error[ZIO_CHILD_TYPES];
+ uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+ uint64_t io_child_count;
+ uint64_t io_phys_children;
+ uint64_t io_parent_count;
+ uint64_t *io_stall;
+ zio_t *io_gang_leader;
+ zio_gang_node_t *io_gang_tree;
+ void *io_executor;
+ void *io_waiter;
+ kmutex_t io_lock;
+ kcondvar_t io_cv;
+ int io_allocator;
+
+ /* FMA state */
+ zio_cksum_report_t *io_cksum_report;
+ uint64_t io_ena;
+
+ /* Taskq dispatching state */
+ taskq_ent_t io_tqent;
+
+ avl_node_t io_trim_node;
+ list_node_t io_trim_link;
+};
+
+extern int zio_bookmark_compare(const void *, const void *);
+
+extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
+ zio_done_func_t *done, void *priv, enum zio_flag flags);
+
+extern zio_t *zio_root(spa_t *spa,
+ zio_done_func_t *done, void *priv, enum zio_flag flags);
+
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
+ zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
+
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
+ zio_done_func_t *ready, zio_done_func_t *children_ready,
+ zio_done_func_t *physdone, zio_done_func_t *done,
+ void *priv, zio_priority_t priority, enum zio_flag flags,
+ const zbookmark_phys_t *zb);
+
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
+ zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
+
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
+ boolean_t nopwrite);
+
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp,
+ zio_done_func_t *done, void *priv, enum zio_flag flags);
+
+extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
+ zio_priority_t priority, enum zio_flag flags);
+
+extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, struct abd *data, int checksum,
+ zio_done_func_t *done, void *priv, zio_priority_t priority,
+ enum zio_flag flags, boolean_t labels);
+
+extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, struct abd *data, int checksum,
+ zio_done_func_t *done, void *priv, zio_priority_t priority,
+ enum zio_flag flags, boolean_t labels);
+
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp, uint64_t size, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg,
+ blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog);
+extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
+ uint64_t size);
+extern void zio_shrink(zio_t *zio, uint64_t size);
+
+extern int zio_wait(zio_t *zio);
+extern void zio_nowait(zio_t *zio);
+extern void zio_execute(zio_t *zio);
+extern void zio_interrupt(zio_t *zio);
+extern void zio_delay_init(zio_t *zio);
+extern void zio_delay_interrupt(zio_t *zio);
+
+extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
+extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
+extern zio_t *zio_unique_parent(zio_t *cio);
+extern void zio_add_child(zio_t *pio, zio_t *cio);
+
+extern void *zio_buf_alloc(size_t size);
+extern void zio_buf_free(void *buf, size_t size);
+extern void *zio_data_buf_alloc(size_t size);
+extern void zio_data_buf_free(void *buf, size_t size);
+
+extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
+ uint64_t bufsize, zio_transform_func_t *transform);
+extern void zio_pop_transforms(zio_t *zio);
+
+extern void zio_resubmit_stage_async(void *);
+
+extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
+ uint64_t offset, struct abd *data, uint64_t size, int type,
+ zio_priority_t priority, enum zio_flag flags,
+ zio_done_func_t *done, void *priv);
+
+extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
+ struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
+ enum zio_flag flags, zio_done_func_t *done, void *priv);
+
+extern void zio_vdev_io_bypass(zio_t *zio);
+extern void zio_vdev_io_reissue(zio_t *zio);
+extern void zio_vdev_io_redone(zio_t *zio);
+
+extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
+
+extern void zio_checksum_verified(zio_t *zio);
+extern int zio_worst_error(int e1, int e2);
+
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+ enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+ enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(spa_t *spa,
+ enum zio_compress child, enum zio_compress parent);
+
+extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
+extern int zio_resume(spa_t *spa);
+extern void zio_resume_wait(spa_t *spa);
+
+/*
+ * Initial setup and teardown.
+ */
+extern void zio_init(void);
+extern void zio_fini(void);
+
+/*
+ * Fault injection
+ */
+struct zinject_record;
+extern uint32_t zio_injection_enabled;
+extern int zio_inject_fault(char *name, int flags, int *id,
+ struct zinject_record *record);
+extern int zio_inject_list_next(int *id, char *name, size_t buflen,
+ struct zinject_record *record);
+extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
+extern int zio_handle_fault_injection(zio_t *zio, int error);
+extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
+extern int zio_handle_label_injection(zio_t *zio, int error);
+extern void zio_handle_ignored_writes(zio_t *zio);
+extern hrtime_t zio_handle_io_delay(zio_t *zio);
+
+/*
+ * Checksum ereport functions
+ */
+extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
+ uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
+extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+ const void *good_data, const void *bad_data, boolean_t drop_if_identical);
+
+extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
+extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
+
+/* If we have the good data in hand, this function can be used */
+extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length,
+ const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
+
+/* Called from spa_sync(), but primarily an injection handler */
+extern void spa_handle_ignored_writes(spa_t *spa);
+
+/* zbookmark_phys functions */
+boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
+ const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
+int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
+ uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
new file mode 100644
index 000000000000..782df534c9a0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -0,0 +1,119 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright Saso Kiselkov 2013, All rights reserved.
+ */
+
+#ifndef _SYS_ZIO_CHECKSUM_H
+#define _SYS_ZIO_CHECKSUM_H
+
+#include <sys/zio.h>
+#include <zfeature_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd;
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(struct abd *, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp);
+typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
+typedef void zio_checksum_tmpl_free_t(void *ctx_template);
+
+typedef enum zio_checksum_flags {
+ /* Strong enough for metadata? */
+ ZCHECKSUM_FLAG_METADATA = (1 << 1),
+ /* ZIO embedded checksum */
+ ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
+ /* Strong enough for dedup (without verification)? */
+ ZCHECKSUM_FLAG_DEDUP = (1 << 3),
+ /* Uses salt value */
+ ZCHECKSUM_FLAG_SALTED = (1 << 4),
+ /* Strong enough for nopwrite? */
+ ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
+} zio_checksum_flags_t;
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+ /* checksum function for each byteorder */
+ zio_checksum_t *ci_func[2];
+ zio_checksum_tmpl_init_t *ci_tmpl_init;
+ zio_checksum_tmpl_free_t *ci_tmpl_free;
+ zio_checksum_flags_t ci_flags;
+ char *ci_name; /* descriptive name */
+} zio_checksum_info_t;
+
+typedef struct zio_bad_cksum {
+ zio_cksum_t zbc_expected;
+ zio_cksum_t zbc_actual;
+ const char *zbc_checksum_name;
+ uint8_t zbc_byteswapped;
+ uint8_t zbc_injected;
+ uint8_t zbc_has_cksum; /* expected/actual valid */
+} zio_bad_cksum_t;
+
+extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
+
+/*
+ * Checksum routines.
+ */
+extern zio_checksum_t abd_checksum_SHA256;
+extern zio_checksum_t abd_checksum_SHA512_native;
+extern zio_checksum_t abd_checksum_SHA512_byteswap;
+
+/* Skein */
+extern zio_checksum_t abd_checksum_skein_native;
+extern zio_checksum_t abd_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free;
+
+#ifdef illumos
+/* Edon-R */
+extern zio_checksum_t abd_checksum_edonr_native;
+extern zio_checksum_t abd_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
+#endif
+
+extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
+ void *, uint64_t, uint64_t, zio_bad_cksum_t *);
+extern void zio_checksum_compute(zio_t *, enum zio_checksum,
+ struct abd *, uint64_t);
+extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
+ struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *);
+extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
+extern void zio_checksum_templates_free(spa_t *spa);
+extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_CHECKSUM_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
new file mode 100644
index 000000000000..aab0282c45be
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
@@ -0,0 +1,128 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZIO_COMPRESS_H
+#define _SYS_ZIO_COMPRESS_H
+
+#include <sys/abd.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_EMPTY,
+ ZIO_COMPRESS_GZIP_1,
+ ZIO_COMPRESS_GZIP_2,
+ ZIO_COMPRESS_GZIP_3,
+ ZIO_COMPRESS_GZIP_4,
+ ZIO_COMPRESS_GZIP_5,
+ ZIO_COMPRESS_GZIP_6,
+ ZIO_COMPRESS_GZIP_7,
+ ZIO_COMPRESS_GZIP_8,
+ ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
+ ZIO_COMPRESS_LZ4,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
+/* Common signature for all zio compress functions. */
+typedef size_t zio_compress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+/* Common signature for all zio decompress functions. */
+typedef int zio_decompress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+/*
+ * Common signature for all zio decompress functions using an ABD as input.
+ * This is helpful if you have both compressed ARC and scatter ABDs enabled,
+ * but is not a requirement for all compression algorithms.
+ */
+typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
+ size_t s_len, size_t d_len, int);
+
+/*
+ * Information about each compression function.
+ */
+typedef struct zio_compress_info {
+ char *ci_name;
+ int ci_level;
+ zio_compress_func_t *ci_compress;
+ zio_decompress_func_t *ci_decompress;
+} zio_compress_info_t;
+
+extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
+
+/*
+ * Compression routines.
+ */
+extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern void lz4_init(void);
+extern void lz4_fini(void);
+extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+
+/*
+ * Compress and decompress data if necessary.
+ */
+extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst,
+ size_t s_len);
+extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+ size_t s_len, size_t d_len);
+extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len);
+
+/*
+ * Module lifetime management.
+ */
+extern void zio_compress_init(void);
+extern void zio_compress_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_COMPRESS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
new file mode 100644
index 000000000000..96b3b0135813
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
@@ -0,0 +1,256 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _ZIO_IMPL_H
+#define _ZIO_IMPL_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * XXX -- Describe ZFS I/O pipeline here. Fill in as needed.
+ *
+ * The ZFS I/O pipeline is comprised of various stages which are defined
+ * in the zio_stage enum below. The individual stages are used to construct
+ * these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
+ *
+ * I/O operations: (XXX - provide detail for each of the operations)
+ *
+ * Read:
+ * Write:
+ * Free:
+ * Claim:
+ * Ioctl:
+ *
+ * Although the most common pipeline are used by the basic I/O operations
+ * above, there are some helper pipelines (one could consider them
+ * sub-pipelines) which are used internally by the ZIO module and are
+ * explained below:
+ *
+ * Interlock Pipeline:
+ * The interlock pipeline is the most basic pipeline and is used by all
+ * of the I/O operations. The interlock pipeline does not perform any I/O
+ * and is used to coordinate the dependencies between I/Os that are being
+ * issued (i.e. the parent/child relationship).
+ *
+ * Vdev child Pipeline:
+ * The vdev child pipeline is responsible for performing the physical I/O.
+ * It is in this pipeline where the I/O are queued and possibly cached.
+ *
+ * In addition to performing I/O, the pipeline is also responsible for
+ * data transformations. The transformations performed are based on the
+ * specific properties that user may have selected and modify the
+ * behavior of the pipeline. Examples of supported transformations are
+ * compression, dedup, and nop writes. Transformations will either modify
+ * the data or the pipeline. This list below further describes each of
+ * the supported transformations:
+ *
+ * Compression:
+ * ZFS supports three different flavors of compression -- gzip, lzjb, and
+ * zle. Compression occurs as part of the write pipeline and is performed
+ * in the ZIO_STAGE_WRITE_BP_INIT stage.
+ *
+ * Dedup:
+ * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
+ * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
+ * read pipeline if the dedup bit is set on the block pointer.
+ * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage
+ * and added to a write pipeline if a user has enabled dedup on that
+ * particular dataset.
+ *
+ * NOP Write:
+ * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage
+ * and is added to an existing write pipeline if a crypographically
+ * secure checksum (i.e. SHA256) is enabled and compression is turned on.
+ * The NOP write stage will compare the checksums of the current data
+ * on-disk (level-0 blocks only) and the data that is currently being written.
+ * If the checksum values are identical then the pipeline is converted to
+ * an interlock pipeline skipping block allocation and bypassing the
+ * physical I/O. The nop write feature can handle writes in either
+ * syncing or open context (i.e. zil writes) and as a result is mutually
+ * exclusive with dedup.
+ */
+
+/*
+ * zio pipeline stage definitions
+ */
+enum zio_stage {
+ ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
+
+ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
+ ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */
+ ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */
+ ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */
+ ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */
+
+ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */
+
+ ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */
+
+ ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */
+ ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */
+ ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */
+ ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */
+
+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */
+ ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */
+
+ ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */
+ ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */
+ ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */
+
+ ZIO_STAGE_READY = 1 << 18, /* RWFCI */
+
+ ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RWF-I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RWF-I */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RWF-I */
+
+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */
+
+ ZIO_STAGE_DONE = 1 << 23 /* RWFCI */
+};
+
+#define ZIO_INTERLOCK_STAGES \
+ (ZIO_STAGE_READY | \
+ ZIO_STAGE_DONE)
+
+#define ZIO_INTERLOCK_PIPELINE \
+ ZIO_INTERLOCK_STAGES
+
+#define ZIO_VDEV_IO_STAGES \
+ (ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_DONE | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DONE)
+
+#define ZIO_READ_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_CHECKSUM_VERIFY)
+
+#define ZIO_READ_PHYS_PIPELINE \
+ ZIO_READ_COMMON_STAGES
+
+#define ZIO_READ_PIPELINE \
+ (ZIO_READ_COMMON_STAGES | \
+ ZIO_STAGE_READ_BP_INIT)
+
+#define ZIO_DDT_CHILD_READ_PIPELINE \
+ ZIO_READ_COMMON_STAGES
+
+#define ZIO_DDT_READ_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_READ_BP_INIT | \
+ ZIO_STAGE_DDT_READ_START | \
+ ZIO_STAGE_DDT_READ_DONE)
+
+#define ZIO_WRITE_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_CHECKSUM_GENERATE)
+
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ZIO_WRITE_COMMON_STAGES
+
+#define ZIO_REWRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_COMPRESS | \
+ ZIO_STAGE_WRITE_BP_INIT)
+
+#define ZIO_WRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_WRITE_COMPRESS | \
+ ZIO_STAGE_DVA_THROTTLE | \
+ ZIO_STAGE_DVA_ALLOCATE)
+
+#define ZIO_DDT_CHILD_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DVA_THROTTLE | \
+ ZIO_STAGE_DVA_ALLOCATE)
+
+#define ZIO_DDT_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_WRITE_COMPRESS | \
+ ZIO_STAGE_CHECKSUM_GENERATE | \
+ ZIO_STAGE_DDT_WRITE)
+
+#define ZIO_GANG_STAGES \
+ (ZIO_STAGE_GANG_ASSEMBLE | \
+ ZIO_STAGE_GANG_ISSUE)
+
+#define ZIO_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_DVA_FREE)
+
+#define ZIO_FREE_PHYS_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES)
+
+#define ZIO_DDT_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_DDT_FREE)
+
+#define ZIO_CLAIM_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_DVA_CLAIM)
+
+#define ZIO_IOCTL_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define ZIO_BLOCKING_STAGES \
+ (ZIO_STAGE_DVA_ALLOCATE | \
+ ZIO_STAGE_DVA_CLAIM | \
+ ZIO_STAGE_VDEV_IO_START)
+
+extern void zio_inject_init(void);
+extern void zio_inject_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
new file mode 100644
index 000000000000..ebe05a09dc4e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
@@ -0,0 +1,43 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ */
+#ifndef _ZIO_PRIORITY_H
+#define _ZIO_PRIORITY_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum zio_priority {
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
+ ZIO_PRIORITY_ASYNC_READ, /* prefetch */
+ ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
+ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
+ ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
+ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
+ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
+ ZIO_PRIORITY_NUM_QUEUEABLE,
+
+ ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
+} zio_priority_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_PRIORITY_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
new file mode 100644
index 000000000000..b6eba1a18ff4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZRLOCK_H
+#define _SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+ kmutex_t zr_mtx;
+ volatile int32_t zr_refcount;
+ kcondvar_t zr_cv;
+ uint16_t zr_pad;
+#ifdef ZFS_DEBUG
+ kthread_t *zr_owner;
+ const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#define zrl_add(_z) zrl_add_impl((_z), __func__)
+extern void zrl_add_impl(zrlock_t *, const char *);
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h
new file mode 100644
index 000000000000..33c218ec4c7d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h
@@ -0,0 +1,39 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZTHR_H
+#define _SYS_ZTHR_H
+
+typedef struct zthr zthr_t;
+typedef void (zthr_func_t)(void *, zthr_t *);
+typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *);
+
+extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc,
+ zthr_func_t *func, void *arg);
+extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc,
+ zthr_func_t *func, void *arg, hrtime_t nano_wait);
+extern void zthr_destroy(zthr_t *t);
+
+extern void zthr_wakeup(zthr_t *t);
+extern void zthr_cancel(zthr_t *t);
+extern void zthr_resume(zthr_t *t);
+
+extern boolean_t zthr_iscancelled(zthr_t *t);
+
+#endif /* _SYS_ZTHR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
new file mode 100644
index 000000000000..6bd4d42b8c3f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
@@ -0,0 +1,85 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_ZVOL_H
+#define _SYS_ZVOL_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZVOL_OBJ 1ULL
+#define ZVOL_ZAP_OBJ 2ULL
+
+#ifdef _KERNEL
+extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
+extern int zvol_check_volblocksize(uint64_t volblocksize);
+extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
+extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+extern int zvol_set_volsize(const char *, uint64_t);
+
+#ifdef illumos
+extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
+extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
+extern int zvol_strategy(buf_t *bp);
+extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
+#endif /* illumos */
+extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
+ int *rvalp);
+extern int zvol_busy(void);
+extern void zvol_init(void);
+extern void zvol_fini(void);
+
+#ifdef illumos
+extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+ uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+ void **rl_hdl, void **bonus_hdl);
+extern uint64_t zvol_get_volume_size(void *minor_hdl);
+extern int zvol_get_volume_wce(void *minor_hdl);
+extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
+ ssize_t resid, boolean_t sync);
+#endif /* illumos */
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+extern void zvol_create_minors(spa_t *spa, const char *name);
+extern void zvol_remove_minors(spa_t *spa, const char *name);
+extern void zvol_rename_minors(spa_t *spa, const char *oldname,
+ const char *newname);
+#endif
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZVOL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
new file mode 100644
index 000000000000..e837320ce538
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
@@ -0,0 +1,634 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/trim_map.h>
+#include <sys/time.h>
+
+/*
+ * Calculate the zio end, upgrading based on ashift which would be
+ * done by zio_vdev_io_start.
+ *
+ * This makes free range consolidation much more effective
+ * than it would otherwise be as well as ensuring that entire
+ * blocks are invalidated by writes.
+ */
+#define TRIM_ZIO_END(vd, offset, size) (offset + \
+ P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
+
+/* Maximal segment size for ATA TRIM. */
+#define TRIM_MAP_SIZE_FACTOR (512 << 16)
+
+#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR)
+
+#define TRIM_MAP_ADD(tm, ts) do { \
+ list_insert_tail(&(tm)->tm_head, (ts)); \
+ (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
+} while (0)
+
+#define TRIM_MAP_REM(tm, ts) do { \
+ list_remove(&(tm)->tm_head, (ts)); \
+ (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
+} while (0)
+
+typedef struct trim_map {
+ list_t tm_head; /* List of segments sorted by txg. */
+ avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */
+ avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */
+ avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */
+ list_t tm_pending_writes; /* Writes blocked on in-flight frees. */
+ kmutex_t tm_lock;
+ uint64_t tm_pending; /* Count of pending TRIMs. */
+} trim_map_t;
+
+typedef struct trim_seg {
+ avl_node_t ts_node; /* AVL node. */
+ list_node_t ts_next; /* List element. */
+ uint64_t ts_start; /* Starting offset of this segment. */
+ uint64_t ts_end; /* Ending offset (non-inclusive). */
+ uint64_t ts_txg; /* Segment creation txg. */
+ hrtime_t ts_time; /* Segment creation time. */
+} trim_seg_t;
+
+extern boolean_t zfs_trim_enabled;
+
+static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */
+static u_int trim_timeout = 30; /* Keep deleted data up to 30s */
+static u_int trim_max_interval = 1; /* 1s delays between TRIMs */
+static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "ZFS TRIM");
+
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
+ 0, "Delay TRIMs by up to this many TXGs");
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
+ "Delay TRIMs by up to this many seconds");
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
+ &trim_max_interval, 0,
+ "Maximum interval between TRIM queue processing (seconds)");
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
+ &trim_vdev_max_pending, 0,
+ "Maximum pending TRIM segments for a vdev");
+
+static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
+
+static int
+trim_map_seg_compare(const void *x1, const void *x2)
+{
+ const trim_seg_t *s1 = x1;
+ const trim_seg_t *s2 = x2;
+
+ if (s1->ts_start < s2->ts_start) {
+ if (s1->ts_end > s2->ts_start)
+ return (0);
+ return (-1);
+ }
+ if (s1->ts_start > s2->ts_start) {
+ if (s1->ts_start < s2->ts_end)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+static int
+trim_map_zio_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_offset < z2->io_offset) {
+ if (z1->io_offset + z1->io_size > z2->io_offset)
+ return (0);
+ return (-1);
+ }
+ if (z1->io_offset > z2->io_offset) {
+ if (z1->io_offset < z2->io_offset + z2->io_size)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+void
+trim_map_create(vdev_t *vd)
+{
+ trim_map_t *tm;
+
+ ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
+ vd->vdev_ops->vdev_op_leaf);
+
+ tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
+ mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&tm->tm_head, sizeof (trim_seg_t),
+ offsetof(trim_seg_t, ts_next));
+ list_create(&tm->tm_pending_writes, sizeof (zio_t),
+ offsetof(zio_t, io_trim_link));
+ avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
+ sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
+ avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
+ sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
+ avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
+ sizeof (zio_t), offsetof(zio_t, io_trim_node));
+ vd->vdev_trimmap = tm;
+}
+
+void
+trim_map_destroy(vdev_t *vd)
+{
+ trim_map_t *tm;
+ trim_seg_t *ts;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (!zfs_trim_enabled)
+ return;
+
+ tm = vd->vdev_trimmap;
+ if (tm == NULL)
+ return;
+
+ /*
+ * We may have been called before trim_map_vdev_commit_done()
+ * had a chance to run, so do it now to prune the remaining
+ * inflight frees.
+ */
+ trim_map_vdev_commit_done(vd->vdev_spa, vd);
+
+ mutex_enter(&tm->tm_lock);
+ while ((ts = list_head(&tm->tm_head)) != NULL) {
+ avl_remove(&tm->tm_queued_frees, ts);
+ TRIM_MAP_REM(tm, ts);
+ kmem_free(ts, sizeof (*ts));
+ }
+ mutex_exit(&tm->tm_lock);
+
+ avl_destroy(&tm->tm_queued_frees);
+ avl_destroy(&tm->tm_inflight_frees);
+ avl_destroy(&tm->tm_inflight_writes);
+ list_destroy(&tm->tm_pending_writes);
+ list_destroy(&tm->tm_head);
+ mutex_destroy(&tm->tm_lock);
+ kmem_free(tm, sizeof (*tm));
+ vd->vdev_trimmap = NULL;
+}
+
+static void
+trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
+{
+ avl_index_t where;
+ trim_seg_t tsearch, *ts_before, *ts_after, *ts;
+ boolean_t merge_before, merge_after;
+ hrtime_t time;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+ VERIFY(start < end);
+
+ time = gethrtime();
+ tsearch.ts_start = start;
+ tsearch.ts_end = end;
+
+ ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
+ if (ts != NULL) {
+ if (start < ts->ts_start)
+ trim_map_segment_add(tm, start, ts->ts_start, txg);
+ if (end > ts->ts_end)
+ trim_map_segment_add(tm, ts->ts_end, end, txg);
+ return;
+ }
+
+ ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
+ ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
+
+ merge_before = (ts_before != NULL && ts_before->ts_end == start);
+ merge_after = (ts_after != NULL && ts_after->ts_start == end);
+
+ if (merge_before && merge_after) {
+ avl_remove(&tm->tm_queued_frees, ts_before);
+ TRIM_MAP_REM(tm, ts_before);
+ TRIM_MAP_REM(tm, ts_after);
+ ts_after->ts_start = ts_before->ts_start;
+ ts_after->ts_txg = txg;
+ ts_after->ts_time = time;
+ TRIM_MAP_ADD(tm, ts_after);
+ kmem_free(ts_before, sizeof (*ts_before));
+ } else if (merge_before) {
+ TRIM_MAP_REM(tm, ts_before);
+ ts_before->ts_end = end;
+ ts_before->ts_txg = txg;
+ ts_before->ts_time = time;
+ TRIM_MAP_ADD(tm, ts_before);
+ } else if (merge_after) {
+ TRIM_MAP_REM(tm, ts_after);
+ ts_after->ts_start = start;
+ ts_after->ts_txg = txg;
+ ts_after->ts_time = time;
+ TRIM_MAP_ADD(tm, ts_after);
+ } else {
+ ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
+ ts->ts_start = start;
+ ts->ts_end = end;
+ ts->ts_txg = txg;
+ ts->ts_time = time;
+ avl_insert(&tm->tm_queued_frees, ts, where);
+ TRIM_MAP_ADD(tm, ts);
+ }
+}
+
+static void
+trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
+ uint64_t end)
+{
+ trim_seg_t *nts;
+ boolean_t left_over, right_over;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+ left_over = (ts->ts_start < start);
+ right_over = (ts->ts_end > end);
+
+ TRIM_MAP_REM(tm, ts);
+ if (left_over && right_over) {
+ nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
+ nts->ts_start = end;
+ nts->ts_end = ts->ts_end;
+ nts->ts_txg = ts->ts_txg;
+ nts->ts_time = ts->ts_time;
+ ts->ts_end = start;
+ avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
+ TRIM_MAP_ADD(tm, ts);
+ TRIM_MAP_ADD(tm, nts);
+ } else if (left_over) {
+ ts->ts_end = start;
+ TRIM_MAP_ADD(tm, ts);
+ } else if (right_over) {
+ ts->ts_start = end;
+ TRIM_MAP_ADD(tm, ts);
+ } else {
+ avl_remove(&tm->tm_queued_frees, ts);
+ kmem_free(ts, sizeof (*ts));
+ }
+}
+
+static void
+trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
+{
+ zio_t zsearch, *zs;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+ zsearch.io_offset = start;
+ zsearch.io_size = end - start;
+
+ zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
+ if (zs == NULL) {
+ trim_map_segment_add(tm, start, end, txg);
+ return;
+ }
+ if (start < zs->io_offset)
+ trim_map_free_locked(tm, start, zs->io_offset, txg);
+ if (zs->io_offset + zs->io_size < end)
+ trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
+}
+
+void
+trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
+{
+ trim_map_t *tm = vd->vdev_trimmap;
+
+ if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
+ mutex_exit(&tm->tm_lock);
+}
+
+boolean_t
+trim_map_write_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t tsearch, *ts;
+ boolean_t left_over, right_over;
+ uint64_t start, end;
+
+ if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
+ return (B_TRUE);
+
+ start = zio->io_offset;
+ end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
+ tsearch.ts_start = start;
+ tsearch.ts_end = end;
+
+ mutex_enter(&tm->tm_lock);
+
+ /*
+ * Checking for colliding in-flight frees.
+ */
+ ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
+ if (ts != NULL) {
+ list_insert_tail(&tm->tm_pending_writes, zio);
+ mutex_exit(&tm->tm_lock);
+ return (B_FALSE);
+ }
+
+ /*
+ * Loop until all overlapping segments are removed.
+ */
+ while ((ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL)) != NULL) {
+ trim_map_segment_remove(tm, ts, start, end);
+ }
+
+ avl_add(&tm->tm_inflight_writes, zio);
+
+ mutex_exit(&tm->tm_lock);
+
+ return (B_TRUE);
+}
+
+void
+trim_map_write_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ trim_map_t *tm = vd->vdev_trimmap;
+
+ /*
+ * Don't check for vdev_notrim, since the write could have
+ * started before vdev_notrim was set.
+ */
+ if (!zfs_trim_enabled || tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ /*
+ * Don't fail if the write isn't in the tree, since the write
+ * could have started after vdev_notrim was set.
+ */
+ if (zio->io_trim_node.avl_child[0] ||
+ zio->io_trim_node.avl_child[1] ||
+ AVL_XPARENT(&zio->io_trim_node) ||
+ tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
+ avl_remove(&tm->tm_inflight_writes, zio);
+ mutex_exit(&tm->tm_lock);
+}
+
+/*
+ * Return the oldest segment (the one with the lowest txg / time) or NULL if:
+ * 1. The list is empty
+ * 2. The first element's txg is greater than txgsafe
+ * 3. The first element's txg is not greater than the txg argument and the
+ * the first element's time is not greater than time argument
+ */
+static trim_seg_t *
+trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
+ boolean_t force)
+{
+ trim_seg_t *ts;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+ VERIFY(txgsafe >= txg);
+
+ ts = list_head(&tm->tm_head);
+ if (ts != NULL && ts->ts_txg <= txgsafe &&
+ (ts->ts_txg <= txg || ts->ts_time <= time || force))
+ return (ts);
+ return (NULL);
+}
+
+static void
+trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
+{
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t *ts;
+ uint64_t size, offset, txgtarget, txgsafe;
+ int64_t hard, soft;
+ hrtime_t timelimit;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (tm == NULL)
+ return;
+
+ timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
+ if (vd->vdev_isl2cache) {
+ txgsafe = UINT64_MAX;
+ txgtarget = UINT64_MAX;
+ } else {
+ txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
+ if (txgsafe > trim_txg_delay)
+ txgtarget = txgsafe - trim_txg_delay;
+ else
+ txgtarget = 0;
+ }
+
+ mutex_enter(&tm->tm_lock);
+ hard = 0;
+ if (tm->tm_pending > trim_vdev_max_pending)
+ hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
+ soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
+ /* Loop until we have sent all outstanding free's */
+ while (soft > 0 &&
+ (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
+ != NULL) {
+ TRIM_MAP_REM(tm, ts);
+ avl_remove(&tm->tm_queued_frees, ts);
+ avl_add(&tm->tm_inflight_frees, ts);
+ size = ts->ts_end - ts->ts_start;
+ offset = ts->ts_start;
+ /*
+ * We drop the lock while we call zio_nowait as the IO
+ * scheduler can result in a different IO being run e.g.
+ * a write which would result in a recursive lock.
+ */
+ mutex_exit(&tm->tm_lock);
+
+ zio_nowait(zio_trim(zio, spa, vd, offset, size));
+
+ soft -= TRIM_MAP_SEGS(size);
+ hard -= TRIM_MAP_SEGS(size);
+ mutex_enter(&tm->tm_lock);
+ }
+ mutex_exit(&tm->tm_lock);
+}
+
+static void
+trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
+{
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t *ts;
+ list_t pending_writes;
+ zio_t *zio;
+ uint64_t start, size;
+ void *cookie;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ if (!avl_is_empty(&tm->tm_inflight_frees)) {
+ cookie = NULL;
+ while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
+ &cookie)) != NULL) {
+ kmem_free(ts, sizeof (*ts));
+ }
+ }
+ list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
+ io_trim_link));
+ list_move_tail(&pending_writes, &tm->tm_pending_writes);
+ mutex_exit(&tm->tm_lock);
+
+ while ((zio = list_remove_head(&pending_writes)) != NULL) {
+ zio_vdev_io_reissue(zio);
+ zio_execute(zio);
+ }
+ list_destroy(&pending_writes);
+}
+
+static void
+trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ return;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ trim_map_vdev_commit(spa, zio, vd);
+ } else {
+ for (c = 0; c < vd->vdev_children; c++)
+ trim_map_commit(spa, zio, vd->vdev_child[c]);
+ }
+}
+
+static void
+trim_map_commit_done(spa_t *spa, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ return;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ trim_map_vdev_commit_done(spa, vd);
+ } else {
+ for (c = 0; c < vd->vdev_children; c++)
+ trim_map_commit_done(spa, vd->vdev_child[c]);
+ }
+}
+
+static void
+trim_thread(void *arg)
+{
+ spa_t *spa = arg;
+ zio_t *zio;
+
+#ifdef _KERNEL
+ (void) snprintf(curthread->td_name, sizeof(curthread->td_name),
+ "trim %s", spa_name(spa));
+#endif
+
+ for (;;) {
+ mutex_enter(&spa->spa_trim_lock);
+ if (spa->spa_trim_thread == NULL) {
+ spa->spa_trim_thread = curthread;
+ cv_signal(&spa->spa_trim_cv);
+ mutex_exit(&spa->spa_trim_lock);
+ thread_exit();
+ }
+
+ (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
+ hz * trim_max_interval);
+ mutex_exit(&spa->spa_trim_lock);
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ trim_map_commit(spa, zio, spa->spa_root_vdev);
+ (void) zio_wait(zio);
+ trim_map_commit_done(spa, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ }
+}
+
+void
+trim_thread_create(spa_t *spa)
+{
+
+ if (!zfs_trim_enabled)
+ return;
+
+ mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
+ mutex_enter(&spa->spa_trim_lock);
+ spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
+ TS_RUN, minclsyspri);
+ mutex_exit(&spa->spa_trim_lock);
+}
+
+void
+trim_thread_destroy(spa_t *spa)
+{
+
+ if (!zfs_trim_enabled)
+ return;
+ if (spa->spa_trim_thread == NULL)
+ return;
+
+ mutex_enter(&spa->spa_trim_lock);
+ /* Setting spa_trim_thread to NULL tells the thread to stop. */
+ spa->spa_trim_thread = NULL;
+ cv_signal(&spa->spa_trim_cv);
+ /* The thread will set it back to != NULL on exit. */
+ while (spa->spa_trim_thread == NULL)
+ cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
+ spa->spa_trim_thread = NULL;
+ mutex_exit(&spa->spa_trim_lock);
+
+ cv_destroy(&spa->spa_trim_cv);
+ mutex_destroy(&spa->spa_trim_lock);
+}
+
+void
+trim_thread_wakeup(spa_t *spa)
+{
+
+ if (!zfs_trim_enabled)
+ return;
+ if (spa->spa_trim_thread == NULL)
+ return;
+
+ mutex_enter(&spa->spa_trim_lock);
+ cv_signal(&spa->spa_trim_cv);
+ mutex_exit(&spa->spa_trim_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
new file mode 100644
index 000000000000..64a5d0972a74
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -0,0 +1,977 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/zil.h>
+#include <sys/callb.h>
+
+/*
+ * ZFS Transaction Groups
+ * ----------------------
+ *
+ * ZFS transaction groups are, as the name implies, groups of transactions
+ * that act on persistent state. ZFS asserts consistency at the granularity of
+ * these transaction groups. Each successive transaction group (txg) is
+ * assigned a 64-bit consecutive identifier. There are three active
+ * transaction group states: open, quiescing, or syncing. At any given time,
+ * there may be an active txg associated with each state; each active txg may
+ * either be processing, or blocked waiting to enter the next state. There may
+ * be up to three active txgs, and there is always a txg in the open state
+ * (though it may be blocked waiting to enter the quiescing state). In broad
+ * strokes, transactions -- operations that change in-memory structures -- are
+ * accepted into the txg in the open state, and are completed while the txg is
+ * in the open or quiescing states. The accumulated changes are written to
+ * disk in the syncing state.
+ *
+ * Open
+ *
+ * When a new txg becomes active, it first enters the open state. New
+ * transactions -- updates to in-memory structures -- are assigned to the
+ * currently open txg. There is always a txg in the open state so that ZFS can
+ * accept new changes (though the txg may refuse new changes if it has hit
+ * some limit). ZFS advances the open txg to the next state for a variety of
+ * reasons such as it hitting a time or size threshold, or the execution of an
+ * administrative action that must be completed in the syncing state.
+ *
+ * Quiescing
+ *
+ * After a txg exits the open state, it enters the quiescing state. The
+ * quiescing state is intended to provide a buffer between accepting new
+ * transactions in the open state and writing them out to stable storage in
+ * the syncing state. While quiescing, transactions can continue their
+ * operation without delaying either of the other states. Typically, a txg is
+ * in the quiescing state very briefly since the operations are bounded by
+ * software latencies rather than, say, slower I/O latencies. After all
+ * transactions complete, the txg is ready to enter the next state.
+ *
+ * Syncing
+ *
+ * In the syncing state, the in-memory state built up during the open and (to
+ * a lesser degree) the quiescing states is written to stable storage. The
+ * process of writing out modified data can, in turn modify more data. For
+ * example when we write new blocks, we need to allocate space for them; those
+ * allocations modify metadata (space maps)... which themselves must be
+ * written to stable storage. During the sync state, ZFS iterates, writing out
+ * data until it converges and all in-memory changes have been written out.
+ * The first such pass is the largest as it encompasses all the modified user
+ * data (as opposed to filesystem metadata). Subsequent passes typically have
+ * far less data to write as they consist exclusively of filesystem metadata.
+ *
+ * To ensure convergence, after a certain number of passes ZFS begins
+ * overwriting locations on stable storage that had been allocated earlier in
+ * the syncing state (and subsequently freed). ZFS usually allocates new
+ * blocks to optimize for large, continuous, writes. For the syncing state to
+ * converge however it must complete a pass where no new blocks are allocated
+ * since each allocation requires a modification of persistent metadata.
+ * Further, to hasten convergence, after a prescribed number of passes, ZFS
+ * also defers frees, and stops compressing.
+ *
+ * In addition to writing out user data, we must also execute synctasks during
+ * the syncing context. A synctask is the mechanism by which some
+ * administrative activities work such as creating and destroying snapshots or
+ * datasets. Note that when a synctask is initiated it enters the open txg,
+ * and ZFS then pushes that txg as quickly as possible to completion of the
+ * syncing state in order to reduce the latency of the administrative
+ * activity. To complete the syncing state, ZFS writes out a new uberblock,
+ * the root of the tree of blocks that comprise all state stored on the ZFS
+ * pool. Finally, if there is a quiesced txg waiting, we signal that it can
+ * now transition to the syncing state.
+ */
+
+static void txg_sync_thread(void *arg);
+static void txg_quiesce_thread(void *arg);
+
+int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS TXG");
+SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
+ "Maximum seconds worth of delta per txg");
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c;
+ bzero(tx, sizeof (tx_state_t));
+
+ tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+
+ for (c = 0; c < max_ncpus; c++) {
+ int i;
+
+ mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+ for (i = 0; i < TXG_SIZE; i++) {
+ cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
+ NULL);
+ list_create(&tx->tx_cpu[c].tc_callbacks[i],
+ sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
+ }
+ }
+
+ mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
+
+ tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c;
+
+ ASSERT0(tx->tx_threads);
+
+ mutex_destroy(&tx->tx_sync_lock);
+
+ cv_destroy(&tx->tx_sync_more_cv);
+ cv_destroy(&tx->tx_sync_done_cv);
+ cv_destroy(&tx->tx_quiesce_more_cv);
+ cv_destroy(&tx->tx_quiesce_done_cv);
+ cv_destroy(&tx->tx_exit_cv);
+
+ for (c = 0; c < max_ncpus; c++) {
+ int i;
+
+ mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
+ mutex_destroy(&tx->tx_cpu[c].tc_lock);
+ for (i = 0; i < TXG_SIZE; i++) {
+ cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+ list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
+ }
+ }
+
+ if (tx->tx_commit_cb_taskq != NULL)
+ taskq_destroy(tx->tx_commit_cb_taskq);
+
+ kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+ bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+
+ dprintf("pool %p\n", dp);
+
+ ASSERT0(tx->tx_threads);
+
+ tx->tx_threads = 2;
+
+ tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+ dp, 0, spa_proc(dp->dp_spa), TS_RUN, minclsyspri);
+
+ /*
+ * The sync thread can need a larger-than-default stack size on
+ * 32-bit x86. This is due in part to nested pools and
+ * scrub_visitbp() recursion.
+ */
+ tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
+ dp, 0, spa_proc(dp->dp_spa), TS_RUN, minclsyspri);
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+ CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+ mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+ ASSERT(*tpp != NULL);
+ *tpp = NULL;
+ tx->tx_threads--;
+ cv_broadcast(&tx->tx_exit_cv);
+ CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
+ thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
+{
+ CALLB_CPR_SAFE_BEGIN(cpr);
+
+ if (time)
+ (void) cv_timedwait(cv, &tx->tx_sync_lock, time);
+ else
+ cv_wait(cv, &tx->tx_sync_lock);
+
+ CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ dprintf("pool %p\n", dp);
+ /*
+ * Finish off any work in progress.
+ */
+ ASSERT3U(tx->tx_threads, ==, 2);
+
+ /*
+ * We need to ensure that we've vacated the deferred space_maps.
+ */
+ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
+
+ /*
+ * Wake all sync threads and wait for them to die.
+ */
+ mutex_enter(&tx->tx_sync_lock);
+
+ ASSERT3U(tx->tx_threads, ==, 2);
+
+ tx->tx_exiting = 1;
+
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ cv_broadcast(&tx->tx_sync_more_cv);
+
+ while (tx->tx_threads != 0)
+ cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+ tx->tx_exiting = 0;
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
+ uint64_t txg;
+
+ mutex_enter(&tc->tc_open_lock);
+ txg = tx->tx_open_txg;
+
+ mutex_enter(&tc->tc_lock);
+ tc->tc_count[txg & TXG_MASK]++;
+ mutex_exit(&tc->tc_lock);
+
+ th->th_cpu = tc;
+ th->th_txg = txg;
+
+ return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+
+ ASSERT(!MUTEX_HELD(&tc->tc_lock));
+ mutex_exit(&tc->tc_open_lock);
+}
+
+void
+txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
+ mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ ASSERT(tc->tc_count[g] != 0);
+ if (--tc->tc_count[g] == 0)
+ cv_broadcast(&tc->tc_cv[g]);
+ mutex_exit(&tc->tc_lock);
+
+ th->th_cpu = NULL; /* defensive */
+}
+
+/*
+ * Blocks until all transactions in the group are committed.
+ *
+ * On return, the transaction group has reached a stable state in which it can
+ * then be passed off to the syncing context.
+ */
+static __noinline void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int g = txg & TXG_MASK;
+ int c;
+
+ /*
+ * Grab all tc_open_locks so nobody else can get into this txg.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_enter(&tx->tx_cpu[c].tc_open_lock);
+
+ ASSERT(txg == tx->tx_open_txg);
+ tx->tx_open_txg++;
+ tx->tx_open_time = gethrtime();
+
+ DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+ DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
+
+ /*
+ * Now that we've incremented tx_open_txg, we can let threads
+ * enter the next transaction group.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_exit(&tx->tx_cpu[c].tc_open_lock);
+
+ /*
+ * Quiesce the transaction group by waiting for everyone to txg_exit().
+ */
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ mutex_enter(&tc->tc_lock);
+ while (tc->tc_count[g] != 0)
+ cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+ mutex_exit(&tc->tc_lock);
+ }
+}
+
+static void
+txg_do_callbacks(void *arg)
+{
+ list_t *cb_list = arg;
+
+ dmu_tx_do_callbacks(cb_list, 0);
+
+ list_destroy(cb_list);
+
+ kmem_free(cb_list, sizeof (list_t));
+}
+
+/*
+ * Dispatch the commit callbacks registered on this txg to worker threads.
+ *
+ * If no callbacks are registered for a given TXG, nothing happens.
+ * This function creates a taskq for the associated pool, if needed.
+ */
+static void
+txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
+{
+ int c;
+ tx_state_t *tx = &dp->dp_tx;
+ list_t *cb_list;
+
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ /*
+ * No need to lock tx_cpu_t at this point, since this can
+ * only be called once a txg has been synced.
+ */
+
+ int g = txg & TXG_MASK;
+
+ if (list_is_empty(&tc->tc_callbacks[g]))
+ continue;
+
+ if (tx->tx_commit_cb_taskq == NULL) {
+ /*
+ * Commit callback taskq hasn't been created yet.
+ */
+ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
+ max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
+ TASKQ_PREPOPULATE);
+ }
+
+ cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(cb_list, sizeof (dmu_tx_callback_t),
+ offsetof(dmu_tx_callback_t, dcb_node));
+
+ list_move_tail(cb_list, &tc->tc_callbacks[g]);
+
+ (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+ txg_do_callbacks, cb_list, TQ_SLEEP);
+ }
+}
+
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiesced_txg != 0);
+}
+
+static void
+txg_sync_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ spa_t *spa = dp->dp_spa;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+ uint64_t start, delta;
+
+ txg_thread_enter(tx, &cpr);
+
+ start = delta = 0;
+ for (;;) {
+ uint64_t timeout = zfs_txg_timeout * hz;
+ uint64_t timer;
+ uint64_t txg;
+ uint64_t dirty_min_bytes =
+ zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
+
+ /*
+ * We sync when we're scanning, there's someone waiting
+ * on us, or the quiesce thread has handed off a txg to
+ * us, or we have reached our timeout.
+ */
+ timer = (delta >= timeout ? 0 : timeout - delta);
+ while (!dsl_scan_active(dp->dp_scan) &&
+ !tx->tx_exiting && timer > 0 &&
+ tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+ !txg_has_quiesced_to_sync(dp) &&
+ dp->dp_dirty_total < dirty_min_bytes) {
+ dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
+ delta = ddi_get_lbolt() - start;
+ timer = (delta > timeout ? 0 : timeout - delta);
+ }
+
+ /*
+ * Wait until the quiesce thread hands off a txg to us,
+ * prompting it to do so if necessary.
+ */
+ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
+ if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+ /*
+ * Consume the quiesced txg which has been handed off to
+ * us. This may cause the quiescing thread to now be
+ * able to quiesce another txg, so we must signal it.
+ */
+ ASSERT(tx->tx_quiesced_txg != 0);
+ txg = tx->tx_quiesced_txg;
+ tx->tx_quiesced_txg = 0;
+ tx->tx_syncing_txg = txg;
+ DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+
+ start = ddi_get_lbolt();
+ spa_sync(spa, txg);
+ delta = ddi_get_lbolt() - start;
+
+ mutex_enter(&tx->tx_sync_lock);
+ tx->tx_synced_txg = txg;
+ tx->tx_syncing_txg = 0;
+ DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
+ cv_broadcast(&tx->tx_sync_done_cv);
+
+ /*
+ * Dispatch commit callbacks to worker threads.
+ */
+ txg_dispatch_callbacks(dp, txg);
+ }
+}
+
+static void
+txg_quiesce_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We quiesce when there's someone waiting on us.
+ * However, we can only have one txg in "quiescing" or
+ * "quiesced, waiting to sync" state. So we wait until
+ * the "quiesced, waiting to sync" txg has been consumed
+ * by the sync thread.
+ */
+ while (!tx->tx_exiting &&
+ (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+ txg_has_quiesced_to_sync(dp)))
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+ txg = tx->tx_open_txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ tx->tx_quiescing_txg = txg;
+
+ mutex_exit(&tx->tx_sync_lock);
+ txg_quiesce(dp, txg);
+ mutex_enter(&tx->tx_sync_lock);
+
+ /*
+ * Hand this txg off to the sync thread.
+ */
+ dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiescing_txg = 0;
+ tx->tx_quiesced_txg = txg;
+ DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ }
+}
+
+/*
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiesing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiesing state.
+ */
+void
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ hrtime_t start = gethrtime();
+
+ /* don't delay if this txg could transition to quiescing immediately */
+ if (tx->tx_open_txg > txg ||
+ tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
+ return;
+
+ mutex_enter(&tx->tx_sync_lock);
+ if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
+ mutex_exit(&tx->tx_sync_lock);
+ return;
+ }
+
+ while (gethrtime() - start < delay &&
+ tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+ (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+ &tx->tx_sync_lock, delay, resolution, 0);
+ }
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static boolean_t
+txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ ASSERT(!dsl_pool_config_held(dp));
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT3U(tx->tx_threads, ==, 2);
+ if (txg == 0)
+ txg = tx->tx_open_txg + TXG_DEFER_SIZE;
+ if (tx->tx_sync_txg_waiting < txg)
+ tx->tx_sync_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_synced_txg < txg) {
+ dprintf("broadcasting sync more "
+ "tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ if (wait_sig) {
+ /*
+ * Condition wait here but stop if the thread receives a
+ * signal. The caller may call txg_wait_synced*() again
+ * to resume waiting for this txg.
+ */
+#ifdef __FreeBSD__
+ /*
+ * FreeBSD returns EINTR or ERESTART if there is
+ * a pending signal, zero if the conditional variable
+ * is signaled. illumos returns zero in the former case
+ * and >0 in the latter.
+ */
+ if (cv_wait_sig(&tx->tx_sync_done_cv,
+ &tx->tx_sync_lock) != 0) {
+#else
+ if (cv_wait_sig(&tx->tx_sync_done_cv,
+ &tx->tx_sync_lock) == 0) {
+#endif
+
+ mutex_exit(&tx->tx_sync_lock);
+ return (B_TRUE);
+ }
+ } else {
+ cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+ }
+ }
+ mutex_exit(&tx->tx_sync_lock);
+ return (B_FALSE);
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+ VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
+}
+
+/*
+ * Similar to a txg_wait_synced but it can be interrupted from a signal.
+ * Returns B_TRUE if the thread was signaled while waiting.
+ */
+boolean_t
+txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
+{
+ return (txg_wait_synced_impl(dp, txg, B_TRUE));
+}
+
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ ASSERT(!dsl_pool_config_held(dp));
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT3U(tx->tx_threads, ==, 2);
+ if (txg == 0)
+ txg = tx->tx_open_txg + 1;
+ if (tx->tx_quiesce_txg_waiting < txg)
+ tx->tx_quiesce_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_open_txg < txg) {
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+/*
+ * If there isn't a txg syncing or in the pipeline, push another txg through
+ * the pipeline by queiscing the open txg.
+ */
+void
+txg_kick(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ ASSERT(!dsl_pool_config_held(dp));
+
+ mutex_enter(&tx->tx_sync_lock);
+ if (!txg_is_syncing(dp) &&
+ !txg_is_quiescing(dp) &&
+ tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
+ tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
+ tx->tx_quiesced_txg <= tx->tx_synced_txg) {
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+boolean_t
+txg_stalled(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+boolean_t
+txg_sync_waiting(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
+ tx->tx_quiesced_txg != 0);
+}
+
+/*
+ * Verify that this txg is active (open, quiescing, syncing). Non-active
+ * txg's should not be manipulated.
+ */
+void
+txg_verify(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
+ return;
+ ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+ ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
+ ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
+}
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
+{
+ int t;
+
+ mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ tl->tl_offset = offset;
+ tl->tl_spa = spa;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ tl->tl_head[t] = NULL;
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ ASSERT(txg_list_empty(tl, t));
+
+ mutex_destroy(&tl->tl_lock);
+}
+
+boolean_t
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+ txg_verify(tl->tl_spa, txg);
+ return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+/*
+ * Returns true if all txg lists are empty.
+ *
+ * Warning: this is inherently racy (an item could be added immediately
+ * after this function returns). We don't bother with the lock because
+ * it wouldn't change the semantics.
+ */
+boolean_t
+txg_all_lists_empty(txg_list_t *tl)
+{
+ for (int i = 0; i < TXG_SIZE; i++) {
+ if (!txg_list_empty(tl, i)) {
+ return (B_FALSE);
+ }
+ }
+ return (B_TRUE);
+}
+
+/*
+ * Add an entry to the list (unless it's already on the list).
+ * Returns B_TRUE if it was actually added.
+ */
+boolean_t
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ boolean_t add;
+
+ txg_verify(tl->tl_spa, txg);
+ mutex_enter(&tl->tl_lock);
+ add = (tn->tn_member[t] == 0);
+ if (add) {
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = tl->tl_head[t];
+ tl->tl_head[t] = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (add);
+}
+
+/*
+ * Add an entry to the end of the list, unless it's already on the list.
+ * (walks list to find end)
+ * Returns B_TRUE if it was actually added.
+ */
+boolean_t
+txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ boolean_t add;
+
+ txg_verify(tl->tl_spa, txg);
+ mutex_enter(&tl->tl_lock);
+ add = (tn->tn_member[t] == 0);
+ if (add) {
+ txg_node_t **tp;
+
+ for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
+ continue;
+
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = NULL;
+ *tp = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (add);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn;
+ void *p = NULL;
+
+ txg_verify(tl->tl_spa, txg);
+ mutex_enter(&tl->tl_lock);
+ if ((tn = tl->tl_head[t]) != NULL) {
+ ASSERT(tn->tn_member[t]);
+ ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
+ p = (char *)tn - tl->tl_offset;
+ tl->tl_head[t] = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn, **tp;
+
+ txg_verify(tl->tl_spa, txg);
+ mutex_enter(&tl->tl_lock);
+
+ for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+ if ((char *)tn - tl->tl_offset == p) {
+ *tp = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ mutex_exit(&tl->tl_lock);
+ return (p);
+ }
+ }
+
+ mutex_exit(&tl->tl_lock);
+
+ return (NULL);
+}
+
+boolean_t
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ txg_verify(tl->tl_spa, txg);
+ return (tn->tn_member[t] != 0);
+}
+
+/*
+ * Walk a txg list -- only safe if you know it's not changing.
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = tl->tl_head[t];
+
+ txg_verify(tl->tl_spa, txg);
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ txg_verify(tl->tl_spa, txg);
+ tn = tn->tn_next[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
new file mode 100644
index 000000000000..b8857d74d810
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/mmp.h>
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+ byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+ if (ub->ub_magic != UBERBLOCK_MAGIC)
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+}
+
+/*
+ * Update the uberblock and return TRUE if anything changed in this
+ * transaction group.
+ */
+boolean_t
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
+{
+ ASSERT(ub->ub_txg < txg);
+
+ /*
+ * We explicitly do not set ub_version here, so that older versions
+ * continue to be written with the previous uberblock version.
+ */
+ ub->ub_magic = UBERBLOCK_MAGIC;
+ ub->ub_txg = txg;
+ ub->ub_guid_sum = rvd->vdev_guid_sum;
+ ub->ub_timestamp = gethrestime_sec();
+ ub->ub_software_version = SPA_VERSION;
+ ub->ub_mmp_magic = MMP_MAGIC;
+ if (spa_multihost(rvd->vdev_spa)) {
+ ub->ub_mmp_delay = mmp_delay;
+ ub->ub_mmp_config = MMP_SEQ_SET(0) |
+ MMP_INTERVAL_SET(zfs_multihost_interval) |
+ MMP_FAIL_INT_SET(zfs_multihost_fail_intervals);
+ } else {
+ ub->ub_mmp_delay = 0;
+ ub->ub_mmp_config = 0;
+ }
+ ub->ub_checkpoint_txg = 0;
+
+ return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
new file mode 100644
index 000000000000..d33f451938b8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
@@ -0,0 +1,112 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx;
+
+typedef struct unique {
+ avl_node_t un_link;
+ uint64_t un_value;
+} unique_t;
+
+#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+ const unique_t *una = (const unique_t *)a;
+ const unique_t *unb = (const unique_t *)b;
+
+ return (AVL_CMP(una->un_value, unb->un_value));
+}
+
+void
+unique_init(void)
+{
+ avl_create(&unique_avl, unique_compare,
+ sizeof (unique_t), offsetof(unique_t, un_link));
+ mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+unique_fini(void)
+{
+ avl_destroy(&unique_avl);
+ mutex_destroy(&unique_mtx);
+}
+
+uint64_t
+unique_create(void)
+{
+ uint64_t value = unique_insert(0);
+ unique_remove(value);
+ return (value);
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+ avl_index_t idx;
+ unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+ un->un_value = value;
+
+ mutex_enter(&unique_mtx);
+ while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+ avl_find(&unique_avl, un, &idx)) {
+ mutex_exit(&unique_mtx);
+ (void) random_get_pseudo_bytes((void*)&un->un_value,
+ sizeof (un->un_value));
+ un->un_value &= UNIQUE_MASK;
+ mutex_enter(&unique_mtx);
+ }
+
+ avl_insert(&unique_avl, un, idx);
+ mutex_exit(&unique_mtx);
+
+ return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+ unique_t un_tofind;
+ unique_t *un;
+
+ un_tofind.un_value = value;
+ mutex_enter(&unique_mtx);
+ un = avl_find(&unique_avl, &un_tofind, NULL);
+ if (un != NULL) {
+ avl_remove(&unique_avl, un);
+ kmem_free(un, sizeof (unique_t));
+ }
+ mutex_exit(&unique_mtx);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
new file mode 100644
index 000000000000..6043adee0241
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -0,0 +1,4520 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/space_reftree.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/zil.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/trim_map.h>
+#include <sys/vdev_initialize.h>
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS VDEV");
+
+/*
+ * Virtual device management.
+ */
+
+/*
+ * The limit for ZFS to automatically increase a top-level vdev's ashift
+ * from logical ashift to physical ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ * child->vdev_ashift = 9 (512 bytes)
+ * child->vdev_physical_ashift = 12 (4096 bytes)
+ * zfs_max_auto_ashift = 11 (2048 bytes)
+ * zfs_min_auto_ashift = 9 (512 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 2048 as limited by
+ * zfs_max_auto_ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ * child->vdev_ashift = 9 (512 bytes)
+ * child->vdev_physical_ashift = 12 (4096 bytes)
+ * zfs_max_auto_ashift = 13 (8192 bytes)
+ * zfs_min_auto_ashift = 9 (512 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 4096 to match the
+ * max vdev_physical_ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ * child->vdev_ashift = 9 (512 bytes)
+ * child->vdev_physical_ashift = 9 (512 bytes)
+ * zfs_max_auto_ashift = 13 (8192 bytes)
+ * zfs_min_auto_ashift = 12 (4096 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 4096 to match the
+ * zfs_min_auto_ashift.
+ */
+static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
+static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
+
+static int
+sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_max_auto_ashift;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift)
+ return (EINVAL);
+
+ zfs_max_auto_ashift = val;
+
+ return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+ sysctl_vfs_zfs_max_auto_ashift, "QU",
+ "Max ashift used when optimising for logical -> physical sectors size on "
+ "new top-level vdevs.");
+
+static int
+sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_min_auto_ashift;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift)
+ return (EINVAL);
+
+ zfs_min_auto_ashift = val;
+
+ return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+ sysctl_vfs_zfs_min_auto_ashift, "QU",
+ "Min ashift used when creating new top-level vdevs.");
+
+static vdev_ops_t *vdev_ops_table[] = {
+ &vdev_root_ops,
+ &vdev_raidz_ops,
+ &vdev_mirror_ops,
+ &vdev_replacing_ops,
+ &vdev_spare_ops,
+#ifdef _KERNEL
+ &vdev_geom_ops,
+#else
+ &vdev_disk_ops,
+#endif
+ &vdev_file_ops,
+ &vdev_missing_ops,
+ &vdev_hole_ops,
+ &vdev_indirect_ops,
+ NULL
+};
+
+
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
+ &zfs_vdev_default_ms_count, 0,
+ "Target number of metaslabs per top-level vdev");
+
+/* minimum number of metaslabs per top-level vdev */
+int zfs_vdev_min_ms_count = 16;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
+ &zfs_vdev_min_ms_count, 0,
+ "Minimum number of metaslabs per top-level vdev");
+
+/* practical upper limit of total metaslabs per top-level vdev */
+int zfs_vdev_ms_count_limit = 1ULL << 17;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
+ &zfs_vdev_ms_count_limit, 0,
+ "Maximum number of metaslabs per top-level vdev");
+
+/* lower limit for metaslab size (512M) */
+int zfs_vdev_default_ms_shift = 29;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
+ &zfs_vdev_default_ms_shift, 0,
+ "Default shift between vdev size and number of metaslabs");
+
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
+ &zfs_vdev_max_ms_shift, 0,
+ "Maximum shift between vdev size and number of metaslabs");
+
+boolean_t vdev_validate_skip = B_FALSE;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, validate_skip, CTLFLAG_RWTUN,
+ &vdev_validate_skip, 0,
+ "Bypass vdev validation");
+
+/*
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
+ */
+int vdev_dtl_sm_blksz = (1 << 12);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN,
+ &vdev_dtl_sm_blksz, 0,
+ "Block size for DTL space map. Power of 2 and greater than 4096.");
+
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+int vdev_standard_sm_blksz = (1 << 17);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN,
+ &vdev_standard_sm_blksz, 0,
+ "Block size for standard space map. Power of 2 and greater than 4096.");
+
+/*
+ * Tunable parameter for debugging or performance analysis. Setting this
+ * will cause pool corruption on power loss if a volatile out-of-order
+ * write cache is enabled.
+ */
+boolean_t zfs_nocacheflush = B_FALSE;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RWTUN,
+ &zfs_nocacheflush, 0, "Disable cache flush");
+
+/*PRINTFLIKE2*/
+void
+vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
+{
+ va_list adx;
+ char buf[256];
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ if (vd->vdev_path != NULL) {
+ zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
+ vd->vdev_path, buf);
+ } else {
+ zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
+ vd->vdev_ops->vdev_op_type,
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)vd->vdev_guid, buf);
+ }
+}
+
+void
+vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
+{
+ char state[20];
+
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
+ zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
+ vd->vdev_ops->vdev_op_type);
+ return;
+ }
+
+ switch (vd->vdev_state) {
+ case VDEV_STATE_UNKNOWN:
+ (void) snprintf(state, sizeof (state), "unknown");
+ break;
+ case VDEV_STATE_CLOSED:
+ (void) snprintf(state, sizeof (state), "closed");
+ break;
+ case VDEV_STATE_OFFLINE:
+ (void) snprintf(state, sizeof (state), "offline");
+ break;
+ case VDEV_STATE_REMOVED:
+ (void) snprintf(state, sizeof (state), "removed");
+ break;
+ case VDEV_STATE_CANT_OPEN:
+ (void) snprintf(state, sizeof (state), "can't open");
+ break;
+ case VDEV_STATE_FAULTED:
+ (void) snprintf(state, sizeof (state), "faulted");
+ break;
+ case VDEV_STATE_DEGRADED:
+ (void) snprintf(state, sizeof (state), "degraded");
+ break;
+ case VDEV_STATE_HEALTHY:
+ (void) snprintf(state, sizeof (state), "healthy");
+ break;
+ default:
+ (void) snprintf(state, sizeof (state), "<state %u>",
+ (uint_t)vd->vdev_state);
+ }
+
+ zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
+ "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
+ vd->vdev_islog ? " (log)" : "",
+ (u_longlong_t)vd->vdev_guid,
+ vd->vdev_path ? vd->vdev_path : "N/A", state);
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
+}
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+ vdev_ops_t *ops, **opspp;
+
+ for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+ if (strcmp(ops->vdev_op_type, type) == 0)
+ break;
+
+ return (ops);
+}
+
+/*
+ * Derive the enumerated alloction bias from string input.
+ * String origin is either the per-vdev zap or zpool(1M).
+ */
+static vdev_alloc_bias_t
+vdev_derive_alloc_bias(const char *bias)
+{
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+
+ if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
+ alloc_bias = VDEV_BIAS_LOG;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+ alloc_bias = VDEV_BIAS_SPECIAL;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
+ alloc_bias = VDEV_BIAS_DEDUP;
+
+ return (alloc_bias);
+}
+
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+ res->rs_start = in->rs_start;
+ res->rs_end = in->rs_end;
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children. This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
+ uint64_t csize;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ asize = MAX(asize, csize);
+ }
+
+ return (asize);
+}
+
+/*
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
+ */
+uint64_t
+vdev_get_min_asize(vdev_t *vd)
+{
+ vdev_t *pvd = vd->vdev_parent;
+
+ /*
+ * If our parent is NULL (inactive spare or cache) or is the root,
+ * just return our own asize.
+ */
+ if (pvd == NULL)
+ return (vd->vdev_asize);
+
+ /*
+ * The top-level vdev just returns the allocatable size rounded
+ * to the nearest metaslab.
+ */
+ if (vd == vd->vdev_top)
+ return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
+
+ /*
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child),
+ * so each child must provide at least 1/Nth of its asize.
+ */
+ if (pvd->vdev_ops == &vdev_raidz_ops)
+ return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
+ pvd->vdev_children);
+
+ return (pvd->vdev_min_asize);
+}
+
+void
+vdev_set_min_asize(vdev_t *vd)
+{
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_set_min_asize(vd->vdev_child[c]);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ if (vdev < rvd->vdev_children) {
+ ASSERT(rvd->vdev_child[vdev] != NULL);
+ return (rvd->vdev_child[vdev]);
+ }
+
+ return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+ vdev_t *mvd;
+
+ if (vd->vdev_guid == guid)
+ return (vd);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+ int n = 0;
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (1);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+ return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+ return (vdev_count_leaves_impl(spa->spa_root_vdev));
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+ size_t oldsize, newsize;
+ uint64_t id = cvd->vdev_id;
+ vdev_t **newchild;
+ spa_t *spa = cvd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(cvd->vdev_parent == NULL);
+
+ cvd->vdev_parent = pvd;
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+ oldsize = pvd->vdev_children * sizeof (vdev_t *);
+ pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+ newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+ newchild = kmem_zalloc(newsize, KM_SLEEP);
+ if (pvd->vdev_child != NULL) {
+ bcopy(pvd->vdev_child, newchild, oldsize);
+ kmem_free(pvd->vdev_child, oldsize);
+ }
+
+ pvd->vdev_child = newchild;
+ pvd->vdev_child[id] = cvd;
+
+ cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+ ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
+ cvd->vdev_spa->spa_leaf_list_gen++;
+ }
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+ int c;
+ uint_t id = cvd->vdev_id;
+
+ ASSERT(cvd->vdev_parent == pvd);
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id < pvd->vdev_children);
+ ASSERT(pvd->vdev_child[id] == cvd);
+
+ pvd->vdev_child[id] = NULL;
+ cvd->vdev_parent = NULL;
+
+ for (c = 0; c < pvd->vdev_children; c++)
+ if (pvd->vdev_child[c])
+ break;
+
+ if (c == pvd->vdev_children) {
+ kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+ pvd->vdev_child = NULL;
+ pvd->vdev_children = 0;
+ }
+
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ spa_t *spa = cvd->vdev_spa;
+ list_remove(&spa->spa_leaf_list, cvd);
+ spa->spa_leaf_list_gen++;
+ }
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+ vdev_t **newchild, *cvd;
+ int oldc = pvd->vdev_children;
+ int newc;
+
+ ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if (oldc == 0)
+ return;
+
+ for (int c = newc = 0; c < oldc; c++)
+ if (pvd->vdev_child[c])
+ newc++;
+
+ if (newc > 0) {
+ newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+ for (int c = newc = 0; c < oldc; c++) {
+ if ((cvd = pvd->vdev_child[c]) != NULL) {
+ newchild[newc] = cvd;
+ cvd->vdev_id = newc++;
+ }
+ }
+ } else {
+ newchild = NULL;
+ }
+
+ kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+ pvd->vdev_child = newchild;
+ pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+ vdev_t *vd;
+ vdev_indirect_config_t *vic;
+
+ vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+ vic = &vd->vdev_indirect_config;
+
+ if (spa->spa_root_vdev == NULL) {
+ ASSERT(ops == &vdev_root_ops);
+ spa->spa_root_vdev = vd;
+ spa->spa_load_guid = spa_generate_guid(NULL);
+ }
+
+ if (guid == 0 && ops != &vdev_hole_ops) {
+ if (spa->spa_root_vdev == vd) {
+ /*
+ * The root vdev's guid will also be the pool guid,
+ * which must be unique among all pools.
+ */
+ guid = spa_generate_guid(NULL);
+ } else {
+ /*
+ * Any other vdev's guid must be unique within the pool.
+ */
+ guid = spa_generate_guid(spa);
+ }
+ ASSERT(!spa_guid_exists(spa_guid(spa), guid));
+ }
+
+ vd->vdev_spa = spa;
+ vd->vdev_id = id;
+ vd->vdev_guid = guid;
+ vd->vdev_guid_sum = guid;
+ vd->vdev_ops = ops;
+ vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_ishole = (ops == &vdev_hole_ops);
+ vic->vic_prev_indirect_vdev = UINT64_MAX;
+
+ rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
+ vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
+
+ list_link_init(&vd->vdev_leaf_node);
+ mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
+ for (int t = 0; t < DTL_TYPES; t++) {
+ vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
+ }
+ txg_list_create(&vd->vdev_ms_list, spa,
+ offsetof(struct metaslab, ms_txg_node));
+ txg_list_create(&vd->vdev_dtl_list, spa,
+ offsetof(struct vdev, vdev_dtl_node));
+ vd->vdev_stat.vs_timestamp = gethrtime();
+ vdev_queue_init(vd);
+ vdev_cache_init(vd);
+
+ return (vd);
+}
+
+/*
+ * Allocate a new vdev. The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+int
+vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
+ int alloctype)
+{
+ vdev_ops_t *ops;
+ char *type;
+ uint64_t guid = 0, islog, nparity;
+ vdev_t *vd;
+ vdev_indirect_config_t *vic;
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+ boolean_t top_level = (parent && !parent->vdev_parent);
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((ops = vdev_getops(type)) == NULL)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * If this is a load, get the vdev guid from the nvlist.
+ * Otherwise, vdev_alloc_common() will generate one for us.
+ */
+ if (alloctype == VDEV_ALLOC_LOAD) {
+ uint64_t label_id;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+ label_id != id)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (SET_ERROR(EINVAL));
+ } else if (alloctype == VDEV_ALLOC_SPARE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (SET_ERROR(EINVAL));
+ } else if (alloctype == VDEV_ALLOC_L2CACHE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (SET_ERROR(EINVAL));
+ } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * The first allocated vdev must be of type 'root'.
+ */
+ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Determine whether we're a log vdev.
+ */
+ islog = 0;
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
+ if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
+ return (SET_ERROR(ENOTSUP));
+
+ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * Set the nparity property for RAID-Z vdevs.
+ */
+ nparity = -1ULL;
+ if (ops == &vdev_raidz_ops) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ &nparity) == 0) {
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
+ return (SET_ERROR(EINVAL));
+ /*
+ * Previous versions could only support 1 or 2 parity
+ * device.
+ */
+ if (nparity > 1 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ2)
+ return (SET_ERROR(ENOTSUP));
+ if (nparity > 2 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ3)
+ return (SET_ERROR(ENOTSUP));
+ } else {
+ /*
+ * We require the parity to be specified for SPAs that
+ * support multiple parity levels.
+ */
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
+ return (SET_ERROR(EINVAL));
+ /*
+ * Otherwise, we default to 1 parity device for RAID-Z.
+ */
+ nparity = 1;
+ }
+ } else {
+ nparity = 0;
+ }
+ ASSERT(nparity != -1ULL);
+
+ /*
+ * If creating a top-level vdev, check for allocation classes input
+ */
+ if (top_level && alloctype == VDEV_ALLOC_ADD) {
+ char *bias;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &bias) == 0) {
+ alloc_bias = vdev_derive_alloc_bias(bias);
+
+ /* spa_vdev_add() expects feature to be enabled */
+ if (alloc_bias != VDEV_BIAS_LOG &&
+ spa->spa_load_state != SPA_LOAD_CREATE &&
+ !spa_feature_is_enabled(spa,
+ SPA_FEATURE_ALLOCATION_CLASSES)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ }
+ }
+
+ vd = vdev_alloc_common(spa, id, guid, ops);
+ vic = &vd->vdev_indirect_config;
+
+ vd->vdev_islog = islog;
+ vd->vdev_nparity = nparity;
+ if (top_level && alloc_bias != VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = alloc_bias;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+ vd->vdev_path = spa_strdup(vd->vdev_path);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+ vd->vdev_devid = spa_strdup(vd->vdev_devid);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ &vd->vdev_physpath) == 0)
+ vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
+ vd->vdev_fru = spa_strdup(vd->vdev_fru);
+
+ /*
+ * Set the whole_disk property. If it's not specified, leave the value
+ * as -1.
+ */
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &vd->vdev_wholedisk) != 0)
+ vd->vdev_wholedisk = -1ULL;
+
+ ASSERT0(vic->vic_mapping_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
+ &vic->vic_mapping_object);
+ ASSERT0(vic->vic_births_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
+ &vic->vic_births_object);
+ ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+ &vic->vic_prev_indirect_vdev);
+
+ /*
+ * Look for the 'not present' flag. This will only be set if the device
+ * was not present at the time of import.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &vd->vdev_not_present);
+
+ /*
+ * Get the alignment requirement.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+
+ /*
+ * Retrieve the vdev creation time.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ &vd->vdev_crtxg);
+
+ /*
+ * If we're a top-level vdev, try to load the allocation parameters.
+ */
+ if (top_level &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ &vd->vdev_ms_array);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ &vd->vdev_ms_shift);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ &vd->vdev_asize);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ &vd->vdev_removing);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+ &vd->vdev_top_zap);
+ } else {
+ ASSERT0(vd->vdev_top_zap);
+ }
+
+ if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
+ ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+ alloctype == VDEV_ALLOC_ADD ||
+ alloctype == VDEV_ALLOC_SPLIT ||
+ alloctype == VDEV_ALLOC_ROOTPOOL);
+ /* Note: metaslab_group_create() is now deferred */
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
+ (void) nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
+ } else {
+ ASSERT0(vd->vdev_leaf_zap);
+ }
+
+ /*
+ * If we're a leaf vdev, try to load the DTL object and other state.
+ */
+
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
+ alloctype == VDEV_ALLOC_ROOTPOOL)) {
+ if (alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+ &vd->vdev_dtl_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+ &vd->vdev_unspare);
+ }
+
+ if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ uint64_t spare = 0;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+ &spare) == 0 && spare)
+ spa_spare_add(vd);
+ }
+
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+ &vd->vdev_offline);
+
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ &vd->vdev_resilver_txg);
+
+ /*
+ * When importing a pool, we want to ignore the persistent fault
+ * state, as the diagnosis made on another system may not be
+ * valid in the current context. Local vdevs will
+ * remain in the faulted state.
+ */
+ if (spa_load_state(spa) == SPA_LOAD_OPEN) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
+ &vd->vdev_faulted);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+ &vd->vdev_degraded);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
+ &vd->vdev_removed);
+
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ char *aux;
+
+ vd->vdev_label_aux =
+ VDEV_AUX_ERR_EXCEEDED;
+ if (nvlist_lookup_string(nv,
+ ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+ strcmp(aux, "external") == 0)
+ vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+ }
+ }
+ }
+
+ /*
+ * Add ourselves to the parent's list of children.
+ */
+ vdev_add_child(parent, vd);
+
+ *vdp = vd;
+
+ return (0);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+
+ /*
+ * Scan queues are normally destroyed at the end of a scan. If the
+ * queue exists here, that implies the vdev is being removed while
+ * the scan is still running.
+ */
+ if (vd->vdev_scan_io_queue != NULL) {
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
+ vd->vdev_scan_io_queue = NULL;
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ /*
+ * vdev_free() implies closing the vdev first. This is simpler than
+ * trying to ensure complicated semantics for all callers.
+ */
+ vdev_close(vd);
+
+ ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
+
+ /*
+ * Free all children.
+ */
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_free(vd->vdev_child[c]);
+
+ ASSERT(vd->vdev_child == NULL);
+ ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ ASSERT(vd->vdev_initialize_thread == NULL);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd->vdev_mg != NULL) {
+ vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ }
+
+ ASSERT0(vd->vdev_stat.vs_space);
+ ASSERT0(vd->vdev_stat.vs_dspace);
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
+ /*
+ * Remove this vdev from its parent's child list.
+ */
+ vdev_remove_child(vd->vdev_parent, vd);
+
+ ASSERT(vd->vdev_parent == NULL);
+ ASSERT(!list_link_active(&vd->vdev_leaf_node));
+
+ /*
+ * Clean up vdev structure.
+ */
+ vdev_queue_fini(vd);
+ vdev_cache_fini(vd);
+
+ if (vd->vdev_path)
+ spa_strfree(vd->vdev_path);
+ if (vd->vdev_devid)
+ spa_strfree(vd->vdev_devid);
+ if (vd->vdev_physpath)
+ spa_strfree(vd->vdev_physpath);
+ if (vd->vdev_fru)
+ spa_strfree(vd->vdev_fru);
+
+ if (vd->vdev_isspare)
+ spa_spare_remove(vd);
+ if (vd->vdev_isl2cache)
+ spa_l2cache_remove(vd);
+
+ txg_list_destroy(&vd->vdev_ms_list);
+ txg_list_destroy(&vd->vdev_dtl_list);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_close(vd->vdev_dtl_sm);
+ for (int t = 0; t < DTL_TYPES; t++) {
+ range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
+ range_tree_destroy(vd->vdev_dtl[t]);
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ EQUIV(vd->vdev_indirect_births != NULL,
+ vd->vdev_indirect_mapping != NULL);
+ if (vd->vdev_indirect_births != NULL) {
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vdev_indirect_births_close(vd->vdev_indirect_births);
+ }
+
+ if (vd->vdev_obsolete_sm != NULL) {
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ }
+ range_tree_destroy(vd->vdev_obsolete_segments);
+ rw_destroy(&vd->vdev_indirect_rwlock);
+ mutex_destroy(&vd->vdev_obsolete_lock);
+
+ mutex_destroy(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_stat_lock);
+ mutex_destroy(&vd->vdev_probe_lock);
+ mutex_destroy(&vd->vdev_scan_io_queue_lock);
+ mutex_destroy(&vd->vdev_initialize_lock);
+ mutex_destroy(&vd->vdev_initialize_io_lock);
+ cv_destroy(&vd->vdev_initialize_io_cv);
+ cv_destroy(&vd->vdev_initialize_cv);
+
+ if (vd == spa->spa_root_vdev)
+ spa->spa_root_vdev = NULL;
+
+ kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+ spa_t *spa = svd->vdev_spa;
+ metaslab_t *msp;
+ vdev_t *vd;
+ int t;
+
+ ASSERT(tvd == tvd->vdev_top);
+
+ tvd->vdev_ms_array = svd->vdev_ms_array;
+ tvd->vdev_ms_shift = svd->vdev_ms_shift;
+ tvd->vdev_ms_count = svd->vdev_ms_count;
+ tvd->vdev_top_zap = svd->vdev_top_zap;
+
+ svd->vdev_ms_array = 0;
+ svd->vdev_ms_shift = 0;
+ svd->vdev_ms_count = 0;
+ svd->vdev_top_zap = 0;
+
+ if (tvd->vdev_mg)
+ ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+ tvd->vdev_mg = svd->vdev_mg;
+ tvd->vdev_ms = svd->vdev_ms;
+
+ svd->vdev_mg = NULL;
+ svd->vdev_ms = NULL;
+
+ if (tvd->vdev_mg != NULL)
+ tvd->vdev_mg->mg_vd = tvd;
+
+ tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
+ svd->vdev_checkpoint_sm = NULL;
+
+ tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
+ svd->vdev_alloc_bias = VDEV_BIAS_NONE;
+
+ tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+ tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+ tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
+
+ svd->vdev_stat.vs_alloc = 0;
+ svd->vdev_stat.vs_space = 0;
+ svd->vdev_stat.vs_dspace = 0;
+
+ /*
+ * State which may be set on a top-level vdev that's in the
+ * process of being removed.
+ */
+ ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+ ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+ ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+ ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+ ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+ ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+ ASSERT0(tvd->vdev_removing);
+ tvd->vdev_removing = svd->vdev_removing;
+ tvd->vdev_indirect_config = svd->vdev_indirect_config;
+ tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+ tvd->vdev_indirect_births = svd->vdev_indirect_births;
+ range_tree_swap(&svd->vdev_obsolete_segments,
+ &tvd->vdev_obsolete_segments);
+ tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+ svd->vdev_indirect_config.vic_mapping_object = 0;
+ svd->vdev_indirect_config.vic_births_object = 0;
+ svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+ svd->vdev_indirect_mapping = NULL;
+ svd->vdev_indirect_births = NULL;
+ svd->vdev_obsolete_sm = NULL;
+ svd->vdev_removing = 0;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+ while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+ if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+ (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+ }
+
+ if (list_link_active(&svd->vdev_config_dirty_node)) {
+ vdev_config_clean(svd);
+ vdev_config_dirty(tvd);
+ }
+
+ if (list_link_active(&svd->vdev_state_dirty_node)) {
+ vdev_state_clean(svd);
+ vdev_state_dirty(tvd);
+ }
+
+ tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
+ svd->vdev_deflate_ratio = 0;
+
+ tvd->vdev_islog = svd->vdev_islog;
+ svd->vdev_islog = 0;
+
+ dsl_scan_io_queue_vdev_xfer(svd, tvd);
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+ if (vd == NULL)
+ return;
+
+ vd->vdev_top = tvd;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+ spa_t *spa = cvd->vdev_spa;
+ vdev_t *pvd = cvd->vdev_parent;
+ vdev_t *mvd;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+
+ mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_min_asize = cvd->vdev_min_asize;
+ mvd->vdev_max_asize = cvd->vdev_max_asize;
+ mvd->vdev_psize = cvd->vdev_psize;
+ mvd->vdev_ashift = cvd->vdev_ashift;
+ mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
+ mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
+ mvd->vdev_state = cvd->vdev_state;
+ mvd->vdev_crtxg = cvd->vdev_crtxg;
+
+ vdev_remove_child(pvd, cvd);
+ vdev_add_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_children;
+ vdev_add_child(mvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (mvd == mvd->vdev_top)
+ vdev_top_transfer(cvd, mvd);
+
+ return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+ vdev_t *mvd = cvd->vdev_parent;
+ vdev_t *pvd = mvd->vdev_parent;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ ASSERT(mvd->vdev_children == 1);
+ ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+ mvd->vdev_ops == &vdev_replacing_ops ||
+ mvd->vdev_ops == &vdev_spare_ops);
+ cvd->vdev_ashift = mvd->vdev_ashift;
+ cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
+ cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
+
+ vdev_remove_child(mvd, cvd);
+ vdev_remove_child(pvd, mvd);
+
+ /*
+ * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
+ * Otherwise, we could have detached an offline device, and when we
+ * go to import the pool we'll think we have two top-level vdevs,
+ * instead of a different version of the same top-level vdev.
+ */
+ if (mvd->vdev_top == mvd) {
+ uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+ cvd->vdev_orig_guid = cvd->vdev_guid;
+ cvd->vdev_guid += guid_delta;
+ cvd->vdev_guid_sum += guid_delta;
+ }
+ cvd->vdev_id = mvd->vdev_id;
+ vdev_add_child(pvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (cvd == cvd->vdev_top)
+ vdev_top_transfer(mvd, cvd);
+
+ ASSERT(mvd->vdev_children == 0);
+ vdev_free(mvd);
+}
+
+static void
+vdev_metaslab_group_create(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /*
+ * metaslab_group_create was delayed until allocation bias was available
+ */
+ if (vd->vdev_mg == NULL) {
+ metaslab_class_t *mc;
+
+ if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = VDEV_BIAS_LOG;
+
+ ASSERT3U(vd->vdev_islog, ==,
+ (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ mc = spa_log_class(spa);
+ break;
+ case VDEV_BIAS_SPECIAL:
+ mc = spa_special_class(spa);
+ break;
+ case VDEV_BIAS_DEDUP:
+ mc = spa_dedup_class(spa);
+ break;
+ default:
+ mc = spa_normal_class(spa);
+ }
+
+ vd->vdev_mg = metaslab_group_create(mc, vd,
+ spa->spa_alloc_count);
+
+ /*
+ * The spa ashift values currently only reflect the
+ * general vdev classes. Class destination is late
+ * binding so ashift checking had to wait until now
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
+ if (vd->vdev_ashift > spa->spa_max_ashift)
+ spa->spa_max_ashift = vd->vdev_ashift;
+ if (vd->vdev_ashift < spa->spa_min_ashift)
+ spa->spa_min_ashift = vd->vdev_ashift;
+ }
+ }
+}
+
+int
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t m;
+ uint64_t oldc = vd->vdev_ms_count;
+ uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+ metaslab_t **mspp;
+ int error;
+ boolean_t expanding = (oldc != 0);
+
+ ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ /*
+ * This vdev is not being allocated from yet or is a hole.
+ */
+ if (vd->vdev_ms_shift == 0)
+ return (0);
+
+ ASSERT(!vd->vdev_ishole);
+
+ ASSERT(oldc <= newc);
+
+ mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+
+ if (expanding) {
+ bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+ kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
+ }
+
+ vd->vdev_ms = mspp;
+ vd->vdev_ms_count = newc;
+ for (m = oldc; m < newc; m++) {
+ uint64_t object = 0;
+
+ /*
+ * vdev_ms_array may be 0 if we are creating the "fake"
+ * metaslabs for an indirect vdev for zdb's leak detection.
+ * See zdb_leak_init().
+ */
+ if (txg == 0 && vd->vdev_ms_array != 0) {
+ error = dmu_read(mos, vd->vdev_ms_array,
+ m * sizeof (uint64_t), sizeof (uint64_t), &object,
+ DMU_READ_PREFETCH);
+ if (error != 0) {
+ vdev_dbgmsg(vd, "unable to read the metaslab "
+ "array [error=%d]", error);
+ return (error);
+ }
+ }
+
+#ifndef _KERNEL
+ /*
+ * To accomodate zdb_leak_init() fake indirect
+ * metaslabs, we allocate a metaslab group for
+ * indirect vdevs which normally don't have one.
+ */
+ if (vd->vdev_mg == NULL) {
+ ASSERT0(vdev_is_concrete(vd));
+ vdev_metaslab_group_create(vd);
+ }
+#endif
+ error = metaslab_init(vd->vdev_mg, m, object, txg,
+ &(vd->vdev_ms[m]));
+ if (error != 0) {
+ vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
+ error);
+ return (error);
+ }
+ }
+
+ if (txg == 0)
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+ /*
+ * If the vdev is being removed we don't activate
+ * the metaslabs since we want to ensure that no new
+ * allocations are performed on this device.
+ */
+ if (!expanding && !vd->vdev_removing) {
+ metaslab_group_activate(vd->vdev_mg);
+ }
+
+ if (txg == 0)
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ return (0);
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+ if (vd->vdev_checkpoint_sm != NULL) {
+ ASSERT(spa_feature_is_active(vd->vdev_spa,
+ SPA_FEATURE_POOL_CHECKPOINT));
+ space_map_close(vd->vdev_checkpoint_sm);
+ /*
+ * Even though we close the space map, we need to set its
+ * pointer to NULL. The reason is that vdev_metaslab_fini()
+ * may be called multiple times for certain operations
+ * (i.e. when destroying a pool) so we need to ensure that
+ * this clause never executes twice. This logic is similar
+ * to the one used for the vdev_ms clause below.
+ */
+ vd->vdev_checkpoint_sm = NULL;
+ }
+
+ if (vd->vdev_ms != NULL) {
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
+
+ uint64_t count = vd->vdev_ms_count;
+ for (uint64_t m = 0; m < count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ if (msp != NULL)
+ metaslab_fini(msp);
+ }
+ kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+ vd->vdev_ms = NULL;
+
+ vd->vdev_ms_count = 0;
+
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ ASSERT0(mg->mg_histogram[i]);
+ }
+ ASSERT0(vd->vdev_ms_count);
+}
+
+typedef struct vdev_probe_stats {
+ boolean_t vps_readable;
+ boolean_t vps_writeable;
+ int vps_flags;
+} vdev_probe_stats_t;
+
+static void
+vdev_probe_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ vdev_t *vd = zio->io_vd;
+ vdev_probe_stats_t *vps = zio->io_private;
+
+ ASSERT(vd->vdev_probe_zio != NULL);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (zio->io_error == 0)
+ vps->vps_readable = 1;
+ if (zio->io_error == 0 && spa_writeable(spa)) {
+ zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
+ zio->io_offset, zio->io_size, zio->io_abd,
+ ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
+ ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
+ } else {
+ abd_free(zio->io_abd);
+ }
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ if (zio->io_error == 0)
+ vps->vps_writeable = 1;
+ abd_free(zio->io_abd);
+ } else if (zio->io_type == ZIO_TYPE_NULL) {
+ zio_t *pio;
+
+ vd->vdev_cant_read |= !vps->vps_readable;
+ vd->vdev_cant_write |= !vps->vps_writeable;
+
+ if (vdev_readable(vd) &&
+ (vdev_writeable(vd) || !spa_writeable(spa))) {
+ zio->io_error = 0;
+ } else {
+ ASSERT(zio->io_error != 0);
+ vdev_dbgmsg(vd, "failed probe");
+ zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
+ spa, vd, NULL, 0, 0);
+ zio->io_error = SET_ERROR(ENXIO);
+ }
+
+ mutex_enter(&vd->vdev_probe_lock);
+ ASSERT(vd->vdev_probe_zio == zio);
+ vd->vdev_probe_zio = NULL;
+ mutex_exit(&vd->vdev_probe_lock);
+
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
+ if (!vdev_accessible(vd, pio))
+ pio->io_error = SET_ERROR(ENXIO);
+
+ kmem_free(vps, sizeof (*vps));
+ }
+}
+
+/*
+ * Determine whether this device is accessible.
+ *
+ * Read and write to several known locations: the pad regions of each
+ * vdev label but the first, which we leave alone in case it contains
+ * a VTOC.
+ */
+zio_t *
+vdev_probe(vdev_t *vd, zio_t *zio)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_probe_stats_t *vps = NULL;
+ zio_t *pio;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ /*
+ * Don't probe the probe.
+ */
+ if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
+ return (NULL);
+
+ /*
+ * To prevent 'probe storms' when a device fails, we create
+ * just one probe i/o at a time. All zios that want to probe
+ * this vdev will become parents of the probe io.
+ */
+ mutex_enter(&vd->vdev_probe_lock);
+
+ if ((pio = vd->vdev_probe_zio) == NULL) {
+ vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+
+ vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
+ ZIO_FLAG_TRYHARD;
+
+ if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
+ /*
+ * vdev_cant_read and vdev_cant_write can only
+ * transition from TRUE to FALSE when we have the
+ * SCL_ZIO lock as writer; otherwise they can only
+ * transition from FALSE to TRUE. This ensures that
+ * any zio looking at these values can assume that
+ * failures persist for the life of the I/O. That's
+ * important because when a device has intermittent
+ * connectivity problems, we want to ensure that
+ * they're ascribed to the device (ENXIO) and not
+ * the zio (EIO).
+ *
+ * Since we hold SCL_ZIO as writer here, clear both
+ * values so the probe can reevaluate from first
+ * principles.
+ */
+ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
+ vd->vdev_cant_read = B_FALSE;
+ vd->vdev_cant_write = B_FALSE;
+ }
+
+ vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
+ vdev_probe_done, vps,
+ vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+
+ /*
+ * We can't change the vdev state in this context, so we
+ * kick off an async task to do it on our behalf.
+ */
+ if (zio != NULL) {
+ vd->vdev_probe_wanted = B_TRUE;
+ spa_async_request(spa, SPA_ASYNC_PROBE);
+ }
+ }
+
+ if (zio != NULL)
+ zio_add_child(zio, pio);
+
+ mutex_exit(&vd->vdev_probe_lock);
+
+ if (vps == NULL) {
+ ASSERT(zio != NULL);
+ return (NULL);
+ }
+
+ for (int l = 1; l < VDEV_LABELS; l++) {
+ zio_nowait(zio_read_phys(pio, vd,
+ vdev_label_offset(vd->vdev_psize, l,
+ offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
+ abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
+ ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
+ ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
+ }
+
+ if (zio == NULL)
+ return (pio);
+
+ zio_nowait(pio);
+ return (NULL);
+}
+
+static void
+vdev_open_child(void *arg)
+{
+ vdev_t *vd = arg;
+
+ vd->vdev_open_thread = curthread;
+ vd->vdev_open_error = vdev_open(vd);
+ vd->vdev_open_thread = NULL;
+}
+
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+ if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+ strlen(ZVOL_DIR)) == 0)
+ return (B_TRUE);
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_uses_zvols(vd->vdev_child[c]))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+void
+vdev_open_children(vdev_t *vd)
+{
+ taskq_t *tq;
+ int children = vd->vdev_children;
+
+ vd->vdev_nonrot = B_TRUE;
+
+ /*
+ * in order to handle pools on top of zvols, do the opens
+ * in a single thread so that the same thread holds the
+ * spa_namespace_lock
+ */
+ if (B_TRUE || vdev_uses_zvols(vd)) {
+ for (int c = 0; c < children; c++) {
+ vd->vdev_child[c]->vdev_open_error =
+ vdev_open(vd->vdev_child[c]);
+ vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+ }
+ return;
+ }
+ tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+
+ for (int c = 0; c < children; c++)
+ VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
+ TQ_SLEEP) != 0);
+
+ taskq_destroy(tq);
+
+ for (int c = 0; c < children; c++)
+ vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+}
+
+/*
+ * Compute the raidz-deflation ratio. Note, we hard-code
+ * in 128k (1 << 17) because it is the "typical" blocksize.
+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+ * otherwise it would inconsistently account for existing bp's.
+ */
+static void
+vdev_set_deflate_ratio(vdev_t *vd)
+{
+ if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
+ vd->vdev_deflate_ratio = (1 << 17) /
+ (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+ }
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ int error;
+ uint64_t osize = 0;
+ uint64_t max_osize = 0;
+ uint64_t asize, max_asize, psize;
+ uint64_t logical_ashift = 0;
+ uint64_t physical_ashift = 0;
+
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+ vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+ vd->vdev_state == VDEV_STATE_OFFLINE);
+
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+ vd->vdev_cant_read = B_FALSE;
+ vd->vdev_cant_write = B_FALSE;
+ vd->vdev_notrim = B_FALSE;
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
+
+ /*
+ * If this vdev is not removed, check its fault status. If it's
+ * faulted, bail out of the open.
+ */
+ if (!vd->vdev_removed && vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ vd->vdev_label_aux);
+ return (SET_ERROR(ENXIO));
+ } else if (vd->vdev_offline) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
+ return (SET_ERROR(ENXIO));
+ }
+
+ error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
+ &logical_ashift, &physical_ashift);
+
+ /*
+ * Reset the vdev_reopening flag so that we actually close
+ * the vdev on error.
+ */
+ vd->vdev_reopening = B_FALSE;
+ if (zio_injection_enabled && error == 0)
+ error = zio_handle_device_injection(vd, NULL, ENXIO);
+
+ if (error) {
+ if (vd->vdev_removed &&
+ vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
+ vd->vdev_removed = B_FALSE;
+
+ if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
+ vd->vdev_stat.vs_aux);
+ } else {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ vd->vdev_stat.vs_aux);
+ }
+ return (error);
+ }
+
+ vd->vdev_removed = B_FALSE;
+
+ /*
+ * Recheck the faulted flag now that we have confirmed that
+ * the vdev is accessible. If we're faulted, bail.
+ */
+ if (vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ vd->vdev_label_aux);
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (vd->vdev_degraded) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_ERR_EXCEEDED);
+ } else {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
+ }
+
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
+ if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
+ trim_map_create(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_NONE);
+ break;
+ }
+ }
+
+ osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+ max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
+
+ if (vd->vdev_children == 0) {
+ if (osize < SPA_MINDEVSIZE) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (SET_ERROR(EOVERFLOW));
+ }
+ psize = osize;
+ asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+ max_asize = max_osize - (VDEV_LABEL_START_SIZE +
+ VDEV_LABEL_END_SIZE);
+ } else {
+ if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
+ (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (SET_ERROR(EOVERFLOW));
+ }
+ psize = 0;
+ asize = osize;
+ max_asize = max_osize;
+ }
+
+ vd->vdev_psize = psize;
+
+ /*
+ * Make sure the allocatable size hasn't shrunk too much.
+ */
+ if (asize < vd->vdev_min_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (SET_ERROR(EINVAL));
+ }
+
+ vd->vdev_physical_ashift =
+ MAX(physical_ashift, vd->vdev_physical_ashift);
+ vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
+ vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
+
+ if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_ASHIFT_TOO_BIG);
+ return (EINVAL);
+ }
+
+ if (vd->vdev_asize == 0) {
+ /*
+ * This is the first-ever open, so use the computed values.
+ * For testing purposes, a higher ashift can be requested.
+ */
+ vd->vdev_asize = asize;
+ vd->vdev_max_asize = max_asize;
+ } else {
+ /*
+ * Make sure the alignment requirement hasn't increased.
+ */
+ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
+ vd->vdev_ops->vdev_op_leaf) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+ vd->vdev_max_asize = max_asize;
+ }
+
+ /*
+ * If all children are healthy we update asize if either:
+ * The asize has increased, due to a device expansion caused by dynamic
+ * LUN growth or vdev replacement, and automatic expansion is enabled;
+ * making the additional space available.
+ *
+ * The asize has decreased, due to a device shrink usually caused by a
+ * vdev replace with a smaller device. This ensures that calculations
+ * based of max_asize and asize e.g. esize are always valid. It's safe
+ * to do this as we've already validated that asize is greater than
+ * vdev_min_asize.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+ ((asize > vd->vdev_asize &&
+ (vd->vdev_expanding || spa->spa_autoexpand)) ||
+ (asize < vd->vdev_asize)))
+ vd->vdev_asize = asize;
+
+ vdev_set_min_asize(vd);
+
+ /*
+ * Ensure we can issue some IO before declaring the
+ * vdev open for business.
+ */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
+ return (error);
+ }
+
+ /*
+ * Track the min and max ashift values for normal data devices.
+ *
+ * DJB - TBD these should perhaps be tracked per allocation class
+ * (e.g. spa_min_ashift is used to round up post compression buffers)
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
+ vd->vdev_aux == NULL) {
+ if (vd->vdev_ashift > spa->spa_max_ashift)
+ spa->spa_max_ashift = vd->vdev_ashift;
+ if (vd->vdev_ashift < spa->spa_min_ashift)
+ spa->spa_min_ashift = vd->vdev_ashift;
+ }
+
+ /*
+ * If a leaf vdev has a DTL, and seems healthy, then kick off a
+ * resilver. But don't do this if we are doing a reopen for a scrub,
+ * since this would just restart the scrub we are already doing.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
+ vdev_resilver_needed(vd, NULL, NULL))
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+ return (0);
+}
+
+/*
+ * Called once the vdevs are all opened, this routine validates the label
+ * contents. This needs to be done before vdev_load() so that we don't
+ * inadvertently do repair I/Os to the wrong device.
+ *
+ * This function will only return failure if one of the vdevs indicates that it
+ * has since been destroyed or exported. This is only possible if
+ * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
+ * will be updated but the function will return 0.
+ */
+int
+vdev_validate(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *label;
+ uint64_t guid = 0, aux_guid = 0, top_guid;
+ uint64_t state;
+ nvlist_t *nvl;
+ uint64_t txg;
+
+ if (vdev_validate_skip)
+ return (0);
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ if (vdev_validate(vd->vdev_child[c]) != 0)
+ return (SET_ERROR(EBADF));
+
+ /*
+ * If the device has already failed, or was marked offline, don't do
+ * any further validation. Otherwise, label I/O will fail and we will
+ * overwrite the previous state.
+ */
+ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
+ return (0);
+
+ /*
+ * If we are performing an extreme rewind, we allow for a label that
+ * was modified at a point after the current txg.
+ * If config lock is not held do not check for the txg. spa_sync could
+ * be updating the vdev's label before updating spa_last_synced_txg.
+ */
+ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
+ spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
+ txg = UINT64_MAX;
+ else
+ txg = spa_last_synced_txg(spa);
+
+ if ((label = vdev_label_read_config(vd, txg)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
+ "txg %llu", (u_longlong_t)txg);
+ return (0);
+ }
+
+ /*
+ * Determine if this vdev has been split off into another
+ * pool. If so, then refuse to open it.
+ */
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+ &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_SPLIT_POOL);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_POOL_GUID);
+ return (0);
+ }
+
+ /*
+ * If config is not trusted then ignore the spa guid check. This is
+ * necessary because if the machine crashed during a re-guid the new
+ * guid might have been written to all of the vdev labels, but not the
+ * cached config. The check will be performed again once we have the
+ * trusted config from the MOS.
+ */
+ if (spa->spa_trust_config && guid != spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
+ "match config (%llu != %llu)", (u_longlong_t)guid,
+ (u_longlong_t)spa_guid(spa));
+ return (0);
+ }
+
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+ != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+ &aux_guid) != 0)
+ aux_guid = 0;
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_GUID);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
+ != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_TOP_GUID);
+ return (0);
+ }
+
+ /*
+ * If this vdev just became a top-level vdev because its sibling was
+ * detached, it will have adopted the parent's vdev guid -- but the
+ * label may or may not be on disk yet. Fortunately, either version
+ * of the label will have the same top guid, so if we're a top-level
+ * vdev, we can safely compare to that instead.
+ * However, if the config comes from a cachefile that failed to update
+ * after the detach, a top-level vdev will appear as a non top-level
+ * vdev in the config. Also relax the constraints if we perform an
+ * extreme rewind.
+ *
+ * If we split this vdev off instead, then we also check the
+ * original pool's guid. We don't want to consider the vdev
+ * corrupt if it is partway through a split operation.
+ */
+ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
+ boolean_t mismatch = B_FALSE;
+ if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
+ if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
+ mismatch = B_TRUE;
+ } else {
+ if (vd->vdev_guid != top_guid &&
+ vd->vdev_top->vdev_guid != guid)
+ mismatch = B_TRUE;
+ }
+
+ if (mismatch) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: config guid "
+ "doesn't match label guid");
+ vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
+ (u_longlong_t)vd->vdev_guid,
+ (u_longlong_t)vd->vdev_top->vdev_guid);
+ vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
+ "aux_guid %llu", (u_longlong_t)guid,
+ (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
+ return (0);
+ }
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_POOL_STATE);
+ return (0);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * If this is a verbatim import, no need to check the
+ * state of the pool.
+ */
+ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+ spa_load_state(spa) == SPA_LOAD_OPEN &&
+ state != POOL_STATE_ACTIVE) {
+ vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
+ "for spa %s", (u_longlong_t)state, spa->spa_name);
+ return (SET_ERROR(EBADF));
+ }
+
+ /*
+ * If we were able to open and validate a vdev that was
+ * previously marked permanently unavailable, clear that state
+ * now.
+ */
+ if (vd->vdev_not_present)
+ vd->vdev_not_present = 0;
+
+ return (0);
+}
+
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+ if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
+ if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
+ zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
+ "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
+ dvd->vdev_path, svd->vdev_path);
+ spa_strfree(dvd->vdev_path);
+ dvd->vdev_path = spa_strdup(svd->vdev_path);
+ }
+ } else if (svd->vdev_path != NULL) {
+ dvd->vdev_path = spa_strdup(svd->vdev_path);
+ zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
+ (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+ }
+}
+
+/*
+ * Recursively copy vdev paths from one vdev to another. Source and destination
+ * vdev trees must have same geometry otherwise return error. Intended to copy
+ * paths from userland config into MOS config.
+ */
+int
+vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
+{
+ if ((svd->vdev_ops == &vdev_missing_ops) ||
+ (svd->vdev_ishole && dvd->vdev_ishole) ||
+ (dvd->vdev_ops == &vdev_indirect_ops))
+ return (0);
+
+ if (svd->vdev_ops != dvd->vdev_ops) {
+ vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
+ svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (svd->vdev_guid != dvd->vdev_guid) {
+ vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
+ "%llu)", (u_longlong_t)svd->vdev_guid,
+ (u_longlong_t)dvd->vdev_guid);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (svd->vdev_children != dvd->vdev_children) {
+ vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
+ "%llu != %llu", (u_longlong_t)svd->vdev_children,
+ (u_longlong_t)dvd->vdev_children);
+ return (SET_ERROR(EINVAL));
+ }
+
+ for (uint64_t i = 0; i < svd->vdev_children; i++) {
+ int error = vdev_copy_path_strict(svd->vdev_child[i],
+ dvd->vdev_child[i]);
+ if (error != 0)
+ return (error);
+ }
+
+ if (svd->vdev_ops->vdev_op_leaf)
+ vdev_copy_path_impl(svd, dvd);
+
+ return (0);
+}
+
+static void
+vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
+{
+ ASSERT(stvd->vdev_top == stvd);
+ ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
+
+ for (uint64_t i = 0; i < dvd->vdev_children; i++) {
+ vdev_copy_path_search(stvd, dvd->vdev_child[i]);
+ }
+
+ if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
+ return;
+
+ /*
+ * The idea here is that while a vdev can shift positions within
+ * a top vdev (when replacing, attaching mirror, etc.) it cannot
+ * step outside of it.
+ */
+ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
+
+ if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
+ return;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ vdev_copy_path_impl(vd, dvd);
+}
+
+/*
+ * Recursively copy vdev paths from one root vdev to another. Source and
+ * destination vdev trees may differ in geometry. For each destination leaf
+ * vdev, search a vdev with the same guid and top vdev id in the source.
+ * Intended to copy paths from userland config into MOS config.
+ */
+void
+vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
+{
+ uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
+ ASSERT(srvd->vdev_ops == &vdev_root_ops);
+ ASSERT(drvd->vdev_ops == &vdev_root_ops);
+
+ for (uint64_t i = 0; i < children; i++) {
+ vdev_copy_path_search(srvd->vdev_child[i],
+ drvd->vdev_child[i]);
+ }
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *pvd = vd->vdev_parent;
+
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ /*
+ * If our parent is reopening, then we are as well, unless we are
+ * going offline.
+ */
+ if (pvd != NULL && pvd->vdev_reopening)
+ vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
+ vd->vdev_ops->vdev_op_close(vd);
+
+ vdev_cache_purge(vd);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ trim_map_destroy(vd);
+
+ /*
+ * We record the previous state before we close it, so that if we are
+ * doing a reopen(), we don't generate FMA ereports if we notice that
+ * it's still faulted.
+ */
+ vd->vdev_prevstate = vd->vdev_state;
+
+ if (vd->vdev_offline)
+ vd->vdev_state = VDEV_STATE_OFFLINE;
+ else
+ vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+}
+
+void
+vdev_hold(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_hold(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_rele(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves. We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock. Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
+void
+vdev_reopen(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ /* set the reopening flag unless we're taking the vdev offline */
+ vd->vdev_reopening = !vd->vdev_offline;
+ vdev_close(vd);
+ (void) vdev_open(vd);
+
+ /*
+ * Call vdev_validate() here to make sure we have the same device.
+ * Otherwise, a device with an invalid label could be successfully
+ * opened in response to vdev_reopen().
+ */
+ if (vd->vdev_aux) {
+ (void) vdev_validate_aux(vd);
+ if (vdev_readable(vd) && vdev_writeable(vd) &&
+ vd->vdev_aux == &spa->spa_l2cache &&
+ !l2arc_vdev_present(vd))
+ l2arc_add_vdev(spa, vd);
+ } else {
+ (void) vdev_validate(vd);
+ }
+
+ /*
+ * Reassess parent vdev's health.
+ */
+ vdev_propagate_state(vd);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
+{
+ int error;
+
+ /*
+ * Normally, partial opens (e.g. of a mirror) are allowed.
+ * For a create, however, we want to fail the request if
+ * there are any components we can't open.
+ */
+ error = vdev_open(vd);
+
+ if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_close(vd);
+ return (error ? error : ENXIO);
+ }
+
+ /*
+ * Recursively load DTLs and initialize all labels.
+ */
+ if ((error = vdev_dtl_load(vd)) != 0 ||
+ (error = vdev_label_init(vd, txg, isreplacing ?
+ VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
+ vdev_close(vd);
+ return (error);
+ }
+
+ return (0);
+}
+
+void
+vdev_metaslab_set_size(vdev_t *vd)
+{
+ uint64_t asize = vd->vdev_asize;
+ uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
+ uint64_t ms_shift;
+
+ /*
+ * There are two dimensions to the metaslab sizing calculation:
+ * the size of the metaslab and the count of metaslabs per vdev.
+ *
+ * The default values used below are a good balance between memory
+ * usage (larger metaslab size means more memory needed for loaded
+ * metaslabs; more metaslabs means more memory needed for the
+ * metaslab_t structs), metaslab load time (larger metaslabs take
+ * longer to load), and metaslab sync time (more metaslabs means
+ * more time spent syncing all of them).
+ *
+ * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+ * The range of the dimensions are as follows:
+ *
+ * 2^29 <= ms_size <= 2^34
+ * 16 <= ms_count <= 131,072
+ *
+ * On the lower end of vdev sizes, we aim for metaslabs sizes of
+ * at least 512MB (2^29) to minimize fragmentation effects when
+ * testing with smaller devices. However, the count constraint
+ * of at least 16 metaslabs will override this minimum size goal.
+ *
+ * On the upper end of vdev sizes, we aim for a maximum metaslab
+ * size of 16GB. However, we will cap the total count to 2^17
+ * metaslabs to keep our memory footprint in check and let the
+ * metaslab size grow from there if that limit is hit.
+ *
+ * The net effect of applying above constrains is summarized below.
+ *
+ * vdev size metaslab count
+ * --------------|-----------------
+ * < 8GB ~16
+ * 8GB - 100GB one per 512MB
+ * 100GB - 3TB ~200
+ * 3TB - 2PB one per 16GB
+ * > 2PB ~131,072
+ * --------------------------------
+ *
+ * Finally, note that all of the above calculate the initial
+ * number of metaslabs. Expanding a top-level vdev will result
+ * in additional metaslabs being allocated making it possible
+ * to exceed the zfs_vdev_ms_count_limit.
+ */
+
+ if (ms_count < zfs_vdev_min_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+ else if (ms_count > zfs_vdev_default_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
+ else
+ ms_shift = zfs_vdev_default_ms_shift;
+
+ if (ms_shift < SPA_MAXBLOCKSHIFT) {
+ ms_shift = SPA_MAXBLOCKSHIFT;
+ } else if (ms_shift > zfs_vdev_max_ms_shift) {
+ ms_shift = zfs_vdev_max_ms_shift;
+ /* cap the total count to constrain memory footprint */
+ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+ ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
+ }
+
+ vd->vdev_ms_shift = ms_shift;
+ ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
+}
+
+/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
+void
+vdev_ashift_optimize(vdev_t *vd)
+{
+ if (vd == vd->vdev_top) {
+ if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+ vd->vdev_ashift = MIN(
+ MAX(zfs_max_auto_ashift, vd->vdev_ashift),
+ MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
+ } else {
+ /*
+ * Unusual case where logical ashift > physical ashift
+ * so we can't cap the calculated ashift based on max
+ * ashift as that would cause failures.
+ * We still check if we need to increase it to match
+ * the min ashift.
+ */
+ vd->vdev_ashift = MAX(zfs_min_auto_ashift,
+ vd->vdev_ashift);
+ }
+ }
+}
+
+void
+vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
+{
+ ASSERT(vd == vd->vdev_top);
+ /* indirect vdevs don't have metaslabs or dtls */
+ ASSERT(vdev_is_concrete(vd) || flags == 0);
+ ASSERT(ISP2(flags));
+ ASSERT(spa_writeable(vd->vdev_spa));
+
+ if (flags & VDD_METASLAB)
+ (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
+
+ if (flags & VDD_DTL)
+ (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
+
+ (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
+}
+
+void
+vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vdev_dirty(vd->vdev_top, flags, vd, txg);
+}
+
+/*
+ * DTLs.
+ *
+ * A vdev's DTL (dirty time log) is the set of transaction groups for which
+ * the vdev has less than perfect replication. There are four kinds of DTL:
+ *
+ * DTL_MISSING: txgs for which the vdev has no valid copies of the data
+ *
+ * DTL_PARTIAL: txgs for which data is available, but not fully replicated
+ *
+ * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
+ * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
+ * txgs that was scrubbed.
+ *
+ * DTL_OUTAGE: txgs which cannot currently be read, whether due to
+ * persistent errors or just some device being offline.
+ * Unlike the other three, the DTL_OUTAGE map is not generally
+ * maintained; it's only computed when needed, typically to
+ * determine whether a device can be detached.
+ *
+ * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
+ * either has the data or it doesn't.
+ *
+ * For interior vdevs such as mirror and RAID-Z the picture is more complex.
+ * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
+ * if any child is less than fully replicated, then so is its parent.
+ * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
+ * comprising only those txgs which appear in 'maxfaults' or more children;
+ * those are the txgs we don't have enough replication to read. For example,
+ * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
+ * thus, its DTL_MISSING consists of the set of txgs that appear in more than
+ * two child DTL_MISSING maps.
+ *
+ * It should be clear from the above that to compute the DTLs and outage maps
+ * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
+ * Therefore, that is all we keep on disk. When loading the pool, or after
+ * a configuration change, we generate all other DTLs from first principles.
+ */
+void
+vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
+{
+ range_tree_t *rt = vd->vdev_dtl[t];
+
+ ASSERT(t < DTL_TYPES);
+ ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+ ASSERT(spa_writeable(vd->vdev_spa));
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ if (!range_tree_contains(rt, txg, size))
+ range_tree_add(rt, txg, size);
+ mutex_exit(&vd->vdev_dtl_lock);
+}
+
+boolean_t
+vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
+{
+ range_tree_t *rt = vd->vdev_dtl[t];
+ boolean_t dirty = B_FALSE;
+
+ ASSERT(t < DTL_TYPES);
+ ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
+ /*
+ * While we are loading the pool, the DTLs have not been loaded yet.
+ * Ignore the DTLs and try all devices. This avoids a recursive
+ * mutex enter on the vdev_dtl_lock, and also makes us try hard
+ * when loading the pool (relying on the checksum to ensure that
+ * we get the right data -- note that we while loading, we are
+ * only reading the MOS, which is always checksummed).
+ */
+ if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ if (!range_tree_is_empty(rt))
+ dirty = range_tree_contains(rt, txg, size);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (dirty);
+}
+
+boolean_t
+vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
+{
+ range_tree_t *rt = vd->vdev_dtl[t];
+ boolean_t empty;
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ empty = range_tree_is_empty(rt);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (empty);
+}
+
+/*
+ * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+{
+ ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
+ if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
+ vd->vdev_ops->vdev_op_leaf)
+ return (B_TRUE);
+
+ return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
+}
+
+/*
+ * Returns the lowest txg in the DTL range.
+ */
+static uint64_t
+vdev_dtl_min(vdev_t *vd)
+{
+ range_seg_t *rs;
+
+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+ ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+ ASSERT0(vd->vdev_children);
+
+ rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
+ return (rs->rs_start - 1);
+}
+
+/*
+ * Returns the highest txg in the DTL.
+ */
+static uint64_t
+vdev_dtl_max(vdev_t *vd)
+{
+ range_seg_t *rs;
+
+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+ ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+ ASSERT0(vd->vdev_children);
+
+ rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
+ return (rs->rs_end);
+}
+
+/*
+ * Determine if a resilvering vdev should remove any DTL entries from
+ * its range. If the vdev was resilvering for the entire duration of the
+ * scan then it should excise that range from its DTLs. Otherwise, this
+ * vdev is considered partially resilvered and should leave its DTL
+ * entries intact. The comment in vdev_dtl_reassess() describes how we
+ * excise the DTLs.
+ */
+static boolean_t
+vdev_dtl_should_excise(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+ ASSERT0(scn->scn_phys.scn_errors);
+ ASSERT0(vd->vdev_children);
+
+ if (vd->vdev_state < VDEV_STATE_DEGRADED)
+ return (B_FALSE);
+
+ if (vd->vdev_resilver_txg == 0 ||
+ range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
+ return (B_TRUE);
+
+ /*
+ * When a resilver is initiated the scan will assign the scn_max_txg
+ * value to the highest txg value that exists in all DTLs. If this
+ * device's max DTL is not part of this scan (i.e. it is not in
+ * the range (scn_min_txg, scn_max_txg] then it is not eligible
+ * for excision.
+ */
+ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+ ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
+ ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
+ ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+{
+ spa_t *spa = vd->vdev_spa;
+ avl_tree_t reftree;
+ int minref;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_dtl_reassess(vd->vdev_child[c], txg,
+ scrub_txg, scrub_done);
+
+ if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
+ return;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+ mutex_enter(&vd->vdev_dtl_lock);
+
+ /*
+ * If we've completed a scan cleanly then determine
+ * if this vdev should remove any DTLs. We only want to
+ * excise regions on vdevs that were available during
+ * the entire duration of this scan.
+ */
+ if (scrub_txg != 0 &&
+ (spa->spa_scrub_started ||
+ (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
+ vdev_dtl_should_excise(vd)) {
+ /*
+ * We completed a scrub up to scrub_txg. If we
+ * did it without rebooting, then the scrub dtl
+ * will be valid, so excise the old region and
+ * fold in the scrub dtl. Otherwise, leave the
+ * dtl as-is if there was an error.
+ *
+ * There's little trick here: to excise the beginning
+ * of the DTL_MISSING map, we put it into a reference
+ * tree and then add a segment with refcnt -1 that
+ * covers the range [0, scrub_txg). This means
+ * that each txg in that range has refcnt -1 or 0.
+ * We then add DTL_SCRUB with a refcnt of 2, so that
+ * entries in the range [0, scrub_txg) will have a
+ * positive refcnt -- either 1 or 2. We then convert
+ * the reference tree into the new DTL_MISSING map.
+ */
+ space_reftree_create(&reftree);
+ space_reftree_add_map(&reftree,
+ vd->vdev_dtl[DTL_MISSING], 1);
+ space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
+ space_reftree_add_map(&reftree,
+ vd->vdev_dtl[DTL_SCRUB], 2);
+ space_reftree_generate_map(&reftree,
+ vd->vdev_dtl[DTL_MISSING], 1);
+ space_reftree_destroy(&reftree);
+ }
+ range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+ range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+ range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
+ if (scrub_done)
+ range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+ range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+ if (!vdev_readable(vd))
+ range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+ else
+ range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+ range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
+
+ /*
+ * If the vdev was resilvering and no longer has any
+ * DTLs then reset its resilvering flag and dirty
+ * the top level so that we persist the change.
+ */
+ if (vd->vdev_resilver_txg != 0 &&
+ range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+ range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
+ vd->vdev_resilver_txg = 0;
+ vdev_config_dirty(vd->vdev_top);
+ }
+
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ if (txg != 0)
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+ return;
+ }
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ for (int t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+ if (t == DTL_SCRUB)
+ continue; /* leaf vdevs only */
+ if (t == DTL_PARTIAL)
+ minref = 1; /* i.e. non-zero */
+ else if (vd->vdev_nparity != 0)
+ minref = vd->vdev_nparity + 1; /* RAID-Z */
+ else
+ minref = vd->vdev_children; /* any kind of mirror */
+ space_reftree_create(&reftree);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ mutex_enter(&cvd->vdev_dtl_lock);
+ space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
+ mutex_exit(&cvd->vdev_dtl_lock);
+ }
+ space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
+ space_reftree_destroy(&reftree);
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
+}
+
+int
+vdev_dtl_load(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ int error = 0;
+
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
+ ASSERT(vdev_is_concrete(vd));
+
+ error = space_map_open(&vd->vdev_dtl_sm, mos,
+ vd->vdev_dtl_object, 0, -1ULL, 0);
+ if (error)
+ return (error);
+ ASSERT(vd->vdev_dtl_sm != NULL);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ error = space_map_load(vd->vdev_dtl_sm,
+ vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (error);
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ error = vdev_dtl_load(vd->vdev_child[c]);
+ if (error != 0)
+ break;
+ }
+
+ return (error);
+}
+
+static void
+vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+ const char *string;
+
+ ASSERT(alloc_bias != VDEV_BIAS_NONE);
+
+ string =
+ (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
+ (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
+ (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
+
+ ASSERT(string != NULL);
+ VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
+ 1, strlen(string) + 1, string, tx));
+
+ if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
+ spa_activate_allocation_classes(spa, tx);
+ }
+}
+
+void
+vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
+ VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+ zapobj, tx));
+}
+
+uint64_t
+vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
+ DMU_OT_NONE, 0, tx);
+
+ ASSERT(zap != 0);
+ VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+ zap, tx));
+
+ return (zap);
+}
+
+void
+vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
+{
+ if (vd->vdev_ops != &vdev_hole_ops &&
+ vd->vdev_ops != &vdev_missing_ops &&
+ vd->vdev_ops != &vdev_root_ops &&
+ !vd->vdev_top->vdev_removing) {
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
+ vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
+ }
+ if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
+ vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
+ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
+ vdev_zap_allocation_data(vd, tx);
+ }
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_construct_zaps(vd->vdev_child[i], tx);
+ }
+}
+
+void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
+ objset_t *mos = spa->spa_meta_objset;
+ range_tree_t *rtsync;
+ dmu_tx_t *tx;
+ uint64_t object = space_map_object(vd->vdev_dtl_sm);
+
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_free(vd->vdev_dtl_sm, tx);
+ space_map_close(vd->vdev_dtl_sm);
+ vd->vdev_dtl_sm = NULL;
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ /*
+ * We only destroy the leaf ZAP for detached leaves or for
+ * removed log devices. Removed data devices handle leaf ZAP
+ * cleanup later, once cancellation is no longer possible.
+ */
+ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
+ vd->vdev_top->vdev_islog)) {
+ vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
+ vd->vdev_leaf_zap = 0;
+ }
+
+ dmu_tx_commit(tx);
+ return;
+ }
+
+ if (vd->vdev_dtl_sm == NULL) {
+ uint64_t new_object;
+
+ new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
+ 0, -1ULL, 0));
+ ASSERT(vd->vdev_dtl_sm != NULL);
+ }
+
+ rtsync = range_tree_create(NULL, NULL);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ range_tree_walk(rt, range_tree_add, rtsync);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
+ space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
+ range_tree_vacate(rtsync, NULL, NULL);
+
+ range_tree_destroy(rtsync);
+
+ /*
+ * If the object for the space map has changed then dirty
+ * the top level so that we update the config.
+ */
+ if (object != space_map_object(vd->vdev_dtl_sm)) {
+ vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
+ "new object %llu", (u_longlong_t)txg, spa_name(spa),
+ (u_longlong_t)object,
+ (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
+ vdev_config_dirty(vd->vdev_top);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Determine whether the specified vdev can be offlined/detached/removed
+ * without losing data.
+ */
+boolean_t
+vdev_dtl_required(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *tvd = vd->vdev_top;
+ uint8_t cant_read = vd->vdev_cant_read;
+ boolean_t required;
+
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ if (vd == spa->spa_root_vdev || vd == tvd)
+ return (B_TRUE);
+
+ /*
+ * Temporarily mark the device as unreadable, and then determine
+ * whether this results in any DTL outages in the top-level vdev.
+ * If not, we can safely offline/detach/remove the device.
+ */
+ vd->vdev_cant_read = B_TRUE;
+ vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+ required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
+ vd->vdev_cant_read = cant_read;
+ vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+
+ if (!required && zio_injection_enabled)
+ required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
+ return (required);
+}
+
+/*
+ * Determine if resilver is needed, and if so the txg range.
+ */
+boolean_t
+vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
+{
+ boolean_t needed = B_FALSE;
+ uint64_t thismin = UINT64_MAX;
+ uint64_t thismax = 0;
+
+ if (vd->vdev_children == 0) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+ vdev_writeable(vd)) {
+
+ thismin = vdev_dtl_min(vd);
+ thismax = vdev_dtl_max(vd);
+ needed = B_TRUE;
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
+ } else {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ uint64_t cmin, cmax;
+
+ if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
+ thismin = MIN(thismin, cmin);
+ thismax = MAX(thismax, cmax);
+ needed = B_TRUE;
+ }
+ }
+ }
+
+ if (needed && minp) {
+ *minp = thismin;
+ *maxp = thismax;
+ }
+ return (needed);
+}
+
+/*
+ * Gets the checkpoint space map object from the vdev's ZAP.
+ * Returns the spacemap object, or 0 if it wasn't in the ZAP
+ * or the ZAP doesn't exist yet.
+ */
+int
+vdev_checkpoint_sm_object(vdev_t *vd)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+ if (vd->vdev_top_zap == 0) {
+ return (0);
+ }
+
+ uint64_t sm_obj = 0;
+ int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
+
+ ASSERT(err == 0 || err == ENOENT);
+
+ return (sm_obj);
+}
+
+int
+vdev_load(vdev_t *vd)
+{
+ int error = 0;
+ /*
+ * Recursively load all children.
+ */
+ for (int c = 0; c < vd->vdev_children; c++) {
+ error = vdev_load(vd->vdev_child[c]);
+ if (error != 0) {
+ return (error);
+ }
+ }
+
+ vdev_set_deflate_ratio(vd);
+
+ /*
+ * On spa_load path, grab the allocation bias from our zap
+ */
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ spa_t *spa = vd->vdev_spa;
+ char bias_str[64];
+
+ if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
+ bias_str) == 0) {
+ ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
+ vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
+ }
+ }
+
+ /*
+ * If this is a top-level vdev, initialize its metaslabs.
+ */
+ if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
+
+ if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
+ "asize=%llu", (u_longlong_t)vd->vdev_ashift,
+ (u_longlong_t)vd->vdev_asize);
+ return (SET_ERROR(ENXIO));
+ }
+
+ error = vdev_metaslab_init(vd, 0);
+ if (error != 0) {
+ vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
+ "[error=%d]", error);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (error);
+ }
+
+ uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
+ if (checkpoint_sm_obj != 0) {
+ objset_t *mos = spa_meta_objset(vd->vdev_spa);
+ ASSERT(vd->vdev_asize != 0);
+ ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
+
+ error = space_map_open(&vd->vdev_checkpoint_sm,
+ mos, checkpoint_sm_obj, 0, vd->vdev_asize,
+ vd->vdev_ashift);
+ if (error != 0) {
+ vdev_dbgmsg(vd, "vdev_load: space_map_open "
+ "failed for checkpoint spacemap (obj %llu) "
+ "[error=%d]",
+ (u_longlong_t)checkpoint_sm_obj, error);
+ return (error);
+ }
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * Since the checkpoint_sm contains free entries
+ * exclusively we can use space_map_allocated() to
+ * indicate the cumulative checkpointed space that
+ * has been freed.
+ */
+ vd->vdev_stat.vs_checkpoint_space =
+ -space_map_allocated(vd->vdev_checkpoint_sm);
+ vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
+ vd->vdev_stat.vs_checkpoint_space;
+ }
+ }
+
+ /*
+ * If this is a leaf vdev, load its DTL.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
+ "[error=%d]", error);
+ return (error);
+ }
+
+ uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
+ if (obsolete_sm_object != 0) {
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ ASSERT(vd->vdev_asize != 0);
+ ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
+
+ if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
+ obsolete_sm_object, 0, vd->vdev_asize, 0))) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
+ "obsolete spacemap (obj %llu) [error=%d]",
+ (u_longlong_t)obsolete_sm_object, error);
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * The special vdev case is used for hot spares and l2cache devices. Its
+ * sole purpose it to set the vdev state for the associated vdev. To do this,
+ * we make sure that we can open the underlying device, then try to read the
+ * label, and make sure that the label is sane and that it hasn't been
+ * repurposed to another pool.
+ */
+int
+vdev_validate_aux(vdev_t *vd)
+{
+ nvlist_t *label;
+ uint64_t guid, version;
+ uint64_t state;
+
+ if (!vdev_readable(vd))
+ return (0);
+
+ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (-1);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
+ !SPA_VERSION_IS_SUPPORTED(version) ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
+ guid != vd->vdev_guid ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (-1);
+ }
+
+ /*
+ * We don't actually check the pool state here. If it's in fact in
+ * use by another pool, we update this fact on the fly when requested.
+ */
+ nvlist_free(label);
+ return (0);
+}
+
+/*
+ * Free the objects used to store this vdev's spacemaps, and the array
+ * that points to them.
+ */
+void
+vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
+{
+ if (vd->vdev_ms_array == 0)
+ return;
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
+ size_t array_bytes = array_count * sizeof (uint64_t);
+ uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
+ VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
+ array_bytes, smobj_array, 0));
+
+ for (uint64_t i = 0; i < array_count; i++) {
+ uint64_t smobj = smobj_array[i];
+ if (smobj == 0)
+ continue;
+
+ space_map_free_obj(mos, smobj, tx);
+ }
+
+ kmem_free(smobj_array, array_bytes);
+ VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
+ vd->vdev_ms_array = 0;
+}
+
+static void
+vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(vd->vdev_islog);
+ ASSERT(vd == vd->vdev_top);
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ vdev_destroy_spacemaps(vd, tx);
+ if (vd->vdev_top_zap != 0) {
+ vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
+ vd->vdev_top_zap = 0;
+ }
+
+ dmu_tx_commit(tx);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+ metaslab_t *msp;
+ boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+
+ ASSERT(vdev_is_concrete(vd));
+
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ != NULL)
+ metaslab_sync_done(msp, txg);
+
+ if (reassess)
+ metaslab_sync_reassess(vd->vdev_mg);
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *lvd;
+ metaslab_t *msp;
+
+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
+ ASSERT(vd->vdev_removing ||
+ vd->vdev_ops == &vdev_indirect_ops);
+
+ vdev_indirect_sync_obsolete(vd, tx);
+
+ /*
+ * If the vdev is indirect, it can't have dirty
+ * metaslabs or DTLs.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
+ ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+ dmu_tx_commit(tx);
+ return;
+ }
+ }
+
+ ASSERT(vdev_is_concrete(vd));
+
+ if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
+ !vd->vdev_removing) {
+ ASSERT(vd == vd->vdev_top);
+ ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
+ vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+ ASSERT(vd->vdev_ms_array != 0);
+ vdev_config_dirty(vd);
+ }
+
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
+ metaslab_sync(msp, txg);
+ (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+ }
+
+ while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+ vdev_dtl_sync(lvd, txg);
+
+ /*
+ * If this is an empty log device being removed, destroy the
+ * metadata associated with it.
+ */
+ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+ vdev_remove_empty_log(vd, txg);
+
+ (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+ dmu_tx_commit(tx);
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+/*
+ * Mark the given vdev faulted. A faulted vdev behaves as if the device could
+ * not be opened, and no I/O is attempted.
+ */
+int
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
+{
+ vdev_t *vd, *tvd;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+ tvd = vd->vdev_top;
+
+ /*
+ * We don't directly use the aux state here, but if we do a
+ * vdev_reopen(), we need this value to be present to remember why we
+ * were faulted.
+ */
+ vd->vdev_label_aux = aux;
+
+ /*
+ * Faulted state takes precedence over degraded.
+ */
+ vd->vdev_delayed_close = B_FALSE;
+ vd->vdev_faulted = 1ULL;
+ vd->vdev_degraded = 0ULL;
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
+
+ /*
+ * If this device has the only valid copy of the data, then
+ * back off and simply mark the vdev as degraded instead.
+ */
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
+ vd->vdev_degraded = 1ULL;
+ vd->vdev_faulted = 0ULL;
+
+ /*
+ * If we reopen the device and it's not dead, only then do we
+ * mark it degraded.
+ */
+ vdev_reopen(tvd);
+
+ if (vdev_readable(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
+ }
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+/*
+ * Mark the given vdev degraded. A degraded vdev is purely an indication to the
+ * user that something is wrong. The vdev continues to operate as normal as far
+ * as I/O is concerned.
+ */
+int
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
+{
+ vdev_t *vd;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+ /*
+ * If the vdev is already faulted, then don't do anything.
+ */
+ if (vd->vdev_faulted || vd->vdev_degraded)
+ return (spa_vdev_state_exit(spa, NULL, 0));
+
+ vd->vdev_degraded = 1ULL;
+ if (!vdev_is_dead(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+ aux);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+/*
+ * Online the given vdev.
+ *
+ * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
+ * spare device should be detached when the device finishes resilvering.
+ * Second, the online should be treated like a 'test' online case, so no FMA
+ * events are generated if the device fails to open.
+ */
+int
+vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
+{
+ vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
+ boolean_t wasoffline;
+ vdev_state_t oldstate;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+ wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
+ oldstate = vd->vdev_state;
+
+ tvd = vd->vdev_top;
+ vd->vdev_offline = B_FALSE;
+ vd->vdev_tmpoffline = B_FALSE;
+ vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
+ vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
+ }
+
+ vdev_reopen(tvd);
+ vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = B_FALSE;
+ }
+
+ if (newstate)
+ *newstate = vd->vdev_state;
+ if ((flags & ZFS_ONLINE_UNSPARE) &&
+ !vdev_is_dead(vd) && vd->vdev_parent &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
+
+ if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (vd->vdev_aux)
+ return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
+
+ /* Restart initializing if necessary */
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_initialize_thread == NULL &&
+ vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+ (void) vdev_initialize(vd);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ if (wasoffline ||
+ (oldstate < VDEV_STATE_DEGRADED &&
+ vd->vdev_state >= VDEV_STATE_DEGRADED))
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ vdev_t *vd, *tvd;
+ int error = 0;
+ uint64_t generation;
+ metaslab_group_t *mg;
+
+top:
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+ tvd = vd->vdev_top;
+ mg = tvd->vdev_mg;
+ generation = spa->spa_config_generation + 1;
+
+ /*
+ * If the device isn't already offline, try to offline it.
+ */
+ if (!vd->vdev_offline) {
+ /*
+ * If this device has the only valid copy of some data,
+ * don't allow it to be offlined. Log devices are always
+ * expendable.
+ */
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ vdev_dtl_required(vd))
+ return (spa_vdev_state_exit(spa, NULL, EBUSY));
+
+ /*
+ * If the top-level is a slog and it has had allocations
+ * then proceed. We check that the vdev's metaslab group
+ * is not NULL since it's possible that we may have just
+ * added this vdev but not yet initialized its metaslabs.
+ */
+ if (tvd->vdev_islog && mg != NULL) {
+ /*
+ * Prevent any future allocations.
+ */
+ metaslab_group_passivate(mg);
+ (void) spa_vdev_state_exit(spa, vd, 0);
+
+ error = spa_reset_logs(spa);
+
+ /*
+ * If the log device was successfully reset but has
+ * checkpointed data, do not offline it.
+ */
+ if (error == 0 &&
+ tvd->vdev_checkpoint_sm != NULL) {
+ error = ZFS_ERR_CHECKPOINT_EXISTS;
+ }
+
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ /*
+ * Check to see if the config has changed.
+ */
+ if (error || generation != spa->spa_config_generation) {
+ metaslab_group_activate(mg);
+ if (error)
+ return (spa_vdev_state_exit(spa,
+ vd, error));
+ (void) spa_vdev_state_exit(spa, vd, 0);
+ goto top;
+ }
+ ASSERT0(tvd->vdev_stat.vs_alloc);
+ }
+
+ /*
+ * Offline this device and reopen its top-level vdev.
+ * If the top-level vdev is a log device then just offline
+ * it. Otherwise, if this action results in the top-level
+ * vdev becoming unusable, undo it and fail the request.
+ */
+ vd->vdev_offline = B_TRUE;
+ vdev_reopen(tvd);
+
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ vdev_is_dead(tvd)) {
+ vd->vdev_offline = B_FALSE;
+ vdev_reopen(tvd);
+ return (spa_vdev_state_exit(spa, NULL, EBUSY));
+ }
+
+ /*
+ * Add the device back into the metaslab rotor so that
+ * once we online the device it's open for business.
+ */
+ if (tvd->vdev_islog && mg != NULL)
+ metaslab_group_activate(mg);
+ }
+
+ vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ int error;
+
+ mutex_enter(&spa->spa_vdev_top_lock);
+ error = vdev_offline_locked(spa, guid, flags);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
+}
+
+/*
+ * Clear the error counts associated with this vdev. Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked. We also clear all
+ * children. If 'vd' is NULL, then the user wants to clear all vdevs.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ if (vd == NULL)
+ vd = rvd;
+
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_clear(spa, vd->vdev_child[c]);
+
+ if (vd == rvd) {
+ for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
+ vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
+
+ for (int c = 0; c < spa->spa_spares.sav_count; c++)
+ vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
+ }
+
+ /*
+ * It makes no sense to "clear" an indirect vdev.
+ */
+ if (!vdev_is_concrete(vd))
+ return;
+
+ /*
+ * If we're in the FAULTED state or have experienced failed I/O, then
+ * clear the persistent state and attempt to reopen the device. We
+ * also mark the vdev config dirty, so that the new faulted state is
+ * written out to disk.
+ */
+ if (vd->vdev_faulted || vd->vdev_degraded ||
+ !vdev_readable(vd) || !vdev_writeable(vd)) {
+
+ /*
+ * When reopening in reponse to a clear event, it may be due to
+ * a fmadm repair request. In this case, if the device is
+ * still broken, we want to still post the ereport again.
+ */
+ vd->vdev_forcefault = B_TRUE;
+
+ vd->vdev_faulted = vd->vdev_degraded = 0ULL;
+ vd->vdev_cant_read = B_FALSE;
+ vd->vdev_cant_write = B_FALSE;
+
+ vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
+
+ vd->vdev_forcefault = B_FALSE;
+
+ if (vd != rvd && vdev_writeable(vd->vdev_top))
+ vdev_state_dirty(vd->vdev_top);
+
+ if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
+ }
+
+ /*
+ * When clearing a FMA-diagnosed fault, we always want to
+ * unspare the device, as we assume that the original spare was
+ * done in response to the FMA fault.
+ */
+ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
+}
+
+boolean_t
+vdev_is_dead(vdev_t *vd)
+{
+ /*
+ * Holes and missing devices are always considered "dead".
+ * This simplifies the code since we don't have to check for
+ * these types of devices in the various code paths.
+ * Instead we rely on the fact that we skip over dead devices
+ * before issuing I/O to them.
+ */
+ return (vd->vdev_state < VDEV_STATE_DEGRADED ||
+ vd->vdev_ops == &vdev_hole_ops ||
+ vd->vdev_ops == &vdev_missing_ops);
+}
+
+boolean_t
+vdev_readable(vdev_t *vd)
+{
+ return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
+}
+
+boolean_t
+vdev_writeable(vdev_t *vd)
+{
+ return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
+ vdev_is_concrete(vd));
+}
+
+boolean_t
+vdev_allocatable(vdev_t *vd)
+{
+ uint64_t state = vd->vdev_state;
+
+ /*
+ * We currently allow allocations from vdevs which may be in the
+ * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
+ * fails to reopen then we'll catch it later when we're holding
+ * the proper locks. Note that we have to get the vdev state
+ * in a local variable because although it changes atomically,
+ * we're asking two separate questions about it.
+ */
+ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
+ !vd->vdev_cant_write && vdev_is_concrete(vd) &&
+ vd->vdev_mg->mg_initialized);
+}
+
+boolean_t
+vdev_accessible(vdev_t *vd, zio_t *zio)
+{
+ ASSERT(zio->io_vd == vd);
+
+ if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
+ return (B_FALSE);
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ return (!vd->vdev_cant_read);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ return (!vd->vdev_cant_write);
+
+ return (B_TRUE);
+}
+
+boolean_t
+vdev_is_spacemap_addressable(vdev_t *vd)
+{
+ if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+ return (B_TRUE);
+
+ /*
+ * If double-word space map entries are not enabled we assume
+ * 47 bits of the space map entry are dedicated to the entry's
+ * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+ * to calculate the maximum address that can be described by a
+ * space map entry for the given device.
+ */
+ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
+
+ if (shift >= 63) /* detect potential overflow */
+ return (B_TRUE);
+
+ return (vd->vdev_asize < (1ULL << shift));
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *tvd = vd->vdev_top;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+ vs->vs_state = vd->vdev_state;
+ vs->vs_rsize = vdev_get_min_asize(vd);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ /*
+ * Report intializing progress. Since we don't have the
+ * initializing locks held, this is only an estimate (although a
+ * fairly accurate one).
+ */
+ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
+ vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
+ vs->vs_initialize_state = vd->vdev_initialize_state;
+ vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
+ }
+ /*
+ * Report expandable space on top-level, non-auxillary devices only.
+ * The expandable space is reported in terms of metaslab sized units
+ * since that determines how much space the pool can expand.
+ */
+ if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
+ vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
+ spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
+ }
+ vs->vs_configured_ashift = vd->vdev_top != NULL
+ ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
+ vs->vs_logical_ashift = vd->vdev_logical_ashift;
+ vs->vs_physical_ashift = vd->vdev_physical_ashift;
+ if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
+ vdev_is_concrete(vd)) {
+ vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
+ vd->vdev_mg->mg_fragmentation : 0;
+ }
+
+ /*
+ * If we're getting stats on the root vdev, aggregate the I/O counts
+ * over all top-level vdevs (i.e. the direct children of the root).
+ */
+ if (vd == rvd) {
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+ vdev_stat_t *cvs = &cvd->vdev_stat;
+
+ for (int t = 0; t < ZIO_TYPES; t++) {
+ vs->vs_ops[t] += cvs->vs_ops[t];
+ vs->vs_bytes[t] += cvs->vs_bytes[t];
+ }
+ cvs->vs_scan_removing = cvd->vdev_removing;
+ }
+ }
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_clear_stats(vdev_t *vd)
+{
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_space = 0;
+ vd->vdev_stat.vs_dspace = 0;
+ vd->vdev_stat.vs_alloc = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_scan_stat_init(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_scan_stat_init(vd->vdev_child[c]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_scan_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_stat_update(zio_t *zio, uint64_t psize)
+{
+ spa_t *spa = zio->io_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
+ vdev_t *pvd;
+ uint64_t txg = zio->io_txg;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ zio_type_t type = zio->io_type;
+ int flags = zio->io_flags;
+
+ /*
+ * If this i/o is a gang leader, it didn't do any actual work.
+ */
+ if (zio->io_gang_tree)
+ return;
+
+ if (zio->io_error == 0) {
+ /*
+ * If this is a root i/o, don't count it -- we've already
+ * counted the top-level vdevs, and vdev_get_stats() will
+ * aggregate them when asked. This reduces contention on
+ * the root vdev_stat_lock and implicitly handles blocks
+ * that compress away to holes, for which there is no i/o.
+ * (Holes never create vdev children, so all the counters
+ * remain zero, which is what we want.)
+ *
+ * Note: this only applies to successful i/o (io_error == 0)
+ * because unlike i/o counts, errors are not additive.
+ * When reading a ditto block, for example, failure of
+ * one top-level vdev does not imply a root-level error.
+ */
+ if (vd == rvd)
+ return;
+
+ ASSERT(vd == zio->io_vd);
+
+ if (flags & ZIO_FLAG_IO_BYPASS)
+ return;
+
+ mutex_enter(&vd->vdev_stat_lock);
+
+ if (flags & ZIO_FLAG_IO_REPAIR) {
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
+ dsl_scan_phys_t *scn_phys =
+ &spa->spa_dsl_pool->dp_scan->scn_phys;
+ uint64_t *processed = &scn_phys->scn_processed;
+
+ /* XXX cleanup? */
+ if (vd->vdev_ops->vdev_op_leaf)
+ atomic_add_64(processed, psize);
+ vs->vs_scan_processed += psize;
+ }
+
+ if (flags & ZIO_FLAG_SELF_HEAL)
+ vs->vs_self_healed += psize;
+ }
+
+ vs->vs_ops[type]++;
+ vs->vs_bytes[type] += psize;
+
+ mutex_exit(&vd->vdev_stat_lock);
+ return;
+ }
+
+ if (flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ /*
+ * If this is an I/O error that is going to be retried, then ignore the
+ * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
+ * hard errors, when in reality they can happen for any number of
+ * innocuous reasons (bus resets, MPxIO link failure, etc).
+ */
+ if (zio->io_error == EIO &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY))
+ return;
+
+ /*
+ * Intent logs writes won't propagate their error to the root
+ * I/O so don't mark these types of failures as pool-level
+ * errors.
+ */
+ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ return;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
+ if (zio->io_error == ECKSUM)
+ vs->vs_checksum_errors++;
+ else
+ vs->vs_read_errors++;
+ }
+ if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
+ vs->vs_write_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ if (spa->spa_load_state == SPA_LOAD_NONE &&
+ type == ZIO_TYPE_WRITE && txg != 0 &&
+ (!(flags & ZIO_FLAG_IO_REPAIR) ||
+ (flags & ZIO_FLAG_SCAN_THREAD) ||
+ spa->spa_claiming)) {
+ /*
+ * This is either a normal write (not a repair), or it's
+ * a repair induced by the scrub thread, or it's a repair
+ * made by zil_claim() during spa_load() in the first txg.
+ * In the normal case, we commit the DTL change in the same
+ * txg as the block was born. In the scrub-induced repair
+ * case, we know that scrubs run in first-pass syncing context,
+ * so we commit the DTL change in spa_syncing_txg(spa).
+ * In the zil_claim() case, we commit in spa_first_txg(spa).
+ *
+ * We currently do not make DTL entries for failed spontaneous
+ * self-healing writes triggered by normal (non-scrubbing)
+ * reads, because we have no transactional context in which to
+ * do so -- and it's not clear that it'd be desirable anyway.
+ */
+ if (vd->vdev_ops->vdev_op_leaf) {
+ uint64_t commit_txg = txg;
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ ASSERT(spa_sync_pass(spa) == 1);
+ vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
+ commit_txg = spa_syncing_txg(spa);
+ } else if (spa->spa_claiming) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ commit_txg = spa_first_txg(spa);
+ }
+ ASSERT(commit_txg >= spa_syncing_txg(spa));
+ if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
+ return;
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
+ }
+ if (vd != rvd)
+ vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
+ }
+}
+
+int64_t
+vdev_deflated_space(vdev_t *vd, int64_t space)
+{
+ ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
+ ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
+
+ return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
+}
+
+/*
+ * Update the in-core space usage stats for this vdev and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta)
+{
+ int64_t dspace_delta;
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
+ * factor. We must calculate this here and not at the root vdev
+ * because the root vdev's psize-to-asize is simply the max of its
+ * childrens', thus not accurate enough for us.
+ */
+ dspace_delta = vdev_deflated_space(vd, space_delta);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_space += space_delta;
+ vd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ /* every class but log contributes to root space stats */
+ if (vd->vdev_mg != NULL && !vd->vdev_islog) {
+ mutex_enter(&rvd->vdev_stat_lock);
+ rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_space += space_delta;
+ rvd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&rvd->vdev_stat_lock);
+ }
+ /* Note: metaslab_class_space_update moved to metaslab_space_update */
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ ASSERT(spa_writeable(spa));
+
+ /*
+ * If this is an aux vdev (as with l2cache and spare devices), then we
+ * update the vdev config manually and set the sync flag.
+ */
+ if (vd->vdev_aux != NULL) {
+ spa_aux_vdev_t *sav = vd->vdev_aux;
+ nvlist_t **aux;
+ uint_t naux;
+
+ for (c = 0; c < sav->sav_count; c++) {
+ if (sav->sav_vdevs[c] == vd)
+ break;
+ }
+
+ if (c == sav->sav_count) {
+ /*
+ * We're being removed. There's nothing more to do.
+ */
+ ASSERT(sav->sav_sync == B_TRUE);
+ return;
+ }
+
+ sav->sav_sync = B_TRUE;
+
+ if (nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
+ }
+
+ ASSERT(c < naux);
+
+ /*
+ * Setting the nvlist in the middle if the array is a little
+ * sketchy, but it will work.
+ */
+ nvlist_free(aux[c]);
+ aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
+
+ return;
+ }
+
+ /*
+ * The dirty list is protected by the SCL_CONFIG lock. The caller
+ * must either hold SCL_CONFIG as writer, or must be the sync thread
+ * (which holds SCL_CONFIG as reader). There's only one sync thread,
+ * so this is sufficient to ensure mutual exclusion.
+ */
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
+ (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+ spa_config_held(spa, SCL_CONFIG, RW_READER)));
+
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_config_dirty(rvd->vdev_child[c]);
+ } else {
+ ASSERT(vd == vd->vdev_top);
+
+ if (!list_link_active(&vd->vdev_config_dirty_node) &&
+ vdev_is_concrete(vd)) {
+ list_insert_head(&spa->spa_config_dirty_list, vd);
+ }
+ }
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
+ (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+ spa_config_held(spa, SCL_CONFIG, RW_READER)));
+
+ ASSERT(list_link_active(&vd->vdev_config_dirty_node));
+ list_remove(&spa->spa_config_dirty_list, vd);
+}
+
+/*
+ * Mark a top-level vdev's state as dirty, so that the next pass of
+ * spa_sync() can convert this into vdev_config_dirty(). We distinguish
+ * the state changes from larger config changes because they require
+ * much less locking, and are often needed for administrative actions.
+ */
+void
+vdev_state_dirty(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_writeable(spa));
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * The state list is protected by the SCL_STATE lock. The caller
+ * must either hold SCL_STATE as writer, or must be the sync thread
+ * (which holds SCL_STATE as reader). There's only one sync thread,
+ * so this is sufficient to ensure mutual exclusion.
+ */
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
+ (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+ spa_config_held(spa, SCL_STATE, RW_READER)));
+
+ if (!list_link_active(&vd->vdev_state_dirty_node) &&
+ vdev_is_concrete(vd))
+ list_insert_head(&spa->spa_state_dirty_list, vd);
+}
+
+void
+vdev_state_clean(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
+ (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+ spa_config_held(spa, SCL_STATE, RW_READER)));
+
+ ASSERT(list_link_active(&vd->vdev_state_dirty_node));
+ list_remove(&spa->spa_state_dirty_list, vd);
+}
+
+/*
+ * Propagate vdev state up from children to parent.
+ */
+void
+vdev_propagate_state(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int degraded = 0, faulted = 0;
+ int corrupted = 0;
+ vdev_t *child;
+
+ if (vd->vdev_children > 0) {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ child = vd->vdev_child[c];
+
+ /*
+ * Don't factor holes or indirect vdevs into the
+ * decision.
+ */
+ if (!vdev_is_concrete(child))
+ continue;
+
+ if (!vdev_readable(child) ||
+ (!vdev_writeable(child) && spa_writeable(spa))) {
+ /*
+ * Root special: if there is a top-level log
+ * device, treat the root vdev as if it were
+ * degraded.
+ */
+ if (child->vdev_islog && vd == rvd)
+ degraded++;
+ else
+ faulted++;
+ } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
+ degraded++;
+ }
+
+ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+ corrupted++;
+ }
+
+ vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+ /*
+ * Root special: if there is a top-level vdev that cannot be
+ * opened due to corrupted metadata, then propagate the root
+ * vdev's aux state as 'corrupt' rather than 'insufficient
+ * replicas'.
+ */
+ if (corrupted && vd == rvd &&
+ rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+ vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ }
+
+ if (vd->vdev_parent)
+ vdev_propagate_state(vd->vdev_parent);
+}
+
+/*
+ * Set a vdev's state. If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
+ */
+void
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
+{
+ uint64_t save_state;
+ spa_t *spa = vd->vdev_spa;
+
+ if (state == vd->vdev_state) {
+ vd->vdev_stat.vs_aux = aux;
+ return;
+ }
+
+ save_state = vd->vdev_state;
+
+ vd->vdev_state = state;
+ vd->vdev_stat.vs_aux = aux;
+
+ /*
+ * If we are setting the vdev state to anything but an open state, then
+ * always close the underlying device unless the device has requested
+ * a delayed close (i.e. we're about to remove or fault the device).
+ * Otherwise, we keep accessible but invalid devices open forever.
+ * We don't call vdev_close() itself, because that implies some extra
+ * checks (offline, etc) that we don't want here. This is limited to
+ * leaf devices, because otherwise closing the device will affect other
+ * children.
+ */
+ if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+ vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_close(vd);
+
+ if (vd->vdev_removed &&
+ state == VDEV_STATE_CANT_OPEN &&
+ (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
+ /*
+ * If the previous state is set to VDEV_STATE_REMOVED, then this
+ * device was previously marked removed and someone attempted to
+ * reopen it. If this failed due to a nonexistent device, then
+ * keep the device in the REMOVED state. We also let this be if
+ * it is one of our special test online cases, which is only
+ * attempting to online the device and shouldn't generate an FMA
+ * fault.
+ */
+ vd->vdev_state = VDEV_STATE_REMOVED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+ } else if (state == VDEV_STATE_REMOVED) {
+ vd->vdev_removed = B_TRUE;
+ } else if (state == VDEV_STATE_CANT_OPEN) {
+ /*
+ * If we fail to open a vdev during an import or recovery, we
+ * mark it as "not available", which signifies that it was
+ * never there to begin with. Failure to open such a device
+ * is not considered an error.
+ */
+ if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER) &&
+ vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_not_present = 1;
+
+ /*
+ * Post the appropriate ereport. If the 'prevstate' field is
+ * set to something other than VDEV_STATE_UNKNOWN, it indicates
+ * that this is part of a vdev_reopen(). In this case, we don't
+ * want to post the ereport if the device was already in the
+ * CANT_OPEN state beforehand.
+ *
+ * If the 'checkremove' flag is set, then this is an attempt to
+ * online the device in response to an insertion event. If we
+ * hit this case, then we have detected an insertion event for a
+ * faulted or offline device that wasn't in the removed state.
+ * In this scenario, we don't post an ereport because we are
+ * about to replace the device, or attempt an online with
+ * vdev_forcefault, which will generate the fault for us.
+ */
+ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
+ !vd->vdev_not_present && !vd->vdev_checkremove &&
+ vd != spa->spa_root_vdev) {
+ const char *class;
+
+ switch (aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+ break;
+ case VDEV_AUX_CORRUPT_DATA:
+ class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+ break;
+ case VDEV_AUX_NO_REPLICAS:
+ class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+ break;
+ case VDEV_AUX_BAD_GUID_SUM:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+ break;
+ case VDEV_AUX_TOO_SMALL:
+ class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+ break;
+ case VDEV_AUX_BAD_LABEL:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+ break;
+ default:
+ class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+ }
+
+ zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
+ }
+
+ /* Erase any notion of persistent removed state */
+ vd->vdev_removed = B_FALSE;
+ } else {
+ vd->vdev_removed = B_FALSE;
+ }
+
+ /*
+ * Notify the fmd of the state change. Be verbose and post
+ * notifications even for stuff that's not important; the fmd agent can
+ * sort it out. Don't emit state change events for non-leaf vdevs since
+ * they can't change state on their own. The FMD can check their state
+ * if it wants to when it sees that a leaf vdev had a state change.
+ */
+ if (vd->vdev_ops->vdev_op_leaf)
+ zfs_post_state_change(spa, vd);
+
+ if (!isopen && vd->vdev_parent)
+ vdev_propagate_state(vd->vdev_parent);
+}
+
+boolean_t
+vdev_children_are_offline(vdev_t *vd)
+{
+ ASSERT(!vd->vdev_ops->vdev_op_leaf);
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Check the vdev configuration to ensure that it's capable of supporting
+ * a root pool. We do not support partial configuration.
+ * In addition, only a single top-level vdev is allowed.
+ *
+ * FreeBSD does not have above limitations.
+ */
+boolean_t
+vdev_is_bootable(vdev_t *vd)
+{
+#ifdef illumos
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ char *vdev_type = vd->vdev_ops->vdev_op_type;
+
+ if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
+ vd->vdev_children > 1) {
+ return (B_FALSE);
+ } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
+ strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
+ return (B_FALSE);
+ }
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (!vdev_is_bootable(vd->vdev_child[c]))
+ return (B_FALSE);
+ }
+#endif /* illumos */
+ return (B_TRUE);
+}
+
+boolean_t
+vdev_is_concrete(vdev_t *vd)
+{
+ vdev_ops_t *ops = vd->vdev_ops;
+ if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
+ ops == &vdev_missing_ops || ops == &vdev_root_ops) {
+ return (B_FALSE);
+ } else {
+ return (B_TRUE);
+ }
+}
+
+/*
+ * Determine if a log device has valid content. If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+ !vd->vdev_removed)
+ return (B_TRUE);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_log_state_valid(vd->vdev_child[c]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * Expand a vdev if possible.
+ */
+void
+vdev_expand(vdev_t *vd, uint64_t txg)
+{
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(vdev_is_concrete(vd));
+
+ vdev_set_deflate_ratio(vd);
+
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+ vdev_config_dirty(vd);
+ }
+}
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+ vdev_t *cvd, *pvd = vd->vdev_parent;
+
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ cvd = pvd->vdev_child[0];
+ if (pvd->vdev_children == 1) {
+ vdev_remove_parent(cvd);
+ cvd->vdev_splitting = B_TRUE;
+ }
+ vdev_propagate_state(cvd);
+}
+
+void
+vdev_deadman(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ vdev_deadman(cvd);
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_enter(&vq->vq_lock);
+ if (avl_numnodes(&vq->vq_active_tree) > 0) {
+ spa_t *spa = vd->vdev_spa;
+ zio_t *fio;
+ uint64_t delta;
+
+ /*
+ * Look at the head of all the pending queues,
+ * if any I/O has been outstanding for longer than
+ * the spa_deadman_synctime we panic the system.
+ */
+ fio = avl_first(&vq->vq_active_tree);
+ delta = gethrtime() - fio->io_timestamp;
+ if (delta > spa_deadman_synctime(spa)) {
+ vdev_dbgmsg(vd, "SLOW IO: zio timestamp "
+ "%lluns, delta %lluns, last io %lluns",
+ fio->io_timestamp, (u_longlong_t)delta,
+ vq->vq_io_complete_ts);
+ fm_panic("I/O to pool '%s' appears to be "
+ "hung on vdev guid %llu at '%s'.",
+ spa_name(spa),
+ (long long unsigned int) vd->vdev_guid,
+ vd->vdev_path);
+ }
+ }
+ mutex_exit(&vq->vq_lock);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
new file mode 100644
index 000000000000..69421bb61897
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -0,0 +1,434 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/kstat.h>
+#include <sys/abd.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache. When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result. In the best case, this can turn 128 back-to-back 512-byte
+ * reads into a single 64k read followed by 127 cache hits; this reduces
+ * latency dramatically. In the worst case, it can turn an isolated 512-byte
+ * read into a 64k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth. A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region. Currently, only
+ * metadata I/O is inflated. A futher enhancement could take advantage of
+ * more semantic information about the I/O. And it could use something
+ * faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate. This reserves a cache entry for the specified region.
+ * We separate the allocate and fill operations so that multiple threads
+ * don't generate I/O for the same cache miss.
+ *
+ * (2) Fill. When the I/O for a cache miss completes, the fill routine
+ * places the data in the previously allocated cache entry.
+ *
+ * (3) Read. Read data from the cache.
+ *
+ * (4) Write. Update cache contents after write completion.
+ *
+ * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
+ * if the total cache size exceeds zfs_vdev_cache_size.
+ */
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * All i/os smaller than zfs_vdev_cache_max will be turned into
+ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
+ * track buffer). At most zfs_vdev_cache_size bytes will be kept in each
+ * vdev's vdev_cache.
+ *
+ * TODO: Note that with the current ZFS code, it turns out that the
+ * vdev cache is not helpful, and in some cases actually harmful. It
+ * is better if we disable this. Once some time has passed, we should
+ * actually remove this to simplify the code. For now we just disable
+ * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11
+ * has made these same changes.
+ */
+int zfs_vdev_cache_max = 1<<14; /* 16KB */
+int zfs_vdev_cache_size = 0;
+int zfs_vdev_cache_bshift = 16;
+
+#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS VDEV Cache");
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
+ &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
+ &zfs_vdev_cache_size, 0, "Size of VDEV cache");
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN,
+ &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value");
+
+kstat_t *vdc_ksp = NULL;
+
+typedef struct vdc_stats {
+ kstat_named_t vdc_stat_delegations;
+ kstat_named_t vdc_stat_hits;
+ kstat_named_t vdc_stat_misses;
+} vdc_stats_t;
+
+static vdc_stats_t vdc_stats = {
+ { "delegations", KSTAT_DATA_UINT64 },
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 }
+};
+
+#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64);
+
+static inline int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
+ const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
+
+ return (AVL_CMP(ve1->ve_offset, ve2->ve_offset));
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
+ const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
+
+ int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused);
+ if (likely(cmp))
+ return (cmp);
+
+ /*
+ * Among equally old entries, sort by offset to ensure uniqueness.
+ */
+ return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
+ ASSERT3P(ve->ve_abd, !=, NULL);
+
+ avl_remove(&vc->vc_lastused_tree, ve);
+ avl_remove(&vc->vc_offset_tree, ve);
+ abd_free(ve->ve_abd);
+ kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache. At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
+ vdev_cache_entry_t *ve;
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+ if (zfs_vdev_cache_size == 0)
+ return (NULL);
+
+ /*
+ * If adding a new entry would exceed the cache size,
+ * evict the oldest entry (LRU).
+ */
+ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
+ zfs_vdev_cache_size) {
+ ve = avl_first(&vc->vc_lastused_tree);
+ if (ve->ve_fill_io != NULL)
+ return (NULL);
+ ASSERT3U(ve->ve_hits, !=, 0);
+ vdev_cache_evict(vc, ve);
+ }
+
+ ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+ ve->ve_offset = offset;
+ ve->ve_lastused = ddi_get_lbolt();
+ ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
+
+ avl_add(&vc->vc_offset_tree, ve);
+ avl_add(&vc->vc_lastused_tree, ve);
+
+ return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
+
+ if (ve->ve_lastused != ddi_get_lbolt()) {
+ avl_remove(&vc->vc_lastused_tree, ve);
+ ve->ve_lastused = ddi_get_lbolt();
+ avl_add(&vc->vc_lastused_tree, ve);
+ }
+
+ ve->ve_hits++;
+ abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *fio)
+{
+ vdev_t *vd = fio->io_vd;
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve = fio->io_private;
+ zio_t *pio;
+
+ ASSERT3U(fio->io_size, ==, VCBS);
+
+ /*
+ * Add data to the cache.
+ */
+ mutex_enter(&vc->vc_lock);
+
+ ASSERT3P(ve->ve_fill_io, ==, fio);
+ ASSERT3U(ve->ve_offset, ==, fio->io_offset);
+ ASSERT3P(ve->ve_abd, ==, fio->io_abd);
+
+ ve->ve_fill_io = NULL;
+
+ /*
+ * Even if this cache line was invalidated by a missed write update,
+ * any reads that were queued up before the missed update are still
+ * valid, so we can satisfy them from this line before we evict it.
+ */
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(fio, &zl)) != NULL)
+ vdev_cache_hit(vc, ve, pio);
+
+ if (fio->io_error || ve->ve_missed_update)
+ vdev_cache_evict(vc, ve);
+
+ mutex_exit(&vc->vc_lock);
+}
+
+/*
+ * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss.
+ */
+boolean_t
+vdev_cache_read(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+ zio_t *fio;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+ return (B_FALSE);
+
+ if (zio->io_size > zfs_vdev_cache_max)
+ return (B_FALSE);
+
+ /*
+ * If the I/O straddles two or more cache blocks, don't cache it.
+ */
+ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
+ return (B_FALSE);
+
+ ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = cache_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
+
+ if (ve != NULL) {
+ if (ve->ve_missed_update) {
+ mutex_exit(&vc->vc_lock);
+ return (B_FALSE);
+ }
+
+ if ((fio = ve->ve_fill_io) != NULL) {
+ zio_vdev_io_bypass(zio);
+ zio_add_child(zio, fio);
+ mutex_exit(&vc->vc_lock);
+ VDCSTAT_BUMP(vdc_stat_delegations);
+ return (B_TRUE);
+ }
+
+ vdev_cache_hit(vc, ve, zio);
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ VDCSTAT_BUMP(vdc_stat_hits);
+ return (B_TRUE);
+ }
+
+ ve = vdev_cache_allocate(zio);
+
+ if (ve == NULL) {
+ mutex_exit(&vc->vc_lock);
+ return (B_FALSE);
+ }
+
+ fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
+ ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
+
+ ve->ve_fill_io = fio;
+ zio_vdev_io_bypass(zio);
+ zio_add_child(zio, fio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_nowait(fio);
+ VDCSTAT_BUMP(vdc_stat_misses);
+
+ return (B_TRUE);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t io_start = zio->io_offset;
+ uint64_t io_end = io_start + zio->io_size;
+ uint64_t min_offset = P2ALIGN(io_start, VCBS);
+ uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
+ avl_index_t where;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = min_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+ if (ve == NULL)
+ ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+ while (ve != NULL && ve->ve_offset < max_offset) {
+ uint64_t start = MAX(ve->ve_offset, io_start);
+ uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
+
+ if (ve->ve_fill_io != NULL) {
+ ve->ve_missed_update = 1;
+ } else {
+ abd_copy_off(ve->ve_abd, zio->io_abd,
+ start - ve->ve_offset, start - io_start,
+ end - start);
+ }
+ ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+ }
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_purge(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve;
+
+ mutex_enter(&vc->vc_lock);
+ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+ vdev_cache_evict(vc, ve);
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_offset_node));
+
+ avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_lastused_node));
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ vdev_cache_purge(vd);
+
+ avl_destroy(&vc->vc_offset_tree);
+ avl_destroy(&vc->vc_lastused_tree);
+
+ mutex_destroy(&vc->vc_lock);
+}
+
+void
+vdev_cache_stat_init(void)
+{
+ vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (vdc_ksp != NULL) {
+ vdc_ksp->ks_data = &vdc_stats;
+ kstat_install(vdc_ksp);
+ }
+}
+
+void
+vdev_cache_stat_fini(void)
+{
+ if (vdc_ksp != NULL) {
+ kstat_delete(vdc_ksp);
+ vdc_ksp = NULL;
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
new file mode 100644
index 000000000000..2fe7b35f4fa0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -0,0 +1,971 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/refcount.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/sunldi.h>
+#include <sys/efi_partition.h>
+#include <sys/fm/fs/zfs.h>
+
+/*
+ * Virtual device vector for disks.
+ */
+
+extern ldi_ident_t zfs_li;
+
+static void vdev_disk_close(vdev_t *);
+
+typedef struct vdev_disk_ldi_cb {
+ list_node_t lcb_next;
+ ldi_callback_id_t lcb_id;
+} vdev_disk_ldi_cb_t;
+
+/*
+ * Bypass the devid when opening a disk vdev.
+ * There have been issues where the devids of several devices were shuffled,
+ * causing pool open failures. Note, that this flag is intended to be used
+ * for pool recovery only.
+ *
+ * Note that if a pool is imported with the devids bypassed, all its vdevs will
+ * cease storing devid information permanently. In practice, the devid is rarely
+ * useful as vdev paths do not tend to change unless the hardware is
+ * reconfigured. That said, if the paths do change and a pool fails to open
+ * automatically at boot, a simple zpool import should re-scan the paths and fix
+ * the issue.
+ */
+boolean_t vdev_disk_bypass_devid = B_FALSE;
+
+static void
+vdev_disk_alloc(vdev_t *vd)
+{
+ vdev_disk_t *dvd;
+
+ dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+ /*
+ * Create the LDI event callback list.
+ */
+ list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
+ offsetof(vdev_disk_ldi_cb_t, lcb_next));
+}
+
+static void
+vdev_disk_free(vdev_t *vd)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_ldi_cb_t *lcb;
+
+ if (dvd == NULL)
+ return;
+
+ /*
+ * We have already closed the LDI handle. Clean up the LDI event
+ * callbacks and free vd->vdev_tsd.
+ */
+ while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
+ list_remove(&dvd->vd_ldi_cbs, lcb);
+ (void) ldi_ev_remove_callbacks(lcb->lcb_id);
+ kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
+ }
+ list_destroy(&dvd->vd_ldi_cbs);
+ kmem_free(dvd, sizeof (vdev_disk_t));
+ vd->vdev_tsd = NULL;
+}
+
+/* ARGSUSED */
+static int
+vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
+ void *ev_data)
+{
+ vdev_t *vd = (vdev_t *)arg;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ /*
+ * Ignore events other than offline.
+ */
+ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
+ return (LDI_EV_SUCCESS);
+
+ /*
+ * All LDI handles must be closed for the state change to succeed, so
+ * call on vdev_disk_close() to do this.
+ *
+ * We inform vdev_disk_close that it is being called from offline
+ * notify context so it will defer cleanup of LDI event callbacks and
+ * freeing of vd->vdev_tsd to the offline finalize or a reopen.
+ */
+ dvd->vd_ldi_offline = B_TRUE;
+ vdev_disk_close(vd);
+
+ /*
+ * Now that the device is closed, request that the spa_async_thread
+ * mark the device as REMOVED and notify FMA of the removal.
+ */
+ zfs_post_remove(vd->vdev_spa, vd);
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+
+ return (LDI_EV_SUCCESS);
+}
+
+/* ARGSUSED */
+static void
+vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
+ int ldi_result, void *arg, void *ev_data)
+{
+ vdev_t *vd = (vdev_t *)arg;
+
+ /*
+ * Ignore events other than offline.
+ */
+ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
+ return;
+
+ /*
+ * We have already closed the LDI handle in notify.
+ * Clean up the LDI event callbacks and free vd->vdev_tsd.
+ */
+ vdev_disk_free(vd);
+
+ /*
+ * Request that the vdev be reopened if the offline state change was
+ * unsuccessful.
+ */
+ if (ldi_result != LDI_EV_SUCCESS) {
+ vd->vdev_probe_wanted = B_TRUE;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
+ }
+}
+
+static ldi_ev_callback_t vdev_disk_off_callb = {
+ .cb_vers = LDI_EV_CB_VERS,
+ .cb_notify = vdev_disk_off_notify,
+ .cb_finalize = vdev_disk_off_finalize
+};
+
+/* ARGSUSED */
+static void
+vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
+ int ldi_result, void *arg, void *ev_data)
+{
+ vdev_t *vd = (vdev_t *)arg;
+
+ /*
+ * Ignore events other than degrade.
+ */
+ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
+ return;
+
+ /*
+ * Degrade events always succeed. Mark the vdev as degraded.
+ * This status is purely informative for the user.
+ */
+ (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
+}
+
+static ldi_ev_callback_t vdev_disk_dgrd_callb = {
+ .cb_vers = LDI_EV_CB_VERS,
+ .cb_notify = NULL,
+ .cb_finalize = vdev_disk_dgrd_finalize
+};
+
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+ ddi_devid_t devid;
+ char *minor;
+
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
+ return;
+
+ /*
+ * Only prefetch path and devid info if the device has
+ * never been opened.
+ */
+ if (vd->vdev_tsd != NULL)
+ return;
+
+ if (vd->vdev_wholedisk == -1ULL) {
+ size_t len = strlen(vd->vdev_path) + 3;
+ char *buf = kmem_alloc(len, KM_SLEEP);
+
+ (void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+ (void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
+ kmem_free(buf, len);
+ }
+
+ if (vd->vdev_name_vp == NULL)
+ (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
+
+ if (vd->vdev_devid != NULL &&
+ ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
+ (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
+ ddi_devid_str_free(minor);
+ ddi_devid_free(devid);
+ }
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ if (vd->vdev_name_vp) {
+ VN_RELE_ASYNC(vd->vdev_name_vp,
+ dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
+ vd->vdev_name_vp = NULL;
+ }
+ if (vd->vdev_devid_vp) {
+ VN_RELE_ASYNC(vd->vdev_devid_vp,
+ dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
+ vd->vdev_devid_vp = NULL;
+ }
+}
+
+/*
+ * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
+ * even a fallback to DKIOCGMEDIAINFO fails.
+ */
+#ifdef DEBUG
+#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__)
+#else
+#define VDEV_DEBUG(...) /* Nothing... */
+#endif
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *ashift)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ ldi_ev_cookie_t ecookie;
+ vdev_disk_ldi_cb_t *lcb;
+ union {
+ struct dk_minfo_ext ude;
+ struct dk_minfo ud;
+ } dks;
+ struct dk_minfo_ext *dkmext = &dks.ude;
+ struct dk_minfo *dkm = &dks.ud;
+ int error;
+ dev_t dev;
+ int otyp;
+ boolean_t validate_devid = B_FALSE;
+ ddi_devid_t devid;
+ uint64_t capacity = 0, blksz = 0, pbsize;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (dvd != NULL) {
+ if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
+ /*
+ * If we are opening a device in its offline notify
+ * context, the LDI handle was just closed. Clean
+ * up the LDI event callbacks and free vd->vdev_tsd.
+ */
+ vdev_disk_free(vd);
+ } else {
+ ASSERT(vd->vdev_reopening);
+ goto skip_open;
+ }
+ }
+
+ /*
+ * Create vd->vdev_tsd.
+ */
+ vdev_disk_alloc(vd);
+ dvd = vd->vdev_tsd;
+
+ /*
+ * Allow bypassing the devid.
+ */
+ if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) {
+ vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed",
+ vd->vdev_devid);
+ spa_strfree(vd->vdev_devid);
+ vd->vdev_devid = NULL;
+ }
+
+ /*
+ * When opening a disk device, we want to preserve the user's original
+ * intent. We always want to open the device by the path the user gave
+ * us, even if it is one of multiple paths to the save device. But we
+ * also want to be able to survive disks being removed/recabled.
+ * Therefore the sequence of opening devices is:
+ *
+ * 1. Try opening the device by path. For legacy pools without the
+ * 'whole_disk' property, attempt to fix the path by appending 's0'.
+ *
+ * 2. If the devid of the device matches the stored value, return
+ * success.
+ *
+ * 3. Otherwise, the device may have moved. Try opening the device
+ * by the devid instead.
+ */
+ if (vd->vdev_devid != NULL) {
+ if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
+ &dvd->vd_minor) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ vdev_dbgmsg(vd, "vdev_disk_open: invalid "
+ "vdev_devid '%s'", vd->vdev_devid);
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ error = EINVAL; /* presume failure */
+
+ if (vd->vdev_path != NULL) {
+
+ if (vd->vdev_wholedisk == -1ULL) {
+ size_t len = strlen(vd->vdev_path) + 3;
+ char *buf = kmem_alloc(len, KM_SLEEP);
+
+ (void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+ error = ldi_open_by_name(buf, spa_mode(spa), kcred,
+ &dvd->vd_lh, zfs_li);
+ if (error == 0) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = buf;
+ vd->vdev_wholedisk = 1ULL;
+ } else {
+ kmem_free(buf, len);
+ }
+ }
+
+ /*
+ * If we have not yet opened the device, try to open it by the
+ * specified path.
+ */
+ if (error != 0) {
+ error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
+ kcred, &dvd->vd_lh, zfs_li);
+ }
+
+ /*
+ * Compare the devid to the stored value.
+ */
+ if (error == 0 && vd->vdev_devid != NULL &&
+ ldi_get_devid(dvd->vd_lh, &devid) == 0) {
+ if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
+ /*
+ * A mismatch here is unexpected, log it.
+ */
+ char *devid_str = ddi_devid_str_encode(devid,
+ dvd->vd_minor);
+ vdev_dbgmsg(vd, "vdev_disk_open: devid "
+ "mismatch: %s != %s", vd->vdev_devid,
+ devid_str);
+ cmn_err(CE_NOTE, "vdev_disk_open %s: devid "
+ "mismatch: %s != %s", vd->vdev_path,
+ vd->vdev_devid, devid_str);
+ ddi_devid_str_free(devid_str);
+
+ error = SET_ERROR(EINVAL);
+ (void) ldi_close(dvd->vd_lh, spa_mode(spa),
+ kcred);
+ dvd->vd_lh = NULL;
+ }
+ ddi_devid_free(devid);
+ }
+
+ /*
+ * If we succeeded in opening the device, but 'vdev_wholedisk'
+ * is not yet set, then this must be a slice.
+ */
+ if (error == 0 && vd->vdev_wholedisk == -1ULL)
+ vd->vdev_wholedisk = 0;
+ }
+
+ /*
+ * If we were unable to open by path, or the devid check fails, open by
+ * devid instead.
+ */
+ if (error != 0 && vd->vdev_devid != NULL) {
+ error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
+ spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
+ if (error != 0) {
+ vdev_dbgmsg(vd, "Failed to open by devid (%s)",
+ vd->vdev_devid);
+ }
+ }
+
+ /*
+ * If all else fails, then try opening by physical path (if available)
+ * or the logical path (if we failed due to the devid check). While not
+ * as reliable as the devid, this will give us something, and the higher
+ * level vdev validation will prevent us from opening the wrong device.
+ */
+ if (error) {
+ if (vd->vdev_devid != NULL)
+ validate_devid = B_TRUE;
+
+ if (vd->vdev_physpath != NULL &&
+ (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
+ error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
+ kcred, &dvd->vd_lh, zfs_li);
+
+ /*
+ * Note that we don't support the legacy auto-wholedisk support
+ * as above. This hasn't been used in a very long time and we
+ * don't need to propagate its oddities to this edge condition.
+ */
+ if (error && vd->vdev_path != NULL)
+ error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
+ kcred, &dvd->vd_lh, zfs_li);
+ }
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
+ error);
+ return (error);
+ }
+
+ /*
+ * Now that the device has been successfully opened, update the devid
+ * if necessary.
+ */
+ if (validate_devid && spa_writeable(spa) &&
+ ldi_get_devid(dvd->vd_lh, &devid) == 0) {
+ if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
+ char *vd_devid;
+
+ vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
+ vdev_dbgmsg(vd, "vdev_disk_open: update devid from "
+ "'%s' to '%s'", vd->vdev_devid, vd_devid);
+ cmn_err(CE_NOTE, "vdev_disk_open %s: update devid "
+ "from '%s' to '%s'", vd->vdev_path != NULL ?
+ vd->vdev_path : "?", vd->vdev_devid, vd_devid);
+ spa_strfree(vd->vdev_devid);
+ vd->vdev_devid = spa_strdup(vd_devid);
+ ddi_devid_str_free(vd_devid);
+ }
+ ddi_devid_free(devid);
+ }
+
+ /*
+ * Once a device is opened, verify that the physical device path (if
+ * available) is up to date.
+ */
+ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
+ ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
+ char *physpath, *minorname;
+
+ physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ minorname = NULL;
+ if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
+ ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
+ (vd->vdev_physpath == NULL ||
+ strcmp(vd->vdev_physpath, physpath) != 0)) {
+ if (vd->vdev_physpath)
+ spa_strfree(vd->vdev_physpath);
+ (void) strlcat(physpath, ":", MAXPATHLEN);
+ (void) strlcat(physpath, minorname, MAXPATHLEN);
+ vd->vdev_physpath = spa_strdup(physpath);
+ }
+ if (minorname)
+ kmem_free(minorname, strlen(minorname) + 1);
+ kmem_free(physpath, MAXPATHLEN);
+ }
+
+ /*
+ * Register callbacks for the LDI offline event.
+ */
+ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
+ LDI_EV_SUCCESS) {
+ lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
+ list_insert_tail(&dvd->vd_ldi_cbs, lcb);
+ (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
+ &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
+ }
+
+ /*
+ * Register callbacks for the LDI degrade event.
+ */
+ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
+ LDI_EV_SUCCESS) {
+ lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
+ list_insert_tail(&dvd->vd_ldi_cbs, lcb);
+ (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
+ &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
+ }
+skip_open:
+ /*
+ * Determine the actual size of the device.
+ */
+ if (ldi_get_size(dvd->vd_lh, psize) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ vdev_dbgmsg(vd, "vdev_disk_open: failed to get size");
+ return (SET_ERROR(EINVAL));
+ }
+
+ *max_psize = *psize;
+
+ /*
+ * Determine the device's minimum transfer size.
+ * If the ioctl isn't supported, assume DEV_BSIZE.
+ */
+ if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
+ (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
+ capacity = dkmext->dki_capacity - 1;
+ blksz = dkmext->dki_lbsize;
+ pbsize = dkmext->dki_pbsize;
+ } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
+ (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
+ VDEV_DEBUG(
+ "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
+ vd->vdev_path);
+ capacity = dkm->dki_capacity - 1;
+ blksz = dkm->dki_lbsize;
+ pbsize = blksz;
+ } else {
+ VDEV_DEBUG("vdev_disk_open(\"%s\"): "
+ "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
+ vd->vdev_path, error);
+ pbsize = DEV_BSIZE;
+ }
+
+ *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
+
+ if (vd->vdev_wholedisk == 1) {
+ int wce = 1;
+
+ if (error == 0) {
+ /*
+ * If we have the capability to expand, we'd have
+ * found out via success from DKIOCGMEDIAINFO{,EXT}.
+ * Adjust max_psize upward accordingly since we know
+ * we own the whole disk now.
+ */
+ *max_psize = capacity * blksz;
+ }
+
+ /*
+ * Since we own the whole disk, try to enable disk write
+ * caching. We ignore errors because it's OK if we can't do it.
+ */
+ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
+ FKIOCTL, kcred, NULL);
+ }
+
+ /*
+ * Clear the nowritecache bit, so that on a vdev_reopen() we will
+ * try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *vd)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ if (vd->vdev_reopening || dvd == NULL)
+ return;
+
+ if (dvd->vd_minor != NULL) {
+ ddi_devid_str_free(dvd->vd_minor);
+ dvd->vd_minor = NULL;
+ }
+
+ if (dvd->vd_devid != NULL) {
+ ddi_devid_free(dvd->vd_devid);
+ dvd->vd_devid = NULL;
+ }
+
+ if (dvd->vd_lh != NULL) {
+ (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
+ dvd->vd_lh = NULL;
+ }
+
+ vd->vdev_delayed_close = B_FALSE;
+ /*
+ * If we closed the LDI handle due to an offline notify from LDI,
+ * don't free vd->vdev_tsd or unregister the callbacks here;
+ * the offline finalize callback or a reopen will take care of it.
+ */
+ if (dvd->vd_ldi_offline)
+ return;
+
+ vdev_disk_free(vd);
+}
+
+int
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+ size_t size, uint64_t offset, int flags, boolean_t isdump)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
+ return (EIO);
+
+ ASSERT(vd->vdev_ops == &vdev_disk_ops);
+
+ /*
+ * If in the context of an active crash dump, use the ldi_dump(9F)
+ * call instead of ldi_strategy(9F) as usual.
+ */
+ if (isdump) {
+ ASSERT3P(dvd, !=, NULL);
+ return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
+ lbtodb(size)));
+ }
+
+ return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+ size_t size, uint64_t offset, int flags)
+{
+ buf_t *bp;
+ int error = 0;
+
+ if (vd_lh == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ASSERT(flags & B_READ || flags & B_WRITE);
+
+ bp = getrbuf(KM_SLEEP);
+ bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
+ bp->b_bcount = size;
+ bp->b_un.b_addr = (void *)data;
+ bp->b_lblkno = lbtodb(offset);
+ bp->b_bufsize = size;
+
+ error = ldi_strategy(vd_lh, bp);
+ ASSERT(error == 0);
+ if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
+ error = SET_ERROR(EIO);
+ freerbuf(bp);
+
+ return (error);
+}
+
+static void
+vdev_disk_io_intr(buf_t *bp)
+{
+ vdev_buf_t *vb = (vdev_buf_t *)bp;
+ zio_t *zio = vb->vb_io;
+
+ /*
+ * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
+ * Rather than teach the rest of the stack about other error
+ * possibilities (EFAULT, etc), we normalize the error value here.
+ */
+ zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
+
+ if (zio->io_error == 0 && bp->b_resid != 0)
+ zio->io_error = SET_ERROR(EIO);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+ } else {
+ abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+ }
+
+ kmem_free(vb, sizeof (vdev_buf_t));
+
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_disk_ioctl_free(zio_t *zio)
+{
+ kmem_free(zio->io_vsd, sizeof (struct dk_callback));
+}
+
+static const zio_vsd_ops_t vdev_disk_vsd_ops = {
+ vdev_disk_ioctl_free,
+ zio_vsd_default_cksum_report
+};
+
+static void
+vdev_disk_ioctl_done(void *zio_arg, int error)
+{
+ zio_t *zio = zio_arg;
+
+ zio->io_error = error;
+
+ zio_interrupt(zio);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_buf_t *vb;
+ struct dk_callback *dkc;
+ buf_t *bp;
+ int error;
+
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ if (vd->vdev_nowritecache) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
+ zio->io_vsd_ops = &vdev_disk_vsd_ops;
+
+ dkc->dkc_callback = vdev_disk_ioctl_done;
+ dkc->dkc_flag = FLUSH_VOLATILE;
+ dkc->dkc_cookie = zio;
+
+ error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
+ (uintptr_t)dkc, FKIOCTL, kcred, NULL);
+
+ if (error == 0) {
+ /*
+ * The ioctl will be done asychronously,
+ * and will call vdev_disk_ioctl_done()
+ * upon completion.
+ */
+ return;
+ }
+
+ zio->io_error = error;
+
+ break;
+
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+
+ zio_execute(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+ vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
+
+ vb->vb_io = zio;
+ bp = &vb->vb_buf;
+
+ bioinit(bp);
+ bp->b_flags = B_BUSY | B_NOCACHE |
+ (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
+ if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+ bp->b_flags |= B_FAILFAST;
+ bp->b_bcount = zio->io_size;
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ bp->b_un.b_addr =
+ abd_borrow_buf(zio->io_abd, zio->io_size);
+ } else {
+ bp->b_un.b_addr =
+ abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ }
+
+ bp->b_lblkno = lbtodb(zio->io_offset);
+ bp->b_bufsize = zio->io_size;
+ bp->b_iodone = (int (*)())vdev_disk_io_intr;
+
+ /* ldi_strategy() will return non-zero only on programming errors */
+ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ /*
+ * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
+ * the device has been removed. If this is the case, then we trigger an
+ * asynchronous removal of the device. Otherwise, probe the device and
+ * make sure it's still accessible.
+ */
+ if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ int state = DKIO_NONE;
+
+ if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
+ FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
+ /*
+ * We post the resource as soon as possible, instead of
+ * when the async removal actually happens, because the
+ * DE is using this information to discard previous I/O
+ * errors.
+ */
+ zfs_post_remove(zio->io_spa, vd);
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
+ }
+ }
+}
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_disk_open,
+ vdev_disk_close,
+ vdev_default_asize,
+ vdev_disk_io_start,
+ vdev_disk_io_done,
+ NULL,
+ NULL,
+ vdev_disk_hold,
+ vdev_disk_rele,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+/*
+ * Given the root disk device devid or pathname, read the label from
+ * the device, and construct a configuration nvlist.
+ */
+int
+vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
+{
+ ldi_handle_t vd_lh;
+ vdev_label_t *label;
+ uint64_t s, size;
+ int l;
+ ddi_devid_t tmpdevid;
+ int error = -1;
+ char *minor_name;
+
+ /*
+ * Read the device label and build the nvlist.
+ */
+ if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
+ &minor_name) == 0) {
+ error = ldi_open_by_devid(tmpdevid, minor_name,
+ FREAD, kcred, &vd_lh, zfs_li);
+ ddi_devid_free(tmpdevid);
+ ddi_devid_str_free(minor_name);
+ }
+
+ if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
+ zfs_li)))
+ return (error);
+
+ if (ldi_get_size(vd_lh, &s)) {
+ (void) ldi_close(vd_lh, FREAD, kcred);
+ return (SET_ERROR(EIO));
+ }
+
+ size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
+ label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
+
+ *config = NULL;
+ for (l = 0; l < VDEV_LABELS; l++) {
+ uint64_t offset, state, txg = 0;
+
+ /* read vdev label */
+ offset = vdev_label_offset(size, l, 0);
+ if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
+ VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
+ continue;
+
+ if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
+ sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
+ *config = NULL;
+ continue;
+ }
+
+ if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 || state >= POOL_STATE_DESTROYED) {
+ nvlist_free(*config);
+ *config = NULL;
+ continue;
+ }
+
+ if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0) {
+ nvlist_free(*config);
+ *config = NULL;
+ continue;
+ }
+
+ break;
+ }
+
+ kmem_free(label, sizeof (vdev_label_t));
+ (void) ldi_close(vd_lh, FREAD, kcred);
+ if (*config == NULL)
+ error = SET_ERROR(EIDRM);
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
new file mode 100644
index 000000000000..aa80e028f7df
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -0,0 +1,307 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+void
+vdev_file_init(void)
+{
+ vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
+ minclsyspri, max_ncpus, INT_MAX, 0);
+}
+
+void
+vdev_file_fini(void)
+{
+ taskq_destroy(vdev_file_taskq);
+}
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_file_t *vf;
+ vnode_t *vp;
+ vattr_t vattr;
+ int error;
+
+ /* Rotational optimizations only make sense on block devices */
+ vd->vdev_nonrot = B_TRUE;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ vf = vd->vdev_tsd;
+ vp = vf->vf_vnode;
+ goto skip_open;
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+ error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
+ spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+ return (error);
+ }
+
+ vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (vp->v_type != VREG) {
+#ifdef __FreeBSD__
+ (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
+#endif
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+#ifdef __FreeBSD__
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+#endif
+ return (SET_ERROR(ENODEV));
+ }
+#endif /* _KERNEL */
+
+skip_open:
+ /*
+ * Determine the physical size of the file.
+ */
+ vattr.va_mask = AT_SIZE;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, kcred);
+ VOP_UNLOCK(vp);
+ if (error) {
+ (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+ return (error);
+ }
+
+ vd->vdev_notrim = B_TRUE;
+
+ *max_psize = *psize = vattr.va_size;
+ *logical_ashift = SPA_MINBLOCKSHIFT;
+ *physical_ashift = SPA_MINBLOCKSHIFT;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vd->vdev_reopening || vf == NULL)
+ return;
+
+ if (vf->vf_vnode != NULL) {
+ (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
+ kcred, NULL);
+ }
+
+ vd->vdev_delayed_close = B_FALSE;
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+/*
+ * Implements the interrupt side for file vdev types. This routine will be
+ * called when the I/O completes allowing us to transfer the I/O to the
+ * interrupt taskqs. For consistency, the code structure mimics disk vdev
+ * types.
+ */
+static void
+vdev_file_io_intr(zio_t *zio)
+{
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+ zio_t *zio = arg;
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf;
+ vnode_t *vp;
+ void *addr;
+ ssize_t resid;
+
+ vf = vd->vdev_tsd;
+ vp = vf->vf_vnode;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ addr = abd_borrow_buf(zio->io_abd, zio->io_size);
+ } else {
+ addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ }
+
+ zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+ UIO_READ : UIO_WRITE, vp, addr, zio->io_size,
+ zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ abd_return_buf_copy(zio->io_abd, addr, zio->io_size);
+ } else {
+ abd_return_buf(zio->io_abd, addr, zio->io_size);
+ }
+
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = ENOSPC;
+
+ vdev_file_io_intr(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+ kcred, NULL);
+ break;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+
+ zio_execute(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+ VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+ TQ_SLEEP), !=, 0);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ NULL,
+ vdev_file_hold,
+ vdev_file_rele,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_FILE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ NULL,
+ vdev_file_hold,
+ vdev_file_rele,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
new file mode 100644
index 000000000000..5ff895ce472c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -0,0 +1,1193 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <geom/geom.h>
+#include <geom/geom_int.h>
+
+/*
+ * Virtual device vector for GEOM.
+ */
+
+static g_attrchanged_t vdev_geom_attrchanged;
+struct g_class zfs_vdev_class = {
+ .name = "ZFS::VDEV",
+ .version = G_VERSION,
+ .attrchanged = vdev_geom_attrchanged,
+};
+
+struct consumer_vdev_elem {
+ SLIST_ENTRY(consumer_vdev_elem) elems;
+ vdev_t *vd;
+};
+
+SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
+_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
+ == sizeof(struct consumer_priv_t*),
+ "consumer_priv_t* can't be stored in g_consumer.private");
+
+DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+/* Don't send BIO_FLUSH. */
+static int vdev_geom_bio_flush_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
+ &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+/* Don't send BIO_DELETE. */
+static int vdev_geom_bio_delete_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
+ &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
+
+/* Declare local functions */
+static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
+
+/*
+ * Thread local storage used to indicate when a thread is probing geoms
+ * for their guids. If NULL, this thread is not tasting geoms. If non NULL,
+ * it is looking for a replacement for the vdev_t* that is its value.
+ */
+uint_t zfs_geom_probe_vdev_key;
+
+static void
+vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
+{
+ int error;
+ uint16_t rate;
+
+ error = g_getattr("GEOM::rotation_rate", cp, &rate);
+ if (error == 0 && rate == 1)
+ vd->vdev_nonrot = B_TRUE;
+ else
+ vd->vdev_nonrot = B_FALSE;
+}
+
+static void
+vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
+ boolean_t do_null_update)
+{
+ boolean_t needs_update = B_FALSE;
+ char *physpath;
+ int error, physpath_len;
+
+ physpath_len = MAXPATHLEN;
+ physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+ error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+ if (error == 0) {
+ char *old_physpath;
+
+ /* g_topology lock ensures that vdev has not been closed */
+ g_topology_assert();
+ old_physpath = vd->vdev_physpath;
+ vd->vdev_physpath = spa_strdup(physpath);
+
+ if (old_physpath != NULL) {
+ needs_update = (strcmp(old_physpath,
+ vd->vdev_physpath) != 0);
+ spa_strfree(old_physpath);
+ } else
+ needs_update = do_null_update;
+ }
+ g_free(physpath);
+
+ /*
+ * If the physical path changed, update the config.
+ * Only request an update for previously unset physpaths if
+ * requested by the caller.
+ */
+ if (needs_update)
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
+
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+ char *old_physpath;
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ int error;
+
+ priv = (struct consumer_priv_t*)&cp->private;
+ if (SLIST_EMPTY(priv))
+ return;
+
+ SLIST_FOREACH(elem, priv, elems) {
+ vdev_t *vd = elem->vd;
+ if (strcmp(attr, "GEOM::rotation_rate") == 0) {
+ vdev_geom_set_rotation_rate(vd, cp);
+ return;
+ }
+ if (strcmp(attr, "GEOM::physpath") == 0) {
+ vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
+ return;
+ }
+ }
+}
+
+static void
+vdev_geom_resize(struct g_consumer *cp)
+{
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ spa_t *spa;
+ vdev_t *vd;
+
+ priv = (struct consumer_priv_t *)&cp->private;
+ if (SLIST_EMPTY(priv))
+ return;
+
+ SLIST_FOREACH(elem, priv, elems) {
+ vd = elem->vd;
+ if (vd->vdev_state != VDEV_STATE_HEALTHY)
+ continue;
+ spa = vd->vdev_spa;
+ if (!spa->spa_autoexpand)
+ continue;
+ vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
+ }
+}
+
+static void
+vdev_geom_orphan(struct g_consumer *cp)
+{
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+
+ g_topology_assert();
+
+ priv = (struct consumer_priv_t*)&cp->private;
+ if (SLIST_EMPTY(priv))
+ /* Vdev close in progress. Ignore the event. */
+ return;
+
+ /*
+ * Orphan callbacks occur from the GEOM event thread.
+ * Concurrent with this call, new I/O requests may be
+ * working their way through GEOM about to find out
+ * (only once executed by the g_down thread) that we've
+ * been orphaned from our disk provider. These I/Os
+ * must be retired before we can detach our consumer.
+ * This is most easily achieved by acquiring the
+ * SPA ZIO configuration lock as a writer, but doing
+ * so with the GEOM topology lock held would cause
+ * a lock order reversal. Instead, rely on the SPA's
+ * async removal support to invoke a close on this
+ * vdev once it is safe to do so.
+ */
+ SLIST_FOREACH(elem, priv, elems) {
+ vdev_t *vd = elem->vd;
+
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+ }
+}
+
+static struct g_consumer *
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
+{
+ struct g_geom *gp;
+ struct g_consumer *cp;
+ int error;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Attaching to %s.", pp->name);
+
+ if (sanity) {
+ if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
+ ZFS_LOG(1, "Failing attach of %s. "
+ "Incompatible sectorsize %d\n",
+ pp->name, pp->sectorsize);
+ return (NULL);
+ } else if (pp->mediasize < SPA_MINDEVSIZE) {
+ ZFS_LOG(1, "Failing attach of %s. "
+ "Incompatible mediasize %ju\n",
+ pp->name, pp->mediasize);
+ return (NULL);
+ }
+ }
+
+ /* Do we have geom already? No? Create one. */
+ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ if (strcmp(gp->name, "zfs::vdev") != 0)
+ continue;
+ break;
+ }
+ if (gp == NULL) {
+ gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
+ gp->orphan = vdev_geom_orphan;
+ gp->attrchanged = vdev_geom_attrchanged;
+ gp->resize = vdev_geom_resize;
+ cp = g_new_consumer(gp);
+ error = g_attach(cp, pp);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
+ __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
+ __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
+ } else {
+ /* Check if we are already connected to this provider. */
+ LIST_FOREACH(cp, &gp->consumer, consumer) {
+ if (cp->provider == pp) {
+ ZFS_LOG(1, "Found consumer for %s.", pp->name);
+ break;
+ }
+ }
+ if (cp == NULL) {
+ cp = g_new_consumer(gp);
+ error = g_attach(cp, pp);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
+ __func__, __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+ __func__, __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created consumer for %s.", pp->name);
+ } else {
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+ __func__, __LINE__, error);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
+ }
+ }
+
+ if (vd != NULL)
+ vd->vdev_tsd = cp;
+
+ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+ return (cp);
+}
+
+static void
+vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
+{
+ struct g_geom *gp;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Detaching from %s.",
+ cp->provider && cp->provider->name ? cp->provider->name : "NULL");
+
+ gp = cp->geom;
+ if (open_for_read)
+ g_access(cp, -1, 0, -1);
+ /* Destroy consumer on last close. */
+ if (cp->acr == 0 && cp->ace == 0) {
+ if (cp->acw > 0)
+ g_access(cp, 0, -cp->acw, 0);
+ if (cp->provider != NULL) {
+ ZFS_LOG(1, "Destroying consumer for %s.",
+ cp->provider->name ? cp->provider->name : "NULL");
+ g_detach(cp);
+ }
+ g_destroy_consumer(cp);
+ }
+ /* Destroy geom if there are no consumers left. */
+ if (LIST_EMPTY(&gp->consumer)) {
+ ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+ g_wither_geom(gp, ENXIO);
+ }
+}
+
+static void
+vdev_geom_close_locked(vdev_t *vd)
+{
+ struct g_consumer *cp;
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem, *elem_temp;
+
+ g_topology_assert();
+
+ cp = vd->vdev_tsd;
+ vd->vdev_delayed_close = B_FALSE;
+ if (cp == NULL)
+ return;
+
+ ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+ KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
+ priv = (struct consumer_priv_t*)&cp->private;
+ vd->vdev_tsd = NULL;
+ SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
+ if (elem->vd == vd) {
+ SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
+ g_free(elem);
+ }
+ }
+
+ vdev_geom_detach(cp, B_TRUE);
+}
+
+/*
+ * Issue one or more bios to the vdev in parallel
+ * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
+ * operation is described by parallel entries from each array. There may be
+ * more bios actually issued than entries in the array
+ */
+static void
+vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
+ off_t *sizes, int *errors, int ncmds)
+{
+ struct bio **bios;
+ u_char *p;
+ off_t off, maxio, s, end;
+ int i, n_bios, j;
+ size_t bios_size;
+
+ maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
+ n_bios = 0;
+
+ /* How many bios are required for all commands ? */
+ for (i = 0; i < ncmds; i++)
+ n_bios += (sizes[i] + maxio - 1) / maxio;
+
+ /* Allocate memory for the bios */
+ bios_size = n_bios * sizeof(struct bio*);
+ bios = kmem_zalloc(bios_size, KM_SLEEP);
+
+ /* Prepare and issue all of the bios */
+ for (i = j = 0; i < ncmds; i++) {
+ off = offsets[i];
+ p = datas[i];
+ s = sizes[i];
+ end = off + s;
+ ASSERT((off % cp->provider->sectorsize) == 0);
+ ASSERT((s % cp->provider->sectorsize) == 0);
+
+ for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
+ bios[j] = g_alloc_bio();
+ bios[j]->bio_cmd = cmds[i];
+ bios[j]->bio_done = NULL;
+ bios[j]->bio_offset = off;
+ bios[j]->bio_length = MIN(s, maxio);
+ bios[j]->bio_data = p;
+ g_io_request(bios[j], cp);
+ }
+ }
+ ASSERT(j == n_bios);
+
+ /* Wait for all of the bios to complete, and clean them up */
+ for (i = j = 0; i < ncmds; i++) {
+ off = offsets[i];
+ s = sizes[i];
+ end = off + s;
+
+ for (; off < end; off += maxio, s -= maxio, j++) {
+ errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
+ g_destroy_bio(bios[j]);
+ }
+ }
+ kmem_free(bios, bios_size);
+}
+
+/*
+ * Read the vdev config from a device. Return the number of valid labels that
+ * were found. The vdev config will be returned in config if and only if at
+ * least one valid label was found.
+ */
+static int
+vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
+{
+ struct g_provider *pp;
+ nvlist_t *config;
+ vdev_phys_t *vdev_lists[VDEV_LABELS];
+ char *buf;
+ size_t buflen;
+ uint64_t psize, state, txg;
+ off_t offsets[VDEV_LABELS];
+ off_t size;
+ off_t sizes[VDEV_LABELS];
+ int cmds[VDEV_LABELS];
+ int errors[VDEV_LABELS];
+ int l, nlabels;
+
+ g_topology_assert_not();
+
+ pp = cp->provider;
+ ZFS_LOG(1, "Reading config from %s...", pp->name);
+
+ psize = pp->mediasize;
+ psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
+
+ size = sizeof(*vdev_lists[0]) + pp->sectorsize -
+ ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
+
+ buflen = sizeof(vdev_lists[0]->vp_nvlist);
+
+ /* Create all of the IO requests */
+ for (l = 0; l < VDEV_LABELS; l++) {
+ cmds[l] = BIO_READ;
+ vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
+ offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
+ sizes[l] = size;
+ errors[l] = 0;
+ ASSERT(offsets[l] % pp->sectorsize == 0);
+ }
+
+ /* Issue the IO requests */
+ vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
+ VDEV_LABELS);
+
+ /* Parse the labels */
+ config = *configp = NULL;
+ nlabels = 0;
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if (errors[l] != 0)
+ continue;
+
+ buf = vdev_lists[l]->vp_nvlist;
+
+ if (nvlist_unpack(buf, buflen, &config, 0) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 || state > POOL_STATE_L2CACHE) {
+ nvlist_free(config);
+ continue;
+ }
+
+ if (state != POOL_STATE_SPARE &&
+ state != POOL_STATE_L2CACHE &&
+ (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0)) {
+ nvlist_free(config);
+ continue;
+ }
+
+ if (*configp != NULL)
+ nvlist_free(*configp);
+ *configp = config;
+
+ nlabels++;
+ }
+
+ /* Free the label storage */
+ for (l = 0; l < VDEV_LABELS; l++)
+ kmem_free(vdev_lists[l], size);
+
+ return (nlabels);
+}
+
+static void
+resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
+{
+ nvlist_t **new_configs;
+ uint64_t i;
+
+ if (id < *count)
+ return;
+ new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
+ KM_SLEEP);
+ for (i = 0; i < *count; i++)
+ new_configs[i] = (*configs)[i];
+ if (*configs != NULL)
+ kmem_free(*configs, *count * sizeof(void *));
+ *configs = new_configs;
+ *count = id + 1;
+}
+
+static void
+process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
+ const char *name, uint64_t* known_pool_guid)
+{
+ nvlist_t *vdev_tree;
+ uint64_t pool_guid;
+ uint64_t vdev_guid, known_guid;
+ uint64_t id, txg, known_txg;
+ char *pname;
+ int i;
+
+ if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
+ strcmp(pname, name) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
+ goto ignore;
+
+ if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
+ goto ignore;
+
+ VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+ if (*known_pool_guid != 0) {
+ if (pool_guid != *known_pool_guid)
+ goto ignore;
+ } else
+ *known_pool_guid = pool_guid;
+
+ resize_configs(configs, count, id);
+
+ if ((*configs)[id] != NULL) {
+ VERIFY(nvlist_lookup_uint64((*configs)[id],
+ ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
+ if (txg <= known_txg)
+ goto ignore;
+ nvlist_free((*configs)[id]);
+ }
+
+ (*configs)[id] = cfg;
+ return;
+
+ignore:
+ nvlist_free(cfg);
+}
+
+int
+vdev_geom_read_pool_label(const char *name,
+ nvlist_t ***configs, uint64_t *count)
+{
+ struct g_class *mp;
+ struct g_geom *gp;
+ struct g_provider *pp;
+ struct g_consumer *zcp;
+ nvlist_t *vdev_cfg;
+ uint64_t pool_guid;
+ int error, nlabels;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ *configs = NULL;
+ *count = 0;
+ pool_guid = 0;
+ LIST_FOREACH(mp, &g_classes, class) {
+ if (mp == &zfs_vdev_class)
+ continue;
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ LIST_FOREACH(pp, &gp->provider, provider) {
+ if (pp->flags & G_PF_WITHER)
+ continue;
+ zcp = vdev_geom_attach(pp, NULL, B_TRUE);
+ if (zcp == NULL)
+ continue;
+ g_topology_unlock();
+ nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
+ g_topology_lock();
+ vdev_geom_detach(zcp, B_TRUE);
+ if (nlabels == 0)
+ continue;
+ ZFS_LOG(1, "successfully read vdev config");
+
+ process_vdev_config(configs, count,
+ vdev_cfg, name, &pool_guid);
+ }
+ }
+ }
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (*count > 0 ? 0 : ENOENT);
+}
+
+enum match {
+ NO_MATCH = 0, /* No matching labels found */
+ TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/
+ ZERO_MATCH = 1, /* Should never be returned */
+ ONE_MATCH = 2, /* 1 label matching the vdev_guid */
+ TWO_MATCH = 3, /* 2 label matching the vdev_guid */
+ THREE_MATCH = 4, /* 3 label matching the vdev_guid */
+ FULL_MATCH = 5 /* all labels match the vdev_guid */
+};
+
+static enum match
+vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
+{
+ nvlist_t *config;
+ uint64_t pool_guid, top_guid, vdev_guid;
+ struct g_consumer *cp;
+ int nlabels;
+
+ cp = vdev_geom_attach(pp, NULL, B_TRUE);
+ if (cp == NULL) {
+ ZFS_LOG(1, "Unable to attach tasting instance to %s.",
+ pp->name);
+ return (NO_MATCH);
+ }
+ g_topology_unlock();
+ nlabels = vdev_geom_read_config(cp, &config);
+ g_topology_lock();
+ vdev_geom_detach(cp, B_TRUE);
+ if (nlabels == 0) {
+ ZFS_LOG(1, "Unable to read config from %s.", pp->name);
+ return (NO_MATCH);
+ }
+
+ pool_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
+ top_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
+ vdev_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+ nvlist_free(config);
+
+ /*
+ * Check that the label's pool guid matches the desired guid.
+ * Inactive spares and L2ARCs do not have any pool guid in the label.
+ */
+ if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
+ ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
+ pp->name,
+ (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
+ return (NO_MATCH);
+ }
+
+ /*
+ * Check that the label's vdev guid matches the desired guid.
+ * The second condition handles possible race on vdev detach, when
+ * remaining vdev receives GUID of destroyed top level mirror vdev.
+ */
+ if (vdev_guid == vd->vdev_guid) {
+ ZFS_LOG(1, "guids match for provider %s.", pp->name);
+ return (ZERO_MATCH + nlabels);
+ } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
+ ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
+ return (TOPGUID_MATCH);
+ }
+ ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
+ pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
+ return (NO_MATCH);
+}
+
+static struct g_consumer *
+vdev_geom_attach_by_guids(vdev_t *vd)
+{
+ struct g_class *mp;
+ struct g_geom *gp;
+ struct g_provider *pp, *best_pp;
+ struct g_consumer *cp;
+ const char *vdpath;
+ enum match match, best_match;
+
+ g_topology_assert();
+
+ vdpath = vd->vdev_path + sizeof("/dev/") - 1;
+ cp = NULL;
+ best_pp = NULL;
+ best_match = NO_MATCH;
+ LIST_FOREACH(mp, &g_classes, class) {
+ if (mp == &zfs_vdev_class)
+ continue;
+ LIST_FOREACH(gp, &mp->geom, geom) {
+ if (gp->flags & G_GEOM_WITHER)
+ continue;
+ LIST_FOREACH(pp, &gp->provider, provider) {
+ match = vdev_attach_ok(vd, pp);
+ if (match > best_match) {
+ best_match = match;
+ best_pp = pp;
+ } else if (match == best_match) {
+ if (strcmp(pp->name, vdpath) == 0) {
+ best_pp = pp;
+ }
+ }
+ if (match == FULL_MATCH)
+ goto out;
+ }
+ }
+ }
+
+out:
+ if (best_pp) {
+ cp = vdev_geom_attach(best_pp, vd, B_TRUE);
+ if (cp == NULL) {
+ printf("ZFS WARNING: Unable to attach to %s.\n",
+ best_pp->name);
+ }
+ }
+ return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_guids(vdev_t *vd)
+{
+ struct g_consumer *cp;
+ char *buf;
+ size_t len;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Searching by guids [%ju:%ju].",
+ (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+ cp = vdev_geom_attach_by_guids(vd);
+ if (cp != NULL) {
+ len = strlen(cp->provider->name) + strlen("/dev/") + 1;
+ buf = kmem_alloc(len, KM_SLEEP);
+
+ snprintf(buf, len, "/dev/%s", cp->provider->name);
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = buf;
+
+ ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+ (uintmax_t)spa_guid(vd->vdev_spa),
+ (uintmax_t)vd->vdev_guid, cp->provider->name);
+ } else {
+ ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+ (uintmax_t)spa_guid(vd->vdev_spa),
+ (uintmax_t)vd->vdev_guid);
+ }
+
+ return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_path(vdev_t *vd, int check_guid)
+{
+ struct g_provider *pp;
+ struct g_consumer *cp;
+
+ g_topology_assert();
+
+ cp = NULL;
+ pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
+ if (pp != NULL) {
+ ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
+ if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
+ cp = vdev_geom_attach(pp, vd, B_FALSE);
+ }
+
+ return (cp);
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ struct g_provider *pp;
+ struct g_consumer *cp;
+ size_t bufsize;
+ int error;
+
+ /* Set the TLS to indicate downstack that we should not access zvols*/
+ VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if ((cp = vd->vdev_tsd) != NULL) {
+ ASSERT(vd->vdev_reopening);
+ goto skip_open;
+ }
+
+ DROP_GIANT();
+ g_topology_lock();
+ error = 0;
+
+ if (vd->vdev_spa->spa_splitting_newspa ||
+ (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+ vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+ vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
+ /*
+ * We are dealing with a vdev that hasn't been previously
+ * opened (since boot), and we are not loading an
+ * existing pool configuration. This looks like a
+ * vdev add operation to a new or existing pool.
+ * Assume the user knows what he/she is doing and find
+ * GEOM provider by its name, ignoring GUID mismatches.
+ *
+ * XXPOLICY: It would be safer to only allow a device
+ * that is unlabeled or labeled but missing
+ * GUID information to be opened in this fashion,
+ * unless we are doing a split, in which case we
+ * should allow any guid.
+ */
+ cp = vdev_geom_open_by_path(vd, 0);
+ } else {
+ /*
+ * Try using the recorded path for this device, but only
+ * accept it if its label data contains the expected GUIDs.
+ */
+ cp = vdev_geom_open_by_path(vd, 1);
+ if (cp == NULL) {
+ /*
+ * The device at vd->vdev_path doesn't have the
+ * expected GUIDs. The disks might have merely
+ * moved around so try all other GEOM providers
+ * to find one with the right GUIDs.
+ */
+ cp = vdev_geom_open_by_guids(vd);
+ }
+ }
+
+ /* Clear the TLS now that tasting is done */
+ VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
+
+ if (cp == NULL) {
+ ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
+ error = ENOENT;
+ } else {
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ int spamode;
+
+ priv = (struct consumer_priv_t*)&cp->private;
+ if (cp->private == NULL)
+ SLIST_INIT(priv);
+ elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
+ elem->vd = vd;
+ SLIST_INSERT_HEAD(priv, elem, elems);
+
+ spamode = spa_mode(vd->vdev_spa);
+ if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
+ !ISP2(cp->provider->sectorsize)) {
+ ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
+ cp->provider->name);
+
+ vdev_geom_close_locked(vd);
+ error = EINVAL;
+ cp = NULL;
+ } else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
+ int i;
+
+ for (i = 0; i < 5; i++) {
+ error = g_access(cp, 0, 1, 0);
+ if (error == 0)
+ break;
+ g_topology_unlock();
+ tsleep(vd, 0, "vdev", hz / 2);
+ g_topology_lock();
+ }
+ if (error != 0) {
+ printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
+ cp->provider->name, error);
+ vdev_geom_close_locked(vd);
+ cp = NULL;
+ }
+ }
+ }
+
+ /* Fetch initial physical path information for this device. */
+ if (cp != NULL) {
+ vdev_geom_attrchanged(cp, "GEOM::physpath");
+
+ /* Set other GEOM characteristics */
+ vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
+ vdev_geom_set_rotation_rate(vd, cp);
+ }
+
+ g_topology_unlock();
+ PICKUP_GIANT();
+ if (cp == NULL) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
+ error);
+ return (error);
+ }
+skip_open:
+ pp = cp->provider;
+
+ /*
+ * Determine the actual size of the device.
+ */
+ *max_psize = *psize = pp->mediasize;
+
+ /*
+ * Determine the device's minimum transfer size and preferred
+ * transfer size.
+ */
+ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+ *physical_ashift = 0;
+ if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
+ pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
+ *physical_ashift = highbit(pp->stripesize) - 1;
+
+ /*
+ * Clear the nowritecache settings, so that on a vdev_reopen()
+ * we will try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ return (0);
+}
+
+static void
+vdev_geom_close(vdev_t *vd)
+{
+ struct g_consumer *cp;
+ int locked;
+
+ cp = vd->vdev_tsd;
+
+ DROP_GIANT();
+ locked = g_topology_locked();
+ if (!locked)
+ g_topology_lock();
+
+ if (!vd->vdev_reopening ||
+ (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
+ (cp->provider != NULL && cp->provider->error != 0))))
+ vdev_geom_close_locked(vd);
+
+ if (!locked)
+ g_topology_unlock();
+ PICKUP_GIANT();
+}
+
+static void
+vdev_geom_io_intr(struct bio *bp)
+{
+ vdev_t *vd;
+ zio_t *zio;
+
+ zio = bp->bio_caller1;
+ vd = zio->io_vd;
+ zio->io_error = bp->bio_error;
+ if (zio->io_error == 0 && bp->bio_resid != 0)
+ zio->io_error = SET_ERROR(EIO);
+
+ switch(zio->io_error) {
+ case ENOTSUP:
+ /*
+ * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
+ * that future attempts will never succeed. In this case
+ * we set a persistent flag so that we don't bother with
+ * requests in the future.
+ */
+ switch(bp->bio_cmd) {
+ case BIO_FLUSH:
+ vd->vdev_nowritecache = B_TRUE;
+ break;
+ case BIO_DELETE:
+ vd->vdev_notrim = B_TRUE;
+ break;
+ }
+ break;
+ case ENXIO:
+ if (!vd->vdev_remove_wanted) {
+ /*
+ * If provider's error is set we assume it is being
+ * removed.
+ */
+ if (bp->bio_to->error != 0) {
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa,
+ SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
+ }
+ }
+ break;
+ }
+
+ /*
+ * We have to split bio freeing into two parts, because the ABD code
+ * cannot be called in this context and vdev_op_io_done is not called
+ * for ZIO_TYPE_IOCTL zio-s.
+ */
+ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+ g_destroy_bio(bp);
+ zio->io_bio = NULL;
+ }
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_geom_io_start(zio_t *zio)
+{
+ vdev_t *vd;
+ struct g_consumer *cp;
+ struct bio *bp;
+ int error;
+
+ vd = zio->io_vd;
+
+ switch (zio->io_type) {
+ case ZIO_TYPE_IOCTL:
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ } else {
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
+ break;
+ if (vd->vdev_nowritecache) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+ goto sendreq;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+ }
+
+ zio_execute(zio);
+ return;
+ case ZIO_TYPE_FREE:
+ if (vd->vdev_notrim) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ } else if (!vdev_geom_bio_delete_disable) {
+ goto sendreq;
+ }
+ zio_execute(zio);
+ return;
+ }
+sendreq:
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_FREE ||
+ zio->io_type == ZIO_TYPE_IOCTL);
+
+ cp = vd->vdev_tsd;
+ if (cp == NULL) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+ bp = g_alloc_bio();
+ bp->bio_caller1 = zio;
+ switch (zio->io_type) {
+ case ZIO_TYPE_READ:
+ case ZIO_TYPE_WRITE:
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ if (zio->io_type == ZIO_TYPE_READ) {
+ bp->bio_cmd = BIO_READ;
+ bp->bio_data =
+ abd_borrow_buf(zio->io_abd, zio->io_size);
+ } else {
+ bp->bio_cmd = BIO_WRITE;
+ bp->bio_data =
+ abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ }
+ break;
+ case ZIO_TYPE_FREE:
+ bp->bio_cmd = BIO_DELETE;
+ bp->bio_data = NULL;
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ break;
+ case ZIO_TYPE_IOCTL:
+ bp->bio_cmd = BIO_FLUSH;
+ bp->bio_data = NULL;
+ bp->bio_offset = cp->provider->mediasize;
+ bp->bio_length = 0;
+ break;
+ }
+ bp->bio_done = vdev_geom_io_intr;
+ zio->io_bio = bp;
+
+ g_io_request(bp, cp);
+}
+
+static void
+vdev_geom_io_done(zio_t *zio)
+{
+ struct bio *bp = zio->io_bio;
+
+ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+ ASSERT(bp == NULL);
+ return;
+ }
+
+ if (bp == NULL) {
+ ASSERT3S(zio->io_error, ==, ENXIO);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
+ else
+ abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
+
+ g_destroy_bio(bp);
+ zio->io_bio = NULL;
+}
+
+static void
+vdev_geom_hold(vdev_t *vd)
+{
+}
+
+static void
+vdev_geom_rele(vdev_t *vd)
+{
+}
+
+vdev_ops_t vdev_geom_ops = {
+ vdev_geom_open,
+ vdev_geom_close,
+ vdev_default_asize,
+ vdev_geom_io_start,
+ vdev_geom_io_done,
+ NULL,
+ NULL,
+ vdev_geom_hold,
+ vdev_geom_rele,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
new file mode 100644
index 000000000000..469150a4b72f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -0,0 +1,1849 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/metaslab.h>
+#include <sys/refcount.h>
+#include <sys/dmu.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/abd.h>
+#include <sys/zthr.h>
+
+/*
+ * An indirect vdev corresponds to a vdev that has been removed. Since
+ * we cannot rewrite block pointers of snapshots, etc., we keep a
+ * mapping from old location on the removed device to the new location
+ * on another device in the pool and use this mapping whenever we need
+ * to access the DVA. Unfortunately, this mapping did not respect
+ * logical block boundaries when it was first created, and so a DVA on
+ * this indirect vdev may be "split" into multiple sections that each
+ * map to a different location. As a consequence, not all DVAs can be
+ * translated to an equivalent new DVA. Instead we must provide a
+ * "vdev_remap" operation that executes a callback on each contiguous
+ * segment of the new location. This function is used in multiple ways:
+ *
+ * - i/os to this vdev use the callback to determine where the
+ * data is now located, and issue child i/os for each segment's new
+ * location.
+ *
+ * - frees and claims to this vdev use the callback to free or claim
+ * each mapped segment. (Note that we don't actually need to claim
+ * log blocks on indirect vdevs, because we don't allocate to
+ * removing vdevs. However, zdb uses zio_claim() for its leak
+ * detection.)
+ */
+
+/*
+ * "Big theory statement" for how we mark blocks obsolete.
+ *
+ * When a block on an indirect vdev is freed or remapped, a section of
+ * that vdev's mapping may no longer be referenced (aka "obsolete"). We
+ * keep track of how much of each mapping entry is obsolete. When
+ * an entry becomes completely obsolete, we can remove it, thus reducing
+ * the memory used by the mapping. The complete picture of obsolescence
+ * is given by the following data structures, described below:
+ * - the entry-specific obsolete count
+ * - the vdev-specific obsolete spacemap
+ * - the pool-specific obsolete bpobj
+ *
+ * == On disk data structures used ==
+ *
+ * We track the obsolete space for the pool using several objects. Each
+ * of these objects is created on demand and freed when no longer
+ * needed, and is assumed to be empty if it does not exist.
+ * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
+ *
+ * - Each vic_mapping_object (associated with an indirect vdev) can
+ * have a vimp_counts_object. This is an array of uint32_t's
+ * with the same number of entries as the vic_mapping_object. When
+ * the mapping is condensed, entries from the vic_obsolete_sm_object
+ * (see below) are folded into the counts. Therefore, each
+ * obsolete_counts entry tells us the number of bytes in the
+ * corresponding mapping entry that were not referenced when the
+ * mapping was last condensed.
+ *
+ * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
+ * This is a space map containing an alloc entry for every DVA that
+ * has been obsoleted since the last time this indirect vdev was
+ * condensed. We use this object in order to improve performance
+ * when marking a DVA as obsolete. Instead of modifying an arbitrary
+ * offset of the vimp_counts_object, we only need to append an entry
+ * to the end of this object. When a DVA becomes obsolete, it is
+ * added to the obsolete space map. This happens when the DVA is
+ * freed, remapped and not referenced by a snapshot, or the last
+ * snapshot referencing it is destroyed.
+ *
+ * - Each dataset can have a ds_remap_deadlist object. This is a
+ * deadlist object containing all blocks that were remapped in this
+ * dataset but referenced in a previous snapshot. Blocks can *only*
+ * appear on this list if they were remapped (dsl_dataset_block_remapped);
+ * blocks that were killed in a head dataset are put on the normal
+ * ds_deadlist and marked obsolete when they are freed.
+ *
+ * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
+ * in the pool that need to be marked obsolete. When a snapshot is
+ * destroyed, we move some of the ds_remap_deadlist to the obsolete
+ * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
+ * asynchronously process the obsolete bpobj, moving its entries to
+ * the specific vdevs' obsolete space maps.
+ *
+ * == Summary of how we mark blocks as obsolete ==
+ *
+ * - When freeing a block: if any DVA is on an indirect vdev, append to
+ * vic_obsolete_sm_object.
+ * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
+ * references; otherwise append to vic_obsolete_sm_object).
+ * - When freeing a snapshot: move parts of ds_remap_deadlist to
+ * dp_obsolete_bpobj (same algorithm as ds_deadlist).
+ * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
+ * individual vdev's vic_obsolete_sm_object.
+ */
+
+/*
+ * "Big theory statement" for how we condense indirect vdevs.
+ *
+ * Condensing an indirect vdev's mapping is the process of determining
+ * the precise counts of obsolete space for each mapping entry (by
+ * integrating the obsolete spacemap into the obsolete counts) and
+ * writing out a new mapping that contains only referenced entries.
+ *
+ * We condense a vdev when we expect the mapping to shrink (see
+ * vdev_indirect_should_condense()), but only perform one condense at a
+ * time to limit the memory usage. In addition, we use a separate
+ * open-context thread (spa_condense_indirect_thread) to incrementally
+ * create the new mapping object in a way that minimizes the impact on
+ * the rest of the system.
+ *
+ * == Generating a new mapping ==
+ *
+ * To generate a new mapping, we follow these steps:
+ *
+ * 1. Save the old obsolete space map and create a new mapping object
+ * (see spa_condense_indirect_start_sync()). This initializes the
+ * spa_condensing_indirect_phys with the "previous obsolete space map",
+ * which is now read only. Newly obsolete DVAs will be added to a
+ * new (initially empty) obsolete space map, and will not be
+ * considered as part of this condense operation.
+ *
+ * 2. Construct in memory the precise counts of obsolete space for each
+ * mapping entry, by incorporating the obsolete space map into the
+ * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
+ *
+ * 3. Iterate through each mapping entry, writing to the new mapping any
+ * entries that are not completely obsolete (i.e. which don't have
+ * obsolete count == mapping length). (See
+ * spa_condense_indirect_generate_new_mapping().)
+ *
+ * 4. Destroy the old mapping object and switch over to the new one
+ * (spa_condense_indirect_complete_sync).
+ *
+ * == Restarting from failure ==
+ *
+ * To restart the condense when we import/open the pool, we must start
+ * at the 2nd step above: reconstruct the precise counts in memory,
+ * based on the space map + counts. Then in the 3rd step, we start
+ * iterating where we left off: at vimp_max_offset of the new mapping
+ * object.
+ */
+
+boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
+
+/*
+ * Condense if at least this percent of the bytes in the mapping is
+ * obsolete. With the default of 25%, the amount of space mapped
+ * will be reduced to 1% of its original size after at most 16
+ * condenses. Higher values will condense less often (causing less
+ * i/o); lower values will reduce the mapping size more quickly.
+ */
+int zfs_indirect_condense_obsolete_pct = 25;
+
+/*
+ * Condense if the obsolete space map takes up more than this amount of
+ * space on disk (logically). This limits the amount of disk space
+ * consumed by the obsolete space map; the default of 1GB is small enough
+ * that we typically don't mind "wasting" it.
+ */
+uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
+
+/*
+ * Don't bother condensing if the mapping uses less than this amount of
+ * memory. The default of 128KB is considered a "trivial" amount of
+ * memory and not worth reducing.
+ */
+uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a condense (which might otherwise
+ * complete too quickly). If used to reduce the performance impact of
+ * condensing in production, a maximum value of 1 should be sufficient.
+ */
+int zfs_condense_indirect_commit_entry_delay_ticks = 0;
+
+/*
+ * If an indirect split block contains more than this many possible unique
+ * combinations when being reconstructed, consider it too computationally
+ * expensive to check them all. Instead, try at most 100 randomly-selected
+ * combinations each time the block is accessed. This allows all segment
+ * copies to participate fairly in the reconstruction when all combinations
+ * cannot be checked and prevents repeated use of one bad copy.
+ */
+int zfs_reconstruct_indirect_combinations_max = 256;
+
+
+/*
+ * Enable to simulate damaged segments and validate reconstruction.
+ * Used by ztest
+ */
+unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+ abd_t *ic_data;
+ vdev_t *ic_vdev;
+
+ /*
+ * ic_duplicate is NULL when the ic_data contents are unique, when it
+ * is determined to be a duplicate it references the primary child.
+ */
+ struct indirect_child *ic_duplicate;
+ list_node_t ic_node; /* node on is_unique_child */
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+ list_node_t is_node; /* link on iv_splits */
+
+ /*
+ * is_split_offset is the offset into the i/o.
+ * This is the sum of the previous splits' is_size's.
+ */
+ uint64_t is_split_offset;
+
+ vdev_t *is_vdev; /* top-level vdev */
+ uint64_t is_target_offset; /* offset on is_vdev */
+ uint64_t is_size;
+ int is_children; /* number of entries in is_child[] */
+ int is_unique_children; /* number of entries in is_unique_child */
+ list_t is_unique_child;
+
+ /*
+ * is_good_child is the child that we are currently using to
+ * attempt reconstruction.
+ */
+ indirect_child_t *is_good_child;
+
+ indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+ boolean_t iv_split_block;
+ boolean_t iv_reconstruct;
+ uint64_t iv_unique_combinations;
+ uint64_t iv_attempts;
+ uint64_t iv_attempts_max;
+
+ list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ indirect_split_t *is;
+ while ((is = list_head(&iv->iv_splits)) != NULL) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic->ic_data != NULL)
+ abd_free(ic->ic_data);
+ }
+ list_remove(&iv->iv_splits, is);
+
+ indirect_child_t *ic;
+ while ((ic = list_head(&is->is_unique_child)) != NULL)
+ list_remove(&is->is_unique_child, ic);
+
+ list_destroy(&is->is_unique_child);
+
+ kmem_free(is,
+ offsetof(indirect_split_t, is_child[is->is_children]));
+ }
+ kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+ vdev_indirect_map_free,
+ zio_vsd_default_cksum_report
+};
+/*
+ * Mark the given offset and size as being obsolete.
+ */
+void
+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
+ ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(size > 0);
+ VERIFY(vdev_indirect_mapping_entry_for_offset(
+ vd->vdev_indirect_mapping, offset) != NULL);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ mutex_enter(&vd->vdev_obsolete_lock);
+ range_tree_add(vd->vdev_obsolete_segments, offset, size);
+ mutex_exit(&vd->vdev_obsolete_lock);
+ vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
+ }
+}
+
+/*
+ * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
+ * wrapper is provided because the DMU does not know about vdev_t's and
+ * cannot directly call vdev_indirect_mark_obsolete.
+ */
+void
+spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /* The DMU can only remap indirect vdevs. */
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ vdev_indirect_mark_obsolete(vd, offset, size);
+}
+
+static spa_condensing_indirect_t *
+spa_condensing_indirect_create(spa_t *spa)
+{
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
+ objset_t *mos = spa->spa_meta_objset;
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ list_create(&sci->sci_new_mapping_entries[i],
+ sizeof (vdev_indirect_mapping_entry_t),
+ offsetof(vdev_indirect_mapping_entry_t, vime_node));
+ }
+
+ sci->sci_new_mapping =
+ vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
+
+ return (sci);
+}
+
+static void
+spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
+{
+ for (int i = 0; i < TXG_SIZE; i++)
+ list_destroy(&sci->sci_new_mapping_entries[i]);
+
+ if (sci->sci_new_mapping != NULL)
+ vdev_indirect_mapping_close(sci->sci_new_mapping);
+
+ kmem_free(sci, sizeof (*sci));
+}
+
+boolean_t
+vdev_indirect_should_condense(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
+
+ if (!zfs_condense_indirect_vdevs_enable)
+ return (B_FALSE);
+
+ /*
+ * We can only condense one indirect vdev at a time.
+ */
+ if (spa->spa_condensing_indirect != NULL)
+ return (B_FALSE);
+
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+
+ /*
+ * The mapping object size must not change while we are
+ * condensing, so we can only condense indirect vdevs
+ * (not vdevs that are still in the middle of being removed).
+ */
+ if (vd->vdev_ops != &vdev_indirect_ops)
+ return (B_FALSE);
+
+ /*
+ * If nothing new has been marked obsolete, there is no
+ * point in condensing.
+ */
+ if (vd->vdev_obsolete_sm == NULL) {
+ ASSERT0(vdev_obsolete_sm_object(vd));
+ return (B_FALSE);
+ }
+
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+
+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ space_map_object(vd->vdev_obsolete_sm));
+
+ uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
+ uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
+ uint64_t mapping_size = vdev_indirect_mapping_size(vim);
+ uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
+
+ ASSERT3U(bytes_obsolete, <=, bytes_mapped);
+
+ /*
+ * If a high percentage of the bytes that are mapped have become
+ * obsolete, condense (unless the mapping is already small enough).
+ * This has a good chance of reducing the amount of memory used
+ * by the mapping.
+ */
+ if (bytes_obsolete * 100 / bytes_mapped >=
+ zfs_indirect_condense_obsolete_pct &&
+ mapping_size > zfs_condense_min_mapping_bytes) {
+ zfs_dbgmsg("should condense vdev %llu because obsolete "
+ "spacemap covers %d%% of %lluMB mapping",
+ (u_longlong_t)vd->vdev_id,
+ (int)(bytes_obsolete * 100 / bytes_mapped),
+ (u_longlong_t)bytes_mapped / 1024 / 1024);
+ return (B_TRUE);
+ }
+
+ /*
+ * If the obsolete space map takes up too much space on disk,
+ * condense in order to free up this disk space.
+ */
+ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
+ zfs_dbgmsg("should condense vdev %llu because obsolete sm "
+ "length %lluMB >= max size %lluMB",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)obsolete_sm_size / 1024 / 1024,
+ (u_longlong_t)zfs_condense_max_obsolete_bytes /
+ 1024 / 1024);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * This sync task completes (finishes) a condense, deleting the old
+ * mapping and replacing it with the new one.
+ */
+static void
+spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_condensing_indirect_t *sci = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ objset_t *mos = spa->spa_meta_objset;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
+ uint64_t new_count =
+ vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT3P(sci, ==, spa->spa_condensing_indirect);
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
+ }
+ ASSERT(vic->vic_mapping_object != 0);
+ ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
+ ASSERT(scip->scip_next_mapping_object != 0);
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+
+ /*
+ * Reset vdev_indirect_mapping to refer to the new object.
+ */
+ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vd->vdev_indirect_mapping = sci->sci_new_mapping;
+ rw_exit(&vd->vdev_indirect_rwlock);
+
+ sci->sci_new_mapping = NULL;
+ vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
+ vic->vic_mapping_object = scip->scip_next_mapping_object;
+ scip->scip_next_mapping_object = 0;
+
+ space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ scip->scip_prev_obsolete_sm_object = 0;
+
+ scip->scip_vdev = 0;
+
+ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, tx));
+ spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
+ spa->spa_condensing_indirect = NULL;
+
+ zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
+ "new mapping object %llu has %llu entries "
+ "(was %llu entries)",
+ vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
+ new_count, old_count);
+
+ vdev_config_dirty(spa->spa_root_vdev);
+}
+
+/*
+ * This sync task appends entries to the new mapping object.
+ */
+static void
+spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_condensing_indirect_t *sci = arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(sci, ==, spa->spa_condensing_indirect);
+
+ vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
+ &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
+}
+
+/*
+ * Open-context function to add one entry to the new mapping. The new
+ * entry will be remembered and written from syncing context.
+ */
+static void
+spa_condense_indirect_commit_entry(spa_t *spa,
+ vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
+{
+ spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
+
+ ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ /*
+ * If we are the first entry committed this txg, kick off the sync
+ * task to write to the MOS on our behalf.
+ */
+ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx),
+ spa_condense_indirect_commit_sync, sci,
+ 0, ZFS_SPACE_CHECK_NONE, tx);
+ }
+
+ vdev_indirect_mapping_entry_t *vime =
+ kmem_alloc(sizeof (*vime), KM_SLEEP);
+ vime->vime_mapping = *vimep;
+ vime->vime_obsolete_count = count;
+ list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
+
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_condense_indirect_generate_new_mapping(vdev_t *vd,
+ uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t mapi = start_index;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ uint64_t old_num_entries =
+ vdev_indirect_mapping_num_entries(old_mapping);
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
+
+ zfs_dbgmsg("starting condense of vdev %llu from index %llu",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)mapi);
+
+ while (mapi < old_num_entries) {
+
+ if (zthr_iscancelled(zthr)) {
+ zfs_dbgmsg("pausing condense of vdev %llu "
+ "at index %llu", (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)mapi);
+ break;
+ }
+
+ vdev_indirect_mapping_entry_phys_t *entry =
+ &old_mapping->vim_entries[mapi];
+ uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
+ ASSERT3U(obsolete_counts[mapi], <=, entry_size);
+ if (obsolete_counts[mapi] < entry_size) {
+ spa_condense_indirect_commit_entry(spa, entry,
+ obsolete_counts[mapi]);
+
+ /*
+ * This delay may be requested for testing, debugging,
+ * or performance reasons.
+ */
+ delay(zfs_condense_indirect_commit_entry_delay_ticks);
+ }
+
+ mapi++;
+ }
+}
+
+/* ARGSUSED */
+static boolean_t
+spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+
+ return (spa->spa_condensing_indirect != NULL);
+}
+
+/* ARGSUSED */
+static void
+spa_condense_indirect_thread(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+ vdev_t *vd;
+
+ ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
+ ASSERT3P(vd, !=, NULL);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ uint32_t *counts;
+ uint64_t start_index;
+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+ space_map_t *prev_obsolete_sm = NULL;
+
+ ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
+ ASSERT(scip->scip_next_mapping_object != 0);
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ /*
+ * The list must start out empty in order for the
+ * _commit_sync() sync task to be properly registered
+ * on the first call to _commit_entry(); so it's wise
+ * to double check and ensure we actually are starting
+ * with empty lists.
+ */
+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
+ }
+
+ VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
+ counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
+ if (prev_obsolete_sm != NULL) {
+ vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
+ counts, prev_obsolete_sm);
+ }
+ space_map_close(prev_obsolete_sm);
+
+ /*
+ * Generate new mapping. Determine what index to continue from
+ * based on the max offset that we've already written in the
+ * new mapping.
+ */
+ uint64_t max_offset =
+ vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
+ if (max_offset == 0) {
+ /* We haven't written anything to the new mapping yet. */
+ start_index = 0;
+ } else {
+ /*
+ * Pick up from where we left off. _entry_for_offset()
+ * returns a pointer into the vim_entries array. If
+ * max_offset is greater than any of the mappings
+ * contained in the table NULL will be returned and
+ * that indicates we've exhausted our iteration of the
+ * old_mapping.
+ */
+
+ vdev_indirect_mapping_entry_phys_t *entry =
+ vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
+ max_offset);
+
+ if (entry == NULL) {
+ /*
+ * We've already written the whole new mapping.
+ * This special value will cause us to skip the
+ * generate_new_mapping step and just do the sync
+ * task to complete the condense.
+ */
+ start_index = UINT64_MAX;
+ } else {
+ start_index = entry - old_mapping->vim_entries;
+ ASSERT3U(start_index, <,
+ vdev_indirect_mapping_num_entries(old_mapping));
+ }
+ }
+
+ spa_condense_indirect_generate_new_mapping(vd, counts,
+ start_index, zthr);
+
+ vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
+
+ /*
+ * If the zthr has received a cancellation signal while running
+ * in generate_new_mapping() or at any point after that, then bail
+ * early. We don't want to complete the condense if the spa is
+ * shutting down.
+ */
+ if (zthr_iscancelled(zthr))
+ return;
+
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ spa_condense_indirect_complete_sync, sci, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/*
+ * Sync task to begin the condensing process.
+ */
+void
+spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+
+ ASSERT0(scip->scip_next_mapping_object);
+ ASSERT0(scip->scip_prev_obsolete_sm_object);
+ ASSERT0(scip->scip_vdev);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
+ ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
+
+ uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
+ ASSERT(obsolete_sm_obj != 0);
+
+ scip->scip_vdev = vd->vdev_id;
+ scip->scip_next_mapping_object =
+ vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
+
+ scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
+
+ /*
+ * We don't need to allocate a new space map object, since
+ * vdev_indirect_sync_obsolete will allocate one when needed.
+ */
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
+
+ VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
+ sizeof (*scip) / sizeof (uint64_t), scip, tx));
+
+ ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
+ spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
+
+ zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
+ "posm=%llu nm=%llu",
+ vd->vdev_id, dmu_tx_get_txg(tx),
+ (u_longlong_t)scip->scip_prev_obsolete_sm_object,
+ (u_longlong_t)scip->scip_next_mapping_object);
+
+ zthr_wakeup(spa->spa_condense_zthr);
+}
+
+/*
+ * Sync to the given vdev's obsolete space map any segments that are no longer
+ * referenced as of the given txg.
+ *
+ * If the obsolete space map doesn't exist yet, create and open it.
+ */
+void
+vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ ASSERT3U(vic->vic_mapping_object, !=, 0);
+ ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
+ ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
+
+ if (vdev_obsolete_sm_object(vd) == 0) {
+ uint64_t obsolete_sm_object =
+ space_map_alloc(spa->spa_meta_objset,
+ vdev_standard_sm_blksz, tx);
+
+ ASSERT(vd->vdev_top_zap != 0);
+ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
+ sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
+ ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
+
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
+ spa->spa_meta_objset, obsolete_sm_object,
+ 0, vd->vdev_asize, 0));
+ }
+
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ space_map_object(vd->vdev_obsolete_sm));
+
+ space_map_write(vd->vdev_obsolete_sm,
+ vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
+ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
+}
+
+int
+spa_condense_init(spa_t *spa)
+{
+ int error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
+ sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
+ &spa->spa_condensing_indirect_phys);
+ if (error == 0) {
+ if (spa_writeable(spa)) {
+ spa->spa_condensing_indirect =
+ spa_condensing_indirect_create(spa);
+ }
+ return (0);
+ } else if (error == ENOENT) {
+ return (0);
+ } else {
+ return (error);
+ }
+}
+
+void
+spa_condense_fini(spa_t *spa)
+{
+ if (spa->spa_condensing_indirect != NULL) {
+ spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
+ spa->spa_condensing_indirect = NULL;
+ }
+}
+
+void
+spa_start_indirect_condensing_thread(spa_t *spa)
+{
+ ASSERT3P(spa->spa_condense_zthr, ==, NULL);
+ spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
+ spa_condense_indirect_thread, spa);
+}
+
+/*
+ * Gets the obsolete spacemap object from the vdev's ZAP.
+ * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
+ * exist yet.
+ */
+int
+vdev_obsolete_sm_object(vdev_t *vd)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+ if (vd->vdev_top_zap == 0) {
+ return (0);
+ }
+
+ uint64_t sm_obj = 0;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
+
+ ASSERT(err == 0 || err == ENOENT);
+
+ return (sm_obj);
+}
+
+boolean_t
+vdev_obsolete_counts_are_precise(vdev_t *vd)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+ if (vd->vdev_top_zap == 0) {
+ return (B_FALSE);
+ }
+
+ uint64_t val = 0;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
+
+ ASSERT(err == 0 || err == ENOENT);
+
+ return (val != 0);
+}
+
+/* ARGSUSED */
+static void
+vdev_indirect_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static int
+vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ *psize = *max_psize = vd->vdev_asize +
+ VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ *logical_ashift = vd->vdev_ashift;
+ *physical_ashift = vd->vdev_physical_ashift;
+ return (0);
+}
+
+typedef struct remap_segment {
+ vdev_t *rs_vd;
+ uint64_t rs_offset;
+ uint64_t rs_asize;
+ uint64_t rs_split_offset;
+ list_node_t rs_node;
+} remap_segment_t;
+
+remap_segment_t *
+rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
+{
+ remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
+ rs->rs_vd = vd;
+ rs->rs_offset = offset;
+ rs->rs_asize = asize;
+ rs->rs_split_offset = split_offset;
+ return (rs);
+}
+
+/*
+ * Given an indirect vdev and an extent on that vdev, it duplicates the
+ * physical entries of the indirect mapping that correspond to the extent
+ * to a new array and returns a pointer to it. In addition, copied_entries
+ * is populated with the number of mapping entries that were duplicated.
+ *
+ * Note that the function assumes that the caller holds vdev_indirect_rwlock.
+ * This ensures that the mapping won't change due to condensing as we
+ * copy over its contents.
+ *
+ * Finally, since we are doing an allocation, it is up to the caller to
+ * free the array allocated in this function.
+ */
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
+ uint64_t asize, uint64_t *copied_entries)
+{
+ vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t entries = 0;
+
+ ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
+
+ vdev_indirect_mapping_entry_phys_t *first_mapping =
+ vdev_indirect_mapping_entry_for_offset(vim, offset);
+ ASSERT3P(first_mapping, !=, NULL);
+
+ vdev_indirect_mapping_entry_phys_t *m = first_mapping;
+ while (asize > 0) {
+ uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+
+ ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
+ ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
+
+ uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
+ uint64_t inner_size = MIN(asize, size - inner_offset);
+
+ offset += inner_size;
+ asize -= inner_size;
+ entries++;
+ m++;
+ }
+
+ size_t copy_length = entries * sizeof (*first_mapping);
+ duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
+ bcopy(first_mapping, duplicate_mappings, copy_length);
+ *copied_entries = entries;
+
+ return (duplicate_mappings);
+}
+
+/*
+ * Goes through the relevant indirect mappings until it hits a concrete vdev
+ * and issues the callback. On the way to the concrete vdev, if any other
+ * indirect vdevs are encountered, then the callback will also be called on
+ * each of those indirect vdevs. For example, if the segment is mapped to
+ * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
+ * mapped to segment B on concrete vdev 2, then the callback will be called on
+ * both vdev 1 and vdev 2.
+ *
+ * While the callback passed to vdev_indirect_remap() is called on every vdev
+ * the function encounters, certain callbacks only care about concrete vdevs.
+ * These types of callbacks should return immediately and explicitly when they
+ * are called on an indirect vdev.
+ *
+ * Because there is a possibility that a DVA section in the indirect device
+ * has been split into multiple sections in our mapping, we keep track
+ * of the relevant contiguous segments of the new location (remap_segment_t)
+ * in a stack. This way we can call the callback for each of the new sections
+ * created by a single section of the indirect device. Note though, that in
+ * this scenario the callbacks in each split block won't occur in-order in
+ * terms of offset, so callers should not make any assumptions about that.
+ *
+ * For callbacks that don't handle split blocks and immediately return when
+ * they encounter them (as is the case for remap_blkptr_cb), the caller can
+ * assume that its callback will be applied from the first indirect vdev
+ * encountered to the last one and then the concrete vdev, in that order.
+ */
+static void
+vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
+ void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
+{
+ list_t stack;
+ spa_t *spa = vd->vdev_spa;
+
+ list_create(&stack, sizeof (remap_segment_t),
+ offsetof(remap_segment_t, rs_node));
+
+ for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
+ rs != NULL; rs = list_remove_head(&stack)) {
+ vdev_t *v = rs->rs_vd;
+ uint64_t num_entries = 0;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+ ASSERT(rs->rs_asize > 0);
+
+ /*
+ * Note: As this function can be called from open context
+ * (e.g. zio_read()), we need the following rwlock to
+ * prevent the mapping from being changed by condensing.
+ *
+ * So we grab the lock and we make a copy of the entries
+ * that are relevant to the extent that we are working on.
+ * Once that is done, we drop the lock and iterate over
+ * our copy of the mapping. Once we are done with the with
+ * the remap segment and we free it, we also free our copy
+ * of the indirect mapping entries that are relevant to it.
+ *
+ * This way we don't need to wait until the function is
+ * finished with a segment, to condense it. In addition, we
+ * don't need a recursive rwlock for the case that a call to
+ * vdev_indirect_remap() needs to call itself (through the
+ * codepath of its callback) for the same vdev in the middle
+ * of its execution.
+ */
+ rw_enter(&v->vdev_indirect_rwlock, RW_READER);
+ vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
+ ASSERT3P(vim, !=, NULL);
+
+ vdev_indirect_mapping_entry_phys_t *mapping =
+ vdev_indirect_mapping_duplicate_adjacent_entries(v,
+ rs->rs_offset, rs->rs_asize, &num_entries);
+ ASSERT3P(mapping, !=, NULL);
+ ASSERT3U(num_entries, >, 0);
+ rw_exit(&v->vdev_indirect_rwlock);
+
+ for (uint64_t i = 0; i < num_entries; i++) {
+ /*
+ * Note: the vdev_indirect_mapping can not change
+ * while we are running. It only changes while the
+ * removal is in progress, and then only from syncing
+ * context. While a removal is in progress, this
+ * function is only called for frees, which also only
+ * happen from syncing context.
+ */
+ vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
+
+ ASSERT3P(m, !=, NULL);
+ ASSERT3U(rs->rs_asize, >, 0);
+
+ uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+ uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
+ uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
+
+ ASSERT3U(rs->rs_offset, >=,
+ DVA_MAPPING_GET_SRC_OFFSET(m));
+ ASSERT3U(rs->rs_offset, <,
+ DVA_MAPPING_GET_SRC_OFFSET(m) + size);
+ ASSERT3U(dst_vdev, !=, v->vdev_id);
+
+ uint64_t inner_offset = rs->rs_offset -
+ DVA_MAPPING_GET_SRC_OFFSET(m);
+ uint64_t inner_size =
+ MIN(rs->rs_asize, size - inner_offset);
+
+ vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
+ ASSERT3P(dst_v, !=, NULL);
+
+ if (dst_v->vdev_ops == &vdev_indirect_ops) {
+ list_insert_head(&stack,
+ rs_alloc(dst_v, dst_offset + inner_offset,
+ inner_size, rs->rs_split_offset));
+
+ }
+
+ if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
+ IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
+ /*
+ * Note: This clause exists only solely for
+ * testing purposes. We use it to ensure that
+ * split blocks work and that the callbacks
+ * using them yield the same result if issued
+ * in reverse order.
+ */
+ uint64_t inner_half = inner_size / 2;
+
+ func(rs->rs_split_offset + inner_half, dst_v,
+ dst_offset + inner_offset + inner_half,
+ inner_half, arg);
+
+ func(rs->rs_split_offset, dst_v,
+ dst_offset + inner_offset,
+ inner_half, arg);
+ } else {
+ func(rs->rs_split_offset, dst_v,
+ dst_offset + inner_offset,
+ inner_size, arg);
+ }
+
+ rs->rs_offset += inner_size;
+ rs->rs_asize -= inner_size;
+ rs->rs_split_offset += inner_size;
+ }
+ VERIFY0(rs->rs_asize);
+
+ kmem_free(mapping, num_entries * sizeof (*mapping));
+ kmem_free(rs, sizeof (remap_segment_t));
+ }
+ list_destroy(&stack);
+}
+
+static void
+vdev_indirect_child_io_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_private;
+
+ mutex_enter(&pio->io_lock);
+ pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
+ mutex_exit(&pio->io_lock);
+
+#ifdef __FreeBSD__
+ if (zio->io_abd != NULL)
+#endif
+ abd_put(zio->io_abd);
+}
+
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
+static void
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ zio_t *zio = arg;
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ ASSERT3P(vd, !=, NULL);
+
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ return;
+
+ int n = 1;
+ if (vd->vdev_ops == &vdev_mirror_ops)
+ n = vd->vdev_children;
+
+ indirect_split_t *is =
+ kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+ is->is_children = n;
+ is->is_size = size;
+ is->is_split_offset = split_offset;
+ is->is_target_offset = offset;
+ is->is_vdev = vd;
+ list_create(&is->is_unique_child, sizeof (indirect_child_t),
+ offsetof(indirect_child_t, ic_node));
+
+ /*
+ * Note that we only consider multiple copies of the data for
+ * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even
+ * though they use the same ops as mirror, because there's only one
+ * "good" copy under the replacing/spare.
+ */
+ if (vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < n; i++) {
+ is->is_child[i].ic_vdev = vd->vdev_child[i];
+ list_link_init(&is->is_child[i].ic_node);
+ }
+ } else {
+ is->is_child[0].ic_vdev = vd;
+ }
+
+ list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+ indirect_child_t *ic = zio->io_private;
+
+ if (zio->io_error != 0) {
+ /*
+ * Clear ic_data to indicate that we do not have data for this
+ * child.
+ */
+ abd_free(ic->ic_data);
+ ic->ic_data = NULL;
+ }
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic = &is->is_child[i];
+
+ if (!vdev_readable(ic->ic_vdev))
+ continue;
+
+ /*
+ * Note, we may read from a child whose DTL
+ * indicates that the data may not be present here.
+ * While this might result in a few i/os that will
+ * likely return incorrect data, it simplifies the
+ * code since we can treat scrub and resilver
+ * identically. (The incorrect data will be
+ * detected and ignored when we verify the
+ * checksum.)
+ */
+
+ ic->ic_data = abd_alloc_sametype(zio->io_abd,
+ is->is_size);
+ ic->ic_duplicate = NULL;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset, ic->ic_data,
+ is->is_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_read_split_done, ic));
+ }
+ }
+ iv->iv_reconstruct = B_TRUE;
+}
+
+static void
+vdev_indirect_io_start(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+ list_create(&iv->iv_splits,
+ sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+ zio->io_vsd = iv;
+ zio->io_vsd_ops = &vdev_indirect_vsd_ops;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+#ifdef __FreeBSD__
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+#else
+ if (zio->io_type != ZIO_TYPE_READ) {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+#endif
+ /*
+ * Note: this code can handle other kinds of writes,
+ * but we don't expect them.
+ */
+ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+ }
+
+ vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
+ vdev_indirect_gather_splits, zio);
+
+ indirect_split_t *first = list_head(&iv->iv_splits);
+ if (first->is_size == zio->io_size) {
+ /*
+ * This is not a split block; we are pointing to the entire
+ * data, which will checksum the same as the original data.
+ * Pass the BP down so that the child i/o can verify the
+ * checksum, and try a different location if available
+ * (e.g. on a mirror).
+ *
+ * While this special case could be handled the same as the
+ * general (split block) case, doing it this way ensures
+ * that the vast majority of blocks on indirect vdevs
+ * (which are not split) are handled identically to blocks
+ * on non-indirect vdevs. This allows us to be less strict
+ * about performance in the general (but rare) case.
+ */
+ ASSERT0(first->is_split_offset);
+ ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ first->is_vdev, first->is_target_offset,
+#ifdef __FreeBSD__
+ zio->io_abd == NULL ? NULL :
+#endif
+ abd_get_offset(zio->io_abd, 0),
+ zio->io_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ } else {
+ iv->iv_split_block = B_TRUE;
+ if (zio->io_type == ZIO_TYPE_READ &&
+ zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+ /*
+ * Read all copies. Note that for simplicity,
+ * we don't bother consulting the DTL in the
+ * resilver case.
+ */
+ vdev_indirect_read_all(zio);
+ } else {
+ /*
+ * If this is a read zio, we read one copy of each
+ * split segment, from the top-level vdev. Since
+ * we don't know the checksum of each split
+ * individually, the child zio can't ensure that
+ * we get the right data. E.g. if it's a mirror,
+ * it will just read from a random (healthy) leaf
+ * vdev. We have to verify the checksum in
+ * vdev_indirect_io_done().
+ *
+ * For write zios, the vdev code will ensure we write
+ * to all children.
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ is->is_vdev, is->is_target_offset,
+#ifdef __FreeBSD__
+ zio->io_abd == NULL ? NULL :
+#endif
+ abd_get_offset(zio->io_abd,
+ is->is_split_offset),
+ is->is_size, zio->io_type,
+ zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ }
+ }
+ }
+
+ zio_execute(zio);
+}
+
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+ indirect_split_t *is, indirect_child_t *ic)
+{
+ vdev_t *vd = ic->ic_vdev;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zio_bad_cksum_t zbc = { 0 };
+ void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
+ abd_t *good_abd = is->is_good_child->ic_data;
+ void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
+ abd_return_buf(ic->ic_data, bad_buf, is->is_size);
+ abd_return_buf(good_abd, good_buf, is->is_size);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies. We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data. If they differ, then we overwrite the bad data
+ * with the good copy. Note that we do this without regard for the DTL's,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong). For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+ if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+ flags |= ZIO_FLAG_SELF_HEAL;
+
+ if (!spa_writeable(zio->io_spa))
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic == is->is_good_child)
+ continue;
+ if (ic->ic_data == NULL)
+ continue;
+ if (ic->ic_duplicate == is->is_good_child)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset,
+ is->is_good_child->ic_data, is->is_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+ NULL, NULL));
+
+ vdev_indirect_checksum_error(zio, is, ic);
+ }
+ }
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+
+ if (ic->ic_data == NULL)
+ continue;
+
+ vdev_t *vd = ic->ic_vdev;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ is->is_target_offset, is->is_size,
+ NULL, NULL, NULL);
+ }
+ }
+}
+
+/*
+ * Copy data from all the splits to a main zio then validate the checksum.
+ * If then checksum is successfully validated return success.
+ */
+static int
+vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
+{
+ zio_bad_cksum_t zbc;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+ ASSERT3P(is->is_good_child->ic_data, !=, NULL);
+ ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
+
+ abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
+ is->is_split_offset, 0, is->is_size);
+ }
+
+ return (zio_checksum_error(zio, &zbc));
+}
+
+/*
+ * There are relatively few possible combinations making it feasible to
+ * deterministically check them all. We do this by setting the good_child
+ * to the next unique split version. If we reach the end of the list then
+ * "carry over" to the next unique split version (like counting in base
+ * is_unique_children, but each digit can have a different base).
+ */
+static int
+vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
+{
+ boolean_t more = B_TRUE;
+
+ iv->iv_attempts = 0;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is))
+ is->is_good_child = list_head(&is->is_unique_child);
+
+ while (more == B_TRUE) {
+ iv->iv_attempts++;
+ more = B_FALSE;
+
+ if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
+ return (0);
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_good_child = list_next(&is->is_unique_child,
+ is->is_good_child);
+ if (is->is_good_child != NULL) {
+ more = B_TRUE;
+ break;
+ }
+
+ is->is_good_child = list_head(&is->is_unique_child);
+ }
+ }
+
+ ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
+
+ return (SET_ERROR(ECKSUM));
+}
+
+/*
+ * There are too many combinations to try all of them in a reasonable amount
+ * of time. So try a fixed number of random combinations from the unique
+ * split versions, after which we'll consider the block unrecoverable.
+ */
+static int
+vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
+{
+ iv->iv_attempts = 0;
+
+ while (iv->iv_attempts < iv->iv_attempts_max) {
+ iv->iv_attempts++;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ indirect_child_t *ic = list_head(&is->is_unique_child);
+ int children = is->is_unique_children;
+
+ for (int i = spa_get_random(children); i > 0; i--)
+ ic = list_next(&is->is_unique_child, ic);
+
+ ASSERT3P(ic, !=, NULL);
+ is->is_good_child = ic;
+ }
+
+ if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
+ return (0);
+ }
+
+ return (SET_ERROR(ECKSUM));
+}
+
+/*
+ * This is a validation function for reconstruction. It randomly selects
+ * a good combination, if one can be found, and then it intentionally
+ * damages all other segment copes by zeroing them. This forces the
+ * reconstruction algorithm to locate the one remaining known good copy.
+ */
+static int
+vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
+{
+ /* Presume all the copies are unique for initial selection. */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_unique_children = 0;
+
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic = &is->is_child[i];
+ if (ic->ic_data != NULL) {
+ is->is_unique_children++;
+ list_insert_tail(&is->is_unique_child, ic);
+ }
+ }
+ }
+
+ /*
+ * Set each is_good_child to a randomly-selected child which
+ * is known to contain validated data.
+ */
+ int error = vdev_indirect_splits_enumerate_randomly(iv, zio);
+ if (error)
+ goto out;
+
+ /*
+ * Damage all but the known good copy by zeroing it. This will
+ * result in two or less unique copies per indirect_child_t.
+ * Both may need to be checked in order to reconstruct the block.
+ * Set iv->iv_attempts_max such that all unique combinations will
+ * enumerated, but limit the damage to at most 16 indirect splits.
+ */
+ iv->iv_attempts_max = 1;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+
+ if (ic == is->is_good_child)
+ continue;
+ if (ic->ic_data == NULL)
+ continue;
+
+ abd_zero(ic->ic_data, ic->ic_data->abd_size);
+ }
+
+ iv->iv_attempts_max *= 2;
+ if (iv->iv_attempts_max > (1ULL << 16)) {
+ iv->iv_attempts_max = UINT64_MAX;
+ break;
+ }
+ }
+
+out:
+ /* Empty the unique children lists so they can be reconstructed. */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ indirect_child_t *ic;
+ while ((ic = list_head(&is->is_unique_child)) != NULL)
+ list_remove(&is->is_unique_child, ic);
+
+ is->is_unique_children = 0;
+ }
+
+ return (error);
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror. The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually.
+ *
+ * We have to try every unique combination of copies of split segments, until
+ * we find one that checksums correctly. Duplicate segment copies are first
+ * identified and latter skipped during reconstruction. This optimization
+ * reduces the search space and ensures that of the remaining combinations
+ * at most one is correct.
+ *
+ * When the total number of combinations is small they can all be checked.
+ * For example, if we have 3 segments in the split, and each points to a
+ * 2-way mirror with unique copies, we will have the following pieces of data:
+ *
+ * | mirror child
+ * split | [0] [1]
+ * ======|=====================
+ * A | data_A_0 data_A_1
+ * B | data_B_0 data_B_1
+ * C | data_C_0 data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary. In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit" "high bit"
+ * v v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we may need to try lots of combinations (see
+ * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror
+ * has small silent errors on all of its children, we can still reconstruct
+ * the correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+ boolean_t known_good = B_FALSE;
+ int error;
+
+ iv->iv_unique_combinations = 1;
+ iv->iv_attempts_max = UINT64_MAX;
+
+ if (zfs_reconstruct_indirect_combinations_max > 0)
+ iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
+
+ /*
+ * If nonzero, every 1/x blocks will be damaged, in order to validate
+ * reconstruction when there are split segments with damaged copies.
+ * Known_good will TRUE when reconstruction is known to be possible.
+ */
+ if (zfs_reconstruct_indirect_damage_fraction != 0 &&
+ spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
+ known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
+
+ /*
+ * Determine the unique children for a split segment and add them
+ * to the is_unique_child list. By restricting reconstruction
+ * to these children, only unique combinations will be considered.
+ * This can vastly reduce the search space when there are a large
+ * number of indirect splits.
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_unique_children = 0;
+
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic_i = &is->is_child[i];
+
+ if (ic_i->ic_data == NULL ||
+ ic_i->ic_duplicate != NULL)
+ continue;
+
+ for (int j = i + 1; j < is->is_children; j++) {
+ indirect_child_t *ic_j = &is->is_child[j];
+
+ if (ic_j->ic_data == NULL ||
+ ic_j->ic_duplicate != NULL)
+ continue;
+
+ if (abd_cmp(ic_i->ic_data, ic_j->ic_data,
+ is->is_size) == 0) {
+ ic_j->ic_duplicate = ic_i;
+ }
+ }
+
+ is->is_unique_children++;
+ list_insert_tail(&is->is_unique_child, ic_i);
+ }
+
+ /* Reconstruction is impossible, no valid children */
+ EQUIV(list_is_empty(&is->is_unique_child),
+ is->is_unique_children == 0);
+ if (list_is_empty(&is->is_unique_child)) {
+ zio->io_error = EIO;
+ vdev_indirect_all_checksum_errors(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ iv->iv_unique_combinations *= is->is_unique_children;
+ }
+
+ if (iv->iv_unique_combinations <= iv->iv_attempts_max)
+ error = vdev_indirect_splits_enumerate_all(iv, zio);
+ else
+ error = vdev_indirect_splits_enumerate_randomly(iv, zio);
+
+ if (error != 0) {
+ /* All attempted combinations failed. */
+ ASSERT3B(known_good, ==, B_FALSE);
+ zio->io_error = error;
+ vdev_indirect_all_checksum_errors(zio);
+ } else {
+ /*
+ * The checksum has been successfully validated. Issue
+ * repair I/Os to any copies of splits which don't match
+ * the validated version.
+ */
+ ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
+ vdev_indirect_repair(zio);
+ zio_checksum_verified(zio);
+ }
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (iv->iv_reconstruct) {
+ /*
+ * We have read all copies of the data (e.g. from mirrors),
+ * either because this was a scrub/resilver, or because the
+ * one-copy read didn't checksum correctly.
+ */
+ vdev_indirect_reconstruct_io_done(zio);
+ return;
+ }
+
+ if (!iv->iv_split_block) {
+ /*
+ * This was not a split block, so we passed the BP down,
+ * and the checksum was handled by the (one) child zio.
+ */
+ return;
+ }
+
+ zio_bad_cksum_t zbc;
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret == 0) {
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ /*
+ * The checksum didn't match. Read all copies of all splits, and
+ * then we will try to reconstruct. The next time
+ * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+ */
+ vdev_indirect_read_all(zio);
+
+ zio_vdev_io_redone(zio);
+}
+
+vdev_ops_t vdev_indirect_ops = {
+ vdev_indirect_open,
+ vdev_indirect_close,
+ vdev_default_asize,
+ vdev_indirect_io_start,
+ vdev_indirect_io_done,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ vdev_indirect_remap,
+ NULL,
+ VDEV_TYPE_INDIRECT, /* name of this vdev type */
+ B_FALSE /* leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
new file mode 100644
index 000000000000..fbecbe830929
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
@@ -0,0 +1,212 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/vdev_indirect_births.h>
+
+static boolean_t
+vdev_indirect_births_verify(vdev_indirect_births_t *vib)
+{
+ ASSERT(vib != NULL);
+
+ ASSERT(vib->vib_object != 0);
+ ASSERT(vib->vib_objset != NULL);
+ ASSERT(vib->vib_phys != NULL);
+ ASSERT(vib->vib_dbuf != NULL);
+
+ EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL);
+
+ return (B_TRUE);
+}
+
+uint64_t
+vdev_indirect_births_count(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib->vib_phys->vib_count);
+}
+
+uint64_t
+vdev_indirect_births_object(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib->vib_object);
+}
+
+static uint64_t
+vdev_indirect_births_size_impl(vdev_indirect_births_t *vib)
+{
+ return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries));
+}
+
+void
+vdev_indirect_births_close(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ if (vib->vib_phys->vib_count > 0) {
+ uint64_t births_size = vdev_indirect_births_size_impl(vib);
+
+ kmem_free(vib->vib_entries, births_size);
+ vib->vib_entries = NULL;
+ }
+
+ dmu_buf_rele(vib->vib_dbuf, vib);
+
+ vib->vib_objset = NULL;
+ vib->vib_object = 0;
+ vib->vib_dbuf = NULL;
+ vib->vib_phys = NULL;
+
+ kmem_free(vib, sizeof (*vib));
+}
+
+uint64_t
+vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ return (dmu_object_alloc(os,
+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t),
+ tx));
+}
+
+vdev_indirect_births_t *
+vdev_indirect_births_open(objset_t *os, uint64_t births_object)
+{
+ vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
+
+ vib->vib_objset = os;
+ vib->vib_object = births_object;
+
+ VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf));
+ vib->vib_phys = vib->vib_dbuf->db_data;
+
+ if (vib->vib_phys->vib_count > 0) {
+ uint64_t births_size = vdev_indirect_births_size_impl(vib);
+ vib->vib_entries = kmem_alloc(births_size, KM_SLEEP);
+ VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
+ births_size, vib->vib_entries, DMU_READ_PREFETCH));
+ }
+
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ return (vib);
+}
+
+void
+vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ VERIFY0(dmu_object_free(os, object, tx));
+}
+
+void
+vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
+ uint64_t max_offset, uint64_t txg, dmu_tx_t *tx)
+{
+ vdev_indirect_birth_entry_phys_t vibe;
+ uint64_t old_size;
+ uint64_t new_size;
+ vdev_indirect_birth_entry_phys_t *new_entries;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
+ ASSERT(vdev_indirect_births_verify(vib));
+
+ dmu_buf_will_dirty(vib->vib_dbuf, tx);
+
+ vibe.vibe_offset = max_offset;
+ vibe.vibe_phys_birth_txg = txg;
+
+ old_size = vdev_indirect_births_size_impl(vib);
+ dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe),
+ &vibe, tx);
+ vib->vib_phys->vib_count++;
+ new_size = vdev_indirect_births_size_impl(vib);
+
+ new_entries = kmem_alloc(new_size, KM_SLEEP);
+ if (old_size > 0) {
+ bcopy(vib->vib_entries, new_entries, old_size);
+ kmem_free(vib->vib_entries, old_size);
+ }
+ new_entries[vib->vib_phys->vib_count - 1] = vibe;
+ vib->vib_entries = new_entries;
+}
+
+uint64_t
+vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib)
+{
+ ASSERT(vdev_indirect_births_verify(vib));
+ ASSERT(vib->vib_phys->vib_count > 0);
+
+ vdev_indirect_birth_entry_phys_t *last =
+ &vib->vib_entries[vib->vib_phys->vib_count - 1];
+ return (last->vibe_phys_birth_txg);
+}
+
+/*
+ * Return the txg in which the given range was copied (i.e. its physical
+ * birth txg). The specified offset+asize must be contiguously mapped
+ * (i.e. not a split block).
+ *
+ * The entries are sorted by increasing phys_birth, and also by increasing
+ * offset. We find the specified offset by binary search. Note that we
+ * can not use bsearch() because looking at each entry independently is
+ * insufficient to find the correct entry. Each entry implicitly relies
+ * on the previous entry: an entry indicates that the offsets from the
+ * end of the previous entry to the end of this entry were written in the
+ * specified txg.
+ */
+uint64_t
+vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset,
+ uint64_t asize)
+{
+ vdev_indirect_birth_entry_phys_t *base;
+ vdev_indirect_birth_entry_phys_t *last;
+
+ ASSERT(vdev_indirect_births_verify(vib));
+ ASSERT(vib->vib_phys->vib_count > 0);
+
+ base = vib->vib_entries;
+ last = base + vib->vib_phys->vib_count - 1;
+
+ ASSERT3U(offset, <, last->vibe_offset);
+
+ while (last >= base) {
+ vdev_indirect_birth_entry_phys_t *p =
+ base + ((last - base) / 2);
+ if (offset >= p->vibe_offset) {
+ base = p + 1;
+ } else if (p == vib->vib_entries ||
+ offset >= (p - 1)->vibe_offset) {
+ ASSERT3U(offset + asize, <=, p->vibe_offset);
+ return (p->vibe_phys_birth_txg);
+ } else {
+ last = p - 1;
+ }
+ }
+ ASSERT(!"offset not found");
+ return (-1);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
new file mode 100644
index 000000000000..3d0f1344dd88
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
@@ -0,0 +1,593 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/zfeature.h>
+#include <sys/dmu_objset.h>
+
+static boolean_t
+vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vim != NULL);
+
+ ASSERT(vim->vim_object != 0);
+ ASSERT(vim->vim_objset != NULL);
+ ASSERT(vim->vim_phys != NULL);
+ ASSERT(vim->vim_dbuf != NULL);
+
+ EQUIV(vim->vim_phys->vimp_num_entries > 0,
+ vim->vim_entries != NULL);
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ vdev_indirect_mapping_entry_phys_t *last_entry =
+ &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
+ uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
+ uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
+
+ ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
+ }
+ if (vim->vim_havecounts) {
+ ASSERT(vim->vim_phys->vimp_counts_object != 0);
+ }
+
+ return (B_TRUE);
+}
+
+uint64_t
+vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_num_entries);
+}
+
+uint64_t
+vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_max_offset);
+}
+
+uint64_t
+vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_object);
+}
+
+uint64_t
+vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim->vim_phys->vimp_bytes_mapped);
+}
+
+/*
+ * The length (in bytes) of the mapping object array in memory and
+ * (logically) on disk.
+ *
+ * Note that unlike most of our accessor functions,
+ * we don't assert that the struct is consistent; therefore it can be
+ * called while there may be concurrent changes, if we don't care about
+ * the value being immediately stale (e.g. from spa_removal_get_stats()).
+ */
+uint64_t
+vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
+{
+ return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
+}
+
+/*
+ * Compare an offset with an indirect mapping entry; there are three
+ * possible scenarios:
+ *
+ * 1. The offset is "less than" the mapping entry; meaning the
+ * offset is less than the source offset of the mapping entry. In
+ * this case, there is no overlap between the offset and the
+ * mapping entry and -1 will be returned.
+ *
+ * 2. The offset is "greater than" the mapping entry; meaning the
+ * offset is greater than the mapping entry's source offset plus
+ * the entry's size. In this case, there is no overlap between
+ * the offset and the mapping entry and 1 will be returned.
+ *
+ * NOTE: If the offset is actually equal to the entry's offset
+ * plus size, this is considered to be "greater" than the entry,
+ * and this case applies (i.e. 1 will be returned). Thus, the
+ * entry's "range" can be considered to be inclusive at its
+ * start, but exclusive at its end: e.g. [src, src + size).
+ *
+ * 3. The last case to consider is if the offset actually falls
+ * within the mapping entry's range. If this is the case, the
+ * offset is considered to be "equal to" the mapping entry and
+ * 0 will be returned.
+ *
+ * NOTE: If the offset is equal to the entry's source offset,
+ * this case applies and 0 will be returned. If the offset is
+ * equal to the entry's source plus its size, this case does
+ * *not* apply (see "NOTE" above for scenario 2), and 1 will be
+ * returned.
+ */
+static int
+dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
+{
+ const uint64_t *key = v_key;
+ const vdev_indirect_mapping_entry_phys_t *array_elem =
+ v_array_elem;
+ uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
+
+ if (*key < src_offset) {
+ return (-1);
+ } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
+ return (0);
+ } else {
+ return (1);
+ }
+}
+
+/*
+ * Returns the mapping entry for the given offset.
+ *
+ * It's possible that the given offset will not be in the mapping table
+ * (i.e. no mapping entries contain this offset), in which case, the
+ * return value value depends on the "next_if_missing" parameter.
+ *
+ * If the offset is not found in the table and "next_if_missing" is
+ * B_FALSE, then NULL will always be returned. The behavior is intended
+ * to allow consumers to get the entry corresponding to the offset
+ * parameter, iff the offset overlaps with an entry in the table.
+ *
+ * If the offset is not found in the table and "next_if_missing" is
+ * B_TRUE, then the entry nearest to the given offset will be returned,
+ * such that the entry's source offset is greater than the offset
+ * passed in (i.e. the "next" mapping entry in the table is returned, if
+ * the offset is missing from the table). If there are no entries whose
+ * source offset is greater than the passed in offset, NULL is returned.
+ */
+static vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
+ uint64_t offset, boolean_t next_if_missing)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+ ASSERT(vim->vim_phys->vimp_num_entries > 0);
+
+ vdev_indirect_mapping_entry_phys_t *entry = NULL;
+
+ uint64_t last = vim->vim_phys->vimp_num_entries - 1;
+ uint64_t base = 0;
+
+ /*
+ * We don't define these inside of the while loop because we use
+ * their value in the case that offset isn't in the mapping.
+ */
+ uint64_t mid;
+ int result;
+
+ while (last >= base) {
+ mid = base + ((last - base) >> 1);
+
+ result = dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[mid]);
+
+ if (result == 0) {
+ entry = &vim->vim_entries[mid];
+ break;
+ } else if (result < 0) {
+ last = mid - 1;
+ } else {
+ base = mid + 1;
+ }
+ }
+
+ if (entry == NULL && next_if_missing) {
+ ASSERT3U(base, ==, last + 1);
+ ASSERT(mid == base || mid == last);
+ ASSERT3S(result, !=, 0);
+
+ /*
+ * The offset we're looking for isn't actually contained
+ * in the mapping table, thus we need to return the
+ * closest mapping entry that is greater than the
+ * offset. We reuse the result of the last comparison,
+ * comparing the mapping entry at index "mid" and the
+ * offset. The offset is guaranteed to lie between
+ * indices one less than "mid", and one greater than
+ * "mid"; we just need to determine if offset is greater
+ * than, or less than the mapping entry contained at
+ * index "mid".
+ */
+
+ uint64_t index;
+ if (result < 0)
+ index = mid;
+ else
+ index = mid + 1;
+
+ ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
+
+ if (index == vim->vim_phys->vimp_num_entries) {
+ /*
+ * If "index" is past the end of the entries
+ * array, then not only is the offset not in the
+ * mapping table, but it's actually greater than
+ * all entries in the table. In this case, we
+ * can't return a mapping entry greater than the
+ * offset (since none exist), so we return NULL.
+ */
+
+ ASSERT3S(dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index - 1]), >, 0);
+
+ return (NULL);
+ } else {
+ /*
+ * Just to be safe, we verify the offset falls
+ * in between the mapping entries at index and
+ * one less than index. Since we know the offset
+ * doesn't overlap an entry, and we're supposed
+ * to return the entry just greater than the
+ * offset, both of the following tests must be
+ * true.
+ */
+ ASSERT3S(dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index]), <, 0);
+ IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
+ &vim->vim_entries[index - 1]) > 0);
+
+ return (&vim->vim_entries[index]);
+ }
+ } else {
+ return (entry);
+ }
+}
+
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
+ uint64_t offset)
+{
+ return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
+ B_FALSE));
+}
+
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
+ uint64_t offset)
+{
+ return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
+ B_TRUE));
+}
+
+void
+vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ uint64_t map_size = vdev_indirect_mapping_size(vim);
+ kmem_free(vim->vim_entries, map_size);
+ vim->vim_entries = NULL;
+ }
+
+ dmu_buf_rele(vim->vim_dbuf, vim);
+
+ vim->vim_objset = NULL;
+ vim->vim_object = 0;
+ vim->vim_dbuf = NULL;
+ vim->vim_phys = NULL;
+
+ kmem_free(vim, sizeof (*vim));
+}
+
+uint64_t
+vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ uint64_t object;
+ ASSERT(dmu_tx_is_syncing(tx));
+ uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
+
+ if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ bonus_size = sizeof (vdev_indirect_mapping_phys_t);
+ }
+
+ object = dmu_object_alloc(os,
+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, bonus_size,
+ tx);
+
+ if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ dmu_buf_t *dbuf;
+ vdev_indirect_mapping_phys_t *vimp;
+
+ VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ vimp = dbuf->db_data;
+ vimp->vimp_counts_object = dmu_object_alloc(os,
+ DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
+ spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ dmu_buf_rele(dbuf, FTAG);
+ }
+
+ return (object);
+}
+
+
+vdev_indirect_mapping_t *
+vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
+{
+ vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(os, mapping_object, &doi));
+
+ vim->vim_objset = os;
+ vim->vim_object = mapping_object;
+
+ VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
+ &vim->vim_dbuf));
+ vim->vim_phys = vim->vim_dbuf->db_data;
+
+ vim->vim_havecounts =
+ (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
+
+ if (vim->vim_phys->vimp_num_entries > 0) {
+ uint64_t map_size = vdev_indirect_mapping_size(vim);
+ vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
+ VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
+ vim->vim_entries, DMU_READ_PREFETCH));
+ }
+
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ return (vim);
+}
+
+void
+vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
+ if (vim->vim_havecounts) {
+ VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
+ tx));
+ spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ }
+ vdev_indirect_mapping_close(vim);
+
+ VERIFY0(dmu_object_free(os, object, tx));
+}
+
+/*
+ * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
+ * mapping object. Also remove the entries from the list and free them.
+ * This also implicitly extends the max_offset of the mapping (to the end
+ * of the last entry).
+ */
+void
+vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
+ list_t *list, dmu_tx_t *tx)
+{
+ vdev_indirect_mapping_entry_phys_t *mapbuf;
+ uint64_t old_size;
+ uint32_t *countbuf = NULL;
+ vdev_indirect_mapping_entry_phys_t *old_entries;
+ uint64_t old_count;
+ uint64_t entries_written = 0;
+
+ ASSERT(vdev_indirect_mapping_verify(vim));
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
+ ASSERT(!list_is_empty(list));
+
+ old_size = vdev_indirect_mapping_size(vim);
+ old_entries = vim->vim_entries;
+ old_count = vim->vim_phys->vimp_num_entries;
+
+ dmu_buf_will_dirty(vim->vim_dbuf, tx);
+
+ mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
+ if (vim->vim_havecounts) {
+ countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
+ ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
+ SPA_FEATURE_OBSOLETE_COUNTS));
+ }
+ while (!list_is_empty(list)) {
+ uint64_t i;
+ /*
+ * Write entries from the list to the
+ * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
+ */
+ for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
+ vdev_indirect_mapping_entry_t *entry =
+ list_remove_head(list);
+ if (entry == NULL)
+ break;
+
+ uint64_t size =
+ DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
+ uint64_t src_offset =
+ DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
+
+ /*
+ * We shouldn't be adding an entry which is fully
+ * obsolete.
+ */
+ ASSERT3U(entry->vime_obsolete_count, <, size);
+ IMPLY(entry->vime_obsolete_count != 0,
+ vim->vim_havecounts);
+
+ mapbuf[i] = entry->vime_mapping;
+ if (vim->vim_havecounts)
+ countbuf[i] = entry->vime_obsolete_count;
+
+ vim->vim_phys->vimp_bytes_mapped += size;
+ ASSERT3U(src_offset, >=,
+ vim->vim_phys->vimp_max_offset);
+ vim->vim_phys->vimp_max_offset = src_offset + size;
+
+ entries_written++;
+
+ kmem_free(entry, sizeof (*entry));
+ }
+ dmu_write(vim->vim_objset, vim->vim_object,
+ vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
+ i * sizeof (*mapbuf),
+ mapbuf, tx);
+ if (vim->vim_havecounts) {
+ dmu_write(vim->vim_objset,
+ vim->vim_phys->vimp_counts_object,
+ vim->vim_phys->vimp_num_entries *
+ sizeof (*countbuf),
+ i * sizeof (*countbuf), countbuf, tx);
+ }
+ vim->vim_phys->vimp_num_entries += i;
+ }
+ zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
+ if (vim->vim_havecounts)
+ zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
+
+ /*
+ * Update the entry array to reflect the new entries. First, copy
+ * over any old entries then read back the new entries we just wrote.
+ */
+ uint64_t new_size = vdev_indirect_mapping_size(vim);
+ ASSERT3U(new_size, >, old_size);
+ ASSERT3U(new_size - old_size, ==,
+ entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
+ vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
+ if (old_size > 0) {
+ bcopy(old_entries, vim->vim_entries, old_size);
+ kmem_free(old_entries, old_size);
+ }
+ VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
+ new_size - old_size, &vim->vim_entries[old_count],
+ DMU_READ_PREFETCH));
+
+ zfs_dbgmsg("txg %llu: wrote %llu entries to "
+ "indirect mapping obj %llu; max offset=0x%llx",
+ (u_longlong_t)dmu_tx_get_txg(tx),
+ (u_longlong_t)entries_written,
+ (u_longlong_t)vim->vim_object,
+ (u_longlong_t)vim->vim_phys->vimp_max_offset);
+}
+
+/*
+ * Increment the relevant counts for the specified offset and length.
+ * The counts array must be obtained from
+ * vdev_indirect_mapping_load_obsolete_counts().
+ */
+void
+vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
+ uint64_t offset, uint64_t length, uint32_t *counts)
+{
+ vdev_indirect_mapping_entry_phys_t *mapping;
+ uint64_t index;
+
+ mapping = vdev_indirect_mapping_entry_for_offset(vim, offset);
+
+ ASSERT(length > 0);
+ ASSERT3P(mapping, !=, NULL);
+
+ index = mapping - vim->vim_entries;
+
+ while (length > 0) {
+ ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
+
+ uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
+ uint64_t inner_offset = offset -
+ DVA_MAPPING_GET_SRC_OFFSET(mapping);
+ VERIFY3U(inner_offset, <, size);
+ uint64_t inner_size = MIN(length, size - inner_offset);
+
+ VERIFY3U(counts[index] + inner_size, <=, size);
+ counts[index] += inner_size;
+
+ offset += inner_size;
+ length -= inner_size;
+ mapping++;
+ index++;
+ }
+}
+
+typedef struct load_obsolete_space_map_arg {
+ vdev_indirect_mapping_t *losma_vim;
+ uint32_t *losma_counts;
+} load_obsolete_space_map_arg_t;
+
+static int
+load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
+{
+ load_obsolete_space_map_arg_t *losma = arg;
+ ASSERT3S(sme->sme_type, ==, SM_ALLOC);
+
+ vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
+ sme->sme_offset, sme->sme_run, losma->losma_counts);
+
+ return (0);
+}
+
+/*
+ * Modify the counts (increment them) based on the spacemap.
+ */
+void
+vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
+ uint32_t *counts, space_map_t *obsolete_space_sm)
+{
+ load_obsolete_space_map_arg_t losma;
+ losma.losma_counts = counts;
+ losma.losma_vim = vim;
+ VERIFY0(space_map_iterate(obsolete_space_sm,
+ space_map_length(obsolete_space_sm),
+ load_obsolete_sm_callback, &losma));
+}
+
+/*
+ * Read the obsolete counts from disk, returning them in an array.
+ */
+uint32_t *
+vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ uint64_t counts_size =
+ vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
+ uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
+ if (vim->vim_havecounts) {
+ VERIFY0(dmu_read(vim->vim_objset,
+ vim->vim_phys->vimp_counts_object,
+ 0, counts_size,
+ counts, DMU_READ_PREFETCH));
+ } else {
+ bzero(counts, counts_size);
+ }
+ return (counts);
+}
+
+extern void
+vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
+ uint32_t *counts)
+{
+ ASSERT(vdev_indirect_mapping_verify(vim));
+
+ kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
new file mode 100644
index 000000000000..34d959008bd5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
@@ -0,0 +1,782 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/refcount.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * Maximum number of metaslabs per group that can be initialized
+ * simultaneously.
+ */
+int max_initialize_ms = 3;
+
+/*
+ * Value that is written to disk during initialization.
+ */
+uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
+
+/* maximum number of I/Os outstanding per leaf vdev */
+int zfs_initialize_limit = 1;
+
+/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
+uint64_t zfs_initialize_chunk_size = 1024 * 1024;
+
+static boolean_t
+vdev_initialize_should_stop(vdev_t *vd)
+{
+ return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
+ vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+static void
+vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+ /*
+ * We pass in the guid instead of the vdev_t since the vdev may
+ * have been freed prior to the sync task being processed. This
+ * happens when a vdev is detached as we call spa_config_vdev_exit(),
+ * stop the intializing thread, schedule the sync task, and free
+ * the vdev. Later when the scheduled sync task is invoked, it would
+ * find that the vdev has been freed.
+ */
+ uint64_t guid = *(uint64_t *)arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ kmem_free(arg, sizeof (uint64_t));
+
+ vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ return;
+
+ uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
+ vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
+
+ VERIFY(vd->vdev_leaf_zap != 0);
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+ if (last_offset > 0) {
+ vd->vdev_initialize_last_offset = last_offset;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (last_offset), 1, &last_offset, tx));
+ }
+ if (vd->vdev_initialize_action_time > 0) {
+ uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
+ 1, &val, tx));
+ }
+
+ uint64_t initialize_state = vd->vdev_initialize_state;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
+ &initialize_state, tx));
+}
+
+static void
+vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ spa_t *spa = vd->vdev_spa;
+
+ if (new_state == vd->vdev_initialize_state)
+ return;
+
+ /*
+ * Copy the vd's guid, this will be freed by the sync task.
+ */
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /*
+ * If we're suspending, then preserving the original start time.
+ */
+ if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
+ vd->vdev_initialize_action_time = gethrestime_sec();
+ }
+ vd->vdev_initialize_state = new_state;
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
+ guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
+
+ switch (new_state) {
+ case VDEV_INITIALIZE_ACTIVE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s activated", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_SUSPENDED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s suspended", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_CANCELED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s canceled", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_COMPLETE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s complete", vd->vdev_path);
+ break;
+ default:
+ panic("invalid state %llu", (unsigned long long)new_state);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+static void
+vdev_initialize_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+ /*
+ * The I/O failed because the vdev was unavailable; roll the
+ * last offset back. (This works because spa_sync waits on
+ * spa_txg_zio before it runs sync tasks.)
+ */
+ uint64_t *off =
+ &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
+ *off = MIN(*off, zio->io_offset);
+ } else {
+ /*
+ * Since initializing is best-effort, we ignore I/O errors and
+ * rely on vdev_probe to determine if the errors are more
+ * critical.
+ */
+ if (zio->io_error != 0)
+ vd->vdev_stat.vs_initialize_errors++;
+
+ vd->vdev_initialize_bytes_done += zio->io_orig_size;
+ }
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ cv_broadcast(&vd->vdev_initialize_io_cv);
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/* Takes care of physical writing and limiting # of concurrent ZIOs. */
+static int
+vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /* Limit inflight initializing I/Os */
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ vd->vdev_initialize_inflight++;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /* This is the first write of this txg. */
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_update_sync, guid, 2,
+ ZFS_SPACE_CHECK_RESERVED, tx);
+ }
+
+ /*
+ * We know the vdev struct will still be around since all
+ * consumers of vdev_free must stop the initialization first.
+ */
+ if (vdev_initialize_should_stop(vd)) {
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+ mutex_exit(&vd->vdev_initialize_lock);
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINTR));
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
+ zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
+ size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
+ ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
+ /* vdev_initialize_cb releases SCL_STATE_ALL */
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+ /*
+ * Walk up the vdev tree
+ */
+ if (vd != vd->vdev_top) {
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+ } else {
+ /*
+ * We've reached the top-level vdev, initialize the
+ * physical range to the logical range and start to
+ * unwind.
+ */
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
+ return;
+ }
+
+ vdev_t *pvd = vd->vdev_parent;
+ ASSERT3P(pvd, !=, NULL);
+ ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+ /*
+ * As this recursive function unwinds, translate the logical
+ * range into its physical components by calling the
+ * vdev specific translate function.
+ */
+ range_seg_t intermediate = { 0 };
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+ physical_rs->rs_start = intermediate.rs_start;
+ physical_rs->rs_end = intermediate.rs_end;
+}
+
+/*
+ * Callback to fill each ABD chunk with zfs_initialize_value. len must be
+ * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
+ * allocation will guarantee these for us.
+ */
+/* ARGSUSED */
+static int
+vdev_initialize_block_fill(void *buf, size_t len, void *unused)
+{
+ ASSERT0(len % sizeof (uint64_t));
+ for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
+ *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
+ }
+ return (0);
+}
+
+static abd_t *
+vdev_initialize_block_alloc()
+{
+ /* Allocate ABD for filler data */
+ abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
+
+ ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
+ (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
+ vdev_initialize_block_fill, NULL);
+
+ return (data);
+}
+
+static void
+vdev_initialize_block_free(abd_t *data)
+{
+ abd_free(data);
+}
+
+static int
+vdev_initialize_ranges(vdev_t *vd, abd_t *data)
+{
+ avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
+
+ for (range_seg_t *rs = avl_first(rt); rs != NULL;
+ rs = AVL_NEXT(rt, rs)) {
+ uint64_t size = rs->rs_end - rs->rs_start;
+
+ /* Split range into legally-sized physical chunks */
+ uint64_t writes_required =
+ ((size - 1) / zfs_initialize_chunk_size) + 1;
+
+ for (uint64_t w = 0; w < writes_required; w++) {
+ int error;
+
+ error = vdev_initialize_write(vd,
+ VDEV_LABEL_START_SIZE + rs->rs_start +
+ (w * zfs_initialize_chunk_size),
+ MIN(size - (w * zfs_initialize_chunk_size),
+ zfs_initialize_chunk_size), data);
+ if (error != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+static void
+vdev_initialize_mg_wait(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+ while (mg->mg_initialize_updating) {
+ cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+ }
+}
+
+static void
+vdev_initialize_mg_mark(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+ ASSERT(mg->mg_initialize_updating);
+
+ while (mg->mg_ms_initializing >= max_initialize_ms) {
+ cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+ }
+ mg->mg_ms_initializing++;
+ ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
+}
+
+/*
+ * Mark the metaslab as being initialized to prevent any allocations
+ * on this metaslab. We must also track how many metaslabs are currently
+ * being initialized within a metaslab group and limit them to prevent
+ * allocation failures from occurring because all metaslabs are being
+ * initialized.
+ */
+static void
+vdev_initialize_ms_mark(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+
+ mutex_enter(&mg->mg_ms_initialize_lock);
+
+ /*
+ * To keep an accurate count of how many threads are initializing
+ * a specific metaslab group, we only allow one thread to mark
+ * the metaslab group at a time. This ensures that the value of
+ * ms_initializing will be accurate when we decide to mark a metaslab
+ * group as being initialized. To do this we force all other threads
+ * to wait till the metaslab's mg_initialize_updating flag is no
+ * longer set.
+ */
+ vdev_initialize_mg_wait(mg);
+ mg->mg_initialize_updating = B_TRUE;
+ if (msp->ms_initializing == 0) {
+ vdev_initialize_mg_mark(mg);
+ }
+ mutex_enter(&msp->ms_lock);
+ msp->ms_initializing++;
+ mutex_exit(&msp->ms_lock);
+
+ mg->mg_initialize_updating = B_FALSE;
+ cv_broadcast(&mg->mg_ms_initialize_cv);
+ mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_ms_unmark(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+ mutex_enter(&mg->mg_ms_initialize_lock);
+ mutex_enter(&msp->ms_lock);
+ if (--msp->ms_initializing == 0) {
+ mg->mg_ms_initializing--;
+ cv_broadcast(&mg->mg_ms_initialize_cv);
+ }
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_calculate_progress(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ vd->vdev_initialize_bytes_est = 0;
+ vd->vdev_initialize_bytes_done = 0;
+
+ for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ mutex_enter(&msp->ms_lock);
+
+ uint64_t ms_free = msp->ms_size -
+ metaslab_allocated_space(msp);
+
+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
+ ms_free /= vd->vdev_top->vdev_children;
+
+ /*
+ * Convert the metaslab range to a physical range
+ * on our vdev. We use this to determine if we are
+ * in the middle of this metaslab range.
+ */
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = msp->ms_start;
+ logical_rs.rs_end = msp->ms_start + msp->ms_size;
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ } else if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done += ms_free;
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * If we get here, we're in the middle of initializing this
+ * metaslab. Load it and walk the free tree for more accurate
+ * progress estimation.
+ */
+ VERIFY0(metaslab_load(msp));
+
+ for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+ rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+ logical_rs.rs_start = rs->rs_start;
+ logical_rs.rs_end = rs->rs_end;
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ uint64_t size = physical_rs.rs_end -
+ physical_rs.rs_start;
+ vd->vdev_initialize_bytes_est += size;
+ if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done += size;
+ } else if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_start &&
+ vd->vdev_initialize_last_offset <
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done +=
+ vd->vdev_initialize_last_offset -
+ physical_rs.rs_start;
+ }
+ }
+ mutex_exit(&msp->ms_lock);
+ }
+}
+
+static void
+vdev_initialize_load(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+ vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (vd->vdev_initialize_last_offset), 1,
+ &vd->vdev_initialize_last_offset);
+ ASSERT(err == 0 || err == ENOENT);
+ }
+
+ vdev_initialize_calculate_progress(vd);
+}
+
+
+/*
+ * Convert the logical range into a physcial range and add it to our
+ * avl tree.
+ */
+void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = start;
+ logical_rs.rs_end = start + size;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ IMPLY(vd->vdev_top == vd,
+ logical_rs.rs_start == physical_rs.rs_start);
+ IMPLY(vd->vdev_top == vd,
+ logical_rs.rs_end == physical_rs.rs_end);
+
+ /* Only add segments that we have not visited yet */
+ if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
+ return;
+
+ /* Pick up where we left off mid-range. */
+ if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
+ zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+ "(%llu, %llu)", vd->vdev_path,
+ (u_longlong_t)physical_rs.rs_start,
+ (u_longlong_t)physical_rs.rs_end,
+ (u_longlong_t)vd->vdev_initialize_last_offset,
+ (u_longlong_t)physical_rs.rs_end);
+ ASSERT3U(physical_rs.rs_end, >,
+ vd->vdev_initialize_last_offset);
+ physical_rs.rs_start = vd->vdev_initialize_last_offset;
+ }
+ ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+ /*
+ * With raidz, it's possible that the logical range does not live on
+ * this leaf vdev. We only add the physical range to this vdev's if it
+ * has a length greater than 0.
+ */
+ if (physical_rs.rs_end > physical_rs.rs_start) {
+ range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+ }
+}
+
+static void
+vdev_initialize_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+ uint64_t ms_count = 0;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_initialize_last_offset = 0;
+ vdev_initialize_load(vd);
+
+ abd_t *deadbeef = vdev_initialize_block_alloc();
+
+ vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
+
+ for (uint64_t i = 0; !vd->vdev_detached &&
+ i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+ /*
+ * If we've expanded the top-level vdev or it's our
+ * first pass, calculate our progress.
+ */
+ if (vd->vdev_top->vdev_ms_count != ms_count) {
+ vdev_initialize_calculate_progress(vd);
+ ms_count = vd->vdev_top->vdev_ms_count;
+ }
+
+ vdev_initialize_ms_mark(msp);
+ mutex_enter(&msp->ms_lock);
+ VERIFY0(metaslab_load(msp));
+
+ range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
+ vd);
+ mutex_exit(&msp->ms_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ error = vdev_initialize_ranges(vd, deadbeef);
+ vdev_initialize_ms_unmark(msp);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
+ if (error != 0)
+ break;
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight > 0) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ range_tree_destroy(vd->vdev_initialize_tree);
+ vdev_initialize_block_free(deadbeef);
+ vd->vdev_initialize_tree = NULL;
+
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
+ }
+ ASSERT(vd->vdev_initialize_thread != NULL ||
+ vd->vdev_initialize_inflight == 0);
+
+ /*
+ * Drop the vdev_initialize_lock while we sync out the
+ * txg since it's possible that a device might be trying to
+ * come online and must check to see if it needs to restart an
+ * initialization. That thread will be holding the spa_config_lock
+ * which would prevent the txg_wait_synced from completing.
+ */
+ mutex_exit(&vd->vdev_initialize_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_thread = NULL;
+ cv_broadcast(&vd->vdev_initialize_cv);
+ mutex_exit(&vd->vdev_initialize_lock);
+ thread_exit();
+}
+
+/*
+ * Initiates a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_initialize(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_initialize_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
+ vd->vdev_initialize_thread = thread_create(NULL, 0,
+ vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Stop initializng a device, with the resultant initialing state being
+ * tgt_state. Blocks until the initializing thread has exited.
+ * Caller must hold vdev_initialize_lock and must not be writing to the spa
+ * config, as the initializing thread may try to enter the config as a reader
+ * before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ spa_t *spa = vd->vdev_spa;
+ ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
+
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+
+ /*
+ * Allow cancel requests to proceed even if the initialize thread
+ * has stopped.
+ */
+ if (vd->vdev_initialize_thread == NULL &&
+ tgt_state != VDEV_INITIALIZE_CANCELED) {
+ return;
+ }
+
+ vdev_initialize_change_state(vd, tgt_state);
+ vd->vdev_initialize_exit_wanted = B_TRUE;
+ while (vd->vdev_initialize_thread != NULL)
+ cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+static void
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, tgt_state);
+ mutex_exit(&vd->vdev_initialize_lock);
+ return;
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
+ }
+}
+
+/*
+ * Convenience function to stop initializing of a vdev tree and set all
+ * initialize thread pointers to NULL.
+ */
+void
+vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ vdev_initialize_stop_all_impl(vd, tgt_state);
+
+ if (vd->vdev_spa->spa_sync_on) {
+ /* Make sure that our state has been synced to disk */
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ }
+}
+
+void
+vdev_initialize_restart(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_leaf_zap != 0) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ uint64_t initialize_state = VDEV_INITIALIZE_NONE;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
+ sizeof (initialize_state), 1, &initialize_state);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_state = initialize_state;
+
+ uint64_t timestamp = 0;
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
+ sizeof (timestamp), 1, &timestamp);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_action_time = (time_t)timestamp;
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vd->vdev_offline) {
+ /* load progress for reporting, but don't resume */
+ vdev_initialize_load(vd);
+ } else if (vd->vdev_initialize_state ==
+ VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
+ vdev_initialize(vd);
+ }
+
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_restart(vd->vdev_child[i]);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
new file mode 100644
index 000000000000..0b777c8870c5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -0,0 +1,1701 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ * 1. Uniquely identify this device as part of a ZFS pool and confirm its
+ * identity within the pool.
+ *
+ * 2. Verify that all the devices given in a configuration are present
+ * within the pool.
+ *
+ * 3. Determine the uberblock for the pool.
+ *
+ * 4. In case of an import operation, determine the configuration of the
+ * toplevel vdev of which it is a part.
+ *
+ * 5. If an import operation cannot find all the devices in the pool,
+ * provide enough information to the administrator to determine which
+ * devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases. The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point. To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced. Assuming we have
+ * labels and an uberblock with the following transaction groups:
+ *
+ * L1 UB L2
+ * +------+ +------+ +------+
+ * | | | | | |
+ * | t10 | | t10 | | t10 |
+ * | | | | | |
+ * +------+ +------+ +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10). Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ * 1. For each vdev, update 'L1' to the new label
+ * 2. Update the uberblock
+ * 3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group. If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid. If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced. If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool. This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure. The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information. It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated. When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ * version ZFS on-disk version
+ * name Pool name
+ * state Pool state
+ * txg Transaction group in which this label was written
+ * pool_guid Unique identifier for this pool
+ * vdev_tree An nvlist describing vdev tree.
+ * features_for_read
+ * An nvlist of the features necessary for reading the MOS.
+ *
+ * Each leaf device label also contains the following:
+ *
+ * top_guid Unique ID for top-level vdev in which this is contained
+ * guid Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/zio.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/trim_map.h>
+
+static boolean_t vdev_trim_on_init = B_TRUE;
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RWTUN,
+ &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+ ASSERT(offset < sizeof (vdev_label_t));
+ ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
+
+ return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+ 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+/*
+ * Returns back the vdev label associated with the passed in offset.
+ */
+int
+vdev_label_number(uint64_t psize, uint64_t offset)
+{
+ int l;
+
+ if (offset >= psize - VDEV_LABEL_END_SIZE) {
+ offset -= psize - VDEV_LABEL_END_SIZE;
+ offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
+ }
+ l = offset / sizeof (vdev_label_t);
+ return (l < VDEV_LABELS ? l : -1);
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
+{
+ ASSERT(
+ spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
+ spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
+ ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
+
+ zio_nowait(zio_read_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
+}
+
+void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
+{
+ ASSERT(
+ spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
+ spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
+ ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
+
+ zio_nowait(zio_write_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
+}
+
+static void
+root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd != spa->spa_root_vdev)
+ return;
+
+ /* provide either current or previous scan information */
+ pool_scan_stat_t ps;
+ if (spa_scan_get_stats(spa, &ps) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+ }
+
+ pool_removal_stat_t prs;
+ if (spa_removal_get_stats(spa, &prs) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
+ sizeof (prs) / sizeof (uint64_t));
+ }
+
+ pool_checkpoint_stat_t pcs;
+ if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
+ sizeof (pcs) / sizeof (uint64_t));
+ }
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
+ vdev_config_flag_t flags)
+{
+ nvlist_t *nv = NULL;
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ nv = fnvlist_alloc();
+
+ fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
+
+ if (vd->vdev_path != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
+
+ if (vd->vdev_devid != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
+
+ if (vd->vdev_physpath != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ vd->vdev_physpath);
+
+ if (vd->vdev_fru != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
+
+ if (vd->vdev_nparity != 0) {
+ ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
+ VDEV_TYPE_RAIDZ) == 0);
+
+ /*
+ * Make sure someone hasn't managed to sneak a fancy new vdev
+ * into a crufty old storage pool.
+ */
+ ASSERT(vd->vdev_nparity == 1 ||
+ (vd->vdev_nparity <= 2 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+ (vd->vdev_nparity <= 3 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
+
+ /*
+ * Note that we'll add the nparity tag even on storage pools
+ * that only support a single parity device -- older software
+ * will just ignore it.
+ */
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
+ }
+
+ if (vd->vdev_wholedisk != -1ULL)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ vd->vdev_wholedisk);
+
+ if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
+
+ if (vd->vdev_isspare)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+ vd == vd->vdev_top) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
+ if (vd->vdev_removing) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing);
+ }
+
+ /* zpool command expects alloc class data */
+ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
+ const char *bias = NULL;
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ bias = VDEV_ALLOC_BIAS_LOG;
+ break;
+ case VDEV_BIAS_SPECIAL:
+ bias = VDEV_ALLOC_BIAS_SPECIAL;
+ break;
+ case VDEV_BIAS_DEDUP:
+ bias = VDEV_ALLOC_BIAS_DEDUP;
+ break;
+ default:
+ ASSERT3U(vd->vdev_alloc_bias, ==,
+ VDEV_BIAS_NONE);
+ }
+ fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ bias);
+ }
+ }
+
+ if (vd->vdev_dtl_sm != NULL) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ space_map_object(vd->vdev_dtl_sm));
+ }
+
+ if (vic->vic_mapping_object != 0) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
+ vic->vic_mapping_object);
+ }
+
+ if (vic->vic_births_object != 0) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
+ vic->vic_births_object);
+ }
+
+ if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+ vic->vic_prev_indirect_vdev);
+ }
+
+ if (vd->vdev_crtxg)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
+
+ if (flags & VDEV_CONFIG_MOS) {
+ if (vd->vdev_leaf_zap != 0) {
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
+ vd->vdev_leaf_zap);
+ }
+
+ if (vd->vdev_top_zap != 0) {
+ ASSERT(vd == vd->vdev_top);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+ vd->vdev_top_zap);
+ }
+ }
+
+ if (getstats) {
+ vdev_stat_t vs;
+
+ vdev_get_stats(vd, &vs);
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
+
+ root_vdev_actions_getprogress(vd, nv);
+
+ /*
+ * Note: this can be called from open context
+ * (spa_get_stats()), so we need the rwlock to prevent
+ * the mapping from being changed by condensing.
+ */
+ rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
+ if (vd->vdev_indirect_mapping != NULL) {
+ ASSERT(vd->vdev_indirect_births != NULL);
+ vdev_indirect_mapping_t *vim =
+ vd->vdev_indirect_mapping;
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
+ vdev_indirect_mapping_size(vim));
+ }
+ rw_exit(&vd->vdev_indirect_rwlock);
+ if (vd->vdev_mg != NULL &&
+ vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
+ /*
+ * Compute approximately how much memory would be used
+ * for the indirect mapping if this device were to
+ * be removed.
+ *
+ * Note: If the frag metric is invalid, then not
+ * enough metaslabs have been converted to have
+ * histograms.
+ */
+ uint64_t seg_count = 0;
+ uint64_t to_alloc = vd->vdev_stat.vs_alloc;
+
+ /*
+ * There are the same number of allocated segments
+ * as free segments, so we will have at least one
+ * entry per free segment. However, small free
+ * segments (smaller than vdev_removal_max_span)
+ * will be combined with adjacent allocated segments
+ * as a single mapping.
+ */
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ if (1ULL << (i + 1) < vdev_removal_max_span) {
+ to_alloc +=
+ vd->vdev_mg->mg_histogram[i] <<
+ i + 1;
+ } else {
+ seg_count +=
+ vd->vdev_mg->mg_histogram[i];
+ }
+ }
+
+ /*
+ * The maximum length of a mapping is
+ * zfs_remove_max_segment, so we need at least one entry
+ * per zfs_remove_max_segment of allocated data.
+ */
+ seg_count += to_alloc / zfs_remove_max_segment;
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
+ seg_count *
+ sizeof (vdev_indirect_mapping_entry_phys_t));
+ }
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t **child;
+ int c, idx;
+
+ ASSERT(!vd->vdev_ishole);
+
+ child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+ KM_SLEEP);
+
+ for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ /*
+ * If we're generating an nvlist of removing
+ * vdevs then skip over any device which is
+ * not being removed.
+ */
+ if ((flags & VDEV_CONFIG_REMOVING) &&
+ !cvd->vdev_removing)
+ continue;
+
+ child[idx++] = vdev_config_generate(spa, cvd,
+ getstats, flags);
+ }
+
+ if (idx) {
+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, idx);
+ }
+
+ for (c = 0; c < idx; c++)
+ nvlist_free(child[c]);
+
+ kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+
+ } else {
+ const char *aux = NULL;
+
+ if (vd->vdev_offline && !vd->vdev_tmpoffline)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
+ if (vd->vdev_resilver_txg != 0)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ vd->vdev_resilver_txg);
+ if (vd->vdev_faulted)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
+ if (vd->vdev_degraded)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
+ if (vd->vdev_removed)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
+ if (vd->vdev_unspare)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
+ if (vd->vdev_ishole)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
+
+ switch (vd->vdev_stat.vs_aux) {
+ case VDEV_AUX_ERR_EXCEEDED:
+ aux = "err_exceeded";
+ break;
+
+ case VDEV_AUX_EXTERNAL:
+ aux = "external";
+ break;
+ }
+
+ if (aux != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
+
+ if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+ vd->vdev_orig_guid);
+ }
+ }
+
+ return (nv);
+}
+
+/*
+ * Generate a view of the top-level vdevs. If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs. Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *array;
+ uint_t c, idx;
+
+ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+ for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_ishole) {
+ array[idx++] = c;
+ }
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+ array, idx) == 0);
+ }
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ rvd->vdev_children) == 0);
+
+ kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
+/*
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
+ */
+nvlist_t *
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *config = NULL;
+ vdev_phys_t *vp;
+ abd_t *vp_abd;
+ zio_t *zio;
+ uint64_t best_txg = 0;
+ uint64_t label_txg = 0;
+ int error = 0;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE;
+
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+ if (!vdev_readable(vd))
+ return (NULL);
+
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ vp = abd_to_buf(vp_abd);
+
+retry:
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ nvlist_t *label = NULL;
+
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ vdev_label_read(zio, vd, l, vp_abd,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL, flags);
+
+ if (zio_wait(zio) == 0 &&
+ nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
+ &label, 0) == 0) {
+ /*
+ * Auxiliary vdevs won't have txg values in their
+ * labels and newly added vdevs may not have been
+ * completely initialized so just return the
+ * configuration from the first valid label we
+ * encounter.
+ */
+ error = nvlist_lookup_uint64(label,
+ ZPOOL_CONFIG_POOL_TXG, &label_txg);
+ if ((error || label_txg == 0) && !config) {
+ config = label;
+ break;
+ } else if (label_txg <= txg && label_txg > best_txg) {
+ best_txg = label_txg;
+ nvlist_free(config);
+ config = fnvlist_dup(label);
+ }
+ }
+
+ if (label != NULL) {
+ nvlist_free(label);
+ label = NULL;
+ }
+ }
+
+ if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ /*
+ * We found a valid label but it didn't pass txg restrictions.
+ */
+ if (config == NULL && label_txg != 0) {
+ vdev_dbgmsg(vd, "label discarded as txg is too large "
+ "(%llu > %llu)", (u_longlong_t)label_txg,
+ (u_longlong_t)txg);
+ }
+
+ abd_free(vp_abd);
+
+ return (config);
+}
+
+/*
+ * Determine if a device is in use. The 'spare_guid' parameter will be filled
+ * in with the device guid if this spare is active elsewhere on the system.
+ */
+static boolean_t
+vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
+ uint64_t *spare_guid, uint64_t *l2cache_guid)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t state, pool_guid, device_guid, txg, spare_pool;
+ uint64_t vdtxg = 0;
+ nvlist_t *label;
+
+ if (spare_guid)
+ *spare_guid = 0ULL;
+ if (l2cache_guid)
+ *l2cache_guid = 0ULL;
+
+ /*
+ * Read the label, if any, and perform some basic sanity checks.
+ */
+ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
+ return (B_FALSE);
+
+ (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ &vdtxg);
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &device_guid) != 0) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0)) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * Check to see if this device indeed belongs to the pool it claims to
+ * be a part of. The only way this is allowed is if the device is a hot
+ * spare (which we check for later on).
+ */
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ !spa_guid_exists(pool_guid, device_guid) &&
+ !spa_spare_exists(device_guid, NULL, NULL) &&
+ !spa_l2cache_exists(device_guid, NULL))
+ return (B_FALSE);
+
+ /*
+ * If the transaction group is zero, then this an initialized (but
+ * unused) label. This is only an error if the create transaction
+ * on-disk is the same as the one we're using now, in which case the
+ * user has attempted to add the same vdev multiple times in the same
+ * transaction.
+ */
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ txg == 0 && vdtxg == crtxg)
+ return (B_TRUE);
+
+ /*
+ * Check to see if this is a spare device. We do an explicit check for
+ * spa_has_spare() here because it may be on our pending list of spares
+ * to add. We also check if it is an l2cache device.
+ */
+ if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
+ spa_has_spare(spa, device_guid)) {
+ if (spare_guid)
+ *spare_guid = device_guid;
+
+ switch (reason) {
+ case VDEV_LABEL_CREATE:
+ case VDEV_LABEL_L2CACHE:
+ return (B_TRUE);
+
+ case VDEV_LABEL_REPLACE:
+ return (!spa_has_spare(spa, device_guid) ||
+ spare_pool != 0ULL);
+
+ case VDEV_LABEL_SPARE:
+ return (spa_has_spare(spa, device_guid));
+ }
+ }
+
+ /*
+ * Check to see if this is an l2cache device.
+ */
+ if (spa_l2cache_exists(device_guid, NULL))
+ return (B_TRUE);
+
+ /*
+ * We can't rely on a pool's state if it's been imported
+ * read-only. Instead we look to see if the pools is marked
+ * read-only in the namespace and set the state to active.
+ */
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+ spa_mode(spa) == FREAD)
+ state = POOL_STATE_ACTIVE;
+
+ /*
+ * If the device is marked ACTIVE, then this device is in use by another
+ * pool on the system.
+ */
+ return (state == POOL_STATE_ACTIVE);
+}
+
+/*
+ * Initialize a vdev label. We check to make sure each leaf device is not in
+ * use, and writable. We put down an initial label which we will later
+ * overwrite with a complete label. Note that it's important to do this
+ * sequentially, not in parallel, so that we catch cases of multiple use of the
+ * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
+ * itself.
+ */
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ abd_t *vp_abd;
+ abd_t *bootenv;
+ uberblock_t *ub;
+ abd_t *ub_abd;
+ zio_t *zio;
+ char *buf;
+ size_t buflen;
+ int error;
+ uint64_t spare_guid, l2cache_guid;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if ((error = vdev_label_init(vd->vdev_child[c],
+ crtxg, reason)) != 0)
+ return (error);
+
+ /* Track the creation time for this vdev */
+ vd->vdev_crtxg = crtxg;
+
+ if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
+ return (0);
+
+ /*
+ * Dead vdevs cannot be initialized.
+ */
+ if (vdev_is_dead(vd))
+ return (SET_ERROR(EIO));
+
+ /*
+ * Determine if the vdev is in use.
+ */
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
+ vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * If this is a request to add or replace a spare or l2cache device
+ * that is in use elsewhere on the system, then we must update the
+ * guid (which was initialized to a random value) to reflect the
+ * actual GUID (which is shared between multiple pools).
+ */
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
+ spare_guid != 0ULL) {
+ uint64_t guid_delta = spare_guid - vd->vdev_guid;
+
+ vd->vdev_guid += guid_delta;
+
+ for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += guid_delta;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding a spare, then it's already
+ * labeled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_SPARE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE ||
+ reason == VDEV_LABEL_SPLIT);
+ }
+
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
+ l2cache_guid != 0ULL) {
+ uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
+
+ vd->vdev_guid += guid_delta;
+
+ for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += guid_delta;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding an l2cache, then it's
+ * already labeled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_L2CACHE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE);
+ }
+
+ /*
+ * TRIM the whole thing, excluding the blank space and boot header
+ * as specified by ZFS On-Disk Specification (section 1.3), so that
+ * we start with a clean slate.
+ * It's just an optimization, so we don't care if it fails.
+ * Don't TRIM if removing so that we don't interfere with zpool
+ * disaster recovery.
+ */
+ if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim &&
+ (reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE ||
+ reason == VDEV_LABEL_L2CACHE))
+ zio_wait(zio_trim(NULL, spa, vd, VDEV_SKIP_SIZE,
+ vd->vdev_psize - VDEV_SKIP_SIZE));
+
+ /*
+ * Initialize its label.
+ */
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ abd_zero(vp_abd, sizeof (vdev_phys_t));
+ vp = abd_to_buf(vp_abd);
+
+ /*
+ * Generate a label describing the pool and our top-level vdev.
+ * We mark it as being from txg 0 to indicate that it's not
+ * really part of an active pool just yet. The labels will
+ * be written again with a meaningful txg by spa_sync().
+ */
+ if (reason == VDEV_LABEL_SPARE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
+ /*
+ * For inactive hot spares, we generate a special label that
+ * identifies as a mutually shared hot spare. We write the
+ * label if we are adding a hot spare, or if we are removing an
+ * active hot spare (in which case we want to revert the
+ * labels).
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_SPARE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ } else if (reason == VDEV_LABEL_L2CACHE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
+ /*
+ * For level 2 ARC devices, add a special label.
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_L2CACHE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ } else {
+ uint64_t txg = 0ULL;
+
+ if (reason == VDEV_LABEL_SPLIT)
+ txg = spa->spa_uberblock.ub_txg;
+ label = spa_config_generate(spa, vd, txg, B_FALSE);
+
+ /*
+ * Add our creation time. This allows us to detect multiple
+ * vdev uses as described above, and automatically expires if we
+ * fail.
+ */
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ crtxg) == 0);
+ }
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
+ if (error != 0) {
+ nvlist_free(label);
+ abd_free(vp_abd);
+ /* EFAULT means nvlist_pack ran out of room */
+ return (error == EFAULT ? ENAMETOOLONG : EINVAL);
+ }
+
+ /*
+ * Initialize uberblock template.
+ */
+ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
+ abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+ ub = abd_to_buf(ub_abd);
+ ub->ub_txg = 0;
+
+ /* Initialize the 2nd padding area. */
+ bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(bootenv, VDEV_PAD_SIZE);
+
+ /*
+ * Write everything in parallel.
+ */
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (int l = 0; l < VDEV_LABELS; l++) {
+
+ vdev_label_write(zio, vd, l, vp_abd,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL, flags);
+
+ /*
+ * Skip the 1st padding area.
+ * Zero out the 2nd padding area where it might have
+ * left over data from previous filesystem format.
+ */
+ vdev_label_write(zio, vd, l, bootenv,
+ offsetof(vdev_label_t, vl_be),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+
+ vdev_label_write(zio, vd, l, ub_abd,
+ offsetof(vdev_label_t, vl_uberblock),
+ VDEV_UBERBLOCK_RING, NULL, NULL, flags);
+ }
+
+ error = zio_wait(zio);
+
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ nvlist_free(label);
+ abd_free(bootenv);
+ abd_free(ub_abd);
+ abd_free(vp_abd);
+
+ /*
+ * If this vdev hasn't been previously identified as a spare, then we
+ * mark it as such only if a) we are labeling it as a spare, or b) it
+ * exists as a spare elsewhere in the system. Do the same for
+ * level 2 ARC devices.
+ */
+ if (error == 0 && !vd->vdev_isspare &&
+ (reason == VDEV_LABEL_SPARE ||
+ spa_spare_exists(vd->vdev_guid, NULL, NULL)))
+ spa_spare_add(vd);
+
+ if (error == 0 && !vd->vdev_isl2cache &&
+ (reason == VDEV_LABEL_L2CACHE ||
+ spa_l2cache_exists(vd->vdev_guid, NULL)))
+ spa_l2cache_add(vd);
+
+ return (error);
+}
+
+/*
+ * Done callback for vdev_label_read_bootenv_impl. If this is the first
+ * callback to finish, store our abd in the callback pointer. Otherwise, we
+ * just free our abd and return.
+ */
+static void
+vdev_label_read_bootenv_done(zio_t *zio)
+{
+ zio_t *rio = zio->io_private;
+ abd_t **cbp = rio->io_private;
+
+ ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE);
+
+ if (zio->io_error == 0) {
+ mutex_enter(&rio->io_lock);
+ if (*cbp == NULL) {
+ /* Will free this buffer in vdev_label_read_bootenv. */
+ *cbp = zio->io_abd;
+ } else {
+ abd_free(zio->io_abd);
+ }
+ mutex_exit(&rio->io_lock);
+ } else {
+ abd_free(zio->io_abd);
+ }
+}
+
+static void
+vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags);
+
+ /*
+ * We just use the first label that has a correct checksum; the
+ * bootloader should have rewritten them all to be the same on boot,
+ * and any changes we made since boot have been the same across all
+ * labels.
+ *
+ * While grub supports writing to all four labels, other bootloaders
+ * don't, so we only use the first two labels to store boot
+ * information.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ for (int l = 0; l < VDEV_LABELS / 2; l++) {
+ vdev_label_read(zio, vd, l,
+ abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),
+ offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,
+ vdev_label_read_bootenv_done, zio, flags);
+ }
+ }
+}
+
+int
+vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command)
+{
+ spa_t *spa = rvd->vdev_spa;
+ abd_t *abd = NULL;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+ ASSERT(command);
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ zio_t *zio = zio_root(spa, NULL, &abd, flags);
+ vdev_label_read_bootenv_impl(zio, rvd, flags);
+ int err = zio_wait(zio);
+
+ if (abd != NULL) {
+ vdev_boot_envblock_t *vbe = abd_to_buf(abd);
+ if (vbe->vbe_version != VB_RAW) {
+ abd_free(abd);
+ return (SET_ERROR(ENOTSUP));
+ }
+ vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
+ fnvlist_add_string(command, "envmap", vbe->vbe_bootenv);
+ /* abd was allocated in vdev_label_read_bootenv_impl() */
+ abd_free(abd);
+ /* If we managed to read any successfully, return success. */
+ return (0);
+ }
+ return (err);
+}
+
+int
+vdev_label_write_bootenv(vdev_t *vd, char *envmap)
+{
+ zio_t *zio;
+ spa_t *spa = vd->vdev_spa;
+ vdev_boot_envblock_t *bootenv;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int error = ENXIO;
+
+ if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) {
+ return (SET_ERROR(E2BIG));
+ }
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ int child_err = vdev_label_write_bootenv(vd->vdev_child[c],
+ envmap);
+ /*
+ * As long as any of the disks managed to write all of their
+ * labels successfully, return success.
+ */
+ if (child_err == 0)
+ error = child_err;
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) ||
+ !vdev_writeable(vd)) {
+ return (error);
+ }
+ ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);
+ abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(abd, VDEV_PAD_SIZE);
+ bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);
+
+ char *buf = bootenv->vbe_bootenv;
+ (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv));
+ bootenv->vbe_version = VB_RAW;
+ abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
+
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+ for (int l = 0; l < VDEV_LABELS / 2; l++) {
+ vdev_label_write(zio, vd, l, abd,
+ offsetof(vdev_label_t, vl_be),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+ }
+
+ error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ abd_free(abd);
+ return (error);
+}
+
+int
+vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
+{
+ spa_t *spa = vd->vdev_spa;
+ zio_t *zio;
+ abd_t *pad2;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int error;
+
+ if (size > VDEV_PAD_SIZE)
+ return (EINVAL);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (ENODEV);
+ if (vdev_is_dead(vd))
+ return (ENXIO);
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(pad2, VDEV_PAD_SIZE);
+ abd_copy_from_buf(pad2, buf, size);
+
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+ vdev_label_write(zio, vd, 0, pad2,
+ offsetof(vdev_label_t, vl_be),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+ error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ abd_free(pad2);
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk. We've
+ * written the first uberblock for txg + 1, and then we lose power. When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline. If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a few seconds we'll have two
+ * conflicting uberblocks on disk with the same txg. The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
+{
+ int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+
+ if (likely(cmp))
+ return (cmp);
+
+ cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+ if (likely(cmp))
+ return (cmp);
+
+ /*
+ * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware
+ * ZFS, e.g. zfsonlinux >= 0.7.
+ *
+ * If one ub has MMP and the other does not, they were written by
+ * different hosts, which matters for MMP. So we treat no MMP/no SEQ as
+ * a 0 value.
+ *
+ * Since timestamp and txg are the same if we get this far, either is
+ * acceptable for importing the pool.
+ */
+ unsigned int seq1 = 0;
+ unsigned int seq2 = 0;
+
+ if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
+ seq1 = MMP_SEQ(ub1);
+
+ if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
+ seq2 = MMP_SEQ(ub2);
+
+ return (AVL_CMP(seq1, seq2));
+}
+
+struct ubl_cbdata {
+ uberblock_t *ubl_ubbest; /* Best uberblock */
+ vdev_t *ubl_vd; /* vdev associated with the above */
+};
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ spa_t *spa = zio->io_spa;
+ zio_t *rio = zio->io_private;
+ uberblock_t *ub = abd_to_buf(zio->io_abd);
+ struct ubl_cbdata *cbp = rio->io_private;
+
+ ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
+
+ if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
+ mutex_enter(&rio->io_lock);
+ if (ub->ub_txg <= spa->spa_load_max_txg &&
+ vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
+ /*
+ * Keep track of the vdev in which this uberblock
+ * was found. We will use this information later
+ * to obtain the config nvlist associated with
+ * this uberblock.
+ */
+ *cbp->ubl_ubbest = *ub;
+ cbp->ubl_vd = vd;
+ }
+ mutex_exit(&rio->io_lock);
+ }
+
+ abd_free(zio->io_abd);
+}
+
+static void
+vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
+ struct ubl_cbdata *cbp)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
+
+ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ vdev_label_read(zio, vd, l,
+ abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
+ B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_load_done, zio, flags);
+ }
+ }
+ }
+}
+
+/*
+ * Reads the 'best' uberblock from disk along with its associated
+ * configuration. First, we read the uberblock array of each label of each
+ * vdev, keeping track of the uberblock with the highest txg in each array.
+ * Then, we read the configuration from the same vdev as the best uberblock.
+ */
+void
+vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
+{
+ zio_t *zio;
+ spa_t *spa = rvd->vdev_spa;
+ struct ubl_cbdata cb;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+ ASSERT(ub);
+ ASSERT(config);
+
+ bzero(ub, sizeof (uberblock_t));
+ *config = NULL;
+
+ cb.ubl_ubbest = ub;
+ cb.ubl_vd = NULL;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ zio = zio_root(spa, NULL, &cb, flags);
+ vdev_uberblock_load_impl(zio, rvd, flags, &cb);
+ (void) zio_wait(zio);
+
+ /*
+ * It's possible that the best uberblock was discovered on a label
+ * that has a configuration which was written in a future txg.
+ * Search all labels on this vdev to find the configuration that
+ * matches the txg for our uberblock.
+ */
+ if (cb.ubl_vd != NULL) {
+ vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
+ "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
+
+ *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+ if (*config == NULL && spa->spa_extreme_rewind) {
+ vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
+ "Trying again without txg restrictions.");
+ *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
+ }
+ if (*config == NULL) {
+ vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
+ }
+ }
+ spa_config_exit(spa, SCL_ALL, FTAG);
+}
+
+/*
+ * On success, increment root zio's count of good writes.
+ * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
+ atomic_inc_64(good_writes);
+}
+
+/*
+ * Write the uberblock to all labels of all leaves of the specified vdev.
+ */
+static void
+vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+ uberblock_t *ub, vdev_t *vd, int flags)
+{
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ vdev_uberblock_sync(zio, good_writes,
+ ub, vd->vdev_child[c], flags);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (!vdev_writeable(vd))
+ return;
+
+ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
+ int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);
+
+ /* Copy the uberblock_t into the ABD */
+ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+ abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+
+ for (int l = 0; l < VDEV_LABELS; l++)
+ vdev_label_write(zio, vd, l, ub_abd,
+ VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_sync_done, good_writes,
+ flags | ZIO_FLAG_DONT_PROPAGATE);
+
+ abd_free(ub_abd);
+}
+
+/* Sync the uberblocks to all vdevs in svd[] */
+int
+vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
+{
+ spa_t *spa = svd[0]->vdev_spa;
+ zio_t *zio;
+ uint64_t good_writes = 0;
+
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (int v = 0; v < svdcount; v++)
+ vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
+
+ (void) zio_wait(zio);
+
+ /*
+ * Flush the uberblocks to disk. This ensures that the odd labels
+ * are no longer needed (because the new uberblocks and the even
+ * labels are safely on disk), so it is safe to overwrite them.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (int v = 0; v < svdcount; v++) {
+ if (vdev_writeable(svd[v])) {
+ zio_flush(zio, svd[v]);
+ }
+ }
+
+ (void) zio_wait(zio);
+
+ return (good_writes >= 1 ? 0 : EIO);
+}
+
+/*
+ * On success, increment the count of good writes for our top-level vdev.
+ */
+static void
+vdev_label_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (zio->io_error == 0)
+ atomic_inc_64(good_writes);
+}
+
+/*
+ * If there weren't enough good writes, indicate failure to the parent.
+ */
+static void
+vdev_label_sync_top_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (*good_writes == 0)
+ zio->io_error = SET_ERROR(EIO);
+
+ kmem_free(good_writes, sizeof (uint64_t));
+}
+
+/*
+ * We ignore errors for log and cache devices, simply free the private data.
+ */
+static void
+vdev_label_sync_ignore_done(zio_t *zio)
+{
+ kmem_free(zio->io_private, sizeof (uint64_t));
+}
+
+/*
+ * Write all even or odd labels to all leaves of the specified vdev.
+ */
+static void
+vdev_label_sync(zio_t *zio, uint64_t *good_writes,
+ vdev_t *vd, int l, uint64_t txg, int flags)
+{
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ abd_t *vp_abd;
+ char *buf;
+ size_t buflen;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_label_sync(zio, good_writes,
+ vd->vdev_child[c], l, txg, flags);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (!vdev_writeable(vd))
+ return;
+
+ /*
+ * Generate a label describing the top-level config to which we belong.
+ */
+ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
+
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ abd_zero(vp_abd, sizeof (vdev_phys_t));
+ vp = abd_to_buf(vp_abd);
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
+ for (; l < VDEV_LABELS; l += 2) {
+ vdev_label_write(zio, vd, l, vp_abd,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t),
+ vdev_label_sync_done, good_writes,
+ flags | ZIO_FLAG_DONT_PROPAGATE);
+ }
+ }
+
+ abd_free(vp_abd);
+ nvlist_free(label);
+}
+
+int
+vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
+{
+ list_t *dl = &spa->spa_config_dirty_list;
+ vdev_t *vd;
+ zio_t *zio;
+ int error;
+
+ /*
+ * Write the new labels to disk.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
+ uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
+ KM_SLEEP);
+
+ ASSERT(!vd->vdev_ishole);
+
+ zio_t *vio = zio_null(zio, spa, NULL,
+ (vd->vdev_islog || vd->vdev_aux != NULL) ?
+ vdev_label_sync_ignore_done : vdev_label_sync_top_done,
+ good_writes, flags);
+ vdev_label_sync(vio, good_writes, vd, l, txg, flags);
+ zio_nowait(vio);
+ }
+
+ error = zio_wait(zio);
+
+ /*
+ * Flush the new labels to disk.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
+ zio_flush(zio, vd);
+
+ (void) zio_wait(zio);
+
+ return (error);
+}
+
+/*
+ * Sync the uberblock and any changes to the vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent. The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+{
+ spa_t *spa = svd[0]->vdev_spa;
+ uberblock_t *ub = &spa->spa_uberblock;
+ int error = 0;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+
+ ASSERT(svdcount != 0);
+retry:
+ /*
+ * Normally, we don't want to try too hard to write every label and
+ * uberblock. If there is a flaky disk, we don't want the rest of the
+ * sync process to block while we retry. But if we can't write a
+ * single label out, we should retry with ZIO_FLAG_TRYHARD before
+ * bailing out and declaring the pool faulted.
+ */
+ if (error != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0)
+ return (error);
+ flags |= ZIO_FLAG_TRYHARD;
+ }
+
+ ASSERT(ub->ub_txg <= txg);
+
+ /*
+ * If this isn't a resync due to I/O errors,
+ * and nothing changed in this transaction group,
+ * and the vdev configuration hasn't changed,
+ * then there's nothing to do.
+ */
+ if (ub->ub_txg < txg) {
+ boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
+ txg, spa->spa_mmp.mmp_delay);
+
+ if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+ return (0);
+ }
+
+ if (txg > spa_freeze_txg(spa))
+ return (0);
+
+ ASSERT(txg <= spa->spa_final_txg);
+
+ /*
+ * Flush the write cache of every disk that's been written to
+ * in this transaction group. This ensures that all blocks
+ * written in this txg will be committed to stable storage
+ * before any uberblock that references them.
+ */
+ zio_t *zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vdev_t *vd =
+ txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
+ vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
+ zio_flush(zio, vd);
+
+ (void) zio_wait(zio);
+
+ /*
+ * Sync out the even labels (L0, L2) for every dirty vdev. If the
+ * system dies in the middle of this process, that's OK: all of the
+ * even labels that made it to disk will be newer than any uberblock,
+ * and will therefore be considered invalid. The odd labels (L1, L3),
+ * which have not yet been touched, will still be valid. We flush
+ * the new labels to disk to ensure that all even-label updates
+ * are committed to stable storage before the uberblock update.
+ */
+ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+ "for pool '%s' when syncing out the even labels "
+ "of dirty vdevs", error, spa_name(spa));
+ }
+ goto retry;
+ }
+
+ /*
+ * Sync the uberblocks to all vdevs in svd[].
+ * If the system dies in the middle of this step, there are two cases
+ * to consider, and the on-disk state is consistent either way:
+ *
+ * (1) If none of the new uberblocks made it to disk, then the
+ * previous uberblock will be the newest, and the odd labels
+ * (which had not yet been touched) will be valid with respect
+ * to that uberblock.
+ *
+ * (2) If one or more new uberblocks made it to disk, then they
+ * will be the newest, and the even labels (which had all
+ * been successfully committed) will be valid with respect
+ * to the new uberblocks.
+ */
+ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
+ "%d for pool '%s'", error, spa_name(spa));
+ }
+ goto retry;
+ }
+
+ if (spa_multihost(spa))
+ mmp_update_uberblock(spa, ub);
+
+ /*
+ * Sync out odd labels for every dirty vdev. If the system dies
+ * in the middle of this process, the even labels and the new
+ * uberblocks will suffice to open the pool. The next time
+ * the pool is opened, the first thing we'll do -- before any
+ * user data is modified -- is mark every vdev dirty so that
+ * all labels will be brought up to date. We flush the new labels
+ * to disk to ensure that all odd-label updates are committed to
+ * stable storage before the next transaction group begins.
+ */
+ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+ "for pool '%s' when syncing out the odd labels of "
+ "dirty vdevs", error, spa_name(spa));
+ }
+ goto retry;;
+ }
+
+ trim_thread_wakeup(spa);
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
new file mode 100644
index 000000000000..391cee87af08
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -0,0 +1,779 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for mirroring.
+ */
+
+typedef struct mirror_child {
+ vdev_t *mc_vd;
+ uint64_t mc_offset;
+ int mc_error;
+ int mc_load;
+ uint8_t mc_tried;
+ uint8_t mc_skipped;
+ uint8_t mc_speculative;
+} mirror_child_t;
+
+typedef struct mirror_map {
+ int *mm_preferred;
+ int mm_preferred_cnt;
+ int mm_children;
+ boolean_t mm_resilvering;
+ boolean_t mm_root;
+ mirror_child_t mm_child[];
+} mirror_map_t;
+
+static int vdev_mirror_shift = 21;
+
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs_vdev);
+static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "ZFS VDEV Mirror");
+#endif
+
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int rotating_inc = 0;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN,
+ &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
+#endif
+
+static int rotating_seek_inc = 5;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN,
+ &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
+#endif
+
+static int rotating_seek_offset = 1 * 1024 * 1024;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN,
+ &rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
+ "triggers a reduced rotating media seek increment");
+#endif
+
+/* Non-rotating media load calculation configuration. */
+static int non_rotating_inc = 0;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN,
+ &non_rotating_inc, 0,
+ "Non-rotating media load increment for non-seeking I/O's");
+#endif
+
+static int non_rotating_seek_inc = 1;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN,
+ &non_rotating_seek_inc, 0,
+ "Non-rotating media load increment for seeking I/O's");
+#endif
+
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+ return (offsetof(mirror_map_t, mm_child[children]) +
+ sizeof(int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+ mirror_map_t *mm;
+
+ mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+ mm->mm_children = children;
+ mm->mm_resilvering = resilvering;
+ mm->mm_root = root;
+ mm->mm_preferred = (int *)((uintptr_t)mm +
+ offsetof(mirror_map_t, mm_child[children]));
+
+ return mm;
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+
+ kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
+}
+
+static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+ vdev_mirror_map_free,
+ zio_vsd_default_cksum_report
+};
+
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+ uint64_t lastoffset;
+ int load;
+
+ /* All DVAs have equal weight at the root. */
+ if (mm->mm_root)
+ return (INT_MAX);
+
+ /*
+ * We don't return INT_MAX if the device is resilvering i.e.
+ * vdev_resilver_txg != 0 as when tested performance was slightly
+ * worse overall when resilvering with compared to without.
+ */
+
+ /* Standard load based on pending queue length. */
+ load = vdev_queue_length(vd);
+ lastoffset = vdev_queue_lastoffset(vd);
+
+ if (vd->vdev_nonrot) {
+ /* Non-rotating media. */
+ if (lastoffset == zio_offset)
+ return (load + non_rotating_inc);
+
+ /*
+ * Apply a seek penalty even for non-rotating devices as
+ * sequential I/O'a can be aggregated into fewer operations
+ * on the device, thus avoiding unnecessary per-command
+ * overhead and boosting performance.
+ */
+ return (load + non_rotating_seek_inc);
+ }
+
+ /* Rotating media I/O's which directly follow the last I/O. */
+ if (lastoffset == zio_offset)
+ return (load + rotating_inc);
+
+ /*
+ * Apply half the seek increment to I/O's within seek offset
+ * of the last I/O queued to this vdev as they should incure less
+ * of a seek increment.
+ */
+ if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
+ return (load + (rotating_seek_inc / 2));
+
+ /* Apply the full seek increment to all other I/O's. */
+ return (load + rotating_seek_inc);
+}
+
+
+static mirror_map_t *
+vdev_mirror_map_init(zio_t *zio)
+{
+ mirror_map_t *mm = NULL;
+ mirror_child_t *mc;
+ vdev_t *vd = zio->io_vd;
+ int c;
+
+ if (vd == NULL) {
+ dva_t *dva = zio->io_bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+ dva_t dva_copy[SPA_DVAS_PER_BP];
+
+ c = BP_GET_NDVAS(zio->io_bp);
+
+ /*
+ * If we do not trust the pool config, some DVAs might be
+ * invalid or point to vdevs that do not exist. We skip them.
+ */
+ if (!spa_trust_config(spa)) {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ int j = 0;
+ for (int i = 0; i < c; i++) {
+ if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
+ dva_copy[j++] = dva[i];
+ }
+ if (j == 0) {
+ zio->io_vsd = NULL;
+ zio->io_error = ENXIO;
+ return (NULL);
+ }
+ if (j < c) {
+ dva = dva_copy;
+ c = j;
+ }
+ }
+
+ mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+ mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+ }
+ } else {
+ /*
+ * If we are resilvering, then we should handle scrub reads
+ * differently; we shouldn't issue them to the resilvering
+ * device because it might not have those blocks.
+ *
+ * We are resilvering iff:
+ * 1) We are a replacing vdev (ie our name is "replacing-1" or
+ * "spare-1" or something like that), and
+ * 2) The pool is currently being resilvered.
+ *
+ * We cannot simply check vd->vdev_resilver_txg, because it's
+ * not set in this path.
+ *
+ * Nor can we just check our vdev_ops; there are cases (such as
+ * when a user types "zpool replace pool odev spare_dev" and
+ * spare_dev is in the spare list, or when a spare device is
+ * automatically used to replace a DEGRADED device) when
+ * resilvering is complete but both the original vdev and the
+ * spare vdev remain in the pool. That behavior is intentional.
+ * It helps implement the policy that a spare should be
+ * automatically removed from the pool after the user replaces
+ * the device that originally failed.
+ *
+ * If a spa load is in progress, then spa_dsl_pool may be
+ * uninitialized. But we shouldn't be resilvering during a spa
+ * load anyway.
+ */
+ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) &&
+ spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+ dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+ mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+ B_FALSE);
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ mc->mc_vd = vd->vdev_child[c];
+ mc->mc_offset = zio->io_offset;
+ }
+ }
+
+ zio->io_vsd = mm;
+ zio->io_vsd_ops = &vdev_mirror_vsd_ops;
+ return (mm);
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ int numerrors = 0;
+ int lasterror = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ vdev_open_children(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error) {
+ lasterror = cvd->vdev_open_error;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+ *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+ *physical_ashift = MAX(*physical_ashift,
+ cvd->vdev_physical_ashift);
+ }
+
+ if (numerrors == vd->vdev_children) {
+ if (vdev_children_are_offline(vd))
+ vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
+ else
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ if (zio->io_error == 0) {
+ zio_t *pio;
+ zio_link_t *zl = NULL;
+
+ mutex_enter(&zio->io_lock);
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+ mutex_enter(&pio->io_lock);
+ ASSERT3U(zio->io_size, >=, pio->io_size);
+ abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
+ mutex_exit(&pio->io_lock);
+ }
+ mutex_exit(&zio->io_lock);
+ }
+ abd_free(zio->io_abd);
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+/*
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked. If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+ dva_t *dva = zio->io_bp->blk_dva;
+ mirror_map_t *mm = zio->io_vsd;
+ int preferred;
+ int c;
+
+ preferred = mm->mm_preferred[p];
+ for (p-- ; p >= 0; p--) {
+ c = mm->mm_preferred[p];
+ if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+ preferred = c;
+ }
+ return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ int p;
+
+ if (mm->mm_root) {
+ p = spa_get_random(mm->mm_preferred_cnt);
+ return (vdev_mirror_dva_select(zio, p));
+ }
+
+ /*
+ * To ensure we don't always favour the first matching vdev,
+ * which could lead to wear leveling issues on SSD's, we
+ * use the I/O offset as a pseudo random seed into the vdevs
+ * which have the lowest load.
+ */
+ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+ return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
+ * If we can't, try the read on any vdev we haven't already tried.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ uint64_t txg = zio->io_txg;
+ int c, lowest_load;
+
+ ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
+
+ lowest_load = INT_MAX;
+ mm->mm_preferred_cnt = 0;
+ for (c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc;
+
+ mc = &mm->mm_child[c];
+ if (mc->mc_tried || mc->mc_skipped)
+ continue;
+
+ if (!vdev_readable(mc->mc_vd)) {
+ mc->mc_error = SET_ERROR(ENXIO);
+ mc->mc_tried = 1; /* don't even try */
+ mc->mc_skipped = 1;
+ continue;
+ }
+
+ if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+ mc->mc_error = SET_ERROR(ESTALE);
+ mc->mc_skipped = 1;
+ mc->mc_speculative = 1;
+ continue;
+ }
+
+ mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+ if (mc->mc_load > lowest_load)
+ continue;
+
+ if (mc->mc_load < lowest_load) {
+ lowest_load = mc->mc_load;
+ mm->mm_preferred_cnt = 0;
+ }
+ mm->mm_preferred[mm->mm_preferred_cnt] = c;
+ mm->mm_preferred_cnt++;
+ }
+
+ if (mm->mm_preferred_cnt == 1) {
+ vdev_queue_register_lastoffset(
+ mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
+ return (mm->mm_preferred[0]);
+ }
+
+ if (mm->mm_preferred_cnt > 1) {
+ int c = vdev_mirror_preferred_child_randomize(zio);
+
+ vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
+ return (c);
+ }
+
+ /*
+ * Every device is either missing or has this txg in its DTL.
+ * Look for any child we haven't already tried before giving up.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ if (!mm->mm_child[c].mc_tried) {
+ vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
+ zio);
+ return (c);
+ }
+ }
+
+ /*
+ * Every child failed. There's no place left to look.
+ */
+ return (-1);
+}
+
+static void
+vdev_mirror_io_start(zio_t *zio)
+{
+ mirror_map_t *mm;
+ mirror_child_t *mc;
+ int c, children;
+
+ mm = vdev_mirror_map_init(zio);
+
+ if (mm == NULL) {
+ ASSERT(!spa_trust_config(zio->io_spa));
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ zio_execute(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (zio->io_bp != NULL &&
+ (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
+ mm->mm_children > 1) {
+ /*
+ * For scrubbing reads (if we can verify the
+ * checksum here, as indicated by io_bp being
+ * non-NULL) we need to allocate a read buffer for
+ * each child and issue reads to all children. If
+ * any child succeeds, it will copy its data into
+ * zio->io_data in vdev_mirror_scrub_done.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ abd_alloc_sametype(zio->io_abd,
+ zio->io_size), zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_mirror_scrub_done, mc));
+ }
+ zio_execute(zio);
+ return;
+ }
+ /*
+ * For normal reads just pick one child.
+ */
+ c = vdev_mirror_child_select(zio);
+ children = (c >= 0);
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_FREE);
+
+ /*
+ * Writes and frees go to all children.
+ */
+ c = 0;
+ children = mm->mm_children;
+ }
+
+ while (children--) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_mirror_child_done, mc));
+ c++;
+ }
+
+ zio_execute(zio);
+}
+
+static int
+vdev_mirror_worst_error(mirror_map_t *mm)
+{
+ int error[2] = { 0, 0 };
+
+ for (int c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc = &mm->mm_child[c];
+ int s = mc->mc_speculative;
+ error[s] = zio_worst_error(error[s], mc->mc_error);
+ }
+
+ return (error[0] ? error[0] : error[1]);
+}
+
+static void
+vdev_mirror_io_done(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
+ int c;
+ int good_copies = 0;
+ int unexpected_errors = 0;
+
+ if (mm == NULL)
+ return;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_error) {
+ if (!mc->mc_skipped)
+ unexpected_errors++;
+ } else if (mc->mc_tried) {
+ good_copies++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * XXX -- for now, treat partial writes as success.
+ *
+ * Now that we support write reallocation, it would be better
+ * to treat partial failure as real failure unless there are
+ * no non-degraded top-level vdevs left, and not update DTLs
+ * if we intend to reallocate.
+ */
+ /* XXPOLICY */
+ if (good_copies != mm->mm_children) {
+ /*
+ * Always require at least one good copy.
+ *
+ * For ditto blocks (io_vd == NULL), require
+ * all copies to be good.
+ *
+ * XXX -- for replacing vdevs, there's no great answer.
+ * If the old device is really dead, we may not even
+ * be able to access it -- so we only want to
+ * require good writes to the new device. But if
+ * the new device turns out to be flaky, we want
+ * to be able to detach it -- which requires all
+ * writes to the old device to have succeeded.
+ */
+ if (good_copies == 0 || zio->io_vd == NULL)
+ zio->io_error = vdev_mirror_worst_error(mm);
+ }
+ return;
+ } else if (zio->io_type == ZIO_TYPE_FREE) {
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * If we don't have a good copy yet, keep trying other children.
+ */
+ /* XXPOLICY */
+ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+ ASSERT(c >= 0 && c < mm->mm_children);
+ mc = &mm->mm_child[c];
+ zio_vdev_io_redone(zio);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
+ ZIO_TYPE_READ, zio->io_priority, 0,
+ vdev_mirror_child_done, mc));
+ return;
+ }
+
+ /* XXPOLICY */
+ if (good_copies == 0) {
+ zio->io_error = vdev_mirror_worst_error(mm);
+ ASSERT(zio->io_error != 0);
+ }
+
+ if (good_copies && spa_writeable(zio->io_spa) &&
+ (unexpected_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER) ||
+ ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ /*
+ * Don't rewrite known good children.
+ * Not only is it unnecessary, it could
+ * actually be harmful: if the system lost
+ * power while rewriting the only good copy,
+ * there would be no good copies left!
+ */
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_error == 0) {
+ if (mc->mc_tried)
+ continue;
+ /*
+ * We didn't try this child. We need to
+ * repair it if:
+ * 1. it's a scrub (in which case we have
+ * tried everything that was healthy)
+ * - or -
+ * 2. it's an indirect vdev (in which case
+ * it could point to any other vdev, which
+ * might have a bad DTL)
+ * - or -
+ * 3. the DTL indicates that this data is
+ * missing from this vdev
+ */
+ if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
+ !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
+ zio->io_txg, 1))
+ continue;
+ mc->mc_error = SET_ERROR(ESTALE);
+ }
+
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio->io_abd, zio->io_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+ }
+ }
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted == vd->vdev_children) {
+ if (vdev_children_are_offline(vd)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
+ VDEV_AUX_CHILDREN_OFFLINE);
+ } else {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ }
+ } else if (degraded + faulted != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ } else {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ }
+}
+
+vdev_ops_t vdev_mirror_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_MIRROR, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_REPLACING, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_spare_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ vdev_default_xlate,
+ VDEV_TYPE_SPARE, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
new file mode 100644
index 000000000000..6852de445049
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -0,0 +1,113 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import. It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing. We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ /*
+ * Really this should just fail. But then the root vdev will be in the
+ * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+ * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
+ * will fail the GUID sum check before ever trying to open the pool.
+ */
+ *psize = 0;
+ *max_psize = 0;
+ *logical_ashift = 0;
+ *physical_ashift = 0;
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_start(zio_t *zio)
+{
+ zio->io_error = SET_ERROR(ENOTSUP);
+ zio_execute(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_missing_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ VDEV_TYPE_MISSING, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+vdev_ops_t vdev_hole_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ VDEV_TYPE_HOLE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
new file mode 100644
index 000000000000..71e3a1fd54bc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -0,0 +1,1047 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
+#include <sys/abd.h>
+
+/*
+ * ZFS I/O Scheduler
+ * ---------------
+ *
+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
+ * I/O scheduler determines when and in what order those operations are
+ * issued. The I/O scheduler divides operations into six I/O classes
+ * prioritized in the following order: sync read, sync write, async read,
+ * async write, scrub/resilver and trim. Each queue defines the minimum and
+ * maximum number of concurrent operations that may be issued to the device.
+ * In addition, the device has an aggregate maximum. Note that the sum of the
+ * per-queue minimums must not exceed the aggregate maximum, and if the
+ * aggregate maximum is equal to or greater than the sum of the per-queue
+ * maximums, the per-queue minimum has no effect.
+ *
+ * For many physical devices, throughput increases with the number of
+ * concurrent operations, but latency typically suffers. Further, physical
+ * devices typically have a limit at which more concurrent operations have no
+ * effect on throughput or can actually cause it to decrease.
+ *
+ * The scheduler selects the next operation to issue by first looking for an
+ * I/O class whose minimum has not been satisfied. Once all are satisfied and
+ * the aggregate maximum has not been hit, the scheduler looks for classes
+ * whose maximum has not been satisfied. Iteration through the I/O classes is
+ * done in the order specified above. No further operations are issued if the
+ * aggregate maximum number of concurrent operations has been hit or if there
+ * are no operations queued for an I/O class that has not hit its maximum.
+ * Every time an I/O is queued or an operation completes, the I/O scheduler
+ * looks for new operations to issue.
+ *
+ * All I/O classes have a fixed maximum number of outstanding operations
+ * except for the async write class. Asynchronous writes represent the data
+ * that is committed to stable storage during the syncing stage for
+ * transaction groups (see txg.c). Transaction groups enter the syncing state
+ * periodically so the number of queued async writes will quickly burst up and
+ * then bleed down to zero. Rather than servicing them as quickly as possible,
+ * the I/O scheduler changes the maximum number of active async write I/Os
+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since
+ * both throughput and latency typically increase with the number of
+ * concurrent operations issued to physical devices, reducing the burstiness
+ * in the number of concurrent operations also stabilizes the response time of
+ * operations from other -- and in particular synchronous -- queues. In broad
+ * strokes, the I/O scheduler will issue more concurrent operations from the
+ * async write queue as there's more dirty data in the pool.
+ *
+ * Async Writes
+ *
+ * The number of concurrent operations issued for the async write I/O class
+ * follows a piece-wise linear function defined by a few adjustable points.
+ *
+ * | o---------| <-- zfs_vdev_async_write_max_active
+ * ^ | /^ |
+ * | | / | |
+ * active | / | |
+ * I/O | / | |
+ * count | / | |
+ * | / | |
+ * |------------o | | <-- zfs_vdev_async_write_min_active
+ * 0|____________^______|_________|
+ * 0% | | 100% of zfs_dirty_data_max
+ * | |
+ * | `-- zfs_vdev_async_write_active_max_dirty_percent
+ * `--------- zfs_vdev_async_write_active_min_dirty_percent
+ *
+ * Until the amount of dirty data exceeds a minimum percentage of the dirty
+ * data allowed in the pool, the I/O scheduler will limit the number of
+ * concurrent operations to the minimum. As that threshold is crossed, the
+ * number of concurrent operations issued increases linearly to the maximum at
+ * the specified maximum percentage of the dirty data allowed in the pool.
+ *
+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped
+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent
+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
+ * maximum percentage, this indicates that the rate of incoming data is
+ * greater than the rate that the backend storage can handle. In this case, we
+ * must further throttle incoming writes (see dmu_tx_delay() for details).
+ */
+
+/*
+ * The maximum number of I/Os active to each device. Ideally, this will be >=
+ * the sum of each queue's max_active. It must be at least the sum of each
+ * queue's min_active.
+ */
+uint32_t zfs_vdev_max_active = 1000;
+
+/*
+ * Per-queue limits on the number of I/Os active to each device. If the
+ * sum of the queue's max_active is < zfs_vdev_max_active, then the
+ * min_active comes into play. We will send min_active from each queue,
+ * and then select from queues in the order defined by zio_priority_t.
+ *
+ * In general, smaller max_active's will lead to lower latency of synchronous
+ * operations. Larger max_active's may lead to higher overall throughput,
+ * depending on underlying storage.
+ *
+ * The ratio of the queues' max_actives determines the balance of performance
+ * between reads, writes, and scrubs. E.g., increasing
+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
+ * more quickly, but reads and writes to have higher latency and lower
+ * throughput.
+ */
+uint32_t zfs_vdev_sync_read_min_active = 10;
+uint32_t zfs_vdev_sync_read_max_active = 10;
+uint32_t zfs_vdev_sync_write_min_active = 10;
+uint32_t zfs_vdev_sync_write_max_active = 10;
+uint32_t zfs_vdev_async_read_min_active = 1;
+uint32_t zfs_vdev_async_read_max_active = 3;
+uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_max_active = 10;
+uint32_t zfs_vdev_scrub_min_active = 1;
+uint32_t zfs_vdev_scrub_max_active = 2;
+uint32_t zfs_vdev_trim_min_active = 1;
+/*
+ * TRIM max active is large in comparison to the other values due to the fact
+ * that TRIM IOs are coalesced at the device layer. This value is set such
+ * that a typical SSD can process the queued IOs in a single request.
+ */
+uint32_t zfs_vdev_trim_max_active = 64;
+uint32_t zfs_vdev_removal_min_active = 1;
+uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
+
+
+/*
+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
+ * dirty data, use zfs_vdev_async_write_min_active. When it has more than
+ * zfs_vdev_async_write_active_max_dirty_percent, use
+ * zfs_vdev_async_write_max_active. The value is linearly interpolated
+ * between min and max.
+ */
+int zfs_vdev_async_write_active_min_dirty_percent = 30;
+int zfs_vdev_async_write_active_max_dirty_percent = 60;
+
+/*
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
+ */
+int zfs_vdev_aggregation_limit = 1 << 20;
+int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
+int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
+
+/*
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
+ */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+/*
+ * When performing allocations for a given metaslab, we want to make sure that
+ * there are enough IOs to aggregate together to improve throughput. We want to
+ * ensure that there are at least 128k worth of IOs that can be aggregated, and
+ * we assume that the average allocation size is 4k, so we need the queue depth
+ * to be 32 per allocator to get good aggregation of sequential writes.
+ */
+int zfs_vdev_def_queue_depth = 32;
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs_vdev);
+
+static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+ sysctl_zfs_async_write_active_min_dirty_percent, "I",
+ "Percentage of async write dirty data below which "
+ "async_write_min_active is used.");
+
+static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+ sysctl_zfs_async_write_active_max_dirty_percent, "I",
+ "Percentage of async write dirty data above which "
+ "async_write_max_active is used.");
+
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
+ &zfs_vdev_max_active, 0,
+ "The maximum number of I/Os of all types active for each device.");
+
+#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
+ &zfs_vdev_ ## name ## _min_active, 0, \
+ "Initial number of I/O requests of type " #name \
+ " active for each device");
+
+#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\
+ &zfs_vdev_ ## name ## _max_active, 0, \
+ "Maximum number of I/O requests of type " #name \
+ " active for each device");
+
+ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
+ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
+ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
+ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
+ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
+ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
+ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
+ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
+ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
+ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
+ZFS_VDEV_QUEUE_KNOB_MIN(trim);
+ZFS_VDEV_QUEUE_KNOB_MAX(trim);
+ZFS_VDEV_QUEUE_KNOB_MIN(removal);
+ZFS_VDEV_QUEUE_KNOB_MAX(removal);
+ZFS_VDEV_QUEUE_KNOB_MIN(initializing);
+ZFS_VDEV_QUEUE_KNOB_MAX(initializing);
+
+#undef ZFS_VDEV_QUEUE_KNOB
+
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
+ &zfs_vdev_aggregation_limit, 0,
+ "I/O requests are aggregated up to this size");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit_non_rotating, CTLFLAG_RWTUN,
+ &zfs_vdev_aggregation_limit_non_rotating, 0,
+ "I/O requests are aggregated up to this size for non-rotating media");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
+ &zfs_vdev_read_gap_limit, 0,
+ "Acceptable gap between two reads being aggregated");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
+ &zfs_vdev_write_gap_limit, 0,
+ "Acceptable gap between two writes being aggregated");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
+ &zfs_vdev_queue_depth_pct, 0,
+ "Queue depth percentage for each top-level");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN,
+ &zfs_vdev_def_queue_depth, 0,
+ "Default queue depth for each allocator");
+
+static int
+sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = zfs_vdev_async_write_active_min_dirty_percent;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 0 || val > 100 ||
+ val >= zfs_vdev_async_write_active_max_dirty_percent)
+ return (EINVAL);
+
+ zfs_vdev_async_write_active_min_dirty_percent = val;
+
+ return (0);
+}
+
+static int
+sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = zfs_vdev_async_write_active_max_dirty_percent;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 0 || val > 100 ||
+ val <= zfs_vdev_async_write_active_min_dirty_percent)
+ return (EINVAL);
+
+ zfs_vdev_async_write_active_max_dirty_percent = val;
+
+ return (0);
+}
+#endif
+#endif
+
+int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = (const zio_t *)x1;
+ const zio_t *z2 = (const zio_t *)x2;
+
+ int cmp = AVL_CMP(z1->io_offset, z2->io_offset);
+
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_PCMP(z1, z2));
+}
+
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+ return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+ if (t == ZIO_TYPE_READ)
+ return (&vq->vq_read_offset_tree);
+ else if (t == ZIO_TYPE_WRITE)
+ return (&vq->vq_write_offset_tree);
+ else
+ return (NULL);
+}
+
+int
+vdev_queue_timestamp_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = (const zio_t *)x1;
+ const zio_t *z2 = (const zio_t *)x2;
+
+ int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp);
+
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_PCMP(z1, z2));
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+ vq->vq_vdev = vd;
+
+ avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ int (*compfn) (const void *, const void *);
+
+ /*
+ * The synchronous i/o queues are dispatched in FIFO rather
+ * than LBA order. This provides more consistent latency for
+ * these i/os.
+ */
+ if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+ compfn = vdev_queue_timestamp_compare;
+ else
+ compfn = vdev_queue_offset_compare;
+
+ avl_create(vdev_queue_class_tree(vq, p), compfn,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ }
+
+ vq->vq_lastoffset = 0;
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
+ avl_destroy(vdev_queue_class_tree(vq, p));
+ avl_destroy(&vq->vq_active_tree);
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
+
+ mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ avl_tree_t *qtt;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ qtt = vdev_queue_type_tree(vq, zio->io_type);
+ if (qtt)
+ avl_add(qtt, zio);
+
+#ifdef illumos
+ mutex_enter(&spa->spa_iokstat_lock);
+ spa->spa_queue_stats[zio->io_priority].spa_queued++;
+ if (spa->spa_iokstat != NULL)
+ kstat_waitq_enter(spa->spa_iokstat->ks_data);
+ mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ avl_tree_t *qtt;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ qtt = vdev_queue_type_tree(vq, zio->io_type);
+ if (qtt)
+ avl_remove(qtt, zio);
+
+#ifdef illumos
+ mutex_enter(&spa->spa_iokstat_lock);
+ ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
+ spa->spa_queue_stats[zio->io_priority].spa_queued--;
+ if (spa->spa_iokstat != NULL)
+ kstat_waitq_exit(spa->spa_iokstat->ks_data);
+ mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
+vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ vq->vq_class[zio->io_priority].vqc_active++;
+ avl_add(&vq->vq_active_tree, zio);
+
+#ifdef illumos
+ mutex_enter(&spa->spa_iokstat_lock);
+ spa->spa_queue_stats[zio->io_priority].spa_active++;
+ if (spa->spa_iokstat != NULL)
+ kstat_runq_enter(spa->spa_iokstat->ks_data);
+ mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
+vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ vq->vq_class[zio->io_priority].vqc_active--;
+ avl_remove(&vq->vq_active_tree, zio);
+
+#ifdef illumos
+ mutex_enter(&spa->spa_iokstat_lock);
+ ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
+ spa->spa_queue_stats[zio->io_priority].spa_active--;
+ if (spa->spa_iokstat != NULL) {
+ kstat_io_t *ksio = spa->spa_iokstat->ks_data;
+
+ kstat_runq_exit(spa->spa_iokstat->ks_data);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ ksio->reads++;
+ ksio->nread += zio->io_size;
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ ksio->writes++;
+ ksio->nwritten += zio->io_size;
+ }
+ }
+ mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+ if (aio->io_type == ZIO_TYPE_READ) {
+ zio_t *pio;
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
+ abd_copy_off(pio->io_abd, aio->io_abd,
+ 0, pio->io_offset - aio->io_offset, pio->io_size);
+ }
+ }
+
+ abd_free(aio->io_abd);
+}
+
+static int
+vdev_queue_class_min_active(zio_priority_t p)
+{
+ switch (p) {
+ case ZIO_PRIORITY_SYNC_READ:
+ return (zfs_vdev_sync_read_min_active);
+ case ZIO_PRIORITY_SYNC_WRITE:
+ return (zfs_vdev_sync_write_min_active);
+ case ZIO_PRIORITY_ASYNC_READ:
+ return (zfs_vdev_async_read_min_active);
+ case ZIO_PRIORITY_ASYNC_WRITE:
+ return (zfs_vdev_async_write_min_active);
+ case ZIO_PRIORITY_SCRUB:
+ return (zfs_vdev_scrub_min_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_min_active);
+ case ZIO_PRIORITY_REMOVAL:
+ return (zfs_vdev_removal_min_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_min_active);
+ default:
+ panic("invalid priority %u", p);
+ return (0);
+ }
+}
+
+static __noinline int
+vdev_queue_max_async_writes(spa_t *spa)
+{
+ int writes;
+ uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
+ uint64_t min_bytes = zfs_dirty_data_max *
+ zfs_vdev_async_write_active_min_dirty_percent / 100;
+ uint64_t max_bytes = zfs_dirty_data_max *
+ zfs_vdev_async_write_active_max_dirty_percent / 100;
+
+ /*
+ * Sync tasks correspond to interactive user actions. To reduce the
+ * execution time of those actions we push data out as fast as possible.
+ */
+ if (spa_has_pending_synctask(spa)) {
+ return (zfs_vdev_async_write_max_active);
+ }
+
+ if (dirty < min_bytes)
+ return (zfs_vdev_async_write_min_active);
+ if (dirty > max_bytes)
+ return (zfs_vdev_async_write_max_active);
+
+ /*
+ * linear interpolation:
+ * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
+ * move right by min_bytes
+ * move up by min_writes
+ */
+ writes = (dirty - min_bytes) *
+ (zfs_vdev_async_write_max_active -
+ zfs_vdev_async_write_min_active) /
+ (max_bytes - min_bytes) +
+ zfs_vdev_async_write_min_active;
+ ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
+ ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
+ return (writes);
+}
+
+static int
+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
+{
+ switch (p) {
+ case ZIO_PRIORITY_SYNC_READ:
+ return (zfs_vdev_sync_read_max_active);
+ case ZIO_PRIORITY_SYNC_WRITE:
+ return (zfs_vdev_sync_write_max_active);
+ case ZIO_PRIORITY_ASYNC_READ:
+ return (zfs_vdev_async_read_max_active);
+ case ZIO_PRIORITY_ASYNC_WRITE:
+ return (vdev_queue_max_async_writes(spa));
+ case ZIO_PRIORITY_SCRUB:
+ return (zfs_vdev_scrub_max_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_max_active);
+ case ZIO_PRIORITY_REMOVAL:
+ return (zfs_vdev_removal_max_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_max_active);
+ default:
+ panic("invalid priority %u", p);
+ return (0);
+ }
+}
+
+/*
+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * there is no eligible class.
+ */
+static zio_priority_t
+vdev_queue_class_to_issue(vdev_queue_t *vq)
+{
+ spa_t *spa = vq->vq_vdev->vdev_spa;
+ zio_priority_t p;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+ return (ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ /* find a queue that has not reached its minimum # outstanding i/os */
+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+ vq->vq_class[p].vqc_active <
+ vdev_queue_class_min_active(p))
+ return (p);
+ }
+
+ /*
+ * If we haven't found a queue, look for one that hasn't reached its
+ * maximum # outstanding i/os.
+ */
+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+ vq->vq_class[p].vqc_active <
+ vdev_queue_class_max_active(spa, p))
+ return (p);
+ }
+
+ /* No eligible queued i/os */
+ return (ZIO_PRIORITY_NUM_QUEUEABLE);
+}
+
+/*
+ * Compute the range spanned by two i/os, which is the endpoint of the last
+ * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
+ * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
+ * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
+ */
+#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
+#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
+
+static zio_t *
+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
+{
+ zio_t *first, *last, *aio, *dio, *mandatory, *nio;
+ zio_link_t *zl = NULL;
+ uint64_t maxgap = 0;
+ uint64_t size;
+ uint64_t limit;
+ int maxblocksize;
+ boolean_t stretch;
+ avl_tree_t *t;
+ enum zio_flag flags;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
+ if (vq->vq_vdev->vdev_nonrot)
+ limit = zfs_vdev_aggregation_limit_non_rotating;
+ else
+ limit = zfs_vdev_aggregation_limit;
+ limit = MAX(MIN(limit, maxblocksize), 0);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
+ return (NULL);
+
+ first = last = zio;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ maxgap = zfs_vdev_read_gap_limit;
+
+ /*
+ * We can aggregate I/Os that are sufficiently adjacent and of
+ * the same flavor, as expressed by the AGG_INHERIT flags.
+ * The latter requirement is necessary so that certain
+ * attributes of the I/O, such as whether it's a normal I/O
+ * or a scrub/resilver, can be preserved in the aggregate.
+ * We can include optional I/Os, but don't allow them
+ * to begin a range as they add no benefit in that situation.
+ */
+
+ /*
+ * We keep track of the last non-optional I/O.
+ */
+ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
+
+ /*
+ * Walk backwards through sufficiently contiguous I/Os
+ * recording the last non-optional I/O.
+ */
+ flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+ t = vdev_queue_type_tree(vq, zio->io_type);
+ while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+ IO_SPAN(dio, last) <= limit &&
+ IO_GAP(dio, first) <= maxgap &&
+ dio->io_type == zio->io_type) {
+ first = dio;
+ if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
+ mandatory = first;
+ }
+
+ /*
+ * Skip any initial optional I/Os.
+ */
+ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
+ first = AVL_NEXT(t, first);
+ ASSERT(first != NULL);
+ }
+
+ /*
+ * Walk forward through sufficiently contiguous I/Os.
+ * The aggregation limit does not apply to optional i/os, so that
+ * we can issue contiguous writes even if they are larger than the
+ * aggregation limit.
+ */
+ while ((dio = AVL_NEXT(t, last)) != NULL &&
+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+ (IO_SPAN(first, dio) <= limit ||
+ (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
+ IO_SPAN(first, dio) <= maxblocksize &&
+ IO_GAP(last, dio) <= maxgap &&
+ dio->io_type == zio->io_type) {
+ last = dio;
+ if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
+ mandatory = last;
+ }
+
+ /*
+ * Now that we've established the range of the I/O aggregation
+ * we must decide what to do with trailing optional I/Os.
+ * For reads, there's nothing to do. While we are unable to
+ * aggregate further, it's possible that a trailing optional
+ * I/O would allow the underlying device to aggregate with
+ * subsequent I/Os. We must therefore determine if the next
+ * non-optional I/O is close enough to make aggregation
+ * worthwhile.
+ */
+ stretch = B_FALSE;
+ if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
+ zio_t *nio = last;
+ while ((dio = AVL_NEXT(t, nio)) != NULL &&
+ IO_GAP(nio, dio) == 0 &&
+ IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
+ nio = dio;
+ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+ stretch = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (stretch) {
+ /*
+ * We are going to include an optional io in our aggregated
+ * span, thus closing the write gap. Only mandatory i/os can
+ * start aggregated spans, so make sure that the next i/o
+ * after our span is mandatory.
+ */
+ dio = AVL_NEXT(t, last);
+ dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+ } else {
+ /* do not include the optional i/o */
+ while (last != mandatory && last != first) {
+ ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
+ last = AVL_PREV(t, last);
+ ASSERT(last != NULL);
+ }
+ }
+
+ if (first == last)
+ return (NULL);
+
+ size = IO_SPAN(first, last);
+ ASSERT3U(size, <=, maxblocksize);
+
+ aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
+ abd_alloc_for_io(size, B_TRUE), size, first->io_type,
+ zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+ vdev_queue_agg_io_done, NULL);
+ aio->io_timestamp = first->io_timestamp;
+
+ nio = first;
+ do {
+ dio = nio;
+ nio = AVL_NEXT(t, dio);
+ zio_add_child(dio, aio);
+ vdev_queue_io_remove(vq, dio);
+ } while (dio != last);
+
+ /*
+ * We need to drop the vdev queue's lock during zio_execute() to
+ * avoid a deadlock that we could encounter due to lock order
+ * reversal between vq_lock and io_lock in zio_change_priority().
+ * Use the dropped lock to do memory copy without congestion.
+ */
+ mutex_exit(&vq->vq_lock);
+ while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
+ ASSERT3U(dio->io_type, ==, aio->io_type);
+
+ if (dio->io_flags & ZIO_FLAG_NODATA) {
+ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
+ abd_zero_off(aio->io_abd,
+ dio->io_offset - aio->io_offset, dio->io_size);
+ } else if (dio->io_type == ZIO_TYPE_WRITE) {
+ abd_copy_off(aio->io_abd, dio->io_abd,
+ dio->io_offset - aio->io_offset, 0, dio->io_size);
+ }
+
+ zio_vdev_io_bypass(dio);
+ zio_execute(dio);
+ }
+ mutex_enter(&vq->vq_lock);
+
+ return (aio);
+}
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq)
+{
+ zio_t *zio, *aio;
+ zio_priority_t p;
+ avl_index_t idx;
+ avl_tree_t *tree;
+ zio_t search;
+
+again:
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ p = vdev_queue_class_to_issue(vq);
+
+ if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
+ /* No eligible queued i/os */
+ return (NULL);
+ }
+
+ /*
+ * For LBA-ordered queues (async / scrub / initializing), issue the
+ * i/o which follows the most recently issued i/o in LBA (offset) order.
+ *
+ * For FIFO queues (sync), issue the i/o with the lowest timestamp.
+ */
+ tree = vdev_queue_class_tree(vq, p);
+ search.io_timestamp = 0;
+ search.io_offset = vq->vq_last_offset + 1;
+ VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ if (zio == NULL)
+ zio = avl_first(tree);
+ ASSERT3U(zio->io_priority, ==, p);
+
+ aio = vdev_queue_aggregate(vq, zio);
+ if (aio != NULL)
+ zio = aio;
+ else
+ vdev_queue_io_remove(vq, zio);
+
+ /*
+ * If the I/O is or was optional and therefore has no data, we need to
+ * simply discard it. We need to drop the vdev queue's lock to avoid a
+ * deadlock that we could encounter since this I/O will complete
+ * immediately.
+ */
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
+ mutex_exit(&vq->vq_lock);
+ zio_vdev_io_bypass(zio);
+ zio_execute(zio);
+ mutex_enter(&vq->vq_lock);
+ goto again;
+ }
+
+ vdev_queue_pending_add(vq, zio);
+ vq->vq_last_offset = zio->io_offset;
+
+ return (zio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+
+ if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+ return (zio);
+
+ /*
+ * Children i/os inherent their parent's priority, which might
+ * not match the child's i/o type. Fix it up here.
+ */
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
+ zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
+ zio->io_priority != ZIO_PRIORITY_SCRUB &&
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING)
+ zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
+ zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING)
+ zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_FREE);
+ zio->io_priority = ZIO_PRIORITY_TRIM;
+ }
+
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+ mutex_enter(&vq->vq_lock);
+ zio->io_timestamp = gethrtime();
+ vdev_queue_io_add(vq, zio);
+ nio = vdev_queue_io_to_issue(vq);
+ mutex_exit(&vq->vq_lock);
+
+ if (nio == NULL)
+ return (NULL);
+
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ return (NULL);
+ }
+
+ return (nio);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+
+ mutex_enter(&vq->vq_lock);
+
+ vdev_queue_pending_remove(vq, zio);
+
+ vq->vq_io_complete_ts = gethrtime();
+
+ while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
+ mutex_exit(&vq->vq_lock);
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ } else {
+ zio_vdev_io_reissue(nio);
+ zio_execute(nio);
+ }
+ mutex_enter(&vq->vq_lock);
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
+
+void
+vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ avl_tree_t *tree;
+
+ /*
+ * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
+ * code to issue IOs without adding them to the vdev queue. In this
+ * case, the zio is already going to be issued as quickly as possible
+ * and so it doesn't need any reprioitization to help.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_NOW)
+ return;
+
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (priority != ZIO_PRIORITY_SYNC_READ &&
+ priority != ZIO_PRIORITY_ASYNC_READ &&
+ priority != ZIO_PRIORITY_SCRUB)
+ priority = ZIO_PRIORITY_ASYNC_READ;
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ if (priority != ZIO_PRIORITY_SYNC_WRITE &&
+ priority != ZIO_PRIORITY_ASYNC_WRITE)
+ priority = ZIO_PRIORITY_ASYNC_WRITE;
+ }
+
+ mutex_enter(&vq->vq_lock);
+
+ /*
+ * If the zio is in none of the queues we can simply change
+ * the priority. If the zio is waiting to be submitted we must
+ * remove it from the queue and re-insert it with the new priority.
+ * Otherwise, the zio is currently active and we cannot change its
+ * priority.
+ */
+ tree = vdev_queue_class_tree(vq, zio->io_priority);
+ if (avl_find(tree, zio, NULL) == zio) {
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ zio->io_priority = priority;
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+ zio->io_priority = priority;
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
+
+/*
+ * As these three methods are only used for load calculations we're not concerned
+ * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
+ * use here, instead we prefer to keep it lock free for performance.
+ */
+int
+vdev_queue_length(vdev_t *vd)
+{
+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_lastoffset(vdev_t *vd)
+{
+ return (vd->vdev_queue.vq_lastoffset);
+}
+
+void
+vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
+{
+ vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
new file mode 100644
index 000000000000..29878ea6eaf6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -0,0 +1,2707 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#ifdef illumos
+#include <sys/vdev_disk.h>
+#endif
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/bio.h>
+
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h> /* vdev_xlate testing */
+#endif
+
+/*
+ * Virtual device vector for RAID-Z.
+ *
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
+ *
+ * o addition (+) is represented by a bitwise XOR
+ * o subtraction (-) is therefore identical to addition: A + B = A - B
+ * o multiplication of A by 2 is defined by the following bitwise expression:
+ *
+ * (A * 2)_7 = A_6
+ * (A * 2)_6 = A_5
+ * (A * 2)_5 = A_4
+ * (A * 2)_4 = A_3 + A_7
+ * (A * 2)_3 = A_2 + A_7
+ * (A * 2)_2 = A_1 + A_7
+ * (A * 2)_1 = A_0
+ * (A * 2)_0 = A_7
+ *
+ * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
+ *
+ * Observe that any number in the field (except for 0) can be expressed as a
+ * power of 2 -- a generator for the field. We store a table of the powers of
+ * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
+ * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
+ *
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
+ *
+ * P = D_0 + D_1 + ... + D_n-2 + D_n-1
+ * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
+ * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
+ *
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
+ */
+
+typedef struct raidz_col {
+ uint64_t rc_devidx; /* child device index for I/O */
+ uint64_t rc_offset; /* device offset */
+ uint64_t rc_size; /* I/O size */
+ abd_t *rc_abd; /* I/O data */
+ void *rc_gdata; /* used to store the "good" version */
+ int rc_error; /* I/O error for this device */
+ uint8_t rc_tried; /* Did we attempt this I/O column? */
+ uint8_t rc_skipped; /* Did we skip this I/O column? */
+} raidz_col_t;
+
+typedef struct raidz_map {
+ uint64_t rm_cols; /* Regular column count */
+ uint64_t rm_scols; /* Count including skipped columns */
+ uint64_t rm_bigcols; /* Number of oversized columns */
+ uint64_t rm_asize; /* Actual total I/O size */
+ uint64_t rm_missingdata; /* Count of missing data devices */
+ uint64_t rm_missingparity; /* Count of missing parity devices */
+ uint64_t rm_firstdatacol; /* First data column/parity count */
+ uint64_t rm_nskip; /* Skipped sectors for padding */
+ uint64_t rm_skipstart; /* Column index of padding start */
+ abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
+ uintptr_t rm_reports; /* # of referencing checksum reports */
+ uint8_t rm_freed; /* map no longer has referencing ZIO */
+ uint8_t rm_ecksuminjected; /* checksum error was injected */
+ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+} raidz_map_t;
+
+#define VDEV_RAIDZ_P 0
+#define VDEV_RAIDZ_Q 1
+#define VDEV_RAIDZ_R 2
+
+#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
+
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+ (mask) = (x) & 0x8080808080808080ULL; \
+ (mask) = ((mask) << 1) - ((mask) >> 7); \
+ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+ ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
+
+#define VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+}
+
+#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
+
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
+
+/* Powers of 2 in the Galois field defined above. */
+static const uint8_t vdev_raidz_pow2[256] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+ 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+ 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+ 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+ 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+ 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+ 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+ 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+ 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+ 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+ 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+ 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+ 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+ 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+ 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+ 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+ 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+ 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+ 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+ 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+ 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+ 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+ 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+ 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+ 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+ 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+ 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+ 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+ 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+ 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+ 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
+};
+/* Logs of 2 in the Galois field defined above. */
+static const uint8_t vdev_raidz_log2[256] = {
+ 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
+ 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
+ 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
+ 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
+ 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
+ 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
+ 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
+ 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
+ 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
+ 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
+ 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
+ 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
+ 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
+ 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
+ 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
+ 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
+ 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
+ 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
+ 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
+ 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
+ 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
+ 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
+ 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
+ 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
+ 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
+ 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
+ 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
+ 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
+ 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
+ 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
+ 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
+ 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
+};
+
+static void vdev_raidz_generate_parity(raidz_map_t *rm);
+
+/*
+ * Multiply a given number by 2 raised to the given power.
+ */
+static uint8_t
+vdev_raidz_exp2(uint_t a, int exp)
+{
+ if (a == 0)
+ return (0);
+
+ ASSERT(exp >= 0);
+ ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
+
+ exp += vdev_raidz_log2[a];
+ if (exp > 255)
+ exp -= 255;
+
+ return (vdev_raidz_pow2[exp]);
+}
+
+static void
+vdev_raidz_map_free(raidz_map_t *rm)
+{
+ int c;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ if (rm->rm_col[c].rc_abd != NULL)
+ abd_free(rm->rm_col[c].rc_abd);
+
+ if (rm->rm_col[c].rc_gdata != NULL)
+ zio_buf_free(rm->rm_col[c].rc_gdata,
+ rm->rm_col[c].rc_size);
+ }
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ if (rm->rm_col[c].rc_abd != NULL)
+ abd_put(rm->rm_col[c].rc_abd);
+ }
+
+ if (rm->rm_abd_copy != NULL)
+ abd_free(rm->rm_abd_copy);
+
+ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+}
+
+static void
+vdev_raidz_map_free_vsd(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ ASSERT0(rm->rm_freed);
+ rm->rm_freed = 1;
+
+ if (rm->rm_reports == 0)
+ vdev_raidz_map_free(rm);
+}
+
+/*ARGSUSED*/
+static void
+vdev_raidz_cksum_free(void *arg, size_t ignored)
+{
+ raidz_map_t *rm = arg;
+
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+ vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
+{
+ raidz_map_t *rm = zcr->zcr_cbdata;
+ size_t c = zcr->zcr_cbinfo;
+ size_t x;
+
+ const char *good = NULL;
+ char *bad;
+
+ if (good_data == NULL) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+ return;
+ }
+
+ if (c < rm->rm_firstdatacol) {
+ /*
+ * The first time through, calculate the parity blocks for
+ * the good data (this relies on the fact that the good
+ * data never changes for a given logical ZIO)
+ */
+ if (rm->rm_col[0].rc_gdata == NULL) {
+ abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
+ char *buf;
+ int offset;
+
+ /*
+ * Set up the rm_col[]s to generate the parity for
+ * good_data, first saving the parity bufs and
+ * replacing them with buffers to hold the result.
+ */
+ for (x = 0; x < rm->rm_firstdatacol; x++) {
+ bad_parity[x] = rm->rm_col[x].rc_abd;
+ rm->rm_col[x].rc_gdata =
+ zio_buf_alloc(rm->rm_col[x].rc_size);
+ rm->rm_col[x].rc_abd =
+ abd_get_from_buf(rm->rm_col[x].rc_gdata,
+ rm->rm_col[x].rc_size);
+ }
+
+ /* fill in the data columns from good_data */
+ buf = (char *)good_data;
+ for (; x < rm->rm_cols; x++) {
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
+ rm->rm_col[x].rc_size);
+ buf += rm->rm_col[x].rc_size;
+ }
+
+ /*
+ * Construct the parity from the good data.
+ */
+ vdev_raidz_generate_parity(rm);
+
+ /* restore everything back to its original state */
+ for (x = 0; x < rm->rm_firstdatacol; x++) {
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = bad_parity[x];
+ }
+
+ offset = 0;
+ for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = abd_get_offset(
+ rm->rm_abd_copy, offset);
+ offset += rm->rm_col[x].rc_size;
+ }
+ }
+
+ ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
+ good = rm->rm_col[c].rc_gdata;
+ } else {
+ /* adjust good_data to point at the start of our column */
+ good = good_data;
+
+ for (x = rm->rm_firstdatacol; x < c; x++)
+ good += rm->rm_col[x].rc_size;
+ }
+
+ bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
+ /* we drop the ereport if it ends up that the data was good */
+ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+ abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely. The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_raidz_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+ size_t c = (size_t)(uintptr_t)arg;
+ size_t offset;
+
+ raidz_map_t *rm = zio->io_vsd;
+ size_t size;
+
+ /* set up the report and bump the refcount */
+ zcr->zcr_cbdata = rm;
+ zcr->zcr_cbinfo = c;
+ zcr->zcr_finish = vdev_raidz_cksum_finish;
+ zcr->zcr_free = vdev_raidz_cksum_free;
+
+ rm->rm_reports++;
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (rm->rm_abd_copy != NULL)
+ return;
+
+ /*
+ * It's the first time we're called for this raidz_map_t, so we need
+ * to copy the data aside; there's no guarantee that our zio's buffer
+ * won't be re-used for something else.
+ *
+ * Our parity data is already in separate buffers, so there's no need
+ * to copy them.
+ */
+
+ size = 0;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ size += rm->rm_col[c].rc_size;
+
+ rm->rm_abd_copy =
+ abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
+
+ for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+ abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
+
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+ abd_put(col->rc_abd);
+ col->rc_abd = tmp;
+
+ offset += col->rc_size;
+ }
+ ASSERT3U(offset, ==, size);
+}
+
+static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+ vdev_raidz_map_free_vsd,
+ vdev_raidz_cksum_report
+};
+
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ */
+static raidz_map_t *
+vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
+ uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
+{
+ raidz_map_t *rm;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
+ uint64_t b = offset >> unit_shift;
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = size >> unit_shift;
+ /* The first column for this stripe. */
+ uint64_t f = b % dcols;
+ /* The starting byte offset on each child vdev. */
+ uint64_t o = (b / dcols) << unit_shift;
+ uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ uint64_t off = 0;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ */
+ q = s / (dcols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ r = s - q * (dcols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
+ bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ /* acols: The columns that will be accessed. */
+ /* scols: The columns that will be accessed or skipped. */
+ if (q == 0) {
+ /* Our I/O request doesn't span all child vdevs. */
+ acols = bc;
+ scols = MIN(dcols, roundup(bc, nparity + 1));
+ } else {
+ acols = dcols;
+ scols = dcols;
+ }
+
+ ASSERT3U(acols, <=, scols);
+
+ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
+
+ rm->rm_cols = acols;
+ rm->rm_scols = scols;
+ rm->rm_bigcols = bc;
+ rm->rm_skipstart = bc;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+ rm->rm_firstdatacol = nparity;
+ rm->rm_abd_copy = NULL;
+ rm->rm_reports = 0;
+ rm->rm_freed = 0;
+ rm->rm_ecksuminjected = 0;
+
+ asize = 0;
+
+ for (c = 0; c < scols; c++) {
+ col = f + c;
+ coff = o;
+ if (col >= dcols) {
+ col -= dcols;
+ coff += 1ULL << unit_shift;
+ }
+ rm->rm_col[c].rc_devidx = col;
+ rm->rm_col[c].rc_offset = coff;
+ rm->rm_col[c].rc_abd = NULL;
+ rm->rm_col[c].rc_gdata = NULL;
+ rm->rm_col[c].rc_error = 0;
+ rm->rm_col[c].rc_tried = 0;
+ rm->rm_col[c].rc_skipped = 0;
+
+ if (c >= acols)
+ rm->rm_col[c].rc_size = 0;
+ else if (c < bc)
+ rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ else
+ rm->rm_col[c].rc_size = q << unit_shift;
+
+ asize += rm->rm_col[c].rc_size;
+ }
+
+ ASSERT3U(asize, ==, tot << unit_shift);
+ rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+ ASSERT3U(rm->rm_nskip, <=, nparity);
+
+ if (!dofree) {
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rm->rm_col[c].rc_abd =
+ abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
+ }
+
+ for (off = 0, c = rm->rm_firstdatacol; c < acols; c++) {
+ rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
+ off += rm->rm_col[c].rc_size;
+ }
+ }
+
+ /*
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
+ *
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for padding
+ * we must make sure to note this swap. We will never intend to
+ * skip the first column since at least one data and one parity
+ * column must appear in each row.
+ */
+ ASSERT(rm->rm_cols >= 2);
+ ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+
+ if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
+ devidx = rm->rm_col[0].rc_devidx;
+ o = rm->rm_col[0].rc_offset;
+ rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
+ rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+ rm->rm_col[1].rc_devidx = devidx;
+ rm->rm_col[1].rc_offset = o;
+
+ if (rm->rm_skipstart == 0)
+ rm->rm_skipstart = 1;
+ }
+
+ return (rm);
+}
+
+struct pqr_struct {
+ uint64_t *p;
+ uint64_t *q;
+ uint64_t *r;
+};
+
+static int
+vdev_raidz_p_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && !pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++)
+ *pqr->p ^= *src;
+
+ return (0);
+}
+
+static int
+vdev_raidz_pq_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_pqr_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
+ *pqr->r ^= *src;
+ }
+
+ return (0);
+}
+
+static void
+vdev_raidz_generate_parity_p(raidz_map_t *rm)
+{
+ uint64_t *p;
+ int c;
+ abd_t *src;
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+
+ if (c == rm->rm_firstdatacol) {
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ } else {
+ struct pqr_struct pqr = { p, NULL, NULL };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_p_func, &pqr);
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+{
+ uint64_t *p, *q, pcnt, ccnt, mask, i;
+ int c;
+ abd_t *src;
+
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ (void) memcpy(q, p, rm->rm_col[c].rc_size);
+ } else {
+ struct pqr_struct pqr = { p, q, NULL };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_pq_func, &pqr);
+ }
+
+ if (c == rm->rm_firstdatacol) {
+ for (i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
+ }
+ } else {
+ /*
+ * Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
+ */
+ for (i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+ uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
+ int c;
+ abd_t *src;
+
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ (void) memcpy(q, p, rm->rm_col[c].rc_size);
+ (void) memcpy(r, p, rm->rm_col[c].rc_size);
+ } else {
+ struct pqr_struct pqr = { p, q, r };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_pqr_func, &pqr);
+ }
+
+ if (c == rm->rm_firstdatacol) {
+ for (i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
+ r[i] = 0;
+ }
+ } else {
+ /*
+ * Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
+ */
+ for (i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
+ VDEV_RAIDZ_64MUL_4(r[i], mask);
+ }
+ }
+ }
+}
+
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
+static void
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+ switch (rm->rm_firstdatacol) {
+ case 1:
+ vdev_raidz_generate_parity_p(rm);
+ break;
+ case 2:
+ vdev_raidz_generate_parity_pq(rm);
+ break;
+ case 3:
+ vdev_raidz_generate_parity_pqr(rm);
+ break;
+ default:
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+ }
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ int cnt = size / sizeof (src[0]);
+
+ for (int i = 0; i < cnt; i++) {
+ dst[i] ^= src[i];
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
+ void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++, src++) {
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ *dst ^= *src;
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
+{
+ uint64_t *dst = buf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++) {
+ /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ }
+
+ return (0);
+}
+
+struct reconst_q_struct {
+ uint64_t *q;
+ int exp;
+};
+
+static int
+vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
+{
+ struct reconst_q_struct *rq = private;
+ uint64_t *dst = buf;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++, rq->q++) {
+ *dst ^= *rq->q;
+
+ int j;
+ uint8_t *b;
+ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+ *b = vdev_raidz_exp2(*b, rq->exp);
+ }
+ }
+
+ return (0);
+}
+
+struct reconst_pq_struct {
+ uint8_t *p;
+ uint8_t *q;
+ uint8_t *pxy;
+ uint8_t *qxy;
+ int aexp;
+ int bexp;
+};
+
+static int
+vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+ uint8_t *yd = ybuf;
+
+ for (int i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ *yd = *rpq->p ^ *rpq->pxy ^ *xd;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+
+ for (int i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
+ /* same operation as vdev_raidz_reconst_pq_func() on xd */
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
+{
+ int x = tgts[0];
+ int c;
+ abd_t *dst, *src;
+
+ ASSERT(ntgts == 1);
+ ASSERT(x >= rm->rm_firstdatacol);
+ ASSERT(x < rm->rm_cols);
+
+ ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ ASSERT(rm->rm_col[x].rc_size > 0);
+
+ src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
+
+ abd_copy(dst, src, rm->rm_col[x].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ uint64_t size = MIN(rm->rm_col[x].rc_size,
+ rm->rm_col[c].rc_size);
+
+ src = rm->rm_col[c].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
+
+ if (c == x)
+ continue;
+
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_p_func, NULL);
+ }
+
+ return (1 << VDEV_RAIDZ_P);
+}
+
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
+{
+ int x = tgts[0];
+ int c, exp;
+ abd_t *dst, *src;
+
+ ASSERT(ntgts == 1);
+
+ ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
+ rm->rm_col[c].rc_size);
+
+ src = rm->rm_col[c].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
+
+ if (c == rm->rm_firstdatacol) {
+ abd_copy(dst, src, size);
+ if (rm->rm_col[x].rc_size > size)
+ abd_zero_off(dst, size,
+ rm->rm_col[x].rc_size - size);
+ } else {
+ ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_q_pre_func, NULL);
+ (void) abd_iterate_func(dst,
+ size, rm->rm_col[x].rc_size - size,
+ vdev_raidz_reconst_q_pre_tail_func, NULL);
+ }
+ }
+
+ src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
+ exp = 255 - (rm->rm_cols - 1 - x);
+
+ struct reconst_q_struct rq = { abd_to_buf(src), exp };
+ (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+ vdev_raidz_reconst_q_post_func, &rq);
+
+ return (1 << VDEV_RAIDZ_Q);
+}
+
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
+{
+ uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
+ abd_t *pdata, *qdata;
+ uint64_t xsize, ysize;
+ int x = tgts[0];
+ int y = tgts[1];
+ abd_t *xd, *yd;
+
+ ASSERT(ntgts == 2);
+ ASSERT(x < y);
+ ASSERT(x >= rm->rm_firstdatacol);
+ ASSERT(y < rm->rm_cols);
+
+ ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
+
+ /*
+ * Move the parity data aside -- we're going to compute parity as
+ * though columns x and y were full of zeros -- Pxy and Qxy. We want to
+ * reuse the parity generation mechanism without trashing the actual
+ * parity so we make those columns appear to be full of zeros by
+ * setting their lengths to zero.
+ */
+ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+ qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
+ xsize = rm->rm_col[x].rc_size;
+ ysize = rm->rm_col[y].rc_size;
+
+ rm->rm_col[VDEV_RAIDZ_P].rc_abd =
+ abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+ rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
+ abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
+ rm->rm_col[x].rc_size = 0;
+ rm->rm_col[y].rc_size = 0;
+
+ vdev_raidz_generate_parity_pq(rm);
+
+ rm->rm_col[x].rc_size = xsize;
+ rm->rm_col[y].rc_size = ysize;
+
+ p = abd_to_buf(pdata);
+ q = abd_to_buf(qdata);
+ pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ xd = rm->rm_col[x].rc_abd;
+ yd = rm->rm_col[y].rc_abd;
+
+ /*
+ * We now have:
+ * Pxy = P + D_x + D_y
+ * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
+ *
+ * We can then solve for D_x:
+ * D_x = A * (P + Pxy) + B * (Q + Qxy)
+ * where
+ * A = 2^(x - y) * (2^(x - y) + 1)^-1
+ * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
+ *
+ * With D_x in hand, we can easily solve for D_y:
+ * D_y = P + Pxy + D_x
+ */
+
+ a = vdev_raidz_pow2[255 + x - y];
+ b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
+ tmp = 255 - vdev_raidz_log2[a ^ 1];
+
+ aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
+ bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
+
+ ASSERT3U(xsize, >=, ysize);
+ struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
+ (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
+ vdev_raidz_reconst_pq_func, &rpq);
+ (void) abd_iterate_func(xd, ysize, xsize - ysize,
+ vdev_raidz_reconst_pq_tail_func, &rpq);
+
+ abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+
+ /*
+ * Restore the saved parity data.
+ */
+ rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
+ rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
+
+ return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ * __ __ __ __
+ * | | __ __ | p_0 |
+ * | V | | D_0 | | p_m-1 |
+ * | | x | : | = | d_0 |
+ * | I | | D_n-1 | | : |
+ * | | ~~ ~~ | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ * __ __ __ __
+ * | 1 .. 1 1 1 | | p_0 |
+ * | 2^n-1 .. 4 2 1 | __ __ | : |
+ * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
+ * | 1 .. 0 0 0 | | D_1 | | d_0 |
+ * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
+ * | : : : : | | : | | d_2 |
+ * | 0 .. 1 0 0 | | D_n-1 | | : |
+ * | 0 .. 0 1 0 | ~~ ~~ | : |
+ * | 0 .. 0 0 1 | | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
+ * | 19 205 116 29 64 16 4 1 | / /
+ * | 1 0 0 0 0 0 0 0 | / /
+ * | 0 1 0 0 0 0 0 0 | <--' /
+ * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 19 205 116 29 64 16 4 1 |
+ * | 1 0 0 0 0 0 0 0 |
+ * (V|I)' = | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __ __
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 0 0 1 0 0 0 0 0 |
+ * | 167 100 5 41 159 169 217 208 |
+ * | 166 100 4 40 158 168 216 209 |
+ * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+ uint8_t **rows)
+{
+ int i, j;
+ int pow;
+
+ ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+ /*
+ * Fill in the missing rows of interest.
+ */
+ for (i = 0; i < nmap; i++) {
+ ASSERT3S(0, <=, map[i]);
+ ASSERT3S(map[i], <=, 2);
+
+ pow = map[i] * n;
+ if (pow > 255)
+ pow -= 255;
+ ASSERT(pow <= 255);
+
+ for (j = 0; j < n; j++) {
+ pow -= map[i];
+ if (pow < 0)
+ pow += 255;
+ rows[i][j] = vdev_raidz_pow2[pow];
+ }
+ }
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+ uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, ii, jj;
+ uint8_t log;
+
+ /*
+ * Assert that the first nmissing entries from the array of used
+ * columns correspond to parity columns and that subsequent entries
+ * correspond to data columns.
+ */
+ for (i = 0; i < nmissing; i++) {
+ ASSERT3S(used[i], <, rm->rm_firstdatacol);
+ }
+ for (; i < n; i++) {
+ ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+ }
+
+ /*
+ * First initialize the storage where we'll compute the inverse rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ invrows[i][j] = (i == j) ? 1 : 0;
+ }
+ }
+
+ /*
+ * Subtract all trivial rows from the rows of consequence.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = nmissing; j < n; j++) {
+ ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+ jj = used[j] - rm->rm_firstdatacol;
+ ASSERT3S(jj, <, n);
+ invrows[i][j] = rows[i][jj];
+ rows[i][jj] = 0;
+ }
+ }
+
+ /*
+ * For each of the rows of interest, we must normalize it and subtract
+ * a multiple of it from the other rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < missing[i]; j++) {
+ ASSERT0(rows[i][j]);
+ }
+ ASSERT3U(rows[i][missing[i]], !=, 0);
+
+ /*
+ * Compute the inverse of the first element and multiply each
+ * element in the row by that value.
+ */
+ log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+ invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+ }
+
+ for (ii = 0; ii < nmissing; ii++) {
+ if (i == ii)
+ continue;
+
+ ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+ log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[ii][j] ^=
+ vdev_raidz_exp2(rows[i][j], log);
+ invrows[ii][j] ^=
+ vdev_raidz_exp2(invrows[i][j], log);
+ }
+ }
+ }
+
+ /*
+ * Verify that the data that is left in the rows are properly part of
+ * an identity matrix.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ if (j == missing[i]) {
+ ASSERT3U(rows[i][j], ==, 1);
+ } else {
+ ASSERT0(rows[i][j]);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+ int *missing, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, x, cc, c;
+ uint8_t *src;
+ uint64_t ccount;
+ uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+ uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+ uint8_t log = 0;
+ uint8_t val;
+ int ll;
+ uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *p, *pp;
+ size_t psize;
+
+ psize = sizeof (invlog[0][0]) * n * nmissing;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing; i++) {
+ invlog[i] = pp;
+ pp += n;
+ }
+
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ ASSERT3U(invrows[i][j], !=, 0);
+ invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ c = used[i];
+ ASSERT3U(c, <, rm->rm_cols);
+
+ src = abd_to_buf(rm->rm_col[c].rc_abd);
+ ccount = rm->rm_col[c].rc_size;
+ for (j = 0; j < nmissing; j++) {
+ cc = missing[j] + rm->rm_firstdatacol;
+ ASSERT3U(cc, >=, rm->rm_firstdatacol);
+ ASSERT3U(cc, <, rm->rm_cols);
+ ASSERT3U(cc, !=, c);
+
+ dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
+ dcount[j] = rm->rm_col[cc].rc_size;
+ }
+
+ ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+ for (x = 0; x < ccount; x++, src++) {
+ if (*src != 0)
+ log = vdev_raidz_log2[*src];
+
+ for (cc = 0; cc < nmissing; cc++) {
+ if (x >= dcount[cc])
+ continue;
+
+ if (*src == 0) {
+ val = 0;
+ } else {
+ if ((ll = log + invlog[cc][i]) >= 255)
+ ll -= 255;
+ val = vdev_raidz_pow2[ll];
+ }
+
+ if (i == 0)
+ dst[cc][x] = val;
+ else
+ dst[cc][x] ^= val;
+ }
+ }
+ }
+
+ kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+ int n, i, c, t, tt;
+ int nmissing_rows;
+ int missing_rows[VDEV_RAIDZ_MAXPARITY];
+ int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+ uint8_t *p, *pp;
+ size_t psize;
+
+ uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *used;
+
+ abd_t **bufs = NULL;
+
+ int code = 0;
+
+ /*
+ * Matrix reconstruction can't use scatter ABDs yet, so we allocate
+ * temporary linear ABDs.
+ */
+ if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
+ bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ bufs[c] = col->rc_abd;
+ col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
+ abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ }
+ }
+
+ n = rm->rm_cols - rm->rm_firstdatacol;
+
+ /*
+ * Figure out which data columns are missing.
+ */
+ nmissing_rows = 0;
+ for (t = 0; t < ntgts; t++) {
+ if (tgts[t] >= rm->rm_firstdatacol) {
+ missing_rows[nmissing_rows++] =
+ tgts[t] - rm->rm_firstdatacol;
+ }
+ }
+
+ /*
+ * Figure out which parity columns to use to help generate the missing
+ * data columns.
+ */
+ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+ ASSERT(tt < ntgts);
+ ASSERT(c < rm->rm_firstdatacol);
+
+ /*
+ * Skip any targeted parity columns.
+ */
+ if (c == tgts[tt]) {
+ tt++;
+ continue;
+ }
+
+ code |= 1 << c;
+
+ parity_map[i] = c;
+ i++;
+ }
+
+ ASSERT(code != 0);
+ ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+ psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+ nmissing_rows * n + sizeof (used[0]) * n;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing_rows; i++) {
+ rows[i] = pp;
+ pp += n;
+ invrows[i] = pp;
+ pp += n;
+ }
+ used = pp;
+
+ for (i = 0; i < nmissing_rows; i++) {
+ used[i] = parity_map[i];
+ }
+
+ for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ if (tt < nmissing_rows &&
+ c == missing_rows[tt] + rm->rm_firstdatacol) {
+ tt++;
+ continue;
+ }
+
+ ASSERT3S(i, <, n);
+ used[i] = c;
+ i++;
+ }
+
+ /*
+ * Initialize the interesting rows of the matrix.
+ */
+ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+ /*
+ * Invert the matrix.
+ */
+ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+ invrows, used);
+
+ /*
+ * Reconstruct the missing data using the generated matrix.
+ */
+ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+ invrows, used);
+
+ kmem_free(p, psize);
+
+ /*
+ * copy back from temporary linear abds and free them
+ */
+ if (bufs) {
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+ col->rc_abd = bufs[c];
+ }
+ kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+ }
+
+ return (code);
+}
+
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+ int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+ int ntgts;
+ int i, c;
+ int code;
+ int nbadparity, nbaddata;
+ int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+ /*
+ * The tgts list must already be sorted.
+ */
+ for (i = 1; i < nt; i++) {
+ ASSERT(t[i] > t[i - 1]);
+ }
+
+ nbadparity = rm->rm_firstdatacol;
+ nbaddata = rm->rm_cols - nbadparity;
+ ntgts = 0;
+ for (i = 0, c = 0; c < rm->rm_cols; c++) {
+ if (c < rm->rm_firstdatacol)
+ parity_valid[c] = B_FALSE;
+
+ if (i < nt && c == t[i]) {
+ tgts[ntgts++] = c;
+ i++;
+ } else if (rm->rm_col[c].rc_error != 0) {
+ tgts[ntgts++] = c;
+ } else if (c >= rm->rm_firstdatacol) {
+ nbaddata--;
+ } else {
+ parity_valid[c] = B_TRUE;
+ nbadparity--;
+ }
+ }
+
+ ASSERT(ntgts >= nt);
+ ASSERT(nbaddata >= 0);
+ ASSERT(nbaddata + nbadparity == ntgts);
+
+ dt = &tgts[nbadparity];
+
+ /*
+ * See if we can use any of our optimized reconstruction routines.
+ */
+ if (!vdev_raidz_default_to_general) {
+ switch (nbaddata) {
+ case 1:
+ if (parity_valid[VDEV_RAIDZ_P])
+ return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+ ASSERT(rm->rm_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+ ASSERT(rm->rm_firstdatacol > 2);
+ break;
+
+ case 2:
+ ASSERT(rm->rm_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_P] &&
+ parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+ ASSERT(rm->rm_firstdatacol > 2);
+
+ break;
+ }
+ }
+
+ code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+ ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+ ASSERT(code > 0);
+ return (code);
+}
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_t *cvd;
+ uint64_t nparity = vd->vdev_nparity;
+ int c;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ ASSERT(nparity > 0);
+
+ if (nparity > VDEV_RAIDZ_MAXPARITY ||
+ vd->vdev_children < nparity + 1) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ vdev_open_children(vd);
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error != 0) {
+ lasterror = cvd->vdev_open_error;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+ *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+ *physical_ashift = MAX(*physical_ashift,
+ cvd->vdev_physical_ashift);
+ }
+
+ *asize *= vd->vdev_children;
+ *max_asize *= vd->vdev_children;
+
+ if (numerrors > nparity) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+#ifdef illumos
+/*
+ * Handle a read or write I/O to a RAID-Z dump device.
+ *
+ * The dump device is in a unique situation compared to other ZFS datasets:
+ * writing to this device should be as simple and fast as possible. In
+ * addition, durability matters much less since the dump will be extracted
+ * once the machine reboots. For that reason, this function eschews parity for
+ * performance and simplicity. The dump device uses the checksum setting
+ * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
+ * dataset.
+ *
+ * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
+ * 128 KB will not fill an entire block; in addition, they may not be properly
+ * aligned. In that case, this function uses the preallocated 128 KB block and
+ * omits reading or writing any "empty" portions of that block, as opposed to
+ * allocating a fresh appropriately-sized block.
+ *
+ * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
+ *
+ * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
+ *
+ * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
+ * allocated which spans all five child vdevs. 8 KB of data would be written to
+ * each of four vdevs, with the fifth containing the parity bits.
+ *
+ * parity data data data data
+ * | PP | XX | XX | XX | XX |
+ * ^ ^ ^ ^ ^
+ * | | | | |
+ * 8 KB parity ------8 KB data blocks------
+ *
+ * However, when writing to the dump device, the behavior is different:
+ *
+ * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
+ *
+ * Unlike the normal RAID-Z case in which the block is allocated based on the
+ * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
+ * I/O size is less than 128 KB, only the actual portions of data are written.
+ * In this example the data is written to the third data vdev since that vdev
+ * contains the offset [64 KB, 96 KB).
+ *
+ * parity data data data data
+ * | | | | XX | |
+ * ^
+ * |
+ * 32 KB data block
+ *
+ * As a result, an individual I/O may not span all child vdevs; moreover, a
+ * small I/O may only operate on a single child vdev.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
+ * would look like:
+ *
+ * parity parity parity data data data data
+ * | | | | | | XX | |
+ * ^
+ * |
+ * 32 KB data block
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+ uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
+{
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c, err = 0;
+
+ uint64_t start, end, colstart, colend;
+ uint64_t coloffset, colsize, colskip;
+
+ int flags = doread ? BIO_READ : BIO_WRITE;
+
+#ifdef _KERNEL
+
+ /*
+ * Don't write past the end of the block
+ */
+ VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
+
+ start = offset;
+ end = start + size;
+
+ /*
+ * Allocate a RAID-Z map for this block. Note that this block starts
+ * from the "original" offset, this is, the offset of the extent which
+ * contains the requisite offset of the data being read or written.
+ *
+ * Even if this I/O operation doesn't span the full block size, let's
+ * treat the on-disk format as if the only blocks are the complete 128
+ * KB size.
+ */
+ abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
+ SPA_OLD_MAXBLOCKSIZE);
+ rm = vdev_raidz_map_alloc(abd,
+ SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
+ vd->vdev_children, vd->vdev_nparity);
+
+ coloffset = origoffset;
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+ c++, coloffset += rc->rc_size) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Find the start and end of this column in the RAID-Z map,
+ * keeping in mind that the stated size and offset of the
+ * operation may not fill the entire column for this vdev.
+ *
+ * If any portion of the data spans this column, issue the
+ * appropriate operation to the vdev.
+ */
+ if (coloffset + rc->rc_size <= start)
+ continue;
+ if (coloffset >= end)
+ continue;
+
+ colstart = MAX(coloffset, start);
+ colend = MIN(end, coloffset + rc->rc_size);
+ colsize = colend - colstart;
+ colskip = colstart - coloffset;
+
+ VERIFY3U(colsize, <=, rc->rc_size);
+ VERIFY3U(colskip, <=, rc->rc_size);
+
+ /*
+ * Note that the child vdev will have a vdev label at the start
+ * of its range of offsets, hence the need for
+ * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
+ * example of why this calculation is needed.
+ */
+ if ((err = vdev_disk_physio(cvd,
+ ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
+ VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+ flags, isdump)) != 0)
+ break;
+ }
+
+ vdev_raidz_map_free(rm);
+ abd_put(abd);
+#endif /* KERNEL */
+
+ return (err);
+}
+#endif
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t cols = vd->vdev_children;
+ uint64_t nparity = vd->vdev_nparity;
+
+ asize = ((psize - 1) >> ashift) + 1;
+ asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
+ asize = roundup(asize, nparity + 1) << ashift;
+
+ return (asize);
+}
+
+static void
+vdev_raidz_child_done(zio_t *zio)
+{
+ raidz_col_t *rc = zio->io_private;
+
+ rc->rc_error = zio->io_error;
+ rc->rc_tried = 1;
+ rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
+{
+#ifdef ZFS_DEBUG
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = zio->io_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_raidz_asize(zio->io_vd, zio->io_size);
+
+ raidz_col_t *rc = &rm->rm_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ vdev_xlate(cvd, &logical_rs, &physical_rs);
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ /*
+ * It would be nice to assert that rs_end is equal
+ * to rc_offset + rc_size but there might be an
+ * optional I/O at the end that is not accounted in
+ * rc_size.
+ */
+ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+ rc->rc_size + (1 << tvd->vdev_ashift));
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+ }
+#endif
+}
+
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity.
+ * 3. If the column skips any sectors for padding, create optional dummy
+ * write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ * 1. Create child zio read operations to each data column's vdev to read
+ * the range of data required for zio.
+ * 2. If this is a scrub or resilver operation, or if any of the data
+ * vdevs have had errors, then create zio read operations to the parity
+ * columns' VDevs as well.
+ */
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c, i;
+
+ rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
+ zio->io_type == ZIO_TYPE_FREE,
+ tvd->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity);
+
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
+ ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+
+ if (zio->io_type == ZIO_TYPE_FREE) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+
+ zio_execute(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ vdev_raidz_generate_parity(rm);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Verify physical to logical translation.
+ */
+ vdev_raidz_io_verify(zio, rm, c);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+
+ /*
+ * Generate optional I/Os for any skipped sectors to improve
+ * aggregation contiguity.
+ */
+ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+ ASSERT(c <= rm->rm_scols);
+ if (c == rm->rm_scols)
+ c = 0;
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset + rc->rc_size, NULL,
+ 1 << tvd->vdev_ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+
+ zio_execute(zio);
+ return;
+ }
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ /*
+ * Iterate over the columns in reverse order so that we hit the parity
+ * last -- any errors along the way will force us to read the parity.
+ */
+ for (c = rm->rm_cols - 1; c >= 0; c--) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ if (!vdev_readable(cvd)) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
+ else
+ rm->rm_missingparity++;
+ rc->rc_error = SET_ERROR(ENXIO);
+ rc->rc_tried = 1; /* don't even try */
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
+ else
+ rm->rm_missingparity++;
+ rc->rc_error = SET_ERROR(ESTALE);
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+ (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+ }
+
+ zio_execute(zio);
+}
+
+
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
+{
+ void *buf;
+ vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ zio_bad_cksum_t zbc;
+ raidz_map_t *rm = zio->io_vsd;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected = rm->rm_ecksuminjected;
+
+ buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ rc->rc_offset, rc->rc_size, buf, bad_data,
+ &zbc);
+ abd_return_buf(rc->rc_abd, buf, rc->rc_size);
+ }
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(zio_t *zio)
+{
+ zio_bad_cksum_t zbc;
+ raidz_map_t *rm = zio->io_vsd;
+
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret != 0 && zbc.zbc_injected != 0)
+ rm->rm_ecksuminjected = 1;
+
+ return (ret);
+}
+
+/*
+ * Generate the parity from the data columns. If we tried and were able to
+ * read the parity without error, verify that the generated parity matches the
+ * data we read. If it doesn't, we fire off a checksum error. Return the
+ * number such failures.
+ */
+static int
+raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
+{
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int c, ret = 0;
+ raidz_col_t *rc;
+
+ blkptr_t *bp = zio->io_bp;
+ enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+ if (checksum == ZIO_CHECKSUM_NOPARITY)
+ return (ret);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ orig[c] = zio_buf_alloc(rc->rc_size);
+ abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
+ }
+
+ vdev_raidz_generate_parity(rm);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
+ raidz_checksum_error(zio, rc, orig[c]);
+ rc->rc_error = SET_ERROR(ECKSUM);
+ ret++;
+ }
+ zio_buf_free(orig[c], rc->rc_size);
+ }
+
+ return (ret);
+}
+
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
+
+static int
+vdev_raidz_worst_error(raidz_map_t *rm)
+{
+ int error = 0;
+
+ for (int c = 0; c < rm->rm_cols; c++)
+ error = zio_worst_error(error, rm->rm_col[c].rc_error);
+
+ return (error);
+}
+
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc;
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *tgts = &tstore[1];
+ int current, next, i, c, n;
+ int code, ret = 0;
+
+ ASSERT(total_errors < rm->rm_firstdatacol);
+
+ /*
+ * This simplifies one edge condition.
+ */
+ tgts[-1] = -1;
+
+ for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+ /*
+ * Initialize the targets array by finding the first n columns
+ * that contain no error.
+ *
+ * If there were no data errors, we need to ensure that we're
+ * always explicitly attempting to reconstruct at least one
+ * data column. To do this, we simply push the highest target
+ * up into the data columns.
+ */
+ for (c = 0, i = 0; i < n; i++) {
+ if (i == n - 1 && data_errors == 0 &&
+ c < rm->rm_firstdatacol) {
+ c = rm->rm_firstdatacol;
+ }
+
+ while (rm->rm_col[c].rc_error != 0) {
+ c++;
+ ASSERT3S(c, <, rm->rm_cols);
+ }
+
+ tgts[i] = c++;
+ }
+
+ /*
+ * Setting tgts[n] simplifies the other edge condition.
+ */
+ tgts[n] = rm->rm_cols;
+
+ /*
+ * These buffers were allocated in previous iterations.
+ */
+ for (i = 0; i < n - 1; i++) {
+ ASSERT(orig[i] != NULL);
+ }
+
+ orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+ current = 0;
+ next = tgts[current];
+
+ while (current != n) {
+ tgts[current] = next;
+ current = 0;
+
+ /*
+ * Save off the original data that we're going to
+ * attempt to reconstruct.
+ */
+ for (i = 0; i < n; i++) {
+ ASSERT(orig[i] != NULL);
+ c = tgts[i];
+ ASSERT3S(c, >=, 0);
+ ASSERT3S(c, <, rm->rm_cols);
+ rc = &rm->rm_col[c];
+ abd_copy_to_buf(orig[i], rc->rc_abd,
+ rc->rc_size);
+ }
+
+ /*
+ * Attempt a reconstruction and exit the outer loop on
+ * success.
+ */
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+ if (raidz_checksum_verify(zio) == 0) {
+ atomic_inc_64(&raidz_corrected[code]);
+
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ ASSERT(rc->rc_error == 0);
+ if (rc->rc_tried)
+ raidz_checksum_error(zio, rc,
+ orig[i]);
+ rc->rc_error = SET_ERROR(ECKSUM);
+ }
+
+ ret = code;
+ goto done;
+ }
+
+ /*
+ * Restore the original data.
+ */
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ abd_copy_from_buf(rc->rc_abd, orig[i],
+ rc->rc_size);
+ }
+
+ do {
+ /*
+ * Find the next valid column after the current
+ * position..
+ */
+ for (next = tgts[current] + 1;
+ next < rm->rm_cols &&
+ rm->rm_col[next].rc_error != 0; next++)
+ continue;
+
+ ASSERT(next <= tgts[current + 1]);
+
+ /*
+ * If that spot is available, we're done here.
+ */
+ if (next != tgts[current + 1])
+ break;
+
+ /*
+ * Otherwise, find the next valid column after
+ * the previous position.
+ */
+ for (c = tgts[current - 1] + 1;
+ rm->rm_col[c].rc_error != 0; c++)
+ continue;
+
+ tgts[current] = c;
+ current++;
+
+ } while (current != n);
+ }
+ }
+ n--;
+done:
+ for (i = 0; i < n; i++) {
+ zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+ }
+
+ return (ret);
+}
+
+/*
+ * Complete an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Check for errors on the child IOs.
+ * 2. Return, setting an error code if too few child VDevs were written
+ * to reconstruct the data later. Note that partial writes are
+ * considered successful if they can be reconstructed at all.
+ * - For read operations:
+ * 1. Check for errors on the child IOs.
+ * 2. If data errors occurred:
+ * a. Try to reassemble the data from the parity available.
+ * b. If we haven't yet read the parity drives, read them now.
+ * c. If all parity drives have been read but the data still doesn't
+ * reassemble with a correct checksum, then try combinatorial
+ * reconstruction.
+ * d. If that doesn't work, return an error.
+ * 3. If there were unexpected errors or this is a resilver operation,
+ * rewrite the vdevs that had errors.
+ */
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd;
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc;
+ int unexpected_errors = 0;
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
+ int total_errors = 0;
+ int n, c;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ int code;
+
+ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+
+ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
+ ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+
+ if (rc->rc_error) {
+ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+
+ if (c < rm->rm_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+
+ total_errors++;
+ } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * XXX -- for now, treat partial writes as a success.
+ * (If we couldn't write enough columns to reconstruct
+ * the data, the I/O failed. Otherwise, good enough.)
+ *
+ * Now that we support write reallocation, it would be better
+ * to treat partial failure as real failure unless there are
+ * no non-degraded top-level vdevs left, and not update DTLs
+ * if we intend to reallocate.
+ */
+ /* XXPOLICY */
+ if (total_errors > rm->rm_firstdatacol)
+ zio->io_error = vdev_raidz_worst_error(rm);
+
+ return;
+ } else if (zio->io_type == ZIO_TYPE_FREE) {
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ /*
+ * There are three potential phases for a read:
+ * 1. produce valid data from the columns read
+ * 2. read all disks and try again
+ * 3. perform combinatorial reconstruction
+ *
+ * Each phase is progressively both more expensive and less likely to
+ * occur. If we encounter more errors than we can repair or all phases
+ * fail, we have no choice but to return an error.
+ */
+
+ /*
+ * If the number of errors we saw was correctable -- less than or equal
+ * to the number of parity disks read -- attempt to produce data that
+ * has a valid checksum. Naturally, this case applies in the absence of
+ * any errors.
+ */
+ if (total_errors <= rm->rm_firstdatacol - parity_untried) {
+ if (data_errors == 0) {
+ if (raidz_checksum_verify(zio) == 0) {
+ /*
+ * If we read parity information (unnecessarily
+ * as it happens since no reconstruction was
+ * needed) regenerate and verify the parity.
+ * We also regenerate parity when resilvering
+ * so we can write it out to the failed device
+ * later.
+ */
+ if (parity_errors + parity_untried <
+ rm->rm_firstdatacol ||
+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
+ n = raidz_parity_verify(zio, rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+ goto done;
+ }
+ } else {
+ /*
+ * We either attempt to read all the parity columns or
+ * none of them. If we didn't try to read parity, we
+ * wouldn't be here in the correctable case. There must
+ * also have been fewer parity errors than parity
+ * columns or, again, we wouldn't be in this code path.
+ */
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rm->rm_firstdatacol);
+
+ /*
+ * Identify the data columns that reported an error.
+ */
+ n = 0;
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
+ }
+ }
+
+ ASSERT(rm->rm_firstdatacol >= n);
+
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+
+ if (raidz_checksum_verify(zio) == 0) {
+ atomic_inc_64(&raidz_corrected[code]);
+
+ /*
+ * If we read more parity disks than were used
+ * for reconstruction, confirm that the other
+ * parity disks produced correct data. This
+ * routine is suboptimal in that it regenerates
+ * the parity that we already used in addition
+ * to the parity that we're attempting to
+ * verify, but this should be a relatively
+ * uncommon case, and can be optimized if it
+ * becomes a problem. Note that we regenerate
+ * parity when resilvering so we can write it
+ * out to failed devices later.
+ */
+ if (parity_errors < rm->rm_firstdatacol - n ||
+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
+ n = raidz_parity_verify(zio, rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+
+ goto done;
+ }
+ }
+ }
+
+ /*
+ * This isn't a typical situation -- either we got a read error or
+ * a child silently returned bad data. Read every block so we can
+ * try again with as much data and parity as we can track down. If
+ * we've already been through once before, all children will be marked
+ * as tried so we'll proceed to combinatorial reconstruction.
+ */
+ unexpected_errors = 1;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ if (rm->rm_col[c].rc_tried)
+ continue;
+
+ zio_vdev_io_redone(zio);
+ do {
+ rc = &rm->rm_col[c];
+ if (rc->rc_tried)
+ continue;
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx],
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ } while (++c < rm->rm_cols);
+
+ return;
+ }
+
+ /*
+ * At this point we've attempted to reconstruct the data given the
+ * errors we detected, and we've attempted to read all columns. There
+ * must, therefore, be one or more additional problems -- silent errors
+ * resulting in invalid data rather than explicit I/O errors resulting
+ * in absent data. We check if there is enough additional data to
+ * possibly reconstruct the data and then perform combinatorial
+ * reconstruction over all possible combinations. If that fails,
+ * we're cooked.
+ */
+ if (total_errors > rm->rm_firstdatacol) {
+ zio->io_error = vdev_raidz_worst_error(rm);
+
+ } else if (total_errors < rm->rm_firstdatacol &&
+ (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
+ /*
+ * If we didn't use all the available parity for the
+ * combinatorial reconstruction, verify that the remaining
+ * parity is correct.
+ */
+ if (code != (1 << rm->rm_firstdatacol) - 1)
+ (void) raidz_parity_verify(zio, rm);
+ } else {
+ /*
+ * We're here because either:
+ *
+ * total_errors == rm_firstdatacol, or
+ * vdev_raidz_combrec() failed
+ *
+ * In either case, there is enough bad data to prevent
+ * reconstruction.
+ *
+ * Start checksum ereports for all children which haven't
+ * failed, and the IO wasn't speculative.
+ */
+ zio->io_error = SET_ERROR(ECKSUM);
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error == 0) {
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected =
+ rm->rm_ecksuminjected;
+
+ zfs_ereport_start_checksum(
+ zio->io_spa,
+ vd->vdev_child[rc->rc_devidx],
+ zio, rc->rc_offset, rc->rc_size,
+ (void *)(uintptr_t)c, &zbc);
+ }
+ }
+ }
+ }
+
+done:
+ zio_checksum_verified(zio);
+
+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+ (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ */
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+ }
+ }
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted > vd->vdev_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+/*
+ * Determine if any portion of the provided block resides on a child vdev
+ * with a dirty DTL and therefore needs to be resilvered. The function
+ * assumes that at least one DTL is dirty which imples that full stripe
+ * width blocks must be resilvered.
+ */
+static boolean_t
+vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+{
+ uint64_t dcols = vd->vdev_children;
+ uint64_t nparity = vd->vdev_nparity;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
+ uint64_t b = offset >> ashift;
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = ((psize - 1) >> ashift) + 1;
+ /* The first column for this stripe. */
+ uint64_t f = b % dcols;
+
+ if (s + nparity >= dcols)
+ return (B_TRUE);
+
+ for (uint64_t c = 0; c < s + nparity; c++) {
+ uint64_t devidx = (f + c) % dcols;
+ vdev_t *cvd = vd->vdev_child[devidx];
+
+ /*
+ * dsl_scan_need_resilver() already checked vd with
+ * vdev_dtl_contains(). So here just check cvd with
+ * vdev_dtl_empty(), cheaper and a good approximation.
+ */
+ if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+{
+ vdev_t *raidvd = cvd->vdev_parent;
+ ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+ uint64_t width = raidvd->vdev_children;
+ uint64_t tgt_col = cvd->vdev_id;
+ uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+ /* make sure the offsets are block-aligned */
+ ASSERT0(in->rs_start % (1 << ashift));
+ ASSERT0(in->rs_end % (1 << ashift));
+ uint64_t b_start = in->rs_start >> ashift;
+ uint64_t b_end = in->rs_end >> ashift;
+
+ uint64_t start_row = 0;
+ if (b_start > tgt_col) /* avoid underflow */
+ start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+ uint64_t end_row = 0;
+ if (b_end > tgt_col)
+ end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+ res->rs_start = start_row << ashift;
+ res->rs_end = end_row << ashift;
+
+ ASSERT3U(res->rs_start, <=, in->rs_start);
+ ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+ vdev_raidz_open,
+ vdev_raidz_close,
+ vdev_raidz_asize,
+ vdev_raidz_io_start,
+ vdev_raidz_io_done,
+ vdev_raidz_state_change,
+ vdev_raidz_need_resilver,
+ NULL,
+ NULL,
+ NULL,
+ vdev_raidz_xlate,
+ VDEV_TYPE_RAIDZ, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
new file mode 100644
index 000000000000..ab51c8c79055
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
@@ -0,0 +1,2156 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dir.h>
+#include <sys/arc.h>
+#include <sys/zfeature.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
+
+/*
+ * This file contains the necessary logic to remove vdevs from a
+ * storage pool. Currently, the only devices that can be removed
+ * are log, cache, and spare devices; and top level vdevs from a pool
+ * w/o raidz. (Note that members of a mirror can also be removed
+ * by the detach operation.)
+ *
+ * Log vdevs are removed by evacuating them and then turning the vdev
+ * into a hole vdev while holding spa config locks.
+ *
+ * Top level vdevs are removed and converted into an indirect vdev via
+ * a multi-step process:
+ *
+ * - Disable allocations from this device (spa_vdev_remove_top).
+ *
+ * - From a new thread (spa_vdev_remove_thread), copy data from
+ * the removing vdev to a different vdev. The copy happens in open
+ * context (spa_vdev_copy_impl) and issues a sync task
+ * (vdev_mapping_sync) so the sync thread can update the partial
+ * indirect mappings in core and on disk.
+ *
+ * - If a free happens during a removal, it is freed from the
+ * removing vdev, and if it has already been copied, from the new
+ * location as well (free_from_removing_vdev).
+ *
+ * - After the removal is completed, the copy thread converts the vdev
+ * into an indirect vdev (vdev_remove_complete) before instructing
+ * the sync thread to destroy the space maps and finish the removal
+ * (spa_finish_removal).
+ */
+
+typedef struct vdev_copy_arg {
+ metaslab_t *vca_msp;
+ uint64_t vca_outstanding_bytes;
+ kcondvar_t vca_cv;
+ kmutex_t vca_lock;
+} vdev_copy_arg_t;
+
+/*
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal. This determines how much i/o we can have
+ * in flight concurrently.
+ */
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
+
+/*
+ * The largest contiguous segment that we will attempt to allocate when
+ * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If
+ * there is a performance problem with attempting to allocate large blocks,
+ * consider decreasing this.
+ *
+ * Note: we will issue I/Os of up to this size. The mpt driver does not
+ * respond well to I/Os larger than 1MB, so we set this to 1MB. (When
+ * mpt processes an I/O larger than 1MB, it needs to do an allocation of
+ * 2 physically contiguous pages; if this allocation fails, mpt will drop
+ * the I/O and hang the device.)
+ */
+int zfs_remove_max_segment = 1024 * 1024;
+
+/*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops. The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ * - the mapping will be smaller, since one entry can cover more allocated
+ * segments
+ * - more of the fragmentation in the removing device will be preserved
+ * - we'll do larger allocations, which may fail and fall back on smaller
+ * allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+uint64_t zfs_remove_max_bytes_pause = UINT64_MAX;
+
+#define VDEV_REMOVAL_ZAP_OBJS "lzap"
+
+static void spa_vdev_remove_thread(void *arg);
+
+static void
+spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
+{
+ VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_REMOVING, sizeof (uint64_t),
+ sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
+ &spa->spa_removing_phys, tx));
+}
+
+static nvlist_t *
+spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
+{
+ for (int i = 0; i < count; i++) {
+ uint64_t guid =
+ fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
+
+ if (guid == target_guid)
+ return (nvpp[i]);
+ }
+
+ return (NULL);
+}
+
+static void
+spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
+ nvlist_t *dev_to_remove)
+{
+ nvlist_t **newdev = NULL;
+
+ if (count > 1)
+ newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
+
+ for (int i = 0, j = 0; i < count; i++) {
+ if (dev[i] == dev_to_remove)
+ continue;
+ VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
+ }
+
+ VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
+ VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
+
+ for (int i = 0; i < count - 1; i++)
+ nvlist_free(newdev[i]);
+
+ if (count > 1)
+ kmem_free(newdev, (count - 1) * sizeof (void *));
+}
+
+static spa_vdev_removal_t *
+spa_vdev_removal_create(vdev_t *vd)
+{
+ spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
+ mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
+ svr->svr_allocd_segs = range_tree_create(NULL, NULL);
+ svr->svr_vdev_id = vd->vdev_id;
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ svr->svr_frees[i] = range_tree_create(NULL, NULL);
+ list_create(&svr->svr_new_segments[i],
+ sizeof (vdev_indirect_mapping_entry_t),
+ offsetof(vdev_indirect_mapping_entry_t, vime_node));
+ }
+
+ return (svr);
+}
+
+void
+spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
+{
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(svr->svr_bytes_done[i]);
+ ASSERT0(svr->svr_max_offset_to_sync[i]);
+ range_tree_destroy(svr->svr_frees[i]);
+ list_destroy(&svr->svr_new_segments[i]);
+ }
+
+ range_tree_destroy(svr->svr_allocd_segs);
+ mutex_destroy(&svr->svr_lock);
+ cv_destroy(&svr->svr_cv);
+ kmem_free(svr, sizeof (*svr));
+}
+
+/*
+ * This is called as a synctask in the txg in which we will mark this vdev
+ * as removing (in the config stored in the MOS).
+ *
+ * It begins the evacuation of a toplevel vdev by:
+ * - initializing the spa_removing_phys which tracks this removal
+ * - computing the amount of space to remove for accounting purposes
+ * - dirtying all dbufs in the spa_config_object
+ * - creating the spa_vdev_removal
+ * - starting the spa_vdev_remove_thread
+ */
+static void
+vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
+{
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
+ spa_vdev_removal_t *svr = NULL;
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
+ svr = spa_vdev_removal_create(vd);
+
+ ASSERT(vd->vdev_removing);
+ ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
+
+ spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ /*
+ * By activating the OBSOLETE_COUNTS feature, we prevent
+ * the pool from being downgraded and ensure that the
+ * refcounts are precise.
+ */
+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ uint64_t one = 1;
+ VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
+ &one, tx));
+ ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+ }
+
+ vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
+ vd->vdev_indirect_mapping =
+ vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
+ vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
+ vd->vdev_indirect_births =
+ vdev_indirect_births_open(mos, vic->vic_births_object);
+ spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
+ spa->spa_removing_phys.sr_start_time = gethrestime_sec();
+ spa->spa_removing_phys.sr_end_time = 0;
+ spa->spa_removing_phys.sr_state = DSS_SCANNING;
+ spa->spa_removing_phys.sr_to_copy = 0;
+ spa->spa_removing_phys.sr_copied = 0;
+
+ /*
+ * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
+ * there may be space in the defer tree, which is free, but still
+ * counted in vs_alloc.
+ */
+ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+ metaslab_t *ms = vd->vdev_ms[i];
+ if (ms->ms_sm == NULL)
+ continue;
+
+ spa->spa_removing_phys.sr_to_copy +=
+ metaslab_allocated_space(ms);
+
+ /*
+ * Space which we are freeing this txg does not need to
+ * be copied.
+ */
+ spa->spa_removing_phys.sr_to_copy -=
+ range_tree_space(ms->ms_freeing);
+
+ ASSERT0(range_tree_space(ms->ms_freed));
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT0(range_tree_space(ms->ms_allocating[t]));
+ }
+
+ /*
+ * Sync tasks are called before metaslab_sync(), so there should
+ * be no already-synced metaslabs in the TXG_CLEAN list.
+ */
+ ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
+
+ spa_sync_removing_state(spa, tx);
+
+ /*
+ * All blocks that we need to read the most recent mapping must be
+ * stored on concrete vdevs. Therefore, we must dirty anything that
+ * is read before spa_remove_init(). Specifically, the
+ * spa_config_object. (Note that although we already modified the
+ * spa_config_object in spa_sync_removing_state, that may not have
+ * modified all blocks of the object.)
+ */
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
+ for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
+ dmu_buf_t *dbuf;
+ VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
+ offset, FTAG, &dbuf, 0));
+ dmu_buf_will_dirty(dbuf, tx);
+ offset += dbuf->db_size;
+ dmu_buf_rele(dbuf, FTAG);
+ }
+
+ /*
+ * Now that we've allocated the im_object, dirty the vdev to ensure
+ * that the object gets written to the config on disk.
+ */
+ vdev_config_dirty(vd);
+
+ zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu "
+ "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx),
+ vic->vic_mapping_object);
+
+ spa_history_log_internal(spa, "vdev remove started", tx,
+ "%s vdev %llu %s", spa_name(spa), vd->vdev_id,
+ (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+ /*
+ * Setting spa_vdev_removal causes subsequent frees to call
+ * free_from_removing_vdev(). Note that we don't need any locking
+ * because we are the sync thread, and metaslab_free_impl() is only
+ * called from syncing context (potentially from a zio taskq thread,
+ * but in any case only when there are outstanding free i/os, which
+ * there are not).
+ */
+ ASSERT3P(spa->spa_vdev_removal, ==, NULL);
+ spa->spa_vdev_removal = svr;
+ svr->svr_thread = thread_create(NULL, 0,
+ spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * When we are opening a pool, we must read the mapping for each
+ * indirect vdev in order from most recently removed to least
+ * recently removed. We do this because the blocks for the mapping
+ * of older indirect vdevs may be stored on more recently removed vdevs.
+ * In order to read each indirect mapping object, we must have
+ * initialized all more recently removed vdevs.
+ */
+int
+spa_remove_init(spa_t *spa)
+{
+ int error;
+
+ error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_REMOVING, sizeof (uint64_t),
+ sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
+ &spa->spa_removing_phys);
+
+ if (error == ENOENT) {
+ spa->spa_removing_phys.sr_state = DSS_NONE;
+ spa->spa_removing_phys.sr_removing_vdev = -1;
+ spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+ spa->spa_indirect_vdevs_loaded = B_TRUE;
+ return (0);
+ } else if (error != 0) {
+ return (error);
+ }
+
+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
+ /*
+ * We are currently removing a vdev. Create and
+ * initialize a spa_vdev_removal_t from the bonus
+ * buffer of the removing vdevs vdev_im_object, and
+ * initialize its partial mapping.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa,
+ spa->spa_removing_phys.sr_removing_vdev);
+
+ if (vd == NULL) {
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ return (EINVAL);
+ }
+
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
+ ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+ ASSERT(vd->vdev_removing);
+
+ vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
+ spa->spa_meta_objset, vic->vic_mapping_object);
+ vd->vdev_indirect_births = vdev_indirect_births_open(
+ spa->spa_meta_objset, vic->vic_births_object);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ spa->spa_vdev_removal = svr;
+ }
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ uint64_t indirect_vdev_id =
+ spa->spa_removing_phys.sr_prev_indirect_vdev;
+ while (indirect_vdev_id != UINT64_MAX) {
+ vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
+ spa->spa_meta_objset, vic->vic_mapping_object);
+ vd->vdev_indirect_births = vdev_indirect_births_open(
+ spa->spa_meta_objset, vic->vic_births_object);
+
+ indirect_vdev_id = vic->vic_prev_indirect_vdev;
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ /*
+ * Now that we've loaded all the indirect mappings, we can allow
+ * reads from other blocks (e.g. via predictive prefetch).
+ */
+ spa->spa_indirect_vdevs_loaded = B_TRUE;
+ return (0);
+}
+
+void
+spa_restart_removal(spa_t *spa)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ if (svr == NULL)
+ return;
+
+ /*
+ * In general when this function is called there is no
+ * removal thread running. The only scenario where this
+ * is not true is during spa_import() where this function
+ * is called twice [once from spa_import_impl() and
+ * spa_async_resume()]. Thus, in the scenario where we
+ * import a pool that has an ongoing removal we don't
+ * want to spawn a second thread.
+ */
+ if (svr->svr_thread != NULL)
+ return;
+
+ if (!spa_writeable(spa))
+ return;
+
+ zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+ svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
+ 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * Process freeing from a device which is in the middle of being removed.
+ * We must handle this carefully so that we attempt to copy freed data,
+ * and we correctly free already-copied data.
+ */
+void
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t txg = spa_syncing_txg(spa);
+ uint64_t max_offset_yet = 0;
+
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
+ vdev_indirect_mapping_object(vim));
+ ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
+
+ mutex_enter(&svr->svr_lock);
+
+ /*
+ * Remove the segment from the removing vdev's spacemap. This
+ * ensures that we will not attempt to copy this space (if the
+ * removal thread has not yet visited it), and also ensures
+ * that we know what is actually allocated on the new vdevs
+ * (needed if we cancel the removal).
+ *
+ * Note: we must do the metaslab_free_concrete() with the svr_lock
+ * held, so that the remove_thread can not load this metaslab and then
+ * visit this offset between the time that we metaslab_free_concrete()
+ * and when we check to see if it has been visited.
+ *
+ * Note: The checkpoint flag is set to false as having/taking
+ * a checkpoint and removing a device can't happen at the same
+ * time.
+ */
+ ASSERT(!spa_has_checkpoint(spa));
+ metaslab_free_concrete(vd, offset, size, B_FALSE);
+
+ uint64_t synced_size = 0;
+ uint64_t synced_offset = 0;
+ uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
+ if (offset < max_offset_synced) {
+ /*
+ * The mapping for this offset is already on disk.
+ * Free from the new location.
+ *
+ * Note that we use svr_max_synced_offset because it is
+ * updated atomically with respect to the in-core mapping.
+ * By contrast, vim_max_offset is not.
+ *
+ * This block may be split between a synced entry and an
+ * in-flight or unvisited entry. Only process the synced
+ * portion of it here.
+ */
+ synced_size = MIN(size, max_offset_synced - offset);
+ synced_offset = offset;
+
+ ASSERT3U(max_offset_yet, <=, max_offset_synced);
+ max_offset_yet = max_offset_synced;
+
+ DTRACE_PROBE3(remove__free__synced,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, synced_size);
+
+ size -= synced_size;
+ offset += synced_size;
+ }
+
+ /*
+ * Look at all in-flight txgs starting from the currently syncing one
+ * and see if a section of this free is being copied. By starting from
+ * this txg and iterating forward, we might find that this region
+ * was copied in two different txgs and handle it appropriately.
+ */
+ for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
+ int txgoff = (txg + i) & TXG_MASK;
+ if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
+ /*
+ * The mapping for this offset is in flight, and
+ * will be synced in txg+i.
+ */
+ uint64_t inflight_size = MIN(size,
+ svr->svr_max_offset_to_sync[txgoff] - offset);
+
+ DTRACE_PROBE4(remove__free__inflight,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, inflight_size,
+ uint64_t, txg + i);
+
+ /*
+ * We copy data in order of increasing offset.
+ * Therefore the max_offset_to_sync[] must increase
+ * (or be zero, indicating that nothing is being
+ * copied in that txg).
+ */
+ if (svr->svr_max_offset_to_sync[txgoff] != 0) {
+ ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
+ >=, max_offset_yet);
+ max_offset_yet =
+ svr->svr_max_offset_to_sync[txgoff];
+ }
+
+ /*
+ * We've already committed to copying this segment:
+ * we have allocated space elsewhere in the pool for
+ * it and have an IO outstanding to copy the data. We
+ * cannot free the space before the copy has
+ * completed, or else the copy IO might overwrite any
+ * new data. To free that space, we record the
+ * segment in the appropriate svr_frees tree and free
+ * the mapped space later, in the txg where we have
+ * completed the copy and synced the mapping (see
+ * vdev_mapping_sync).
+ */
+ range_tree_add(svr->svr_frees[txgoff],
+ offset, inflight_size);
+ size -= inflight_size;
+ offset += inflight_size;
+
+ /*
+ * This space is already accounted for as being
+ * done, because it is being copied in txg+i.
+ * However, if i!=0, then it is being copied in
+ * a future txg. If we crash after this txg
+ * syncs but before txg+i syncs, then the space
+ * will be free. Therefore we must account
+ * for the space being done in *this* txg
+ * (when it is freed) rather than the future txg
+ * (when it will be copied).
+ */
+ ASSERT3U(svr->svr_bytes_done[txgoff], >=,
+ inflight_size);
+ svr->svr_bytes_done[txgoff] -= inflight_size;
+ svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
+ }
+ }
+ ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
+
+ if (size > 0) {
+ /*
+ * The copy thread has not yet visited this offset. Ensure
+ * that it doesn't.
+ */
+
+ DTRACE_PROBE3(remove__free__unvisited,
+ spa_t *, spa,
+ uint64_t, offset,
+ uint64_t, size);
+
+ if (svr->svr_allocd_segs != NULL)
+ range_tree_clear(svr->svr_allocd_segs, offset, size);
+
+ /*
+ * Since we now do not need to copy this data, for
+ * accounting purposes we have done our job and can count
+ * it as completed.
+ */
+ svr->svr_bytes_done[txg & TXG_MASK] += size;
+ }
+ mutex_exit(&svr->svr_lock);
+
+ /*
+ * Now that we have dropped svr_lock, process the synced portion
+ * of this free.
+ */
+ if (synced_size > 0) {
+ vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
+ /*
+ * Note: this can only be called from syncing context,
+ * and the vdev_indirect_mapping is only changed from the
+ * sync thread, so we don't need svr_lock while doing
+ * metaslab_free_impl_cb.
+ */
+ boolean_t checkpoint = B_FALSE;
+ vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
+ metaslab_free_impl_cb, &checkpoint);
+ }
+}
+
+/*
+ * Stop an active removal and update the spa_removing phys.
+ */
+static void
+spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
+
+ /* Ensure the removal thread has completed before we free the svr. */
+ spa_vdev_remove_suspend(spa);
+
+ ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
+
+ if (state == DSS_FINISHED) {
+ spa_removing_phys_t *srp = &spa->spa_removing_phys;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
+ vdev_t *pvd = vdev_lookup_top(spa,
+ srp->sr_prev_indirect_vdev);
+ ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
+ }
+
+ vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
+ srp->sr_prev_indirect_vdev = vd->vdev_id;
+ }
+ spa->spa_removing_phys.sr_state = state;
+ spa->spa_removing_phys.sr_end_time = gethrestime_sec();
+
+ spa->spa_vdev_removal = NULL;
+ spa_vdev_removal_destroy(svr);
+
+ spa_sync_removing_state(spa, tx);
+
+ vdev_config_dirty(spa->spa_root_vdev);
+}
+
+static void
+free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
+{
+ vdev_t *vd = arg;
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ boolean_t checkpoint = B_FALSE;
+ vdev_indirect_ops.vdev_op_remap(vd, offset, size,
+ metaslab_free_impl_cb, &checkpoint);
+}
+
+/*
+ * On behalf of the removal thread, syncs an incremental bit more of
+ * the indirect mapping to disk and updates the in-memory mapping.
+ * Called as a sync task in every txg that the removal thread makes progress.
+ */
+static void
+vdev_mapping_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT(vic->vic_mapping_object != 0);
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+
+ vdev_indirect_mapping_add_entries(vim,
+ &svr->svr_new_segments[txg & TXG_MASK], tx);
+ vdev_indirect_births_add_entry(vd->vdev_indirect_births,
+ vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
+
+ /*
+ * Free the copied data for anything that was freed while the
+ * mapping entries were in flight.
+ */
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
+ free_mapped_segment_cb, vd);
+ ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
+ vdev_indirect_mapping_max_offset(vim));
+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
+ mutex_exit(&svr->svr_lock);
+
+ spa_sync_removing_state(spa, tx);
+}
+
+typedef struct vdev_copy_segment_arg {
+ spa_t *vcsa_spa;
+ dva_t *vcsa_dest_dva;
+ uint64_t vcsa_txg;
+ range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_copy_segment_arg_t *vcsa = arg;
+ spa_t *spa = vcsa->vcsa_spa;
+ blkptr_t bp = { 0 };
+
+ BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+ BP_SET_LSIZE(&bp, size);
+ BP_SET_PSIZE(&bp, size);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(&bp, DMU_OT_NONE);
+ BP_SET_LEVEL(&bp, 0);
+ BP_SET_DEDUP(&bp, 0);
+ BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+ DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+ DVA_SET_OFFSET(&bp.blk_dva[0],
+ DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+ DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+ zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_segment_done(zio_t *zio)
+{
+ vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+ range_tree_vacate(vcsa->vcsa_obsolete_segs,
+ unalloc_seg, vcsa);
+ range_tree_destroy(vcsa->vcsa_obsolete_segs);
+ kmem_free(vcsa, sizeof (*vcsa));
+
+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
+static void
+spa_vdev_copy_segment_write_done(zio_t *zio)
+{
+ vdev_copy_arg_t *vca = zio->io_private;
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes -= zio->io_size;
+ cv_signal(&vca->vca_cv);
+ mutex_exit(&vca->vca_lock);
+}
+
+/*
+ * The read of the old location is done. The parent zio is the write to
+ * the new location. Allow it to start.
+ */
+static void
+spa_vdev_copy_segment_read_done(zio_t *zio)
+{
+ zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible. Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs. However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads. If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ * null
+ * / \
+ * write(new vdev, child 0) write(new vdev, child 1)
+ * | |
+ * read(old vdev, child 0) read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete. However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete. In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*. We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+ vdev_t *source_vd, uint64_t source_offset,
+ vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+ ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes += size;
+ mutex_exit(&vca->vca_lock);
+
+ abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+ vdev_t *source_child_vd;
+ if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+ /*
+ * Source and dest are both mirrors. Copy from the same
+ * child id as we are copying to (wrapping around if there
+ * are more dest children than source children).
+ */
+ source_child_vd =
+ source_vd->vdev_child[dest_id % source_vd->vdev_children];
+ } else {
+ source_child_vd = source_vd;
+ }
+
+ zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+ dest_child_vd, dest_offset, abd, size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_write_done, vca);
+
+ zio_nowait(zio_vdev_child_io(write_zio, NULL,
+ source_child_vd, source_offset, abd, size,
+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_read_done, vca));
+}
+
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
+static int
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+ uint64_t maxalloc, uint64_t txg,
+ vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_indirect_mapping_entry_t *entry;
+ dva_t dst = { 0 };
+ uint64_t start = range_tree_min(segs);
+
+ ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
+
+ uint64_t size = range_tree_span(segs);
+ if (range_tree_span(segs) > maxalloc) {
+ /*
+ * We can't allocate all the segments. Prefer to end
+ * the allocation at the end of a segment, thus avoiding
+ * additional split blocks.
+ */
+ range_seg_t search;
+ avl_index_t where;
+ search.rs_start = start + maxalloc;
+ search.rs_end = search.rs_start;
+ range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
+ if (rs == NULL) {
+ rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
+ } else {
+ rs = AVL_PREV(&segs->rt_root, rs);
+ }
+ if (rs != NULL) {
+ size = rs->rs_end - start;
+ } else {
+ /*
+ * There are no segments that end before maxalloc.
+ * I.e. the first segment is larger than maxalloc,
+ * so we must split it.
+ */
+ size = maxalloc;
+ }
+ }
+ ASSERT3U(size, <=, maxalloc);
+
+ /*
+ * An allocation class might not have any remaining vdevs or space
+ */
+ metaslab_class_t *mc = mg->mg_class;
+ if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+ mc = spa_normal_class(spa);
+ int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+ zal, 0);
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+ &dst, 0, NULL, txg, 0, zal, 0);
+ }
+ if (error != 0)
+ return (error);
+
+ /*
+ * Determine the ranges that are not actually needed. Offsets are
+ * relative to the start of the range to be copied (i.e. relative to the
+ * local variable "start").
+ */
+ range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+
+ range_seg_t *rs = avl_first(&segs->rt_root);
+ ASSERT3U(rs->rs_start, ==, start);
+ uint64_t prev_seg_end = rs->rs_end;
+ while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
+ if (rs->rs_start >= start + size) {
+ break;
+ } else {
+ range_tree_add(obsolete_segs,
+ prev_seg_end - start,
+ rs->rs_start - prev_seg_end);
+ }
+ prev_seg_end = rs->rs_end;
+ }
+ /* We don't end in the middle of an obsolete range */
+ ASSERT3U(start + size, <=, prev_seg_end);
+
+ range_tree_clear(segs, start, size);
+
+ /*
+ * We can't have any padding of the allocated size, otherwise we will
+ * misunderstand what's allocated, and the size of the mapping.
+ * The caller ensures this will be true by passing in a size that is
+ * aligned to the worst (highest) ashift in the pool.
+ */
+ ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
+
+ entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
+ DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
+ entry->vime_mapping.vimep_dst = dst;
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+ }
+
+ vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+ vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+ vcsa->vcsa_obsolete_segs = obsolete_segs;
+ vcsa->vcsa_spa = spa;
+ vcsa->vcsa_txg = txg;
+
+ /*
+ * See comment before spa_vdev_copy_one_child().
+ */
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+ spa_vdev_copy_segment_done, vcsa, 0);
+ vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+ if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < dest_vd->vdev_children; i++) {
+ vdev_t *child = dest_vd->vdev_child[i];
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ child, DVA_GET_OFFSET(&dst), i, size);
+ }
+ } else {
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+ }
+ zio_nowait(nzio);
+
+ list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
+ ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
+ vdev_dirty(vd, 0, NULL, txg);
+
+ return (0);
+}
+
+/*
+ * Complete the removal of a toplevel vdev. This is called as a
+ * synctask in the same txg that we will sync out the new config (to the
+ * MOS object) which indicates that this vdev is indirect.
+ */
+static void
+vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(svr->svr_bytes_done[i]);
+ }
+
+ ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
+ spa->spa_removing_phys.sr_to_copy);
+
+ vdev_destroy_spacemaps(vd, tx);
+
+ /* destroy leaf zaps, if any */
+ ASSERT3P(svr->svr_zaplist, !=, NULL);
+ for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
+ vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
+ }
+ fnvlist_free(svr->svr_zaplist);
+
+ spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
+ /* vd->vdev_path is not available here */
+ spa_history_log_internal(spa, "vdev remove completed", tx,
+ "%s vdev %llu", spa_name(spa), vd->vdev_id);
+}
+
+static void
+vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
+{
+ ASSERT3P(zlist, !=, NULL);
+ ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
+
+ if (vd->vdev_leaf_zap != 0) {
+ char zkey[32];
+ (void) snprintf(zkey, sizeof (zkey), "%s-%ju",
+ VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap);
+ fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
+ }
+
+ for (uint64_t id = 0; id < vd->vdev_children; id++) {
+ vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
+ }
+}
+
+static void
+vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
+{
+ vdev_t *ivd;
+ dmu_tx_t *tx;
+ spa_t *spa = vd->vdev_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ /*
+ * First, build a list of leaf zaps to be destroyed.
+ * This is passed to the sync context thread,
+ * which does the actual unlinking.
+ */
+ svr->svr_zaplist = fnvlist_alloc();
+ vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
+
+ ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+ ivd->vdev_removing = 0;
+
+ vd->vdev_leaf_zap = 0;
+
+ vdev_remove_child(ivd, vd);
+ vdev_compact_children(ivd);
+
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
+ 0, ZFS_SPACE_CHECK_NONE, tx);
+ dmu_tx_commit(tx);
+
+ /*
+ * Indicate that this thread has exited.
+ * After this, we can not use svr.
+ */
+ mutex_enter(&svr->svr_lock);
+ svr->svr_thread = NULL;
+ cv_broadcast(&svr->svr_cv);
+ mutex_exit(&svr->svr_lock);
+}
+
+/*
+ * Complete the removal of a toplevel vdev. This is called in open
+ * context by the removal thread after we have copied all vdev's data.
+ */
+static void
+vdev_remove_complete(spa_t *spa)
+{
+ uint64_t txg;
+
+ /*
+ * Wait for any deferred frees to be synced before we call
+ * vdev_metaslab_fini()
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ txg = spa_vdev_enter(spa);
+ vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+
+ zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
+ vd->vdev_id, txg);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd->vdev_mg != NULL) {
+ vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ vd->vdev_mg = NULL;
+ }
+ ASSERT0(vd->vdev_stat.vs_space);
+ ASSERT0(vd->vdev_stat.vs_dspace);
+
+ vdev_remove_replace_with_indirect(vd, txg);
+
+ /*
+ * We now release the locks, allowing spa_sync to run and finish the
+ * removal via vdev_remove_complete_sync in syncing context.
+ *
+ * Note that we hold on to the vdev_t that has been replaced. Since
+ * it isn't part of the vdev tree any longer, it can't be concurrently
+ * manipulated, even while we don't have the config lock.
+ */
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ /*
+ * Top ZAP should have been transferred to the indirect vdev in
+ * vdev_remove_replace_with_indirect.
+ */
+ ASSERT0(vd->vdev_top_zap);
+
+ /*
+ * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
+ */
+ ASSERT0(vd->vdev_leaf_zap);
+
+ txg = spa_vdev_enter(spa);
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+ /*
+ * Request to update the config and the config cachefile.
+ */
+ vdev_config_dirty(spa->spa_root_vdev);
+ (void) spa_vdev_exit(spa, vd, txg, 0);
+
+ spa_event_post(ev);
+}
+
+/*
+ * Evacuates a segment of size at most max_alloc from the vdev
+ * via repeated calls to spa_vdev_copy_segment. If an allocation
+ * fails, the pool is probably too fragmented to handle such a
+ * large size, so decrease max_alloc so that the caller will not try
+ * this size again this txg.
+ */
+static void
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+ uint64_t *max_alloc, dmu_tx_t *tx)
+{
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ mutex_enter(&svr->svr_lock);
+
+ /*
+ * Determine how big of a chunk to copy. We can allocate up
+ * to max_alloc bytes, and we can span up to vdev_removal_max_span
+ * bytes of unallocated space at a time. "segs" will track the
+ * allocated segments that we are copying. We may also be copying
+ * free segments (of up to vdev_removal_max_span bytes).
+ */
+ range_tree_t *segs = range_tree_create(NULL, NULL);
+ for (;;) {
+ range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
+ if (rs == NULL)
+ break;
+
+ uint64_t seg_length;
+
+ if (range_tree_is_empty(segs)) {
+ /* need to truncate the first seg based on max_alloc */
+ seg_length =
+ MIN(rs->rs_end - rs->rs_start, *max_alloc);
+ } else {
+ if (rs->rs_start - range_tree_max(segs) >
+ vdev_removal_max_span) {
+ /*
+ * Including this segment would cause us to
+ * copy a larger unneeded chunk than is allowed.
+ */
+ break;
+ } else if (rs->rs_end - range_tree_min(segs) >
+ *max_alloc) {
+ /*
+ * This additional segment would extend past
+ * max_alloc. Rather than splitting this
+ * segment, leave it for the next mapping.
+ */
+ break;
+ } else {
+ seg_length = rs->rs_end - rs->rs_start;
+ }
+ }
+
+ range_tree_add(segs, rs->rs_start, seg_length);
+ range_tree_remove(svr->svr_allocd_segs,
+ rs->rs_start, seg_length);
+ }
+
+ if (range_tree_is_empty(segs)) {
+ mutex_exit(&svr->svr_lock);
+ range_tree_destroy(segs);
+ return;
+ }
+
+ if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
+ svr, 0, ZFS_SPACE_CHECK_NONE, tx);
+ }
+
+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
+
+ /*
+ * Note: this is the amount of *allocated* space
+ * that we are taking care of each txg.
+ */
+ svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
+
+ mutex_exit(&svr->svr_lock);
+
+ zio_alloc_list_t zal;
+ metaslab_trace_init(&zal);
+ uint64_t thismax = SPA_MAXBLOCKSIZE;
+ while (!range_tree_is_empty(segs)) {
+ int error = spa_vdev_copy_segment(vd,
+ segs, thismax, txg, vca, &zal);
+
+ if (error == ENOSPC) {
+ /*
+ * Cut our segment in half, and don't try this
+ * segment size again this txg. Note that the
+ * allocation size must be aligned to the highest
+ * ashift in the pool, so that the allocation will
+ * not be padded out to a multiple of the ashift,
+ * which could cause us to think that this mapping
+ * is larger than we intended.
+ */
+ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
+ uint64_t attempted =
+ MIN(range_tree_span(segs), thismax);
+ thismax = P2ROUNDUP(attempted / 2,
+ 1 << spa->spa_max_ashift);
+ /*
+ * The minimum-size allocation can not fail.
+ */
+ ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+ *max_alloc = attempted - (1 << spa->spa_max_ashift);
+ } else {
+ ASSERT0(error);
+
+ /*
+ * We've performed an allocation, so reset the
+ * alloc trace list.
+ */
+ metaslab_trace_fini(&zal);
+ metaslab_trace_init(&zal);
+ }
+ }
+ metaslab_trace_fini(&zal);
+ range_tree_destroy(segs);
+}
+
+/*
+ * The removal thread operates in open context. It iterates over all
+ * allocated space in the vdev, by loading each metaslab's spacemap.
+ * For each contiguous segment of allocated space (capping the segment
+ * size at SPA_MAXBLOCKSIZE), we:
+ * - Allocate space for it on another vdev.
+ * - Create a new mapping from the old location to the new location
+ * (as a record in svr_new_segments).
+ * - Initiate a logical read zio to get the data off the removing disk.
+ * - In the read zio's done callback, initiate a logical write zio to
+ * write it to the new vdev.
+ * Note that all of this will take effect when a particular TXG syncs.
+ * The sync thread ensures that all the phys reads and writes for the syncing
+ * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
+ * (see vdev_mapping_sync()).
+ */
+static void
+spa_vdev_remove_thread(void *arg)
+{
+ spa_t *spa = arg;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_copy_arg_t vca;
+ uint64_t max_alloc = zfs_remove_max_segment;
+ uint64_t last_txg = 0;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
+
+ ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT(vd->vdev_removing);
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT(vim != NULL);
+
+ mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
+ vca.vca_outstanding_bytes = 0;
+
+ mutex_enter(&svr->svr_lock);
+
+ /*
+ * Start from vim_max_offset so we pick up where we left off
+ * if we are restarting the removal after opening the pool.
+ */
+ uint64_t msi;
+ for (msi = start_offset >> vd->vdev_ms_shift;
+ msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+ ASSERT3U(msi, <=, vd->vdev_ms_count);
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Assert nothing in flight -- ms_*tree is empty.
+ */
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
+ }
+
+ /*
+ * If the metaslab has ever been allocated from (ms_sm!=NULL),
+ * read the allocated segments from the space map object
+ * into svr_allocd_segs. Since we do this while holding
+ * svr_lock and ms_sync_lock, concurrent frees (which
+ * would have modified the space map) will wait for us
+ * to finish loading the spacemap, and then take the
+ * appropriate action (see free_from_removing_vdev()).
+ */
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ svr->svr_allocd_segs, SM_ALLOC));
+
+ range_tree_walk(msp->ms_freeing,
+ range_tree_remove, svr->svr_allocd_segs);
+
+ /*
+ * When we are resuming from a paused removal (i.e.
+ * when importing a pool with a removal in progress),
+ * discard any state that we have already processed.
+ */
+ range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
+ }
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&msp->ms_sync_lock);
+
+ vca.vca_msp = msp;
+ zfs_dbgmsg("copying %llu segments for metaslab %llu",
+ avl_numnodes(&svr->svr_allocd_segs->rt_root),
+ msp->ms_id);
+
+ while (!svr->svr_thread_exit &&
+ !range_tree_is_empty(svr->svr_allocd_segs)) {
+
+ mutex_exit(&svr->svr_lock);
+
+ /*
+ * We need to periodically drop the config lock so that
+ * writers can get in. Additionally, we can't wait
+ * for a txg to sync while holding a config lock
+ * (since a waiting writer could cause a 3-way deadlock
+ * with the sync thread, which also gets a config
+ * lock for reader). So we can't hold the config lock
+ * while calling dmu_tx_assign().
+ */
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * This delay will pause the removal around the point
+ * specified by zfs_remove_max_bytes_pause. We do this
+ * solely from the test suite or during debugging.
+ */
+ uint64_t bytes_copied =
+ spa->spa_removing_phys.sr_copied;
+ for (int i = 0; i < TXG_SIZE; i++)
+ bytes_copied += svr->svr_bytes_done[i];
+ while (zfs_remove_max_bytes_pause <= bytes_copied &&
+ !svr->svr_thread_exit)
+ delay(hz);
+
+ mutex_enter(&vca.vca_lock);
+ while (vca.vca_outstanding_bytes >
+ zfs_remove_max_copy_bytes) {
+ cv_wait(&vca.vca_cv, &vca.vca_lock);
+ }
+ mutex_exit(&vca.vca_lock);
+
+ dmu_tx_t *tx =
+ dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ /*
+ * Reacquire the vdev_config lock. The vdev_t
+ * that we're removing may have changed, e.g. due
+ * to a vdev_attach or vdev_detach.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
+ if (txg != last_txg)
+ max_alloc = zfs_remove_max_segment;
+ last_txg = txg;
+
+ spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
+
+ dmu_tx_commit(tx);
+ mutex_enter(&svr->svr_lock);
+ }
+ }
+
+ mutex_exit(&svr->svr_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * Wait for all copies to finish before cleaning up the vca.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ ASSERT0(vca.vca_outstanding_bytes);
+
+ mutex_destroy(&vca.vca_lock);
+ cv_destroy(&vca.vca_cv);
+
+ if (svr->svr_thread_exit) {
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
+ svr->svr_thread = NULL;
+ cv_broadcast(&svr->svr_cv);
+ mutex_exit(&svr->svr_lock);
+ } else {
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+ vdev_remove_complete(spa);
+ }
+ thread_exit();
+}
+
+void
+spa_vdev_remove_suspend(spa_t *spa)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+ if (svr == NULL)
+ return;
+
+ mutex_enter(&svr->svr_lock);
+ svr->svr_thread_exit = B_TRUE;
+ while (svr->svr_thread != NULL)
+ cv_wait(&svr->svr_cv, &svr->svr_lock);
+ svr->svr_thread_exit = B_FALSE;
+ mutex_exit(&svr->svr_lock);
+}
+
+/* ARGSUSED */
+static int
+spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ESRCH);
+ return (0);
+}
+
+/*
+ * Cancel a removal by freeing all entries from the partial mapping
+ * and marking the vdev as no longer being removing.
+ */
+/* ARGSUSED */
+static void
+spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ objset_t *mos = spa->spa_meta_objset;
+
+ ASSERT3P(svr->svr_thread, ==, NULL);
+
+ spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
+ if (vdev_obsolete_counts_are_precise(vd)) {
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
+ }
+
+ if (vdev_obsolete_sm_object(vd) != 0) {
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ space_map_object(vd->vdev_obsolete_sm));
+
+ space_map_free(vd->vdev_obsolete_sm, tx);
+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
+ space_map_close(vd->vdev_obsolete_sm);
+ vd->vdev_obsolete_sm = NULL;
+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+ }
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_is_empty(&svr->svr_new_segments[i]));
+ ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
+ vdev_indirect_mapping_max_offset(vim));
+ }
+
+ for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+
+ if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
+ break;
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Assert nothing in flight -- ms_*tree is empty.
+ */
+ for (int i = 0; i < TXG_SIZE; i++)
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
+ for (int i = 0; i < TXG_DEFER_SIZE; i++)
+ ASSERT0(range_tree_space(msp->ms_defer[i]));
+ ASSERT0(range_tree_space(msp->ms_freed));
+
+ if (msp->ms_sm != NULL) {
+ mutex_enter(&svr->svr_lock);
+ VERIFY0(space_map_load(msp->ms_sm,
+ svr->svr_allocd_segs, SM_ALLOC));
+ range_tree_walk(msp->ms_freeing,
+ range_tree_remove, svr->svr_allocd_segs);
+
+ /*
+ * Clear everything past what has been synced,
+ * because we have not allocated mappings for it yet.
+ */
+ uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
+ uint64_t sm_end = msp->ms_sm->sm_start +
+ msp->ms_sm->sm_size;
+ if (sm_end > syncd)
+ range_tree_clear(svr->svr_allocd_segs,
+ syncd, sm_end - syncd);
+
+ mutex_exit(&svr->svr_lock);
+ }
+ mutex_exit(&msp->ms_lock);
+
+ mutex_enter(&svr->svr_lock);
+ range_tree_vacate(svr->svr_allocd_segs,
+ free_mapped_segment_cb, vd);
+ mutex_exit(&svr->svr_lock);
+ }
+
+ /*
+ * Note: this must happen after we invoke free_mapped_segment_cb,
+ * because it adds to the obsolete_segments.
+ */
+ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
+
+ ASSERT3U(vic->vic_mapping_object, ==,
+ vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+ vd->vdev_indirect_mapping = NULL;
+ vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
+ vic->vic_mapping_object = 0;
+
+ ASSERT3U(vic->vic_births_object, ==,
+ vdev_indirect_births_object(vd->vdev_indirect_births));
+ vdev_indirect_births_close(vd->vdev_indirect_births);
+ vd->vdev_indirect_births = NULL;
+ vdev_indirect_births_free(mos, vic->vic_births_object, tx);
+ vic->vic_births_object = 0;
+
+ /*
+ * We may have processed some frees from the removing vdev in this
+ * txg, thus increasing svr_bytes_done; discard that here to
+ * satisfy the assertions in spa_vdev_removal_destroy().
+ * Note that future txg's can not have any bytes_done, because
+ * future TXG's are only modified from open context, and we have
+ * already shut down the copying thread.
+ */
+ svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
+ spa_finish_removal(spa, DSS_CANCELED, tx);
+
+ vd->vdev_removing = B_FALSE;
+ vdev_config_dirty(vd);
+
+ zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
+ vd->vdev_id, dmu_tx_get_txg(tx));
+ spa_history_log_internal(spa, "vdev remove canceled", tx,
+ "%s vdev %llu %s", spa_name(spa),
+ vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+}
+
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+ spa_vdev_remove_suspend(spa);
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ESRCH);
+
+ uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
+
+ int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
+ spa_vdev_remove_cancel_sync, NULL, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
+
+ if (error == 0) {
+ spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
+ vdev_t *vd = vdev_lookup_top(spa, vdid);
+ metaslab_group_activate(vd->vdev_mg);
+ spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
+ }
+
+ return (error);
+}
+
+void
+svr_sync(spa_t *spa, dmu_tx_t *tx)
+{
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ /*
+ * This check is necessary so that we do not dirty the
+ * DIRECTORY_OBJECT via spa_sync_removing_state() when there
+ * is nothing to do. Dirtying it every time would prevent us
+ * from syncing-to-convergence.
+ */
+ if (svr->svr_bytes_done[txgoff] == 0)
+ return;
+
+ /*
+ * Update progress accounting.
+ */
+ spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
+ svr->svr_bytes_done[txgoff] = 0;
+
+ spa_sync_removing_state(spa, tx);
+}
+
+static void
+vdev_remove_make_hole_and_free(vdev_t *vd)
+{
+ uint64_t id = vd->vdev_id;
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ vdev_free(vd);
+
+ if (last_vdev) {
+ vdev_compact_children(rvd);
+ } else {
+ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+ vdev_add_child(rvd, vd);
+ }
+ vdev_config_dirty(rvd);
+
+ /*
+ * Reassess the health of our root vdev.
+ */
+ vdev_reopen(rvd);
+}
+
+/*
+ * Remove a log device. The config lock is held for the specified TXG.
+ */
+static int
+spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+
+ ASSERT(vd->vdev_islog);
+ ASSERT(vd == vd->vdev_top);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Stop allocating from this vdev.
+ */
+ metaslab_group_passivate(mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * Evacuate the device. We don't hold the config lock as
+ * writer since we need to do I/O but we do keep the
+ * spa_namespace_lock held. Once this completes the device
+ * should no longer have any blocks allocated on it.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (vd->vdev_stat.vs_alloc != 0)
+ error = spa_reset_logs(spa);
+
+ *txg = spa_vdev_config_enter(spa);
+
+ if (error != 0) {
+ metaslab_group_activate(mg);
+ return (error);
+ }
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
+ /*
+ * The evacuation succeeded. Remove any remaining MOS metadata
+ * associated with this vdev, and wait for these changes to sync.
+ */
+ vd->vdev_removing = B_TRUE;
+
+ vdev_dirty_leaves(vd, VDD_DTL, *txg);
+ vdev_config_dirty(vd);
+
+ vdev_metaslab_fini(vd);
+
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
+ (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+
+ /* Make sure these changes are sync'ed */
+ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+
+ /* Stop initializing */
+ (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
+ *txg = spa_vdev_config_enter(spa);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /* The top ZAP should have been destroyed by vdev_remove_empty. */
+ ASSERT0(vd->vdev_top_zap);
+ /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
+ ASSERT0(vd->vdev_leaf_zap);
+
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ if (list_link_active(&vd->vdev_state_dirty_node))
+ vdev_state_clean(vd);
+ if (list_link_active(&vd->vdev_config_dirty_node))
+ vdev_config_clean(vd);
+
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
+ /*
+ * Clean up the vdev namespace.
+ */
+ vdev_remove_make_hole_and_free(vd);
+
+ if (ev != NULL)
+ spa_event_post(ev);
+
+ return (0);
+}
+
+static int
+spa_vdev_remove_top_check(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd != vd->vdev_top)
+ return (SET_ERROR(ENOTSUP));
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
+ return (SET_ERROR(ENOTSUP));
+
+ /* available space in the pool's normal class */
+ uint64_t available = dsl_dir_space_available(
+ spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+ metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+ /*
+ * When removing a vdev from an allocation class that has
+ * remaining vdevs, include available space from the class.
+ */
+ if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+ uint64_t class_avail = metaslab_class_get_space(mc) -
+ metaslab_class_get_alloc(mc);
+
+ /* add class space, adjusted for overhead */
+ available += (class_avail * 94) / 100;
+ }
+
+ /*
+ * There has to be enough free space to remove the
+ * device and leave double the "slop" space (i.e. we
+ * must leave at least 3% of the pool free, in addition to
+ * the normal slop space).
+ */
+ if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ return (SET_ERROR(ENOSPC));
+ }
+
+ /*
+ * There can not be a removal in progress.
+ */
+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * The device must have all its data.
+ */
+ if (!vdev_dtl_empty(vd, DTL_MISSING) ||
+ !vdev_dtl_empty(vd, DTL_OUTAGE))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * The device must be healthy.
+ */
+ if (!vdev_readable(vd))
+ return (SET_ERROR(EIO));
+
+ /*
+ * All vdevs in normal class must have the same ashift.
+ */
+ if (spa->spa_max_ashift != spa->spa_min_ashift) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * All vdevs in normal class must have the same ashift
+ * and not be raidz.
+ */
+ vdev_t *rvd = spa->spa_root_vdev;
+ int num_indirect = 0;
+ for (uint64_t id = 0; id < rvd->vdev_children; id++) {
+ vdev_t *cvd = rvd->vdev_child[id];
+ if (cvd->vdev_ashift != 0 && !cvd->vdev_islog)
+ ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
+ if (cvd->vdev_ops == &vdev_indirect_ops)
+ num_indirect++;
+ if (!vdev_is_concrete(cvd))
+ continue;
+ if (cvd->vdev_ops == &vdev_raidz_ops)
+ return (SET_ERROR(EINVAL));
+ /*
+ * Need the mirror to be mirror of leaf vdevs only
+ */
+ if (cvd->vdev_ops == &vdev_mirror_ops) {
+ for (uint64_t cid = 0;
+ cid < cvd->vdev_children; cid++) {
+ vdev_t *tmp = cvd->vdev_child[cid];
+ if (!tmp->vdev_ops->vdev_op_leaf)
+ return (SET_ERROR(EINVAL));
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Initiate removal of a top-level vdev, reducing the total space in the pool.
+ * The config lock is held for the specified TXG. Once initiated,
+ * evacuation of all allocated space (copying it to other vdevs) happens
+ * in the background (see spa_vdev_remove_thread()), and can be canceled
+ * (see spa_vdev_remove_cancel()). If successful, the vdev will
+ * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
+ */
+static int
+spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ int error;
+
+ /*
+ * Check for errors up-front, so that we don't waste time
+ * passivating the metaslab group and clearing the ZIL if there
+ * are errors.
+ */
+ error = spa_vdev_remove_top_check(vd);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Stop allocating from this vdev. Note that we must check
+ * that this is not the only device in the pool before
+ * passivating, otherwise we will not be able to make
+ * progress because we can't allocate from any vdevs.
+ * The above check for sufficient free space serves this
+ * purpose.
+ */
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * We must ensure that no "stubby" log blocks are allocated
+ * on the device to be removed. These blocks could be
+ * written at any time, including while we are in the middle
+ * of copying them.
+ */
+ error = spa_reset_logs(spa);
+
+ /*
+ * We stop any initializing that is currently in progress but leave
+ * the state as "active". This will allow the initializing to resume
+ * if the removal is canceled sometime later.
+ */
+ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
+ *txg = spa_vdev_config_enter(spa);
+
+ /*
+ * Things might have changed while the config lock was dropped
+ * (e.g. space usage). Check for errors again.
+ */
+ if (error == 0)
+ error = spa_vdev_remove_top_check(vd);
+
+ if (error != 0) {
+ metaslab_group_activate(mg);
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+ return (error);
+ }
+
+ vd->vdev_removing = B_TRUE;
+
+ vdev_dirty_leaves(vd, VDD_DTL, *txg);
+ vdev_config_dirty(vd);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool,
+ vdev_remove_initiate_sync,
+ (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Remove a device from the pool.
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time. As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock. During each step the configuration is synced out.
+ */
+int
+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+{
+ vdev_t *vd;
+ nvlist_t **spares, **l2cache, *nv;
+ uint64_t txg = 0;
+ uint_t nspares, nl2cache;
+ int error = 0;
+ boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+ sysevent_t *ev = NULL;
+
+ ASSERT(spa_writeable(spa));
+
+ if (!locked)
+ txg = spa_vdev_enter(spa);
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+ if (!locked)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ return (error);
+ }
+
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ if (spa->spa_spares.sav_vdevs != NULL &&
+ nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
+ (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
+ /*
+ * Only remove the hot spare if it's not currently in use
+ * in this pool.
+ */
+ if (vd == NULL || unspare) {
+ char *nvstr = fnvlist_lookup_string(nv,
+ ZPOOL_CONFIG_PATH);
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev (%s) %s", spa_name(spa),
+ VDEV_TYPE_SPARE, nvstr);
+ if (vd == NULL)
+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+ ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_AUX);
+ spa_vdev_remove_aux(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares, nv);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ } else {
+ error = SET_ERROR(EBUSY);
+ }
+ } else if (spa->spa_l2cache.sav_vdevs != NULL &&
+ nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
+ (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
+ char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
+ /*
+ * Cache devices can always be removed.
+ */
+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+ ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
+ spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ } else if (vd != NULL && vd->vdev_islog) {
+ ASSERT(!locked);
+ error = spa_vdev_remove_log(vd, &txg);
+ } else if (vd != NULL) {
+ ASSERT(!locked);
+ error = spa_vdev_remove_top(vd, &txg);
+ } else {
+ /*
+ * There is no vdev of any kind with the specified guid.
+ */
+ error = SET_ERROR(ENOENT);
+ }
+
+ if (!locked)
+ error = spa_vdev_exit(spa, NULL, txg, error);
+
+ if (ev != NULL) {
+ if (error != 0) {
+ spa_event_discard(ev);
+ } else {
+ spa_event_post(ev);
+ }
+ }
+
+ return (error);
+}
+
+int
+spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
+{
+ prs->prs_state = spa->spa_removing_phys.sr_state;
+
+ if (prs->prs_state == DSS_NONE)
+ return (SET_ERROR(ENOENT));
+
+ prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
+ prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
+ prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
+ prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
+ prs->prs_copied = spa->spa_removing_phys.sr_copied;
+
+ if (spa->spa_vdev_removal != NULL) {
+ for (int i = 0; i < TXG_SIZE; i++) {
+ prs->prs_copied +=
+ spa->spa_vdev_removal->svr_bytes_done[i];
+ }
+ }
+
+ prs->prs_mapping_memory = 0;
+ uint64_t indirect_vdev_id =
+ spa->spa_removing_phys.sr_prev_indirect_vdev;
+ while (indirect_vdev_id != -1) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+ prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
+ indirect_vdev_id = vic->vic_prev_indirect_vdev;
+ }
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
new file mode 100644
index 000000000000..a03d18704dfc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -0,0 +1,157 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+static uint64_t
+vdev_root_core_tvds(vdev_t *vd)
+{
+ uint64_t tvds = 0;
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!cvd->vdev_ishole && !cvd->vdev_islog &&
+ cvd->vdev_ops != &vdev_indirect_ops) {
+ tvds++;
+ }
+ }
+
+ return (tvds);
+}
+
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata. Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
+ * place to live. When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine. Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+static boolean_t
+too_many_errors(vdev_t *vd, uint64_t numerrors)
+{
+ uint64_t tvds;
+
+ if (numerrors == 0)
+ return (B_FALSE);
+
+ tvds = vdev_root_core_tvds(vd);
+ ASSERT3U(numerrors, <=, tvds);
+
+ if (numerrors == tvds)
+ return (B_TRUE);
+
+ return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
+}
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ spa_t *spa = vd->vdev_spa;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ vdev_open_children(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error && !cvd->vdev_islog) {
+ lasterror = cvd->vdev_open_error;
+ numerrors++;
+ }
+ }
+
+ if (spa_load_state(spa) != SPA_LOAD_NONE)
+ spa_set_missing_tvds(spa, numerrors);
+
+ if (too_many_errors(vd, numerrors)) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ *asize = 0;
+ *max_asize = 0;
+ *logical_ashift = 0;
+ *physical_ashift = 0;
+
+ return (0);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (too_many_errors(vd, faulted)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ } else if (degraded || faulted) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ } else {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ }
+}
+
+vdev_ops_t vdev_root_ops = {
+ vdev_root_open,
+ vdev_root_close,
+ vdev_default_asize,
+ NULL, /* io_start - not applicable to the root */
+ NULL, /* io_done - not applicable to the root */
+ vdev_root_state_change,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ VDEV_TYPE_ROOT, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
new file mode 100644
index 000000000000..a0cadaae949d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -0,0 +1,1378 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+/*
+ * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
+ * (all leaf blocks) when we start iterating over it.
+ *
+ * For zap_cursor_init(), the callers all intend to iterate through all the
+ * entries. There are a few cases where an error (typically i/o error) could
+ * cause it to bail out early.
+ *
+ * For zap_cursor_init_serialized(), there are callers that do the iteration
+ * outside of ZFS. Typically they would iterate over everything, but we
+ * don't have control of that. E.g. zfs_ioc_snapshot_list_next(),
+ * zcp_snapshots_iter(), and other iterators over things in the MOS - these
+ * are called by /sbin/zfs and channel programs. The other example is
+ * zfs_readdir() which iterates over directory entries for the getdents()
+ * syscall. /sbin/ls iterates to the end (unless it receives a signal), but
+ * userland doesn't have to.
+ *
+ * Given that the ZAP entries aren't returned in a specific order, the only
+ * legitimate use cases for partial iteration would be:
+ *
+ * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
+ * get the first 100 and then wait for the user to hit "next page", which
+ * they may never do).
+ *
+ * 2. You want to know if there are more than X entries, without relying on
+ * the zfs-specific implementation of the directory's st_size (which is
+ * the number of entries).
+ */
+boolean_t zap_iterate_prefetch = B_TRUE;
+
+int fzap_default_block_shift = 14; /* 16k blocksize */
+
+extern inline zap_phys_t *zap_f_phys(zap_t *zap);
+
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+ uint64_t block_type = *(uint64_t *)vbuf;
+
+ if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
+ zap_leaf_byteswap(vbuf, size);
+ else {
+ /* it's a ptrtbl block */
+ byteswap_uint64_array(vbuf, size);
+ }
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
+{
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ zap->zap_ismicro = FALSE;
+
+ zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
+ zap->zap_dbu.dbu_evict_func_async = NULL;
+
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+ zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
+
+ zap_phys_t *zp = zap_f_phys(zap);
+ /*
+ * explicitly zero it since it might be coming from an
+ * initialized microzap
+ */
+ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
+ zp->zap_block_type = ZBT_HEADER;
+ zp->zap_magic = ZAP_MAGIC;
+
+ zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
+
+ zp->zap_freeblk = 2; /* block 1 will be the first leaf */
+ zp->zap_num_leafs = 1;
+ zp->zap_num_entries = 0;
+ zp->zap_salt = zap->zap_salt;
+ zp->zap_normflags = zap->zap_normflags;
+ zp->zap_flags = flags;
+
+ /* block 1 will be the first leaf */
+ for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
+
+ /*
+ * set up block 1 - the first leaf
+ */
+ dmu_buf_t *db;
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
+ dmu_buf_will_dirty(db, tx);
+
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ l->l_dbuf = db;
+
+ zap_leaf_init(l, zp->zap_normflags != 0);
+
+ kmem_free(l, sizeof (zap_leaf_t));
+ dmu_buf_rele(db, FTAG);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+ if (RW_WRITE_HELD(&zap->zap_rwlock))
+ return (1);
+ if (rw_tryupgrade(&zap->zap_rwlock)) {
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static int
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+ void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+ dmu_tx_t *tx)
+{
+ uint64_t newblk;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int hepb = 1<<(bs-4);
+ /* hepb = half the number of entries in a block */
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+ ASSERT(tbl->zt_numblks > 0);
+
+ if (tbl->zt_nextblk != 0) {
+ newblk = tbl->zt_nextblk;
+ } else {
+ newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
+ tbl->zt_nextblk = newblk;
+ ASSERT0(tbl->zt_blks_copied);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+
+ /*
+ * Copy the ptrtbl from the old to new location.
+ */
+
+ uint64_t b = tbl->zt_blks_copied;
+ dmu_buf_t *db_old;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+
+ /* first half of entries in old[b] go to new[2*b+0] */
+ dmu_buf_t *db_new;
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func(db_old->db_data, db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ /* second half of entries in old[b] go to new[2*b+1] */
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func((uint64_t *)db_old->db_data + hepb,
+ db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ dmu_buf_rele(db_old, FTAG);
+
+ tbl->zt_blks_copied++;
+
+ dprintf("copied block %llu of %llu\n",
+ tbl->zt_blks_copied, tbl->zt_numblks);
+
+ if (tbl->zt_blks_copied == tbl->zt_numblks) {
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
+
+ tbl->zt_blk = newblk;
+ tbl->zt_numblks *= 2;
+ tbl->zt_shift++;
+ tbl->zt_nextblk = 0;
+ tbl->zt_blks_copied = 0;
+
+ dprintf("finished; numblocks now %llu (%lluk entries)\n",
+ tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+ }
+
+ return (0);
+}
+
+static int
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+ dmu_tx_t *tx)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+
+ dprintf("storing %llx at index %llx\n", val, idx);
+
+ uint64_t blk = idx >> (bs-3);
+ uint64_t off = idx & ((1<<(bs-3))-1);
+
+ dmu_buf_t *db;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+ dmu_buf_will_dirty(db, tx);
+
+ if (tbl->zt_nextblk != 0) {
+ uint64_t idx2 = idx * 2;
+ uint64_t blk2 = idx2 >> (bs-3);
+ uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+ dmu_buf_t *db2;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
+ DMU_READ_NO_PREFETCH);
+ if (err != 0) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+ dmu_buf_will_dirty(db2, tx);
+ ((uint64_t *)db2->db_data)[off2] = val;
+ ((uint64_t *)db2->db_data)[off2+1] = val;
+ dmu_buf_rele(db2, FTAG);
+ }
+
+ ((uint64_t *)db->db_data)[off] = val;
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
+}
+
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ uint64_t blk = idx >> (bs-3);
+ uint64_t off = idx & ((1<<(bs-3))-1);
+
+ /*
+ * Note: this is equivalent to dmu_buf_hold(), but we use
+ * _dnode_enter / _by_dnode because it's faster because we don't
+ * have to hold the dnode.
+ */
+ dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ dmu_buf_t *db;
+ int err = dmu_buf_hold_by_dnode(dn,
+ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
+ if (err != 0)
+ return (err);
+ *valp = ((uint64_t *)db->db_data)[off];
+ dmu_buf_rele(db, FTAG);
+
+ if (tbl->zt_nextblk != 0) {
+ /*
+ * read the nextblk for the sake of i/o error checking,
+ * so that zap_table_load() will catch errors for
+ * zap_table_store.
+ */
+ blk = (idx*2) >> (bs-3);
+
+ dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ err = dmu_buf_hold_by_dnode(dn,
+ (tbl->zt_nextblk + blk) << bs, FTAG, &db,
+ DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
+ if (err == 0)
+ dmu_buf_rele(db, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+ for (int i = 0; i < n; i++) {
+ uint64_t lb = src[i];
+ dst[2 * i + 0] = lb;
+ dst[2 * i + 1] = lb;
+ }
+}
+
+static int
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+ /*
+ * The pointer table should never use more hash bits than we
+ * have (otherwise we'd be using useless zero bits to index it).
+ * If we are within 2 bits of running out, stop growing, since
+ * this is already an aberrant condition.
+ */
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+ return (SET_ERROR(ENOSPC));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+ /*
+ * We are outgrowing the "embedded" ptrtbl (the one
+ * stored in the header block). Give it its own entire
+ * block, which will double the size of the ptrtbl.
+ */
+ ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+ ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
+
+ uint64_t newblk = zap_allocate_blocks(zap, 1);
+ dmu_buf_t *db_new;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
+ DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+ dmu_buf_will_dirty(db_new, tx);
+ zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ dmu_buf_rele(db_new, FTAG);
+
+ zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
+
+ ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
+ (FZAP_BLOCK_SHIFT(zap)-3));
+
+ return (0);
+ } else {
+ return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
+ zap_ptrtbl_transfer, tx));
+ }
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+ ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+ zap_f_phys(zap)->zap_num_entries += delta;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+static uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks)
+{
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
+ zap_f_phys(zap)->zap_freeblk += nblocks;
+ return (newblk);
+}
+
+static void
+zap_leaf_evict_sync(void *dbu)
+{
+ zap_leaf_t *l = dbu;
+
+ rw_destroy(&l->l_rwlock);
+ kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ rw_init(&l->l_rwlock, 0, 0, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = zap_allocate_blocks(zap, 1);
+ l->l_dbuf = NULL;
+
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+ DMU_READ_NO_PREFETCH));
+ dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+ VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
+ dmu_buf_will_dirty(l->l_dbuf, tx);
+
+ zap_leaf_init(l, zap->zap_normflags != 0);
+
+ zap_f_phys(zap)->zap_num_leafs++;
+
+ return (l);
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+ ASSERT(!zap->zap_ismicro);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+ *count = zap_f_phys(zap)->zap_num_entries;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+ return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+void
+zap_put_leaf(zap_leaf_t *l)
+{
+ rw_exit(&l->l_rwlock);
+ dmu_buf_rele(l->l_dbuf, NULL);
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+ ASSERT(blkid != 0);
+
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ rw_init(&l->l_rwlock, 0, 0, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = blkid;
+ l->l_bs = highbit64(db->db_size) - 1;
+ l->l_dbuf = db;
+
+ dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+ zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
+
+ rw_exit(&l->l_rwlock);
+ if (winner != NULL) {
+ /* someone else set it first */
+ zap_leaf_evict_sync(&l->l_dbu);
+ l = winner;
+ }
+
+ /*
+ * lhr_pad was previously used for the next leaf in the leaf
+ * chain. There should be no chained leafs (as we have removed
+ * support for them).
+ */
+ ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
+
+ /*
+ * There should be more hash entries than there can be
+ * chunks to put in the hash table
+ */
+ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
+
+ /* The chunks should begin at the end of the hash table */
+ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
+ &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+
+ /* The chunks should end at the end of the block */
+ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
+ (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
+
+ return (l);
+}
+
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+ zap_leaf_t **lp)
+{
+ dmu_buf_t *db;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ int err = dmu_buf_hold_by_dnode(dn,
+ blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
+ if (err != 0)
+ return (err);
+
+ ASSERT3U(db->db_object, ==, zap->zap_object);
+ ASSERT3U(db->db_offset, ==, blkid << bs);
+ ASSERT3U(db->db_size, ==, 1 << bs);
+ ASSERT(blkid != 0);
+
+ zap_leaf_t *l = dmu_buf_get_user(db);
+
+ if (l == NULL)
+ l = zap_open_leaf(blkid, db);
+
+ rw_enter(&l->l_rwlock, lt);
+ /*
+ * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
+ * causing ASSERT below to fail.
+ */
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(l->l_blkid, ==, blkid);
+ ASSERT3P(l->l_dbuf, ==, db);
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ *lp = l;
+ return (0);
+}
+
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+ ASSERT3U(idx, <,
+ (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+ *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+ return (0);
+ } else {
+ return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
+ idx, valp));
+ }
+}
+
+static int
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+ return (0);
+ } else {
+ return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
+ idx, blk, tx));
+ }
+}
+
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
+{
+ uint64_t blk;
+
+ ASSERT(zap->zap_dbuf == NULL ||
+ zap_f_phys(zap) == zap->zap_dbuf->db_data);
+
+ /* Reality check for corrupt zap objects (leaf or header). */
+ if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
+ zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
+ zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
+ return (SET_ERROR(EIO));
+ }
+
+ uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ int err = zap_idx_to_blk(zap, idx, &blk);
+ if (err != 0)
+ return (err);
+ err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
+
+ ASSERT(err ||
+ ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+ zap_leaf_phys(*lp)->l_hdr.lh_prefix);
+ return (err);
+}
+
+static int
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
+ void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
+{
+ zap_t *zap = zn->zn_zap;
+ uint64_t hash = zn->zn_hash;
+ int err;
+ int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+ ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ zap_leaf_phys(l)->l_hdr.lh_prefix);
+
+ if (zap_tryupgradedir(zap, tx) == 0 ||
+ old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
+ /* We failed to upgrade, or need to grow the pointer table */
+ objset_t *os = zap->zap_objset;
+ uint64_t object = zap->zap_object;
+
+ zap_put_leaf(l);
+ zap_unlockdir(zap, tag);
+ err = zap_lockdir(os, object, tx, RW_WRITER,
+ FALSE, FALSE, tag, &zn->zn_zap);
+ zap = zn->zn_zap;
+ if (err != 0)
+ return (err);
+ ASSERT(!zap->zap_ismicro);
+
+ while (old_prefix_len ==
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
+ err = zap_grow_ptrtbl(zap, tx);
+ if (err != 0)
+ return (err);
+ }
+
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+
+ if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
+ /* it split while our locks were down */
+ *lp = l;
+ return (0);
+ }
+ }
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ zap_leaf_phys(l)->l_hdr.lh_prefix);
+
+ int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+ (old_prefix_len + 1);
+ uint64_t sibling =
+ (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+
+ /* check for i/o errors before doing zap_leaf_split */
+ for (int i = 0; i < (1ULL << prefix_diff); i++) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, sibling + i, &blk);
+ if (err != 0)
+ return (err);
+ ASSERT3U(blk, ==, l->l_blkid);
+ }
+
+ zap_leaf_t *nl = zap_create_leaf(zap, tx);
+ zap_leaf_split(l, nl, zap->zap_normflags != 0);
+
+ /* set sibling pointers */
+ for (int i = 0; i < (1ULL << prefix_diff); i++) {
+ err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
+ ASSERT0(err); /* we checked for i/o errors above */
+ }
+
+ if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
+ /* we want the sibling */
+ zap_put_leaf(l);
+ *lp = nl;
+ } else {
+ zap_put_leaf(nl);
+ *lp = l;
+ }
+
+ return (0);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
+ void *tag, dmu_tx_t *tx)
+{
+ zap_t *zap = zn->zn_zap;
+ int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+ zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+
+ zap_put_leaf(l);
+
+ if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
+ /*
+ * We are in the middle of growing the pointer table, or
+ * this leaf will soon make us grow it.
+ */
+ if (zap_tryupgradedir(zap, tx) == 0) {
+ objset_t *os = zap->zap_objset;
+ uint64_t zapobj = zap->zap_object;
+
+ zap_unlockdir(zap, tag);
+ int err = zap_lockdir(os, zapobj, tx,
+ RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
+ zap = zn->zn_zap;
+ if (err != 0)
+ return;
+ }
+
+ /* could have finished growing while our locks were down */
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
+ (void) zap_grow_ptrtbl(zap, tx);
+ }
+}
+
+static int
+fzap_checkname(zap_name_t *zn)
+{
+ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ return (0);
+}
+
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
+ /* Only integer sizes supported by C */
+ switch (integer_size) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ break;
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (integer_size * num_integers > ZAP_MAXVALUELEN)
+ return (E2BIG);
+
+ return (0);
+}
+
+static int
+fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
+{
+ int err = fzap_checkname(zn);
+ if (err != 0)
+ return (err);
+ return (fzap_checksize(integer_size, num_integers));
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+int
+fzap_lookup(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ char *realname, int rn_len, boolean_t *ncp)
+{
+ zap_leaf_t *l;
+ zap_entry_handle_t zeh;
+
+ int err = fzap_checkname(zn);
+ if (err != 0)
+ return (err);
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
+ zap_put_leaf(l);
+ return (err);
+ }
+
+ err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+ (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
+ if (ncp) {
+ *ncp = zap_entry_normalization_conflict(&zeh,
+ zn, NULL, zn->zn_zap);
+ }
+ }
+
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_add_cd(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+ zap_t *zap = zn->zn_zap;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(!zap->zap_ismicro);
+ ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
+
+ err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ err = SET_ERROR(EEXIST);
+ goto out;
+ }
+ if (err != ENOENT)
+ goto out;
+
+ err = zap_entry_create(l, zn, cd,
+ integer_size, num_integers, val, &zeh);
+
+ if (err == 0) {
+ zap_increment_num_entries(zap, 1, tx);
+ } else if (err == EAGAIN) {
+ err = zap_expand_leaf(zn, l, tag, tx, &l);
+ zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
+ if (err == 0)
+ goto retry;
+ }
+
+out:
+ if (zap != NULL)
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+ return (err);
+}
+
+int
+fzap_add(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, void *tag, dmu_tx_t *tx)
+{
+ int err = fzap_check(zn, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ return (fzap_add_cd(zn, integer_size, num_integers,
+ val, ZAP_NEED_CD, tag, tx));
+}
+
+int
+fzap_update(zap_name_t *zn,
+ int integer_size, uint64_t num_integers, const void *val,
+ void *tag, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err;
+ boolean_t create;
+ zap_entry_handle_t zeh;
+ zap_t *zap = zn->zn_zap;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ err = fzap_check(zn, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, zn, &zeh);
+ create = (err == ENOENT);
+ ASSERT(err == 0 || err == ENOENT);
+
+ if (create) {
+ err = zap_entry_create(l, zn, ZAP_NEED_CD,
+ integer_size, num_integers, val, &zeh);
+ if (err == 0)
+ zap_increment_num_entries(zap, 1, tx);
+ } else {
+ err = zap_entry_update(&zeh, integer_size, num_integers, val);
+ }
+
+ if (err == EAGAIN) {
+ err = zap_expand_leaf(zn, l, tag, tx, &l);
+ zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
+ if (err == 0)
+ goto retry;
+ }
+
+ if (zap != NULL)
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+ return (err);
+}
+
+int
+fzap_length(zap_name_t *zn,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err != 0)
+ goto out;
+
+ if (integer_size != 0)
+ *integer_size = zeh.zeh_integer_size;
+ if (num_integers != 0)
+ *num_integers = zeh.zeh_num_integers;
+out:
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ zap_entry_remove(&zeh);
+ zap_increment_num_entries(zn->zn_zap, -1, tx);
+ }
+ zap_put_leaf(l);
+ return (err);
+}
+
+void
+fzap_prefetch(zap_name_t *zn)
+{
+ uint64_t blk;
+ zap_t *zap = zn->zn_zap;
+
+ uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ if (zap_idx_to_blk(zap, idx, &blk) != 0)
+ return;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+ ZIO_PRIORITY_SYNC_READ);
+}
+
+/*
+ * Helper functions for consumers.
+ */
+
+uint64_t
+zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+ const char *name, dmu_tx_t *tx)
+{
+ return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
+}
+
+uint64_t
+zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+ const char *name, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t new_obj;
+
+ VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0,
+ dnodesize, tx)) > 0);
+ VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
+ tx));
+
+ return (new_obj);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
+ char *name)
+{
+ zap_cursor_t zc;
+ int err;
+
+ if (mask == 0)
+ mask = -1ULL;
+
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, zapobj);
+ (err = zap_cursor_retrieve(&zc, za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((za->za_first_integer & mask) == (value & mask)) {
+ (void) strcpy(name, za->za_name);
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
+ return (err);
+}
+
+int
+zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ int err = 0;
+
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
+ err = zap_add(os, intoobj, za->za_name,
+ 8, 1, &za->za_first_integer, tx);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
+ return (err);
+}
+
+int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ int err = 0;
+
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
+ err = zap_add(os, intoobj, za->za_name,
+ 8, 1, &value, tx);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
+ return (err);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ int err = 0;
+
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ uint64_t delta = 0;
+
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
+
+ err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
+ if (err != 0 && err != ENOENT)
+ break;
+ delta += za->za_first_integer;
+ err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
+ if (err != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
+ return (err);
+}
+
+int
+zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+ return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+ return (zap_remove(os, obj, name, tx));
+}
+
+int
+zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+ return (zap_lookup(os, obj, name, 8, 1, &value));
+}
+
+int
+zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_update_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_update(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx)
+{
+ uint64_t value = 0;
+
+ if (delta == 0)
+ return (0);
+
+ int err = zap_lookup(os, obj, name, 8, 1, &value);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ value += delta;
+ if (value == 0)
+ err = zap_remove(os, obj, name, tx);
+ else
+ err = zap_update(os, obj, name, 8, 1, &value, tx);
+ return (err);
+}
+
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_increment(os, obj, name, delta, tx));
+}
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err = ENOENT;
+ zap_entry_handle_t zeh;
+ zap_leaf_t *l;
+
+ /* retrieve the next entry at or after zc_hash/zc_cd */
+ /* if no entry, return ENOENT */
+
+ /*
+ * If we are reading from the beginning, we're almost
+ * certain to iterate over the entire ZAP object. If there are
+ * multiple leaf blocks (freeblk > 2), prefetch the whole
+ * object, so that we read the leaf blocks concurrently.
+ * (Unless noprefetch was requested via zap_cursor_init_noprefetch()).
+ */
+ if (zc->zc_hash == 0 && zap_iterate_prefetch &&
+ zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
+ dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+ zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+
+ if (zc->zc_leaf &&
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+
+again:
+ if (zc->zc_leaf == NULL) {
+ err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+ &zc->zc_leaf);
+ if (err != 0)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ }
+ l = zc->zc_leaf;
+
+ err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+ if (err == ENOENT) {
+ uint64_t nocare =
+ (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
+ zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+ zc->zc_cd = 0;
+ if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
+ zc->zc_hash == 0) {
+ zc->zc_hash = -1ULL;
+ } else {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ goto again;
+ }
+ }
+
+ if (err == 0) {
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+ za->za_integer_length = zeh.zeh_integer_size;
+ za->za_num_integers = zeh.zeh_num_integers;
+ if (zeh.zeh_num_integers == 0) {
+ za->za_first_integer = 0;
+ } else {
+ err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+ ASSERT(err == 0 || err == EOVERFLOW);
+ }
+ err = zap_entry_read_name(zap, &zeh,
+ sizeof (za->za_name), za->za_name);
+ ASSERT(err == 0);
+
+ za->za_normalization_conflict =
+ zap_entry_normalization_conflict(&zeh,
+ NULL, za->za_name, zap);
+ }
+ rw_exit(&zc->zc_leaf->l_rwlock);
+ return (err);
+}
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+ uint64_t lastblk = 0;
+
+ /*
+ * NB: if a leaf has more pointers than an entire ptrtbl block
+ * can hold, then it'll be accounted for more than once, since
+ * we won't have lastblk.
+ */
+ for (int i = 0; i < len; i++) {
+ zap_leaf_t *l;
+
+ if (tbl[i] == lastblk)
+ continue;
+ lastblk = tbl[i];
+
+ int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+ if (err == 0) {
+ zap_leaf_stats(zap, l, zs);
+ zap_put_leaf(l);
+ }
+ }
+}
+
+int
+fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn)
+{
+ int err;
+ zap_leaf_t *l;
+ zap_entry_handle_t zeh;
+
+ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err != 0)
+ return (err);
+
+ zc->zc_leaf = l;
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+
+ return (err);
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ zs->zs_blocksize = 1ULL << bs;
+
+ /*
+ * Set zap_phys_t fields
+ */
+ zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+ zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+ zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+ zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+ zs->zs_magic = zap_f_phys(zap)->zap_magic;
+ zs->zs_salt = zap_f_phys(zap)->zap_salt;
+
+ /*
+ * Set zap_ptrtbl fields
+ */
+ zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
+ zs->zs_ptrtbl_blks_copied =
+ zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+ zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+ zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+ zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+ /* the ptrtbl is entirely in the header block. */
+ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
+ } else {
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+ ZIO_PRIORITY_SYNC_READ);
+
+ for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+ b++) {
+ dmu_buf_t *db;
+ int err;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
+ FTAG, &db, DMU_READ_NO_PREFETCH);
+ if (err == 0) {
+ zap_stats_ptrtbl(zap, db->db_data,
+ 1<<(bs-3), zs);
+ dmu_buf_rele(db, FTAG);
+ }
+ }
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
new file mode 100644
index 000000000000..1c7c736d8e97
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -0,0 +1,849 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/arc.h>
+
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+
+#define CHAIN_END 0xffff /* end of the chunk chain */
+
+/* half the (current) minimum block size */
+#define MAX_ARRAY_BYTES (8<<10)
+
+#define LEAF_HASH(l, h) \
+ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
+ ((h) >> \
+ (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
+
+#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
+
+extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+ char *cp = a;
+ char *cpend = cp + n;
+
+ while (cp < cpend)
+ *cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+ switch (len) {
+ case 1:
+ *(uint8_t *)addr = value;
+ return;
+ case 2:
+ *(uint16_t *)addr = value;
+ return;
+ case 4:
+ *(uint32_t *)addr = value;
+ return;
+ case 8:
+ *(uint64_t *)addr = value;
+ return;
+ }
+ ASSERT(!"bad int len");
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+ switch (len) {
+ case 1:
+ return (*(uint8_t *)addr);
+ case 2:
+ return (*(uint16_t *)addr);
+ case 4:
+ return (*(uint32_t *)addr);
+ case 8:
+ return (*(uint64_t *)addr);
+ }
+ ASSERT(!"bad int len");
+ return (0xFEEDFACEDEADBEEFULL);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+{
+ zap_leaf_t l;
+ dmu_buf_t l_dbuf;
+
+ l_dbuf.db_data = buf;
+ l.l_bs = highbit64(size) - 1;
+ l.l_dbuf = &l_dbuf;
+
+ buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
+ buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
+ buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
+ buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
+ buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
+ buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
+ buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
+
+ for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+ buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+ zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
+ struct zap_leaf_entry *le;
+
+ switch (lc->l_free.lf_type) {
+ case ZAP_CHUNK_ENTRY:
+ le = &lc->l_entry;
+
+ le->le_type = BSWAP_8(le->le_type);
+ le->le_value_intlen = BSWAP_8(le->le_value_intlen);
+ le->le_next = BSWAP_16(le->le_next);
+ le->le_name_chunk = BSWAP_16(le->le_name_chunk);
+ le->le_name_numints = BSWAP_16(le->le_name_numints);
+ le->le_value_chunk = BSWAP_16(le->le_value_chunk);
+ le->le_value_numints = BSWAP_16(le->le_value_numints);
+ le->le_cd = BSWAP_32(le->le_cd);
+ le->le_hash = BSWAP_64(le->le_hash);
+ break;
+ case ZAP_CHUNK_FREE:
+ lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
+ lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
+ break;
+ case ZAP_CHUNK_ARRAY:
+ lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
+ lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
+ /* la_array doesn't need swapping */
+ break;
+ default:
+ ASSERT(!"bad leaf type");
+ }
+ }
+}
+
+void
+zap_leaf_init(zap_leaf_t *l, boolean_t sort)
+{
+ l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
+ zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+ sizeof (struct zap_leaf_header));
+ zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+ 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
+ }
+ ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
+ zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
+ zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+ zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+ if (sort)
+ zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+ ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
+
+ int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
+
+ zap_leaf_phys(l)->l_hdr.lh_freelist =
+ ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+
+ zap_leaf_phys(l)->l_hdr.lh_nfree--;
+
+ return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+ struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
+
+ zlf->lf_type = ZAP_CHUNK_FREE;
+ zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
+ bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+ zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
+
+ zap_leaf_phys(l)->l_hdr.lh_nfree++;
+}
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(zap_leaf_t *l, const char *buf,
+ int integer_size, int num_integers)
+{
+ uint16_t chunk_head;
+ uint16_t *chunkp = &chunk_head;
+ int byten = 0;
+ uint64_t value = 0;
+ int shift = (integer_size - 1) * 8;
+ int len = num_integers;
+
+ ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
+
+ while (len > 0) {
+ uint16_t chunk = zap_leaf_chunk_alloc(l);
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+
+ la->la_type = ZAP_CHUNK_ARRAY;
+ for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+ if (byten == 0)
+ value = ldv(integer_size, buf);
+ la->la_array[i] = value >> shift;
+ value <<= 8;
+ if (++byten == integer_size) {
+ byten = 0;
+ buf += integer_size;
+ if (--len == 0)
+ break;
+ }
+ }
+
+ *chunkp = chunk;
+ chunkp = &la->la_next;
+ }
+ *chunkp = CHAIN_END;
+
+ return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
+{
+ uint16_t chunk = *chunkp;
+
+ *chunkp = CHAIN_END;
+
+ while (chunk != CHAIN_END) {
+ int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
+ ZAP_CHUNK_ARRAY);
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ }
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
+ int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+ void *buf)
+{
+ int len = MIN(array_len, buf_len);
+ int byten = 0;
+ uint64_t value = 0;
+ char *p = buf;
+
+ ASSERT3U(array_int_len, <=, buf_int_len);
+
+ /* Fast path for one 8-byte integer */
+ if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ uint8_t *ip = la->la_array;
+ uint64_t *buf64 = buf;
+
+ *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
+ (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
+ (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
+ (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
+ return;
+ }
+
+ /* Fast path for an array of 1-byte integers (eg. the entry name) */
+ if (array_int_len == 1 && buf_int_len == 1 &&
+ buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+ p += ZAP_LEAF_ARRAY_BYTES;
+ chunk = la->la_next;
+ }
+ return;
+ }
+
+ while (len > 0) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+ value = (value << 8) | la->la_array[i];
+ byten++;
+ if (byten == array_int_len) {
+ stv(buf_int_len, p, value);
+ byten = 0;
+ len--;
+ if (len == 0)
+ return;
+ p += buf_int_len;
+ }
+ }
+ chunk = la->la_next;
+ }
+}
+
+static boolean_t
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
+ int chunk, int array_numints)
+{
+ int bseen = 0;
+
+ if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+ uint64_t *thiskey =
+ kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
+ ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+
+ zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
+ sizeof (*thiskey), array_numints, thiskey);
+ boolean_t match = bcmp(thiskey, zn->zn_key_orig,
+ array_numints * sizeof (*thiskey)) == 0;
+ kmem_free(thiskey, array_numints * sizeof (*thiskey));
+ return (match);
+ }
+
+ ASSERT(zn->zn_key_intlen == 1);
+ if (zn->zn_matchtype & MT_NORMALIZE) {
+ char *thisname = kmem_alloc(array_numints, KM_SLEEP);
+
+ zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
+ sizeof (char), array_numints, thisname);
+ boolean_t match = zap_match(zn, thisname);
+ kmem_free(thisname, array_numints);
+ return (match);
+ }
+
+ /*
+ * Fast path for exact matching.
+ * First check that the lengths match, so that we don't read
+ * past the end of the zn_key_orig array.
+ */
+ if (array_numints != zn->zn_key_orig_numints)
+ return (B_FALSE);
+ while (bseen < array_numints) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
+ break;
+ chunk = la->la_next;
+ bseen += toread;
+ }
+ return (bseen == array_numints);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
+{
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
+ *chunkp != CHAIN_END; chunkp = &le->le_next) {
+ uint16_t chunk = *chunkp;
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_hash != zn->zn_hash)
+ continue;
+
+ /*
+ * NB: the entry chain is always sorted by cd on
+ * normalized zap objects, so this will find the
+ * lowest-cd match for MT_NORMALIZE.
+ */
+ ASSERT((zn->zn_matchtype == 0) ||
+ (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+ if (zap_leaf_array_match(l, zn, le->le_name_chunk,
+ le->le_name_numints)) {
+ zeh->zeh_num_integers = le->le_value_numints;
+ zeh->zeh_integer_size = le->le_value_intlen;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+ zeh->zeh_leaf = l;
+ return (0);
+ }
+ }
+
+ return (SET_ERROR(ENOENT));
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+#define HCD_GTEQ(h1, cd1, h2, cd2) \
+ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+ uint64_t besth = -1ULL;
+ uint32_t bestcd = -1U;
+ uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+ for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
+ HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
+ ASSERT3U(bestlh, >=, lh);
+ bestlh = lh;
+ besth = le->le_hash;
+ bestcd = le->le_cd;
+
+ zeh->zeh_num_integers = le->le_value_numints;
+ zeh->zeh_integer_size = le->le_value_intlen;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_fakechunk = chunk;
+ zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+ zeh->zeh_leaf = l;
+ }
+ }
+ }
+
+ return (bestcd == -1U ? ENOENT : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_value_intlen > integer_size)
+ return (SET_ERROR(EINVAL));
+
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
+ le->le_value_intlen, le->le_value_numints,
+ integer_size, num_integers, buf);
+
+ if (zeh->zeh_num_integers > num_integers)
+ return (SET_ERROR(EOVERFLOW));
+ return (0);
+
+}
+
+int
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+ char *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+ le->le_name_numints, 8, buflen / 8, buf);
+ } else {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+ le->le_name_numints, 1, buflen, buf);
+ }
+ if (le->le_name_numints > buflen)
+ return (SET_ERROR(EOVERFLOW));
+ return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+ zap_leaf_t *l = zeh->zeh_leaf;
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
+
+ int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
+
+ if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
+ return (SET_ERROR(EAGAIN));
+
+ zap_leaf_array_free(l, &le->le_value_chunk);
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_numints = num_integers;
+ le->le_value_intlen = integer_size;
+ return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+ zap_leaf_t *l = zeh->zeh_leaf;
+
+ ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+ uint16_t entry_chunk = *zeh->zeh_chunkp;
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ zap_leaf_array_free(l, &le->le_name_chunk);
+ zap_leaf_array_free(l, &le->le_value_chunk);
+
+ *zeh->zeh_chunkp = le->le_next;
+ zap_leaf_chunk_free(l, entry_chunk);
+
+ zap_leaf_phys(l)->l_hdr.lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ struct zap_leaf_entry *le;
+ uint64_t h = zn->zn_hash;
+
+ uint64_t valuelen = integer_size * num_integers;
+
+ int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+ zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+ if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
+ return (E2BIG);
+
+ if (cd == ZAP_NEED_CD) {
+ /* find the lowest unused cd */
+ if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+ cd = 0;
+
+ for (chunk = *LEAF_HASH_ENTPTR(l, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ if (le->le_cd > cd)
+ break;
+ if (le->le_hash == h) {
+ ASSERT3U(cd, ==, le->le_cd);
+ cd++;
+ }
+ }
+ } else {
+ /* old unsorted format; do it the O(n^2) way */
+ for (cd = 0; ; cd++) {
+ for (chunk = *LEAF_HASH_ENTPTR(l, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ if (le->le_hash == h &&
+ le->le_cd == cd) {
+ break;
+ }
+ }
+ /* If this cd is not in use, we are good. */
+ if (chunk == CHAIN_END)
+ break;
+ }
+ }
+ /*
+ * We would run out of space in a block before we could
+ * store enough entries to run out of CD values.
+ */
+ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
+ }
+
+ if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
+ return (SET_ERROR(EAGAIN));
+
+ /* make the entry */
+ chunk = zap_leaf_chunk_alloc(l);
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ le->le_type = ZAP_CHUNK_ENTRY;
+ le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+ zn->zn_key_intlen, zn->zn_key_orig_numints);
+ le->le_name_numints = zn->zn_key_orig_numints;
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_numints = num_integers;
+ le->le_value_intlen = integer_size;
+ le->le_hash = h;
+ le->le_cd = cd;
+
+ /* link it into the hash chain */
+ /* XXX if we did the search above, we could just use that */
+ uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+
+ zap_leaf_phys(l)->l_hdr.lh_nentries++;
+
+ zeh->zeh_leaf = l;
+ zeh->zeh_num_integers = num_integers;
+ zeh->zeh_integer_size = le->le_value_intlen;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+
+ return (0);
+}
+
+/*
+ * Determine if there is another entry with the same normalized form.
+ * For performance purposes, either zn or name must be provided (the
+ * other can be NULL). Note, there usually won't be any hash
+ * conflicts, in which case we don't need the concatenated/normalized
+ * form of the name. But all callers have one of these on hand anyway,
+ * so might as well take advantage. A cleaner but slower interface
+ * would accept neither argument, and compute the normalized name as
+ * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
+ */
+boolean_t
+zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
+ const char *name, zap_t *zap)
+{
+ struct zap_leaf_entry *le;
+ boolean_t allocdzn = B_FALSE;
+
+ if (zap->zap_normflags == 0)
+ return (B_FALSE);
+
+ for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
+ if (le->le_hash != zeh->zeh_hash)
+ continue;
+ if (le->le_cd == zeh->zeh_cd)
+ continue;
+
+ if (zn == NULL) {
+ zn = zap_name_alloc(zap, name, MT_NORMALIZE);
+ allocdzn = B_TRUE;
+ }
+ if (zap_leaf_array_match(zeh->zeh_leaf, zn,
+ le->le_name_chunk, le->le_name_numints)) {
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_TRUE);
+ }
+ }
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_FALSE);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static uint16_t *
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+ struct zap_leaf_entry *le2;
+ uint16_t *chunkp;
+
+ /*
+ * keep the entry chain sorted by cd
+ * NB: this will not cause problems for unsorted leafs, though
+ * it is unnecessary there.
+ */
+ for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
+ *chunkp != CHAIN_END; chunkp = &le2->le_next) {
+ le2 = ZAP_LEAF_ENTRY(l, *chunkp);
+ if (le2->le_cd > le->le_cd)
+ break;
+ }
+
+ le->le_next = *chunkp;
+ *chunkp = entry;
+ return (chunkp);
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+ uint16_t new_chunk;
+ uint16_t *nchunkp = &new_chunk;
+
+ while (chunk != CHAIN_END) {
+ uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_array *nla =
+ &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int nextchunk = la->la_next;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
+
+ *nla = *la; /* structure assignment */
+
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ *nchunkp = nchunk;
+ nchunkp = &nla->la_next;
+ }
+ *nchunkp = CHAIN_END;
+ return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+{
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ uint16_t chunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
+ *nle = *le; /* structure assignment */
+
+ (void) zap_leaf_rehash_entry(nl, chunk);
+
+ nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+ nle->le_value_chunk =
+ zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+ zap_leaf_chunk_free(l, entry);
+
+ zap_leaf_phys(l)->l_hdr.lh_nentries--;
+ zap_leaf_phys(nl)->l_hdr.lh_nentries++;
+}
+
+/*
+ * Transfer the entries whose hash prefix ends in 1 to the new leaf.
+ */
+void
+zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
+{
+ int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+ /* set new prefix and prefix_len */
+ zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
+ zap_leaf_phys(nl)->l_hdr.lh_prefix =
+ zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
+ zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+ /* break existing hash chains */
+ zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+ 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+
+ if (sort)
+ zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+
+ /*
+ * Transfer entries whose hash bit 'bit' is set to nl; rehash
+ * the remaining entries
+ *
+ * NB: We could find entries via the hashtable instead. That
+ * would be O(hashents+numents) rather than O(numblks+numents),
+ * but this accesses memory more sequentially, and when we're
+ * called, the block is usually pretty full.
+ */
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
+ if (le->le_type != ZAP_CHUNK_ENTRY)
+ continue;
+
+ if (le->le_hash & (1ULL << bit))
+ zap_leaf_transfer_entry(l, i, nl);
+ else
+ (void) zap_leaf_rehash_entry(l, i);
+ }
+}
+
+void
+zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+ int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_leafs_with_2n_pointers[n]++;
+
+
+ n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_with_n5_entries[n]++;
+
+ n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
+ zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+ (1<<FZAP_BLOCK_SHIFT(zap));
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_n_tenths_full[n]++;
+
+ for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+ int nentries = 0;
+ int chunk = zap_leaf_phys(l)->l_hash[i];
+
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(l, chunk);
+
+ n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
+ le->le_value_intlen);
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_entries_using_n_chunks[n]++;
+
+ chunk = le->le_next;
+ nentries++;
+ }
+
+ n = nentries;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_buckets_with_n_entries[n]++;
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
new file mode 100644
index 000000000000..133989eca324
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -0,0 +1,1609 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/avl.h>
+#include <sys/arc.h>
+#include <sys/dmu_objset.h>
+
+#ifdef _KERNEL
+#include <sys/sunddi.h>
+#endif
+
+extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+
+static int mzap_upgrade(zap_t **zapp,
+ void *tag, dmu_tx_t *tx, zap_flags_t flags);
+
+uint64_t
+zap_getflags(zap_t *zap)
+{
+ if (zap->zap_ismicro)
+ return (0);
+ return (zap_f_phys(zap)->zap_flags);
+}
+
+int
+zap_hashbits(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return (48);
+ else
+ return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return ((1<<16)-1);
+ else
+ return (-1U);
+}
+
+static uint64_t
+zap_hash(zap_name_t *zn)
+{
+ zap_t *zap = zn->zn_zap;
+ uint64_t h = 0;
+
+ if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+ ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+ h = *(uint64_t *)zn->zn_key_orig;
+ } else {
+ h = zap->zap_salt;
+ ASSERT(h != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ const uint64_t *wp = zn->zn_key_norm;
+
+ ASSERT(zn->zn_key_intlen == 8);
+ for (int i = 0; i < zn->zn_key_norm_numints;
+ wp++, i++) {
+ uint64_t word = *wp;
+
+ for (int j = 0; j < zn->zn_key_intlen; j++) {
+ h = (h >> 8) ^
+ zfs_crc64_table[(h ^ word) & 0xFF];
+ word >>= NBBY;
+ }
+ }
+ } else {
+ const uint8_t *cp = zn->zn_key_norm;
+
+ /*
+ * We previously stored the terminating null on
+ * disk, but didn't hash it, so we need to
+ * continue to not hash it. (The
+ * zn_key_*_numints includes the terminating
+ * null for non-binary keys.)
+ */
+ int len = zn->zn_key_norm_numints - 1;
+
+ ASSERT(zn->zn_key_intlen == 1);
+ for (int i = 0; i < len; cp++, i++) {
+ h = (h >> 8) ^
+ zfs_crc64_table[(h ^ *cp) & 0xFF];
+ }
+ }
+ }
+ /*
+ * Don't use all 64 bits, since we need some in the cookie for
+ * the collision differentiator. We MUST use the high bits,
+ * since those are the ones that we first pay attention to when
+ * chosing the bucket.
+ */
+ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
+
+ return (h);
+}
+
+static int
+zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
+{
+ ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
+ size_t inlen = strlen(name) + 1;
+ size_t outlen = ZAP_MAXNAMELEN;
+
+ int err = 0;
+ (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
+ normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
+ U8_UNICODE_LATEST, &err);
+
+ return (err);
+}
+
+boolean_t
+zap_match(zap_name_t *zn, const char *matchname)
+{
+ ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
+ if (zn->zn_matchtype & MT_NORMALIZE) {
+ char norm[ZAP_MAXNAMELEN];
+
+ if (zap_normalize(zn->zn_zap, matchname, norm,
+ zn->zn_normflags) != 0)
+ return (B_FALSE);
+
+ return (strcmp(zn->zn_key_norm, norm) == 0);
+ } else {
+ return (strcmp(zn->zn_key_orig, matchname) == 0);
+ }
+}
+
+void
+zap_name_free(zap_name_t *zn)
+{
+ kmem_free(zn, sizeof (zap_name_t));
+}
+
+zap_name_t *
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+ zn->zn_zap = zap;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = key;
+ zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
+ zn->zn_matchtype = mt;
+ zn->zn_normflags = zap->zap_normflags;
+
+ /*
+ * If we're dealing with a case sensitive lookup on a mixed or
+ * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
+ * will fold case to all caps overriding the lookup request.
+ */
+ if (mt & MT_MATCH_CASE)
+ zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
+
+ if (zap->zap_normflags) {
+ /*
+ * We *must* use zap_normflags because this normalization is
+ * what the hash is computed from.
+ */
+ if (zap_normalize(zap, key, zn->zn_normbuf,
+ zap->zap_normflags) != 0) {
+ zap_name_free(zn);
+ return (NULL);
+ }
+ zn->zn_key_norm = zn->zn_normbuf;
+ zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+ } else {
+ if (mt != 0) {
+ zap_name_free(zn);
+ return (NULL);
+ }
+ zn->zn_key_norm = zn->zn_key_orig;
+ zn->zn_key_norm_numints = zn->zn_key_orig_numints;
+ }
+
+ zn->zn_hash = zap_hash(zn);
+
+ if (zap->zap_normflags != zn->zn_normflags) {
+ /*
+ * We *must* use zn_normflags because this normalization is
+ * what the matching is based on. (Not the hash!)
+ */
+ if (zap_normalize(zap, key, zn->zn_normbuf,
+ zn->zn_normflags) != 0) {
+ zap_name_free(zn);
+ return (NULL);
+ }
+ zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+ }
+
+ return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+ ASSERT(zap->zap_normflags == 0);
+ zn->zn_zap = zap;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = zn->zn_key_norm = key;
+ zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+ zn->zn_matchtype = 0;
+
+ zn->zn_hash = zap_hash(zn);
+ return (zn);
+}
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+ buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+ buf->mz_salt = BSWAP_64(buf->mz_salt);
+ buf->mz_normflags = BSWAP_64(buf->mz_normflags);
+ int max = (size / MZAP_ENT_LEN) - 1;
+ for (int i = 0; i < max; i++) {
+ buf->mz_chunk[i].mze_value =
+ BSWAP_64(buf->mz_chunk[i].mze_value);
+ buf->mz_chunk[i].mze_cd =
+ BSWAP_32(buf->mz_chunk[i].mze_cd);
+ }
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+ uint64_t block_type = *(uint64_t *)buf;
+
+ if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
+ /* ASSERT(magic == ZAP_LEAF_MAGIC); */
+ mzap_byteswap(buf, size);
+ } else {
+ fzap_byteswap(buf, size);
+ }
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+ const mzap_ent_t *mze1 = arg1;
+ const mzap_ent_t *mze2 = arg2;
+
+ int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
+}
+
+static int
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
+{
+ avl_index_t idx;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+ mze->mze_chunkid = chunkid;
+ mze->mze_hash = hash;
+ mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+ ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
+ if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
+ kmem_free(mze, sizeof (mzap_ent_t));
+ return (EEXIST);
+ }
+ avl_insert(&zap->zap_m.zap_avl, mze, idx);
+ return (0);
+}
+
+static mzap_ent_t *
+mze_find(zap_name_t *zn)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
+
+ ASSERT(zn->zn_zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
+
+ mze_tofind.mze_hash = zn->zn_hash;
+ mze_tofind.mze_cd = 0;
+
+ mze = avl_find(avl, &mze_tofind, &idx);
+ if (mze == NULL)
+ mze = avl_nearest(avl, idx, AVL_AFTER);
+ for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
+ ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+ if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
+ return (mze);
+ }
+
+ return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_cd = 0;
+
+ uint32_t cd = 0;
+ for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (mze->mze_cd != cd)
+ break;
+ cd++;
+ }
+
+ return (cd);
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ avl_remove(&zap->zap_m.zap_avl, mze);
+ kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+ mzap_ent_t *mze;
+ void *avlcookie = NULL;
+
+ while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
+ kmem_free(mze, sizeof (mzap_ent_t));
+ avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+ zap_t *winner;
+ uint64_t *zap_hdr = (uint64_t *)db->db_data;
+ uint64_t zap_block_type = zap_hdr[0];
+ uint64_t zap_magic = zap_hdr[1];
+
+ ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+ zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+ rw_init(&zap->zap_rwlock, 0, 0, 0);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ zap->zap_objset = os;
+ zap->zap_object = obj;
+ zap->zap_dbuf = db;
+
+ if (zap_block_type != ZBT_MICRO) {
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+ zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
+ if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
+ winner = NULL; /* No actual winner here... */
+ goto handle_winner;
+ }
+ } else {
+ zap->zap_ismicro = TRUE;
+ }
+
+ /*
+ * Make sure that zap_ismicro is set before we let others see
+ * it, because zap_lockdir() checks zap_ismicro without the lock
+ * held.
+ */
+ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
+ winner = dmu_buf_set_user(db, &zap->zap_dbu);
+
+ if (winner != NULL)
+ goto handle_winner;
+
+ if (zap->zap_ismicro) {
+ zap->zap_salt = zap_m_phys(zap)->mz_salt;
+ zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
+ zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+ avl_create(&zap->zap_m.zap_avl, mze_compare,
+ sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+ for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze =
+ &zap_m_phys(zap)->mz_chunk[i];
+ if (mze->mze_name[0]) {
+ zap_name_t *zn;
+
+ zn = zap_name_alloc(zap, mze->mze_name, 0);
+ if (mze_insert(zap, i, zn->zn_hash) == 0)
+ zap->zap_m.zap_num_entries++;
+ else {
+ printf("ZFS WARNING: Duplicated ZAP "
+ "entry detected (%s).\n",
+ mze->mze_name);
+ }
+ zap_name_free(zn);
+ }
+ }
+ } else {
+ zap->zap_salt = zap_f_phys(zap)->zap_salt;
+ zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
+
+ ASSERT3U(sizeof (struct zap_leaf_header), ==,
+ 2*ZAP_LEAF_CHUNKSIZE);
+
+ /*
+ * The embedded pointer table should not overlap the
+ * other members.
+ */
+ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
+ &zap_f_phys(zap)->zap_salt);
+
+ /*
+ * The embedded pointer table should end at the end of
+ * the block
+ */
+ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
+ 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
+ (uintptr_t)zap_f_phys(zap), ==,
+ zap->zap_dbuf->db_size);
+ }
+ rw_exit(&zap->zap_rwlock);
+ return (zap);
+
+handle_winner:
+ rw_exit(&zap->zap_rwlock);
+ rw_destroy(&zap->zap_rwlock);
+ if (!zap->zap_ismicro)
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+ kmem_free(zap, sizeof (zap_t));
+ return (winner);
+}
+
+/*
+ * This routine "consumes" the caller's hold on the dbuf, which must
+ * have the specified tag.
+ */
+static int
+zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
+{
+ ASSERT0(db->db_offset);
+ objset_t *os = dmu_buf_get_objset(db);
+ uint64_t obj = db->db_object;
+
+ *zapp = NULL;
+
+ zap_t *zap = dmu_buf_get_user(db);
+ if (zap == NULL) {
+ zap = mzap_open(os, obj, db);
+ if (zap == NULL) {
+ /*
+ * mzap_open() didn't like what it saw on-disk.
+ * Check for corruption!
+ */
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ /*
+ * We're checking zap_ismicro without the lock held, in order to
+ * tell what type of lock we want. Once we have some sort of
+ * lock, see if it really is the right type. In practice this
+ * can only be different if it was upgraded from micro to fat,
+ * and micro wanted WRITER but fat only needs READER.
+ */
+ krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+ rw_enter(&zap->zap_rwlock, lt);
+ if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+ /* it was upgraded, now we only need reader */
+ ASSERT(lt == RW_WRITER);
+ ASSERT(RW_READER ==
+ (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
+ rw_downgrade(&zap->zap_rwlock);
+ lt = RW_READER;
+ }
+
+ zap->zap_objset = os;
+
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3P(zap->zap_dbuf, ==, db);
+
+ ASSERT(!zap->zap_ismicro ||
+ zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+ if (zap->zap_ismicro && tx && adding &&
+ zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+ uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+ if (newsz > MZAP_MAX_BLKSZ) {
+ dprintf("upgrading obj %llu: num_entries=%u\n",
+ obj, zap->zap_m.zap_num_entries);
+ *zapp = zap;
+ int err = mzap_upgrade(zapp, tag, tx, 0);
+ if (err != 0)
+ rw_exit(&zap->zap_rwlock);
+ return (err);
+ }
+ VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
+ zap->zap_m.zap_num_chunks =
+ db->db_size / MZAP_ENT_LEN - 1;
+ }
+
+ *zapp = zap;
+ return (0);
+}
+
+static int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+ dmu_buf_t *db;
+
+ int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0) {
+ return (err);
+ }
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+ }
+#endif
+
+ err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+ if (err != 0) {
+ dmu_buf_rele(db, tag);
+ }
+ return (err);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+ dmu_buf_t *db;
+
+ int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+ }
+#endif
+ err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+ if (err != 0)
+ dmu_buf_rele(db, tag);
+ return (err);
+}
+
+void
+zap_unlockdir(zap_t *zap, void *tag)
+{
+ rw_exit(&zap->zap_rwlock);
+ dmu_buf_rele(zap->zap_dbuf, tag);
+}
+
+static int
+mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
+{
+ int err = 0;
+ zap_t *zap = *zapp;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ int sz = zap->zap_dbuf->db_size;
+ mzap_phys_t *mzp = zio_buf_alloc(sz);
+ bcopy(zap->zap_dbuf->db_data, mzp, sz);
+ int nchunks = zap->zap_m.zap_num_chunks;
+
+ if (!flags) {
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << fzap_default_block_shift, 0, tx);
+ if (err != 0) {
+ zio_buf_free(mzp, sz);
+ return (err);
+ }
+ }
+
+ dprintf("upgrading obj=%llu with %u chunks\n",
+ zap->zap_object, nchunks);
+ /* XXX destroy the avl later, so we can use the stored hash value */
+ mze_destroy(zap);
+
+ fzap_upgrade(zap, tx, flags);
+
+ for (int i = 0; i < nchunks; i++) {
+ mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+ if (mze->mze_name[0] == 0)
+ continue;
+ dprintf("adding %s=%llu\n",
+ mze->mze_name, mze->mze_value);
+ zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
+ err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+ tag, tx);
+ zap = zn->zn_zap; /* fzap_add_cd() may change zap */
+ zap_name_free(zn);
+ if (err != 0)
+ break;
+ }
+ zio_buf_free(mzp, sz);
+ *zapp = zap;
+ return (err);
+}
+
+/*
+ * The "normflags" determine the behavior of the matchtype_t which is
+ * passed to zap_lookup_norm(). Names which have the same normalized
+ * version will be stored with the same hash value, and therefore we can
+ * perform normalization-insensitive lookups. We can be Unicode form-
+ * insensitive and/or case-insensitive. The following flags are valid for
+ * "normflags":
+ *
+ * U8_TEXTPREP_NFC
+ * U8_TEXTPREP_NFD
+ * U8_TEXTPREP_NFKC
+ * U8_TEXTPREP_NFKD
+ * U8_TEXTPREP_TOUPPER
+ *
+ * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
+ * of them may be supplied.
+ */
+void
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+
+ VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
+
+ dmu_buf_will_dirty(db, tx);
+ mzap_phys_t *zp = db->db_data;
+ zp->mz_block_type = ZBT_MICRO;
+ zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
+ zp->mz_normflags = normflags;
+
+ if (flags != 0) {
+ zap_t *zap;
+ /* Only fat zap supports flags; upgrade immediately. */
+ VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+ B_FALSE, B_FALSE, &zap));
+ VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
+ zap_unlockdir(zap, FTAG);
+ } else {
+ dmu_buf_rele(db, FTAG);
+ }
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+int
+zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_claim_norm_dnsize(os, obj,
+ 0, ot, bonustype, bonuslen, dnodesize, tx));
+}
+
+int
+zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
+ dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
+ dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
+ int err;
+
+ err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
+ if (err != 0)
+ return (err);
+ mzap_create_impl(os, obj, normflags, 0, tx);
+ return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
+}
+
+uint64_t
+zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
+ dnodesize, tx));
+}
+
+uint64_t
+zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+ return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+uint64_t
+zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
+
+ mzap_create_impl(os, obj, normflags, 0, tx);
+ return (obj);
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+ return (zap_create_flags_dnsize(os, normflags, flags, ot,
+ leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
+
+ ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
+ leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
+ indirect_blockshift >= SPA_MINBLOCKSHIFT &&
+ indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
+
+ VERIFY(dmu_object_set_blocksize(os, obj,
+ 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
+
+ mzap_create_impl(os, obj, normflags, flags, tx);
+ return (obj);
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+ /*
+ * dmu_object_free will free the object number and free the
+ * data. Freeing the data will cause our pageout function to be
+ * called, which will destroy our data (zap_leaf_t's and zap_t).
+ */
+
+ return (dmu_object_free(os, zapobj, tx));
+}
+
+void
+zap_evict_sync(void *dbu)
+{
+ zap_t *zap = dbu;
+
+ rw_destroy(&zap->zap_rwlock);
+
+ if (zap->zap_ismicro)
+ mze_destroy(zap);
+ else
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+
+ kmem_free(zap, sizeof (zap_t));
+}
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_count(zap, count);
+ } else {
+ *count = zap->zap_m.zap_num_entries;
+ }
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+/*
+ * zn may be NULL; if not specified, it will be computed if needed.
+ * See also the comment above zap_entry_normalization_conflict().
+ */
+static boolean_t
+mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
+{
+ int direction = AVL_BEFORE;
+ boolean_t allocdzn = B_FALSE;
+
+ if (zap->zap_normflags == 0)
+ return (B_FALSE);
+
+again:
+ for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+ other && other->mze_hash == mze->mze_hash;
+ other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
+
+ if (zn == NULL) {
+ zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
+ MT_NORMALIZE);
+ allocdzn = B_TRUE;
+ }
+ if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_TRUE);
+ }
+ }
+
+ if (direction == AVL_BEFORE) {
+ direction = AVL_AFTER;
+ goto again;
+ }
+
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_FALSE);
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ return (zap_lookup_norm(os, zapobj, name, integer_size,
+ num_integers, buf, 0, NULL, 0, NULL));
+}
+
+static int
+zap_lookup_impl(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ int err = 0;
+
+ zap_name_t *zn = zap_name_alloc(zap, name, mt);
+ if (zn == NULL)
+ return (SET_ERROR(ENOTSUP));
+
+ if (!zap->zap_ismicro) {
+ err = fzap_lookup(zn, integer_size, num_integers, buf,
+ realname, rn_len, ncp);
+ } else {
+ mzap_ent_t *mze = mze_find(zn);
+ if (mze == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ if (num_integers < 1) {
+ err = SET_ERROR(EOVERFLOW);
+ } else if (integer_size != 8) {
+ err = SET_ERROR(EINVAL);
+ } else {
+ *(uint64_t *)buf =
+ MZE_PHYS(zap, mze)->mze_value;
+ (void) strlcpy(realname,
+ MZE_PHYS(zap, mze)->mze_name, rn_len);
+ if (ncp) {
+ *ncp = mzap_normalization_conflict(zap,
+ zn, mze);
+ }
+ }
+ }
+ }
+ zap_name_free(zn);
+ return (err);
+}
+
+int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_lookup_impl(zap, name, integer_size,
+ num_integers, buf, mt, realname, rn_len, ncp);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+ num_integers, buf, 0, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ zap_t *zap;
+
+ int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+ FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_lookup_impl(zap, name, integer_size,
+ num_integers, buf, mt, realname, rn_len, ncp);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ fzap_prefetch(zn);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ err = fzap_lookup(zn, integer_size, num_integers, buf,
+ NULL, 0, NULL);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+ int err = zap_lookup_norm(os, zapobj, name, 0,
+ 0, NULL, 0, NULL, 0, NULL);
+ if (err == EOVERFLOW || err == EINVAL)
+ err = 0; /* found, but skipped reading the value */
+ return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc(zap, name, 0);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_length(zn, integer_size, num_integers);
+ } else {
+ mzap_ent_t *mze = mze_find(zn);
+ if (mze == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ if (integer_size)
+ *integer_size = 8;
+ if (num_integers)
+ *num_integers = 1;
+ }
+ }
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_length(zn, integer_size, num_integers);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+static void
+mzap_addent(zap_name_t *zn, uint64_t value)
+{
+ zap_t *zap = zn->zn_zap;
+ int start = zap->zap_m.zap_alloc_next;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+ for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
+ ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
+ }
+#endif
+
+ uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
+ /* given the limited size of the microzap, this can't happen */
+ ASSERT(cd < zap_maxcd(zap));
+
+again:
+ for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
+ if (mze->mze_name[0] == 0) {
+ mze->mze_value = value;
+ mze->mze_cd = cd;
+ (void) strcpy(mze->mze_name, zn->zn_key_orig);
+ zap->zap_m.zap_num_entries++;
+ zap->zap_m.zap_alloc_next = i+1;
+ if (zap->zap_m.zap_alloc_next ==
+ zap->zap_m.zap_num_chunks)
+ zap->zap_m.zap_alloc_next = 0;
+ VERIFY(0 == mze_insert(zap, i, zn->zn_hash));
+ return;
+ }
+ }
+ if (start != 0) {
+ start = 0;
+ goto again;
+ }
+ ASSERT(!"out of entries!");
+}
+
+static int
+zap_add_impl(zap_t *zap, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx, void *tag)
+{
+ const uint64_t *intval = val;
+ int err = 0;
+
+ zap_name_t *zn = zap_name_alloc(zap, key, 0);
+ if (zn == NULL) {
+ zap_unlockdir(zap, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(key) >= MZAP_NAME_LEN) {
+ err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
+ if (err == 0) {
+ err = fzap_add(zn, integer_size, num_integers, val,
+ tag, tx);
+ }
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ } else {
+ if (mze_find(zn) != NULL) {
+ err = SET_ERROR(EEXIST);
+ } else {
+ mzap_addent(zn, *intval);
+ }
+ }
+ ASSERT(zap == zn->zn_zap);
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap, tag);
+ return (err);
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
+ /* zap_add_impl() calls zap_unlockdir() */
+ return (err);
+}
+
+int
+zap_add_by_dnode(dnode_t *dn, const char *key,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
+ /* zap_add_impl() calls zap_unlockdir() */
+ return (err);
+}
+
+int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ uint64_t oldval;
+ const uint64_t *intval = val;
+
+#ifdef ZFS_DEBUG
+ /*
+ * If there is an old value, it shouldn't change across the
+ * lockdir (eg, due to bprewrite's xlation).
+ */
+ if (integer_size == 8 && num_integers == 1)
+ (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
+#endif
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc(zap, name, 0);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_update(zn, integer_size, num_integers, val,
+ FTAG, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+ if (err == 0) {
+ err = fzap_update(zn, integer_size, num_integers,
+ val, FTAG, tx);
+ }
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ } else {
+ mzap_ent_t *mze = mze_find(zn);
+ if (mze != NULL) {
+ ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
+ MZE_PHYS(zap, mze)->mze_value = *intval;
+ } else {
+ mzap_addent(zn, *intval);
+ }
+ }
+ ASSERT(zap == zn->zn_zap);
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+ return (zap_remove_norm(os, zapobj, name, 0, tx));
+}
+
+static int
+zap_remove_impl(zap_t *zap, const char *name,
+ matchtype_t mt, dmu_tx_t *tx)
+{
+ int err = 0;
+
+ zap_name_t *zn = zap_name_alloc(zap, name, mt);
+ if (zn == NULL)
+ return (SET_ERROR(ENOTSUP));
+ if (!zap->zap_ismicro) {
+ err = fzap_remove(zn, tx);
+ } else {
+ mzap_ent_t *mze = mze_find(zn);
+ if (mze == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ zap->zap_m.zap_num_entries--;
+ bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
+ sizeof (mzap_ent_phys_t));
+ mze_remove(zap, mze);
+ }
+ }
+ zap_name_free(zn);
+ return (err);
+}
+
+int
+zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
+ matchtype_t mt, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+ if (err)
+ return (err);
+ err = zap_remove_impl(zap, name, mt, tx);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+ if (err)
+ return (err);
+ err = zap_remove_impl(zap, name, 0, tx);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_remove(zn, tx);
+ zap_name_free(zn);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+static void
+zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized, boolean_t prefetch)
+{
+ zc->zc_objset = os;
+ zc->zc_zap = NULL;
+ zc->zc_leaf = NULL;
+ zc->zc_zapobj = zapobj;
+ zc->zc_serialized = serialized;
+ zc->zc_hash = 0;
+ zc->zc_cd = 0;
+ zc->zc_prefetch = prefetch;
+}
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized)
+{
+ zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning of the ZAP object. The entire
+ * ZAP object will be prefetched.
+ */
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning, but request that we not prefetch
+ * the entire ZAP object.
+ */
+void
+zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
+}
+
+void
+zap_cursor_fini(zap_cursor_t *zc)
+{
+ if (zc->zc_zap) {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ zap_unlockdir(zc->zc_zap, NULL);
+ zc->zc_zap = NULL;
+ }
+ if (zc->zc_leaf) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+ zc->zc_objset = NULL;
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return (-1ULL);
+ if (zc->zc_zap == NULL)
+ return (zc->zc_serialized);
+ ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+ ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+ /*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So usually use a small
+ * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+ * of the cursor.
+ *
+ * [ collision differentiator | zap_hashbits()-bit hash value ]
+ */
+ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+ ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err;
+
+ if (zc->zc_hash == -1ULL)
+ return (SET_ERROR(ENOENT));
+
+ if (zc->zc_zap == NULL) {
+ int hb;
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
+ if (err != 0)
+ return (err);
+
+ /*
+ * To support zap_cursor_init_serialized, advance, retrieve,
+ * we must add to the existing zc_cd, which may already
+ * be 1 due to the zap_cursor_advance.
+ */
+ ASSERT(zc->zc_hash == 0);
+ hb = zap_hashbits(zc->zc_zap);
+ zc->zc_hash = zc->zc_serialized << (64 - hb);
+ zc->zc_cd += zc->zc_serialized >> hb;
+ if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+ zc->zc_cd = 0;
+ } else {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ }
+ if (!zc->zc_zap->zap_ismicro) {
+ err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
+ } else {
+ avl_index_t idx;
+ mzap_ent_t mze_tofind;
+
+ mze_tofind.mze_hash = zc->zc_hash;
+ mze_tofind.mze_cd = zc->zc_cd;
+
+ mzap_ent_t *mze =
+ avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+ if (mze == NULL) {
+ mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
+ idx, AVL_AFTER);
+ }
+ if (mze) {
+ mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+ ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
+ za->za_normalization_conflict =
+ mzap_normalization_conflict(zc->zc_zap, NULL, mze);
+ za->za_integer_length = 8;
+ za->za_num_integers = 1;
+ za->za_first_integer = mzep->mze_value;
+ (void) strcpy(za->za_name, mzep->mze_name);
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_cd;
+ err = 0;
+ } else {
+ zc->zc_hash = -1ULL;
+ err = SET_ERROR(ENOENT);
+ }
+ }
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return;
+ zc->zc_cd++;
+}
+
+int
+zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
+{
+ int err = 0;
+ mzap_ent_t *mze;
+ zap_name_t *zn;
+
+ if (zc->zc_zap == NULL) {
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap);
+ if (err)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ }
+
+ zn = zap_name_alloc(zc->zc_zap, name, mt);
+ if (zn == NULL) {
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ if (!zc->zc_zap->zap_ismicro) {
+ err = fzap_cursor_move_to_key(zc, zn);
+ } else {
+ mze = mze_find(zn);
+ if (mze == NULL) {
+ err = SET_ERROR(ENOENT);
+ goto out;
+ }
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_cd;
+ }
+
+out:
+ zap_name_free(zn);
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (err);
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+
+ bzero(zs, sizeof (zap_stats_t));
+
+ if (zap->zap_ismicro) {
+ zs->zs_blocksize = zap->zap_dbuf->db_size;
+ zs->zs_num_entries = zap->zap_m.zap_num_entries;
+ zs->zs_num_blocks = 1;
+ } else {
+ fzap_get_stats(zap, zs);
+ }
+ zap_unlockdir(zap, FTAG);
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
new file mode 100644
index 000000000000..cf8b0a58d3ae
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
@@ -0,0 +1,1432 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZFS Channel Programs (ZCP)
+ *
+ * The ZCP interface allows various ZFS commands and operations ZFS
+ * administrative operations (e.g. creating and destroying snapshots, typically
+ * performed via an ioctl to /dev/zfs by the zfs(1M) command and
+ * libzfs/libzfs_core) to be run * programmatically as a Lua script. A ZCP
+ * script is run as a dsl_sync_task and fully executed during one transaction
+ * group sync. This ensures that no other changes can be written concurrently
+ * with a running Lua script. Combining multiple calls to the exposed ZFS
+ * functions into one script gives a number of benefits:
+ *
+ * 1. Atomicity. For some compound or iterative operations, it's useful to be
+ * able to guarantee that the state of a pool has not changed between calls to
+ * ZFS.
+ *
+ * 2. Performance. If a large number of changes need to be made (e.g. deleting
+ * many filesystems), there can be a significant performance penalty as a
+ * result of the need to wait for a transaction group sync to pass for every
+ * single operation. When expressed as a single ZCP script, all these changes
+ * can be performed at once in one txg sync.
+ *
+ * A modified version of the Lua 5.2 interpreter is used to run channel program
+ * scripts. The Lua 5.2 manual can be found at:
+ *
+ * http://www.lua.org/manual/5.2/
+ *
+ * If being run by a user (via an ioctl syscall), executing a ZCP script
+ * requires root privileges in the global zone.
+ *
+ * Scripts are passed to zcp_eval() as a string, then run in a synctask by
+ * zcp_eval_sync(). Arguments can be passed into the Lua script as an nvlist,
+ * which will be converted to a Lua table. Similarly, values returned from
+ * a ZCP script will be converted to an nvlist. See zcp_lua_to_nvlist_impl()
+ * for details on exact allowed types and conversion.
+ *
+ * ZFS functionality is exposed to a ZCP script as a library of function calls.
+ * These calls are sorted into submodules, such as zfs.list and zfs.sync, for
+ * iterators and synctasks, respectively. Each of these submodules resides in
+ * its own source file, with a zcp_*_info structure describing each library
+ * call in the submodule.
+ *
+ * Error handling in ZCP scripts is handled by a number of different methods
+ * based on severity:
+ *
+ * 1. Memory and time limits are in place to prevent a channel program from
+ * consuming excessive system or running forever. If one of these limits is
+ * hit, the channel program will be stopped immediately and return from
+ * zcp_eval() with an error code. No attempt will be made to roll back or undo
+ * any changes made by the channel program before the error occured.
+ * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time
+ * limit of 0, disabling the time limit.
+ *
+ * 2. Internal Lua errors can occur as a result of a syntax error, calling a
+ * library function with incorrect arguments, invoking the error() function,
+ * failing an assert(), or other runtime errors. In these cases the channel
+ * program will stop executing and return from zcp_eval() with an error code.
+ * In place of a return value, an error message will also be returned in the
+ * 'result' nvlist containing information about the error. No attempt will be
+ * made to roll back or undo any changes made by the channel program before the
+ * error occured.
+ *
+ * 3. If an error occurs inside a ZFS library call which returns an error code,
+ * the error is returned to the Lua script to be handled as desired.
+ *
+ * In the first two cases, Lua's error-throwing mechanism is used, which
+ * longjumps out of the script execution with luaL_error() and returns with the
+ * error.
+ *
+ * See zfs-program(1M) for more information on high level usage.
+ */
+
+#include "lua.h"
+#include "lualib.h"
+#include "lauxlib.h"
+
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/zcp.h>
+#include <sys/zcp_iter.h>
+#include <sys/zcp_prop.h>
+#include <sys/zcp_global.h>
+#ifdef illumos
+#include <util/sscanf.h>
+#endif
+
+#ifdef __FreeBSD__
+#define ECHRNG EDOM
+#define ETIME ETIMEDOUT
+#endif
+
+#define ZCP_NVLIST_MAX_DEPTH 20
+
+uint64_t zfs_lua_check_instrlimit_interval = 100;
+uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
+uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
+
+/*
+ * Forward declarations for mutually recursive functions
+ */
+static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int);
+static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *,
+ int);
+
+/*
+ * The outer-most error callback handler for use with lua_pcall(). On
+ * error Lua will call this callback with a single argument that
+ * represents the error value. In most cases this will be a string
+ * containing an error message, but channel programs can use Lua's
+ * error() function to return arbitrary objects as errors. This callback
+ * returns (on the Lua stack) the original error object along with a traceback.
+ *
+ * Fatal Lua errors can occur while resources are held, so we also call any
+ * registered cleanup function here.
+ */
+static int
+zcp_error_handler(lua_State *state)
+{
+ const char *msg;
+
+ zcp_cleanup(state);
+
+ VERIFY3U(1, ==, lua_gettop(state));
+ msg = lua_tostring(state, 1);
+ luaL_traceback(state, state, msg, 1);
+ return (1);
+}
+
+int
+zcp_argerror(lua_State *state, int narg, const char *msg, ...)
+{
+ va_list alist;
+
+ va_start(alist, msg);
+ const char *buf = lua_pushvfstring(state, msg, alist);
+ va_end(alist);
+
+ return (luaL_argerror(state, narg, buf));
+}
+
+/*
+ * Install a new cleanup function, which will be invoked with the given
+ * opaque argument if a fatal error causes the Lua interpreter to longjump out
+ * of a function call.
+ *
+ * If an error occurs, the cleanup function will be invoked exactly once and
+ * then unreigstered.
+ *
+ * Returns the registered cleanup handler so the caller can deregister it
+ * if no error occurs.
+ */
+zcp_cleanup_handler_t *
+zcp_register_cleanup(lua_State *state, zcp_cleanup_t cleanfunc, void *cleanarg)
+{
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ zcp_cleanup_handler_t *zch = kmem_alloc(sizeof (*zch), KM_SLEEP);
+ zch->zch_cleanup_func = cleanfunc;
+ zch->zch_cleanup_arg = cleanarg;
+ list_insert_head(&ri->zri_cleanup_handlers, zch);
+
+ return (zch);
+}
+
+void
+zcp_deregister_cleanup(lua_State *state, zcp_cleanup_handler_t *zch)
+{
+ zcp_run_info_t *ri = zcp_run_info(state);
+ list_remove(&ri->zri_cleanup_handlers, zch);
+ kmem_free(zch, sizeof (*zch));
+}
+
+/*
+ * Execute the currently registered cleanup handlers then free them and
+ * destroy the handler list.
+ */
+void
+zcp_cleanup(lua_State *state)
+{
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ for (zcp_cleanup_handler_t *zch =
+ list_remove_head(&ri->zri_cleanup_handlers); zch != NULL;
+ zch = list_remove_head(&ri->zri_cleanup_handlers)) {
+ zch->zch_cleanup_func(zch->zch_cleanup_arg);
+ kmem_free(zch, sizeof (*zch));
+ }
+}
+
+/*
+ * Convert the lua table at the given index on the Lua stack to an nvlist
+ * and return it.
+ *
+ * If the table can not be converted for any reason, NULL is returned and
+ * an error message is pushed onto the Lua stack.
+ */
+static nvlist_t *
+zcp_table_to_nvlist(lua_State *state, int index, int depth)
+{
+ nvlist_t *nvl;
+ /*
+ * Converting a Lua table to an nvlist with key uniqueness checking is
+ * O(n^2) in the number of keys in the nvlist, which can take a long
+ * time when we return a large table from a channel program.
+ * Furthermore, Lua's table interface *almost* guarantees unique keys
+ * on its own (details below). Therefore, we don't use fnvlist_alloc()
+ * here to avoid the built-in uniqueness checking.
+ *
+ * The *almost* is because it's possible to have key collisions between
+ * e.g. the string "1" and the number 1, or the string "true" and the
+ * boolean true, so we explicitly check that when we're looking at a
+ * key which is an integer / boolean or a string that can be parsed as
+ * one of those types. In the worst case this could still devolve into
+ * O(n^2), so we only start doing these checks on boolean/integer keys
+ * once we've seen a string key which fits this weird usage pattern.
+ *
+ * Ultimately, we still want callers to know that the keys in this
+ * nvlist are unique, so before we return this we set the nvlist's
+ * flags to reflect that.
+ */
+ VERIFY0(nvlist_alloc(&nvl, 0, KM_SLEEP));
+
+ /*
+ * Push an empty stack slot where lua_next() will store each
+ * table key.
+ */
+ lua_pushnil(state);
+ boolean_t saw_str_could_collide = B_FALSE;
+ while (lua_next(state, index) != 0) {
+ /*
+ * The next key-value pair from the table at index is
+ * now on the stack, with the key at stack slot -2 and
+ * the value at slot -1.
+ */
+ int err = 0;
+ char buf[32];
+ const char *key = NULL;
+ boolean_t key_could_collide = B_FALSE;
+
+ switch (lua_type(state, -2)) {
+ case LUA_TSTRING:
+ key = lua_tostring(state, -2);
+
+ /* check if this could collide with a number or bool */
+ long long tmp;
+ int parselen;
+ if ((sscanf(key, "%lld%n", &tmp, &parselen) > 0 &&
+ parselen == strlen(key)) ||
+ strcmp(key, "true") == 0 ||
+ strcmp(key, "false") == 0) {
+ key_could_collide = B_TRUE;
+ saw_str_could_collide = B_TRUE;
+ }
+ break;
+ case LUA_TBOOLEAN:
+ key = (lua_toboolean(state, -2) == B_TRUE ?
+ "true" : "false");
+ if (saw_str_could_collide) {
+ key_could_collide = B_TRUE;
+ }
+ break;
+ case LUA_TNUMBER:
+ VERIFY3U(sizeof (buf), >,
+ snprintf(buf, sizeof (buf), "%lld",
+ (longlong_t)lua_tonumber(state, -2)));
+ key = buf;
+ if (saw_str_could_collide) {
+ key_could_collide = B_TRUE;
+ }
+ break;
+ default:
+ fnvlist_free(nvl);
+ (void) lua_pushfstring(state, "Invalid key "
+ "type '%s' in table",
+ lua_typename(state, lua_type(state, -2)));
+ return (NULL);
+ }
+ /*
+ * Check for type-mismatched key collisions, and throw an error.
+ */
+ if (key_could_collide && nvlist_exists(nvl, key)) {
+ fnvlist_free(nvl);
+ (void) lua_pushfstring(state, "Collision of "
+ "key '%s' in table", key);
+ return (NULL);
+ }
+ /*
+ * Recursively convert the table value and insert into
+ * the new nvlist with the parsed key. To prevent
+ * stack overflow on circular or heavily nested tables,
+ * we track the current nvlist depth.
+ */
+ if (depth >= ZCP_NVLIST_MAX_DEPTH) {
+ fnvlist_free(nvl);
+ (void) lua_pushfstring(state, "Maximum table "
+ "depth (%d) exceeded for table",
+ ZCP_NVLIST_MAX_DEPTH);
+ return (NULL);
+ }
+ err = zcp_lua_to_nvlist_impl(state, -1, nvl, key,
+ depth + 1);
+ if (err != 0) {
+ fnvlist_free(nvl);
+ /*
+ * Error message has been pushed to the lua
+ * stack by the recursive call.
+ */
+ return (NULL);
+ }
+ /*
+ * Pop the value pushed by lua_next().
+ */
+ lua_pop(state, 1);
+ }
+
+ /*
+ * Mark the nvlist as having unique keys. This is a little ugly, but we
+ * ensured above that there are no duplicate keys in the nvlist.
+ */
+ nvl->nvl_nvflag |= NV_UNIQUE_NAME;
+
+ return (nvl);
+}
+
+/*
+ * Convert a value from the given index into the lua stack to an nvpair, adding
+ * it to an nvlist with the given key.
+ *
+ * Values are converted as follows:
+ *
+ * string -> string
+ * number -> int64
+ * boolean -> boolean
+ * nil -> boolean (no value)
+ *
+ * Lua tables are converted to nvlists and then inserted. The table's keys
+ * are converted to strings then used as keys in the nvlist to store each table
+ * element. Keys are converted as follows:
+ *
+ * string -> no change
+ * number -> "%lld"
+ * boolean -> "true" | "false"
+ * nil -> error
+ *
+ * In the case of a key collision, an error is thrown.
+ *
+ * If an error is encountered, a nonzero error code is returned, and an error
+ * string will be pushed onto the Lua stack.
+ */
+static int
+zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl,
+ const char *key, int depth)
+{
+ /*
+ * Verify that we have enough remaining space in the lua stack to parse
+ * a key-value pair and push an error.
+ */
+ if (!lua_checkstack(state, 3)) {
+ (void) lua_pushstring(state, "Lua stack overflow");
+ return (1);
+ }
+
+ index = lua_absindex(state, index);
+
+ switch (lua_type(state, index)) {
+ case LUA_TNIL:
+ fnvlist_add_boolean(nvl, key);
+ break;
+ case LUA_TBOOLEAN:
+ fnvlist_add_boolean_value(nvl, key,
+ lua_toboolean(state, index));
+ break;
+ case LUA_TNUMBER:
+ fnvlist_add_int64(nvl, key, lua_tonumber(state, index));
+ break;
+ case LUA_TSTRING:
+ fnvlist_add_string(nvl, key, lua_tostring(state, index));
+ break;
+ case LUA_TTABLE: {
+ nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth);
+ if (value_nvl == NULL)
+ return (EINVAL);
+
+ fnvlist_add_nvlist(nvl, key, value_nvl);
+ fnvlist_free(value_nvl);
+ break;
+ }
+ default:
+ (void) lua_pushfstring(state,
+ "Invalid value type '%s' for key '%s'",
+ lua_typename(state, lua_type(state, index)), key);
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * Convert a lua value to an nvpair, adding it to an nvlist with the given key.
+ */
+static void
+zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key)
+{
+ /*
+ * On error, zcp_lua_to_nvlist_impl pushes an error string onto the Lua
+ * stack before returning with a nonzero error code. If an error is
+ * returned, throw a fatal lua error with the given string.
+ */
+ if (zcp_lua_to_nvlist_impl(state, index, nvl, key, 0) != 0)
+ (void) lua_error(state);
+}
+
+static int
+zcp_lua_to_nvlist_helper(lua_State *state)
+{
+ nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2);
+ const char *key = (const char *)lua_touserdata(state, 1);
+ zcp_lua_to_nvlist(state, 3, nv, key);
+ return (0);
+}
+
+static void
+zcp_convert_return_values(lua_State *state, nvlist_t *nvl,
+ const char *key, int *result)
+{
+ int err;
+ VERIFY3U(1, ==, lua_gettop(state));
+ lua_pushcfunction(state, zcp_lua_to_nvlist_helper);
+ lua_pushlightuserdata(state, (char *)key);
+ lua_pushlightuserdata(state, nvl);
+ lua_pushvalue(state, 1);
+ lua_remove(state, 1);
+ err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */
+ if (err != 0) {
+ zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR);
+ *result = SET_ERROR(ECHRNG);
+ }
+}
+
+/*
+ * Push a Lua table representing nvl onto the stack. If it can't be
+ * converted, return EINVAL, fill in errbuf, and push nothing. errbuf may
+ * be specified as NULL, in which case no error string will be output.
+ *
+ * Most nvlists are converted as simple key->value Lua tables, but we make
+ * an exception for the case where all nvlist entries are BOOLEANs (a string
+ * key without a value). In Lua, a table key pointing to a value of Nil
+ * (no value) is equivalent to the key not existing, so a BOOLEAN nvlist
+ * entry can't be directly converted to a Lua table entry. Nvlists of entirely
+ * BOOLEAN entries are frequently used to pass around lists of datasets, so for
+ * convenience we check for this case, and convert it to a simple Lua array of
+ * strings.
+ */
+int
+zcp_nvlist_to_lua(lua_State *state, nvlist_t *nvl,
+ char *errbuf, int errbuf_len)
+{
+ nvpair_t *pair;
+ lua_newtable(state);
+ boolean_t has_values = B_FALSE;
+ /*
+ * If the list doesn't have any values, just convert it to a string
+ * array.
+ */
+ for (pair = nvlist_next_nvpair(nvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+ if (nvpair_type(pair) != DATA_TYPE_BOOLEAN) {
+ has_values = B_TRUE;
+ break;
+ }
+ }
+ if (!has_values) {
+ int i = 1;
+ for (pair = nvlist_next_nvpair(nvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+ (void) lua_pushinteger(state, i);
+ (void) lua_pushstring(state, nvpair_name(pair));
+ (void) lua_settable(state, -3);
+ i++;
+ }
+ } else {
+ for (pair = nvlist_next_nvpair(nvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+ int err = zcp_nvpair_value_to_lua(state, pair,
+ errbuf, errbuf_len);
+ if (err != 0) {
+ lua_pop(state, 1);
+ return (err);
+ }
+ (void) lua_setfield(state, -2, nvpair_name(pair));
+ }
+ }
+ return (0);
+}
+
+/*
+ * Push a Lua object representing the value of "pair" onto the stack.
+ *
+ * Only understands boolean_value, string, int64, nvlist,
+ * string_array, and int64_array type values. For other
+ * types, returns EINVAL, fills in errbuf, and pushes nothing.
+ */
+static int
+zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair,
+ char *errbuf, int errbuf_len)
+{
+ int err = 0;
+
+ if (pair == NULL) {
+ lua_pushnil(state);
+ return (0);
+ }
+
+ switch (nvpair_type(pair)) {
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) lua_pushboolean(state,
+ fnvpair_value_boolean_value(pair));
+ break;
+ case DATA_TYPE_STRING:
+ (void) lua_pushstring(state, fnvpair_value_string(pair));
+ break;
+ case DATA_TYPE_INT64:
+ (void) lua_pushinteger(state, fnvpair_value_int64(pair));
+ break;
+ case DATA_TYPE_NVLIST:
+ err = zcp_nvlist_to_lua(state,
+ fnvpair_value_nvlist(pair), errbuf, errbuf_len);
+ break;
+ case DATA_TYPE_STRING_ARRAY: {
+ char **strarr;
+ uint_t nelem;
+ (void) nvpair_value_string_array(pair, &strarr, &nelem);
+ lua_newtable(state);
+ for (int i = 0; i < nelem; i++) {
+ (void) lua_pushinteger(state, i + 1);
+ (void) lua_pushstring(state, strarr[i]);
+ (void) lua_settable(state, -3);
+ }
+ break;
+ }
+ case DATA_TYPE_UINT64_ARRAY: {
+ uint64_t *intarr;
+ uint_t nelem;
+ (void) nvpair_value_uint64_array(pair, &intarr, &nelem);
+ lua_newtable(state);
+ for (int i = 0; i < nelem; i++) {
+ (void) lua_pushinteger(state, i + 1);
+ (void) lua_pushinteger(state, intarr[i]);
+ (void) lua_settable(state, -3);
+ }
+ break;
+ }
+ case DATA_TYPE_INT64_ARRAY: {
+ int64_t *intarr;
+ uint_t nelem;
+ (void) nvpair_value_int64_array(pair, &intarr, &nelem);
+ lua_newtable(state);
+ for (int i = 0; i < nelem; i++) {
+ (void) lua_pushinteger(state, i + 1);
+ (void) lua_pushinteger(state, intarr[i]);
+ (void) lua_settable(state, -3);
+ }
+ break;
+ }
+ default: {
+ if (errbuf != NULL) {
+ (void) snprintf(errbuf, errbuf_len,
+ "Unhandled nvpair type %d for key '%s'",
+ nvpair_type(pair), nvpair_name(pair));
+ }
+ return (EINVAL);
+ }
+ }
+ return (err);
+}
+
+int
+zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname,
+ int error)
+{
+ if (error == ENOENT) {
+ (void) zcp_argerror(state, 1, "no such dataset '%s'", dsname);
+ return (0); /* not reached; zcp_argerror will longjmp */
+ } else if (error == EXDEV) {
+ (void) zcp_argerror(state, 1,
+ "dataset '%s' is not in the target pool '%s'",
+ dsname, spa_name(dp->dp_spa));
+ return (0); /* not reached; zcp_argerror will longjmp */
+ } else if (error == EIO) {
+ (void) luaL_error(state,
+ "I/O error while accessing dataset '%s'", dsname);
+ return (0); /* not reached; luaL_error will longjmp */
+ } else if (error != 0) {
+ (void) luaL_error(state,
+ "unexpected error %d while accessing dataset '%s'",
+ error, dsname);
+ return (0); /* not reached; luaL_error will longjmp */
+ }
+ return (0);
+}
+
+/*
+ * Note: will longjmp (via lua_error()) on error.
+ * Assumes that the dsname is argument #1 (for error reporting purposes).
+ */
+dsl_dataset_t *
+zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname,
+ void *tag)
+{
+ dsl_dataset_t *ds;
+ int error = dsl_dataset_hold(dp, dsname, tag, &ds);
+ (void) zcp_dataset_hold_error(state, dp, dsname, error);
+ return (ds);
+}
+
+static int zcp_debug(lua_State *);
+static zcp_lib_info_t zcp_debug_info = {
+ .name = "debug",
+ .func = zcp_debug,
+ .pargs = {
+ { .za_name = "debug string", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_debug(lua_State *state)
+{
+ const char *dbgstring;
+ zcp_run_info_t *ri = zcp_run_info(state);
+ zcp_lib_info_t *libinfo = &zcp_debug_info;
+
+ zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+ dbgstring = lua_tostring(state, 1);
+
+ zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring);
+
+ return (0);
+}
+
+static int zcp_exists(lua_State *);
+static zcp_lib_info_t zcp_exists_info = {
+ .name = "exists",
+ .func = zcp_exists,
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_exists(lua_State *state)
+{
+ zcp_run_info_t *ri = zcp_run_info(state);
+ dsl_pool_t *dp = ri->zri_pool;
+ zcp_lib_info_t *libinfo = &zcp_exists_info;
+
+ zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+ const char *dsname = lua_tostring(state, 1);
+
+ dsl_dataset_t *ds;
+ int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ lua_pushboolean(state, B_TRUE);
+ } else if (error == ENOENT) {
+ lua_pushboolean(state, B_FALSE);
+ } else if (error == EXDEV) {
+ return (luaL_error(state, "dataset '%s' is not in the "
+ "target pool", dsname));
+ } else if (error == EIO) {
+ return (luaL_error(state, "I/O error opening dataset '%s'",
+ dsname));
+ } else if (error != 0) {
+ return (luaL_error(state, "unexpected error %d", error));
+ }
+
+ return (1);
+}
+
+/*
+ * Allocate/realloc/free a buffer for the lua interpreter.
+ *
+ * When nsize is 0, behaves as free() and returns NULL.
+ *
+ * If ptr is NULL, behaves as malloc() and returns an allocated buffer of size
+ * at least nsize.
+ *
+ * Otherwise, behaves as realloc(), changing the allocation from osize to nsize.
+ * Shrinking the buffer size never fails.
+ *
+ * The original allocated buffer size is stored as a uint64 at the beginning of
+ * the buffer to avoid actually reallocating when shrinking a buffer, since lua
+ * requires that this operation never fail.
+ */
+static void *
+zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
+{
+ zcp_alloc_arg_t *allocargs = ud;
+ int flags = (allocargs->aa_must_succeed) ?
+ KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI);
+
+ if (nsize == 0) {
+ if (ptr != NULL) {
+ int64_t *allocbuf = (int64_t *)ptr - 1;
+ int64_t allocsize = *allocbuf;
+ ASSERT3S(allocsize, >, 0);
+ ASSERT3S(allocargs->aa_alloc_remaining + allocsize, <=,
+ allocargs->aa_alloc_limit);
+ allocargs->aa_alloc_remaining += allocsize;
+ kmem_free(allocbuf, allocsize);
+ }
+ return (NULL);
+ } else if (ptr == NULL) {
+ int64_t *allocbuf;
+ int64_t allocsize = nsize + sizeof (int64_t);
+
+ if (!allocargs->aa_must_succeed &&
+ (allocsize <= 0 ||
+ allocsize > allocargs->aa_alloc_remaining)) {
+ return (NULL);
+ }
+
+ allocbuf = kmem_alloc(allocsize, flags);
+ if (allocbuf == NULL) {
+ return (NULL);
+ }
+ allocargs->aa_alloc_remaining -= allocsize;
+
+ *allocbuf = allocsize;
+ return (allocbuf + 1);
+ } else if (nsize <= osize) {
+ /*
+ * If shrinking the buffer, lua requires that the reallocation
+ * never fail.
+ */
+ return (ptr);
+ } else {
+ ASSERT3U(nsize, >, osize);
+
+ uint64_t *luabuf = zcp_lua_alloc(ud, NULL, 0, nsize);
+ if (luabuf == NULL) {
+ return (NULL);
+ }
+ (void) memcpy(luabuf, ptr, osize);
+ VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL);
+ return (luabuf);
+ }
+}
+
+/* ARGSUSED */
+static void
+zcp_lua_counthook(lua_State *state, lua_Debug *ar)
+{
+ lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+ zcp_run_info_t *ri = lua_touserdata(state, -1);
+
+ /*
+ * Check if we were canceled while waiting for the
+ * txg to sync or from our open context thread
+ */
+ if (ri->zri_canceled ||
+ (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) {
+ ri->zri_canceled = B_TRUE;
+ (void) lua_pushstring(state, "Channel program was canceled.");
+ (void) lua_error(state);
+ }
+
+ /*
+ * Check how many instructions the channel program has
+ * executed so far, and compare against the limit.
+ */
+ ri->zri_curinstrs += zfs_lua_check_instrlimit_interval;
+ if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) {
+ ri->zri_timed_out = B_TRUE;
+ (void) lua_pushstring(state,
+ "Channel program timed out.");
+ (void) lua_error(state);
+ }
+}
+
+static int
+zcp_panic_cb(lua_State *state)
+{
+ panic("unprotected error in call to Lua API (%s)\n",
+ lua_tostring(state, -1));
+ return (0);
+}
+
+static void
+zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri)
+{
+ int err;
+ lua_State *state = ri->zri_state;
+
+ VERIFY3U(3, ==, lua_gettop(state));
+
+ /* finish initializing our runtime state */
+ ri->zri_pool = dmu_tx_pool(tx);
+ ri->zri_tx = tx;
+ list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t),
+ offsetof(zcp_cleanup_handler_t, zch_node));
+
+ /*
+ * Store the zcp_run_info_t struct for this run in the Lua registry.
+ * Registry entries are not directly accessible by the Lua scripts but
+ * can be accessed by our callbacks.
+ */
+ lua_pushlightuserdata(state, ri);
+ lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+ VERIFY3U(3, ==, lua_gettop(state));
+
+ /*
+ * Tell the Lua interpreter to call our handler every count
+ * instructions. Channel programs that execute too many instructions
+ * should die with ETIMEDOUT.
+ */
+ (void) lua_sethook(state, zcp_lua_counthook, LUA_MASKCOUNT,
+ zfs_lua_check_instrlimit_interval);
+
+ /*
+ * Tell the Lua memory allocator to stop using KM_SLEEP before handing
+ * off control to the channel program. Channel programs that use too
+ * much memory should die with ENOSPC.
+ */
+ ri->zri_allocargs->aa_must_succeed = B_FALSE;
+
+ /*
+ * Call the Lua function that open-context passed us. This pops the
+ * function and its input from the stack and pushes any return
+ * or error values.
+ */
+ err = lua_pcall(state, 1, LUA_MULTRET, 1);
+
+ /*
+ * Let Lua use KM_SLEEP while we interpret the return values.
+ */
+ ri->zri_allocargs->aa_must_succeed = B_TRUE;
+
+ /*
+ * Remove the error handler callback from the stack. At this point,
+ * there shouldn't be any cleanup handler registered in the handler
+ * list (zri_cleanup_handlers), regardless of whether it ran or not.
+ */
+ list_destroy(&ri->zri_cleanup_handlers);
+ lua_remove(state, 1);
+
+ switch (err) {
+ case LUA_OK: {
+ /*
+ * Lua supports returning multiple values in a single return
+ * statement. Return values will have been pushed onto the
+ * stack:
+ * 1: Return value 1
+ * 2: Return value 2
+ * 3: etc...
+ * To simplify the process of retrieving a return value from a
+ * channel program, we disallow returning more than one value
+ * to ZFS from the Lua script, yielding a singleton return
+ * nvlist of the form { "return": Return value 1 }.
+ */
+ int return_count = lua_gettop(state);
+
+ if (return_count == 1) {
+ ri->zri_result = 0;
+ zcp_convert_return_values(state, ri->zri_outnvl,
+ ZCP_RET_RETURN, &ri->zri_result);
+ } else if (return_count > 1) {
+ ri->zri_result = SET_ERROR(ECHRNG);
+ lua_settop(state, 0);
+ (void) lua_pushfstring(state, "Multiple return "
+ "values not supported");
+ zcp_convert_return_values(state, ri->zri_outnvl,
+ ZCP_RET_ERROR, &ri->zri_result);
+ }
+ break;
+ }
+ case LUA_ERRRUN:
+ case LUA_ERRGCMM: {
+ /*
+ * The channel program encountered a fatal error within the
+ * script, such as failing an assertion, or calling a function
+ * with incompatible arguments. The error value and the
+ * traceback generated by zcp_error_handler() should be on the
+ * stack.
+ */
+ VERIFY3U(1, ==, lua_gettop(state));
+ if (ri->zri_timed_out) {
+ ri->zri_result = SET_ERROR(ETIME);
+ } else if (ri->zri_canceled) {
+ ri->zri_result = SET_ERROR(EINTR);
+ } else {
+ ri->zri_result = SET_ERROR(ECHRNG);
+ }
+
+ zcp_convert_return_values(state, ri->zri_outnvl,
+ ZCP_RET_ERROR, &ri->zri_result);
+ break;
+ }
+ case LUA_ERRERR: {
+ /*
+ * The channel program encountered a fatal error within the
+ * script, and we encountered another error while trying to
+ * compute the traceback in zcp_error_handler(). We can only
+ * return the error message.
+ */
+ VERIFY3U(1, ==, lua_gettop(state));
+ if (ri->zri_timed_out) {
+ ri->zri_result = SET_ERROR(ETIME);
+ } else if (ri->zri_canceled) {
+ ri->zri_result = SET_ERROR(EINTR);
+ } else {
+ ri->zri_result = SET_ERROR(ECHRNG);
+ }
+
+ zcp_convert_return_values(state, ri->zri_outnvl,
+ ZCP_RET_ERROR, &ri->zri_result);
+ break;
+ }
+ case LUA_ERRMEM:
+ /*
+ * Lua ran out of memory while running the channel program.
+ * There's not much we can do.
+ */
+ ri->zri_result = SET_ERROR(ENOSPC);
+ break;
+ default:
+ VERIFY0(err);
+ }
+}
+
+static void
+zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
+{
+ ri->zri_result = SET_ERROR(ECHRNG);
+ lua_settop(ri->zri_state, 0);
+ (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s",
+ poolname);
+ zcp_convert_return_values(ri->zri_state, ri->zri_outnvl,
+ ZCP_RET_ERROR, &ri->zri_result);
+
+}
+
+/*
+ * This callback is called when txg_wait_synced_sig encountered a signal.
+ * The txg_wait_synced_sig will continue to wait for the txg to complete
+ * after calling this callback.
+ */
+/* ARGSUSED */
+static void
+zcp_eval_sig(void *arg, dmu_tx_t *tx)
+{
+ zcp_run_info_t *ri = arg;
+
+ ri->zri_canceled = B_TRUE;
+}
+
+static void
+zcp_eval_sync(void *arg, dmu_tx_t *tx)
+{
+ zcp_run_info_t *ri = arg;
+
+ /*
+ * Open context should have setup the stack to contain:
+ * 1: Error handler callback
+ * 2: Script to run (converted to a Lua function)
+ * 3: nvlist input to function (converted to Lua table or nil)
+ */
+ VERIFY3U(3, ==, lua_gettop(ri->zri_state));
+
+ zcp_eval_impl(tx, ri);
+}
+
+static void
+zcp_eval_open(zcp_run_info_t *ri, const char *poolname)
+{
+ int error;
+ dsl_pool_t *dp;
+ dmu_tx_t *tx;
+
+ /*
+ * See comment from the same assertion in zcp_eval_sync().
+ */
+ VERIFY3U(3, ==, lua_gettop(ri->zri_state));
+
+ error = dsl_pool_hold(poolname, FTAG, &dp);
+ if (error != 0) {
+ zcp_pool_error(ri, poolname);
+ return;
+ }
+
+ /*
+ * As we are running in open-context, we have no transaction associated
+ * with the channel program. At the same time, functions from the
+ * zfs.check submodule need to be associated with a transaction as
+ * they are basically dry-runs of their counterparts in the zfs.sync
+ * submodule. These functions should be able to run in open-context.
+ * Therefore we create a new transaction that we later abort once
+ * the channel program has been evaluated.
+ */
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+
+ zcp_eval_impl(tx, ri);
+
+ dmu_tx_abort(tx);
+
+ dsl_pool_rele(dp, FTAG);
+}
+
+int
+zcp_eval(const char *poolname, const char *program, boolean_t sync,
+ uint64_t instrlimit, uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl)
+{
+ int err;
+ lua_State *state;
+ zcp_run_info_t runinfo;
+
+ if (instrlimit > zfs_lua_max_instrlimit)
+ return (SET_ERROR(EINVAL));
+ if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
+ return (SET_ERROR(EINVAL));
+
+ zcp_alloc_arg_t allocargs = {
+ .aa_must_succeed = B_TRUE,
+ .aa_alloc_remaining = (int64_t)memlimit,
+ .aa_alloc_limit = (int64_t)memlimit,
+ };
+
+ /*
+ * Creates a Lua state with a memory allocator that uses KM_SLEEP.
+ * This should never fail.
+ */
+ state = lua_newstate(zcp_lua_alloc, &allocargs);
+ VERIFY(state != NULL);
+ (void) lua_atpanic(state, zcp_panic_cb);
+
+ /*
+ * Load core Lua libraries we want access to.
+ */
+ VERIFY3U(1, ==, luaopen_base(state));
+ lua_pop(state, 1);
+ VERIFY3U(1, ==, luaopen_coroutine(state));
+ lua_setglobal(state, LUA_COLIBNAME);
+ VERIFY0(lua_gettop(state));
+ VERIFY3U(1, ==, luaopen_string(state));
+ lua_setglobal(state, LUA_STRLIBNAME);
+ VERIFY0(lua_gettop(state));
+ VERIFY3U(1, ==, luaopen_table(state));
+ lua_setglobal(state, LUA_TABLIBNAME);
+ VERIFY0(lua_gettop(state));
+
+ /*
+ * Load globally visible variables such as errno aliases.
+ */
+ zcp_load_globals(state);
+ VERIFY0(lua_gettop(state));
+
+ /*
+ * Load ZFS-specific modules.
+ */
+ lua_newtable(state);
+ VERIFY3U(1, ==, zcp_load_list_lib(state));
+ lua_setfield(state, -2, "list");
+ VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_FALSE));
+ lua_setfield(state, -2, "check");
+ VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_TRUE));
+ lua_setfield(state, -2, "sync");
+ VERIFY3U(1, ==, zcp_load_get_lib(state));
+ lua_pushcclosure(state, zcp_debug_info.func, 0);
+ lua_setfield(state, -2, zcp_debug_info.name);
+ lua_pushcclosure(state, zcp_exists_info.func, 0);
+ lua_setfield(state, -2, zcp_exists_info.name);
+ lua_setglobal(state, "zfs");
+ VERIFY0(lua_gettop(state));
+
+ /*
+ * Push the error-callback that calculates Lua stack traces on
+ * unexpected failures.
+ */
+ lua_pushcfunction(state, zcp_error_handler);
+ VERIFY3U(1, ==, lua_gettop(state));
+
+ /*
+ * Load the actual script as a function onto the stack as text ("t").
+ * The only valid error condition is a syntax error in the script.
+ * ERRMEM should not be possible because our allocator is using
+ * KM_SLEEP. ERRGCMM should not be possible because we have not added
+ * any objects with __gc metamethods to the interpreter that could
+ * fail.
+ */
+ err = luaL_loadbufferx(state, program, strlen(program),
+ "channel program", "t");
+ if (err == LUA_ERRSYNTAX) {
+ fnvlist_add_string(outnvl, ZCP_RET_ERROR,
+ lua_tostring(state, -1));
+ lua_close(state);
+ return (SET_ERROR(EINVAL));
+ }
+ VERIFY0(err);
+ VERIFY3U(2, ==, lua_gettop(state));
+
+ /*
+ * Convert the input nvlist to a Lua object and put it on top of the
+ * stack.
+ */
+ char errmsg[128];
+ err = zcp_nvpair_value_to_lua(state, nvarg,
+ errmsg, sizeof (errmsg));
+ if (err != 0) {
+ fnvlist_add_string(outnvl, ZCP_RET_ERROR, errmsg);
+ lua_close(state);
+ return (SET_ERROR(EINVAL));
+ }
+ VERIFY3U(3, ==, lua_gettop(state));
+
+ runinfo.zri_state = state;
+ runinfo.zri_allocargs = &allocargs;
+ runinfo.zri_outnvl = outnvl;
+ runinfo.zri_result = 0;
+ runinfo.zri_cred = CRED();
+ runinfo.zri_timed_out = B_FALSE;
+ runinfo.zri_canceled = B_FALSE;
+ runinfo.zri_sync = sync;
+ runinfo.zri_space_used = 0;
+ runinfo.zri_curinstrs = 0;
+ runinfo.zri_maxinstrs = instrlimit;
+
+ if (sync) {
+ err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync,
+ zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
+ if (err != 0)
+ zcp_pool_error(&runinfo, poolname);
+ } else {
+ zcp_eval_open(&runinfo, poolname);
+ }
+ lua_close(state);
+
+ return (runinfo.zri_result);
+}
+
+/*
+ * Retrieve metadata about the currently running channel program.
+ */
+zcp_run_info_t *
+zcp_run_info(lua_State *state)
+{
+ zcp_run_info_t *ri;
+
+ lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+ ri = lua_touserdata(state, -1);
+ lua_pop(state, 1);
+ return (ri);
+}
+
+/*
+ * Argument Parsing
+ * ================
+ *
+ * The Lua language allows methods to be called with any number
+ * of arguments of any type. When calling back into ZFS we need to sanitize
+ * arguments from channel programs to make sure unexpected arguments or
+ * arguments of the wrong type result in clear error messages. To do this
+ * in a uniform way all callbacks from channel programs should use the
+ * zcp_parse_args() function to interpret inputs.
+ *
+ * Positional vs Keyword Arguments
+ * ===============================
+ *
+ * Every callback function takes a fixed set of required positional arguments
+ * and optional keyword arguments. For example, the destroy function takes
+ * a single positional string argument (the name of the dataset to destroy)
+ * and an optional "defer" keyword boolean argument. When calling lua functions
+ * with parentheses, only positional arguments can be used:
+ *
+ * zfs.sync.snapshot("rpool@snap")
+ *
+ * To use keyword arguments functions should be called with a single argument
+ * that is a lua table containing mappings of integer -> positional arguments
+ * and string -> keyword arguments:
+ *
+ * zfs.sync.snapshot({1="rpool@snap", defer=true})
+ *
+ * The lua language allows curly braces to be used in place of parenthesis as
+ * syntactic sugar for this calling convention:
+ *
+ * zfs.sync.snapshot{"rpool@snap", defer=true}
+ */
+
+/*
+ * Throw an error and print the given arguments. If there are too many
+ * arguments to fit in the output buffer, only the error format string is
+ * output.
+ */
+static void
+zcp_args_error(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+ const zcp_arg_t *kwargs, const char *fmt, ...)
+{
+ int i;
+ char errmsg[512];
+ size_t len = sizeof (errmsg);
+ size_t msglen = 0;
+ va_list argp;
+
+ va_start(argp, fmt);
+ VERIFY3U(len, >, vsnprintf(errmsg, len, fmt, argp));
+ va_end(argp);
+
+ /*
+ * Calculate the total length of the final string, including extra
+ * formatting characters. If the argument dump would be too large,
+ * only print the error string.
+ */
+ msglen = strlen(errmsg);
+ msglen += strlen(fname) + 4; /* : + {} + null terminator */
+ for (i = 0; pargs[i].za_name != NULL; i++) {
+ msglen += strlen(pargs[i].za_name);
+ msglen += strlen(lua_typename(state, pargs[i].za_lua_type));
+ if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL)
+ msglen += 5; /* < + ( + )> + , */
+ else
+ msglen += 4; /* < + ( + )> */
+ }
+ for (i = 0; kwargs[i].za_name != NULL; i++) {
+ msglen += strlen(kwargs[i].za_name);
+ msglen += strlen(lua_typename(state, kwargs[i].za_lua_type));
+ if (kwargs[i + 1].za_name != NULL)
+ msglen += 4; /* =( + ) + , */
+ else
+ msglen += 3; /* =( + ) */
+ }
+
+ if (msglen >= len)
+ (void) luaL_error(state, errmsg);
+
+ VERIFY3U(len, >, strlcat(errmsg, ": ", len));
+ VERIFY3U(len, >, strlcat(errmsg, fname, len));
+ VERIFY3U(len, >, strlcat(errmsg, "{", len));
+ for (i = 0; pargs[i].za_name != NULL; i++) {
+ VERIFY3U(len, >, strlcat(errmsg, "<", len));
+ VERIFY3U(len, >, strlcat(errmsg, pargs[i].za_name, len));
+ VERIFY3U(len, >, strlcat(errmsg, "(", len));
+ VERIFY3U(len, >, strlcat(errmsg,
+ lua_typename(state, pargs[i].za_lua_type), len));
+ VERIFY3U(len, >, strlcat(errmsg, ")>", len));
+ if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) {
+ VERIFY3U(len, >, strlcat(errmsg, ", ", len));
+ }
+ }
+ for (i = 0; kwargs[i].za_name != NULL; i++) {
+ VERIFY3U(len, >, strlcat(errmsg, kwargs[i].za_name, len));
+ VERIFY3U(len, >, strlcat(errmsg, "=(", len));
+ VERIFY3U(len, >, strlcat(errmsg,
+ lua_typename(state, kwargs[i].za_lua_type), len));
+ VERIFY3U(len, >, strlcat(errmsg, ")", len));
+ if (kwargs[i + 1].za_name != NULL) {
+ VERIFY3U(len, >, strlcat(errmsg, ", ", len));
+ }
+ }
+ VERIFY3U(len, >, strlcat(errmsg, "}", len));
+
+ (void) luaL_error(state, errmsg);
+ panic("unreachable code");
+}
+
+static void
+zcp_parse_table_args(lua_State *state, const char *fname,
+ const zcp_arg_t *pargs, const zcp_arg_t *kwargs)
+{
+ int i;
+ int type;
+
+ for (i = 0; pargs[i].za_name != NULL; i++) {
+ /*
+ * Check the table for this positional argument, leaving it
+ * on the top of the stack once we finish validating it.
+ */
+ lua_pushinteger(state, i + 1);
+ lua_gettable(state, 1);
+
+ type = lua_type(state, -1);
+ if (type == LUA_TNIL) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "too few arguments");
+ panic("unreachable code");
+ } else if (type != pargs[i].za_lua_type) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "arg %d wrong type (is '%s', expected '%s')",
+ i + 1, lua_typename(state, type),
+ lua_typename(state, pargs[i].za_lua_type));
+ panic("unreachable code");
+ }
+
+ /*
+ * Remove the positional argument from the table.
+ */
+ lua_pushinteger(state, i + 1);
+ lua_pushnil(state);
+ lua_settable(state, 1);
+ }
+
+ for (i = 0; kwargs[i].za_name != NULL; i++) {
+ /*
+ * Check the table for this keyword argument, which may be
+ * nil if it was omitted. Leave the value on the top of
+ * the stack after validating it.
+ */
+ lua_getfield(state, 1, kwargs[i].za_name);
+
+ type = lua_type(state, -1);
+ if (type != LUA_TNIL && type != kwargs[i].za_lua_type) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "kwarg '%s' wrong type (is '%s', expected '%s')",
+ kwargs[i].za_name, lua_typename(state, type),
+ lua_typename(state, kwargs[i].za_lua_type));
+ panic("unreachable code");
+ }
+
+ /*
+ * Remove the keyword argument from the table.
+ */
+ lua_pushnil(state);
+ lua_setfield(state, 1, kwargs[i].za_name);
+ }
+
+ /*
+ * Any entries remaining in the table are invalid inputs, print
+ * an error message based on what the entry is.
+ */
+ lua_pushnil(state);
+ if (lua_next(state, 1)) {
+ if (lua_isnumber(state, -2) && lua_tointeger(state, -2) > 0) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "too many positional arguments");
+ } else if (lua_isstring(state, -2)) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "invalid kwarg '%s'", lua_tostring(state, -2));
+ } else {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "kwarg keys must be strings");
+ }
+ panic("unreachable code");
+ }
+
+ lua_remove(state, 1);
+}
+
+static void
+zcp_parse_pos_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+ const zcp_arg_t *kwargs)
+{
+ int i;
+ int type;
+
+ for (i = 0; pargs[i].za_name != NULL; i++) {
+ type = lua_type(state, i + 1);
+ if (type == LUA_TNONE) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "too few arguments");
+ panic("unreachable code");
+ } else if (type != pargs[i].za_lua_type) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "arg %d wrong type (is '%s', expected '%s')",
+ i + 1, lua_typename(state, type),
+ lua_typename(state, pargs[i].za_lua_type));
+ panic("unreachable code");
+ }
+ }
+ if (lua_gettop(state) != i) {
+ zcp_args_error(state, fname, pargs, kwargs,
+ "too many positional arguments");
+ panic("unreachable code");
+ }
+
+ for (i = 0; kwargs[i].za_name != NULL; i++) {
+ lua_pushnil(state);
+ }
+}
+
+/*
+ * Checks the current Lua stack against an expected set of positional and
+ * keyword arguments. If the stack does not match the expected arguments
+ * aborts the current channel program with a useful error message, otherwise
+ * it re-arranges the stack so that it contains the positional arguments
+ * followed by the keyword argument values in declaration order. Any missing
+ * keyword argument will be represented by a nil value on the stack.
+ *
+ * If the stack contains exactly one argument of type LUA_TTABLE the curly
+ * braces calling convention is assumed, otherwise the stack is parsed for
+ * positional arguments only.
+ *
+ * This function should be used by every function callback. It should be called
+ * before the callback manipulates the Lua stack as it assumes the stack
+ * represents the function arguments.
+ */
+void
+zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+ const zcp_arg_t *kwargs)
+{
+ if (lua_gettop(state) == 1 && lua_istable(state, 1)) {
+ zcp_parse_table_args(state, fname, pargs, kwargs);
+ } else {
+ zcp_parse_pos_args(state, fname, pargs, kwargs);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
new file mode 100644
index 000000000000..dcba02c508b0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
@@ -0,0 +1,865 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ */
+
+#include "lua.h"
+#include "lualib.h"
+#include "lauxlib.h"
+
+#include <zfs_prop.h>
+
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_objset.h>
+#include <sys/mntent.h>
+#include <sys/sunddi.h>
+#include <sys/zap.h>
+#include <sys/zcp.h>
+#include <sys/zcp_iter.h>
+#include <sys/zcp_global.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/zvol.h>
+
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+static int
+get_objset_type(dsl_dataset_t *ds, zfs_type_t *type)
+{
+ int error;
+ objset_t *os;
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+ if (ds->ds_is_snapshot) {
+ *type = ZFS_TYPE_SNAPSHOT;
+ } else {
+ switch (os->os_phys->os_type) {
+ case DMU_OST_ZFS:
+ *type = ZFS_TYPE_FILESYSTEM;
+ break;
+ case DMU_OST_ZVOL:
+ *type = ZFS_TYPE_VOLUME;
+ break;
+ default:
+ return (EINVAL);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Returns the string name of ds's type in str (a buffer which should be
+ * at least 12 bytes long).
+ */
+static int
+get_objset_type_name(dsl_dataset_t *ds, char *str)
+{
+ int error;
+ zfs_type_t type;
+ error = get_objset_type(ds, &type);
+ if (error != 0)
+ return (error);
+ switch (type) {
+ case ZFS_TYPE_SNAPSHOT:
+ (void) strcpy(str, "snapshot");
+ break;
+ case ZFS_TYPE_FILESYSTEM:
+ (void) strcpy(str, "filesystem");
+ break;
+ case ZFS_TYPE_VOLUME:
+ (void) strcpy(str, "volume");
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * Determines the source of a property given its setpoint and
+ * property type. It pushes the source to the lua stack.
+ */
+static void
+get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop)
+{
+ if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) {
+ lua_pushnil(state);
+ } else {
+ const char *src;
+ if (strcmp("", setpoint) == 0) {
+ src = "default";
+ } else {
+ src = setpoint;
+ }
+ (void) lua_pushstring(state, src);
+ }
+}
+
+/*
+ * Given an error encountered while getting properties, either longjmp's for
+ * a fatal error or pushes nothing to the stack for a non fatal one.
+ */
+static int
+zcp_handle_error(lua_State *state, const char *dataset_name,
+ const char *property_name, int error)
+{
+ ASSERT3S(error, !=, 0);
+ if (error == ENOENT) {
+ return (0);
+ } else if (error == EINVAL) {
+ return (luaL_error(state,
+ "property '%s' is not a valid property on dataset '%s'",
+ property_name, dataset_name));
+ } else if (error == EIO) {
+ return (luaL_error(state,
+ "I/O error while retrieving property '%s' on dataset '%s'",
+ property_name, dataset_name));
+ } else {
+ return (luaL_error(state, "unexpected error %d while "
+ "retrieving property '%s' on dataset '%s'",
+ error, property_name, dataset_name));
+ }
+}
+
+/*
+ * Look up a user defined property in the zap object. If it exists, push it
+ * and the setpoint onto the stack, otherwise don't push anything.
+ */
+static int
+zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
+ const char *property_name)
+{
+ int error;
+ char *buf;
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+ /*
+ * zcp_dataset_hold will either successfully return the requested
+ * dataset or throw a lua error and longjmp out of the zfs.get_prop call
+ * without returning.
+ */
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN,
+ buf, setpoint);
+ dsl_dataset_rele(ds, FTAG);
+
+ if (error != 0) {
+ kmem_free(buf, ZAP_MAXVALUELEN);
+ return (zcp_handle_error(state, dataset_name, property_name,
+ error));
+ }
+ (void) lua_pushstring(state, buf);
+ (void) lua_pushstring(state, setpoint);
+ kmem_free(buf, ZAP_MAXVALUELEN);
+ return (2);
+}
+
+/*
+ * Check if the property we're looking for is stored in the ds_dir. If so,
+ * return it in the 'val' argument. Return 0 on success and ENOENT and if
+ * the property is not present.
+ */
+static int
+get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop,
+ uint64_t *val)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ mutex_enter(&dd->dd_lock);
+ switch (zfs_prop) {
+ case ZFS_PROP_USEDSNAP:
+ *val = dsl_dir_get_usedsnap(dd);
+ break;
+ case ZFS_PROP_USEDCHILD:
+ *val = dsl_dir_get_usedchild(dd);
+ break;
+ case ZFS_PROP_USEDDS:
+ *val = dsl_dir_get_usedds(dd);
+ break;
+ case ZFS_PROP_USEDREFRESERV:
+ *val = dsl_dir_get_usedrefreserv(dd);
+ break;
+ case ZFS_PROP_LOGICALUSED:
+ *val = dsl_dir_get_logicalused(dd);
+ break;
+ default:
+ mutex_exit(&dd->dd_lock);
+ return (ENOENT);
+ }
+ mutex_exit(&dd->dd_lock);
+ return (0);
+}
+
+/*
+ * Takes a dataset, a property, a value and that value's setpoint as
+ * found in the ZAP. Checks if the property has been changed in the vfs.
+ * If so, val and setpoint will be overwritten with updated content.
+ * Otherwise, they are left unchanged.
+ */
+static int
+get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
+ char *setpoint)
+{
+#ifndef _KERNEL
+ return (0);
+#else
+ int error;
+#ifdef illumos
+ zfsvfs_t *zfvp;
+#endif
+ vfs_t *vfsp;
+ objset_t *os;
+ uint64_t tmp = *val;
+
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+
+ error = getzfsvfs_impl(os, &vfsp);
+ if (error != 0)
+ return (error);
+#ifdef illumos
+ vfsp = zfvp->z_vfs;
+#endif
+ switch (zfs_prop) {
+ case ZFS_PROP_ATIME:
+ if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_DEVICES:
+ if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_EXEC:
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_SETUID:
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_READONLY:
+ if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_XATTR:
+ if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
+ tmp = 1;
+ break;
+ case ZFS_PROP_NBMAND:
+ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
+ tmp = 0;
+ if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
+ tmp = 1;
+ break;
+ default:
+#ifdef illumos
+ VFS_RELE(vfsp);
+#else
+ vfs_rel(vfsp);
+#endif
+ return (ENOENT);
+ }
+
+#ifdef illumos
+ VFS_RELE(vfsp);
+#else
+ vfs_rel(vfsp);
+#endif
+ if (tmp != *val) {
+ (void) strcpy(setpoint, "temporary");
+ *val = tmp;
+ }
+ return (0);
+#endif
+}
+
+/*
+ * Check if the property we're looking for is stored at the dsl_dataset or
+ * dsl_dir level. If so, push the property value and source onto the lua stack
+ * and return 0. If it is not present or a failure occurs in lookup, return a
+ * non-zero error value.
+ */
+static int
+get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
+ zfs_prop_t zfs_prop)
+{
+ int error = 0;
+ objset_t *os;
+ uint64_t numval;
+ char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
+ "Internal error - setpoint not determined";
+ zfs_type_t ds_type;
+ zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
+ (void) get_objset_type(ds, &ds_type);
+
+ switch (zfs_prop) {
+ case ZFS_PROP_REFRATIO:
+ numval = dsl_get_refratio(ds);
+ break;
+ case ZFS_PROP_USED:
+ numval = dsl_get_used(ds);
+ break;
+ case ZFS_PROP_CLONES: {
+ nvlist_t *clones = fnvlist_alloc();
+ error = get_clones_stat_impl(ds, clones);
+ if (error == 0) {
+ /* push list to lua stack */
+ VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0));
+ /* source */
+ (void) lua_pushnil(state);
+ }
+ nvlist_free(clones);
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ return (error);
+ }
+ case ZFS_PROP_COMPRESSRATIO:
+ numval = dsl_get_compressratio(ds);
+ break;
+ case ZFS_PROP_CREATION:
+ numval = dsl_get_creation(ds);
+ break;
+ case ZFS_PROP_REFERENCED:
+ numval = dsl_get_referenced(ds);
+ break;
+ case ZFS_PROP_AVAILABLE:
+ numval = dsl_get_available(ds);
+ break;
+ case ZFS_PROP_LOGICALREFERENCED:
+ numval = dsl_get_logicalreferenced(ds);
+ break;
+ case ZFS_PROP_CREATETXG:
+ numval = dsl_get_creationtxg(ds);
+ break;
+ case ZFS_PROP_GUID:
+ numval = dsl_get_guid(ds);
+ break;
+ case ZFS_PROP_UNIQUE:
+ numval = dsl_get_unique(ds);
+ break;
+ case ZFS_PROP_OBJSETID:
+ numval = dsl_get_objsetid(ds);
+ break;
+ case ZFS_PROP_ORIGIN:
+ dsl_dir_get_origin(ds->ds_dir, strval);
+ break;
+ case ZFS_PROP_USERACCOUNTING:
+ error = dmu_objset_from_ds(ds, &os);
+ if (error == 0)
+ numval = dmu_objset_userspace_present(os);
+ break;
+ case ZFS_PROP_WRITTEN:
+ error = dsl_get_written(ds, &numval);
+ break;
+ case ZFS_PROP_TYPE:
+ error = get_objset_type_name(ds, strval);
+ break;
+ case ZFS_PROP_PREV_SNAP:
+ error = dsl_get_prev_snap(ds, strval);
+ break;
+ case ZFS_PROP_NAME:
+ dsl_dataset_name(ds, strval);
+ break;
+ case ZFS_PROP_MOUNTPOINT:
+ error = dsl_get_mountpoint(ds, dsname, strval, setpoint);
+ break;
+ case ZFS_PROP_VERSION:
+ /* should be a snapshot or filesystem */
+ ASSERT(ds_type != ZFS_TYPE_VOLUME);
+ error = dmu_objset_from_ds(ds, &os);
+ /* look in the master node for the version */
+ if (error == 0) {
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ sizeof (numval), 1, &numval);
+ }
+ break;
+ case ZFS_PROP_DEFER_DESTROY:
+ numval = dsl_get_defer_destroy(ds);
+ break;
+ case ZFS_PROP_USERREFS:
+ numval = dsl_get_userrefs(ds);
+ break;
+ case ZFS_PROP_FILESYSTEM_COUNT:
+ error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval);
+ (void) strcpy(setpoint, "");
+ break;
+ case ZFS_PROP_SNAPSHOT_COUNT:
+ error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval);
+ (void) strcpy(setpoint, "");
+ break;
+ case ZFS_PROP_REMAPTXG:
+ error = dsl_dir_get_remaptxg(ds->ds_dir, &numval);
+ break;
+ case ZFS_PROP_NUMCLONES:
+ numval = dsl_get_numclones(ds);
+ break;
+ case ZFS_PROP_INCONSISTENT:
+ numval = dsl_get_inconsistent(ds);
+ break;
+ case ZFS_PROP_RECEIVE_RESUME_TOKEN: {
+ char *token = get_receive_resume_stats_impl(ds);
+ VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <,
+ ZAP_MAXVALUELEN);
+ strfree(token);
+ if (strcmp(strval, "") == 0) {
+ token = get_child_receive_stats(ds);
+ VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <,
+ ZAP_MAXVALUELEN);
+ strfree(token);
+ if (strcmp(strval, "") == 0)
+ error = ENOENT;
+ }
+ break;
+ }
+ case ZFS_PROP_VOLSIZE:
+ ASSERT(ds_type == ZFS_TYPE_VOLUME);
+ error = dmu_objset_from_ds(ds, &os);
+ if (error == 0) {
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size",
+ sizeof (numval), 1, &numval);
+ }
+ if (error == 0)
+ (void) strcpy(setpoint, dsname);
+
+ break;
+ case ZFS_PROP_VOLBLOCKSIZE: {
+ ASSERT(ds_type == ZFS_TYPE_VOLUME);
+ dmu_object_info_t doi;
+ error = dmu_objset_from_ds(ds, &os);
+ if (error == 0) {
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+ if (error == 0)
+ numval = doi.doi_data_block_size;
+ }
+ break;
+ }
+ default:
+ /* Did not match these props, check in the dsl_dir */
+ error = get_dsl_dir_prop(ds, zfs_prop, &numval);
+ }
+ if (error != 0) {
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ return (error);
+ }
+
+ switch (prop_type) {
+ case PROP_TYPE_NUMBER: {
+ (void) lua_pushnumber(state, numval);
+ break;
+ }
+ case PROP_TYPE_STRING: {
+ (void) lua_pushstring(state, strval);
+ break;
+ }
+ case PROP_TYPE_INDEX: {
+ const char *propval;
+ error = zfs_prop_index_to_string(zfs_prop, numval, &propval);
+ if (error != 0) {
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ return (error);
+ }
+ (void) lua_pushstring(state, propval);
+ break;
+ }
+ }
+ kmem_free(strval, ZAP_MAXVALUELEN);
+
+ /* Push the source to the stack */
+ get_prop_src(state, setpoint, zfs_prop);
+ return (0);
+}
+
+/*
+ * Look up a property and its source in the zap object. If the value is
+ * present and successfully retrieved, push the value and source on the
+ * lua stack and return 0. On failure, return a non-zero error value.
+ */
+static int
+get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
+{
+ int error = 0;
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+ char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+ uint64_t numval;
+ const char *prop_name = zfs_prop_to_name(zfs_prop);
+ zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
+
+ if (prop_type == PROP_TYPE_STRING) {
+ /* Push value to lua stack */
+ error = dsl_prop_get_ds(ds, prop_name, 1,
+ ZAP_MAXVALUELEN, strval, setpoint);
+ if (error == 0)
+ (void) lua_pushstring(state, strval);
+ } else {
+ error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
+ 1, &numval, setpoint);
+
+ /* Fill in temorary value for prop, if applicable */
+ (void) get_temporary_prop(ds, zfs_prop, &numval, setpoint);
+
+ /* Push value to lua stack */
+ if (prop_type == PROP_TYPE_INDEX) {
+ const char *propval;
+ error = zfs_prop_index_to_string(zfs_prop, numval,
+ &propval);
+ if (error == 0)
+ (void) lua_pushstring(state, propval);
+ } else {
+ if (error == 0)
+ (void) lua_pushnumber(state, numval);
+ }
+ }
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ if (error == 0)
+ get_prop_src(state, setpoint, zfs_prop);
+ return (error);
+}
+
+/*
+ * Determine whether property is valid for a given dataset
+ */
+boolean_t
+prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
+{
+ int error;
+ zfs_type_t zfs_type;
+
+ /* properties not supported */
+ if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) ||
+ (zfs_prop == ZFS_PROP_MOUNTED))
+ return (B_FALSE);
+
+ /* if we want the origin prop, ds must be a clone */
+ if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
+ return (B_FALSE);
+
+ error = get_objset_type(ds, &zfs_type);
+ if (error != 0)
+ return (B_FALSE);
+ return (zfs_prop_valid_for_type(zfs_prop, zfs_type));
+}
+
+/*
+ * Look up a given dataset property. On success return 2, the number of
+ * values pushed to the lua stack (property value and source). On a fatal
+ * error, longjmp. On a non fatal error push nothing.
+ */
+static int
+zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
+ zfs_prop_t zfs_prop)
+{
+ int error;
+ /*
+ * zcp_dataset_hold will either successfully return the requested
+ * dataset or throw a lua error and longjmp out of the zfs.get_prop call
+ * without returning.
+ */
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ /* Check that the property is valid for the given dataset */
+ const char *prop_name = zfs_prop_to_name(zfs_prop);
+ if (!prop_valid_for_ds(ds, zfs_prop)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ /* Check if the property can be accessed directly */
+ error = get_special_prop(state, ds, dataset_name, zfs_prop);
+ if (error == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ /* The value and source have been pushed by get_special_prop */
+ return (2);
+ }
+ if (error != ENOENT) {
+ dsl_dataset_rele(ds, FTAG);
+ return (zcp_handle_error(state, dataset_name,
+ prop_name, error));
+ }
+
+ /* If we were unable to find it, look in the zap object */
+ error = get_zap_prop(state, ds, zfs_prop);
+ dsl_dataset_rele(ds, FTAG);
+ if (error != 0) {
+ return (zcp_handle_error(state, dataset_name,
+ prop_name, error));
+ }
+ /* The value and source have been pushed by get_zap_prop */
+ return (2);
+}
+
+static zfs_userquota_prop_t
+get_userquota_prop(const char *prop_name)
+{
+ zfs_userquota_prop_t type;
+ /* Figure out the property type ({user|group}{quota|used}) */
+ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
+ if (strncmp(prop_name, zfs_userquota_prop_prefixes[type],
+ strlen(zfs_userquota_prop_prefixes[type])) == 0)
+ break;
+ }
+ return (type);
+}
+
+#ifdef _KERNEL
+/*
+ * Given the name of a zfs_userquota_prop, this function determines the
+ * prop type as well as the numeric group/user ids based on the string
+ * following the '@' in the property name. On success, returns 0. On failure,
+ * returns a non-zero error.
+ * 'domain' must be free'd by caller using strfree()
+ */
+static int
+parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type,
+ char **domain, uint64_t *rid)
+{
+ char *cp, *end, *domain_val;
+
+ *type = get_userquota_prop(prop_name);
+ if (*type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (EINVAL);
+
+ *rid = 0;
+ cp = strchr(prop_name, '@') + 1;
+ if (strncmp(cp, "S-1-", 4) == 0) {
+ /*
+ * It's a numeric SID (eg "S-1-234-567-89") and we want to
+ * seperate the domain id and the rid
+ */
+ int domain_len = strrchr(cp, '-') - cp;
+ domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
+ (void) strncpy(domain_val, cp, domain_len);
+ domain_val[domain_len] = '\0';
+ cp += domain_len + 1;
+
+ (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
+ if (*end != '\0') {
+ strfree(domain_val);
+ return (EINVAL);
+ }
+ } else {
+ /* It's only a user/group ID (eg "12345"), just get the rid */
+ domain_val = NULL;
+ (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
+ if (*end != '\0')
+ return (EINVAL);
+ }
+ *domain = domain_val;
+ return (0);
+}
+
+/*
+ * Look up {user|group}{quota|used} property for given dataset. On success
+ * push the value (quota or used amount) and the setpoint. On failure, push
+ * a lua error.
+ */
+static int
+zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp,
+ const char *dataset_name, const char *prop_name)
+{
+ zfsvfs_t *zfvp;
+ zfsvfs_t *zfsvfs;
+ int error;
+ zfs_userquota_prop_t type;
+ char *domain;
+ uint64_t rid, value;
+ objset_t *os;
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ error = parse_userquota_prop(prop_name, &type, &domain, &rid);
+ if (error == 0) {
+ error = dmu_objset_from_ds(ds, &os);
+ if (error == 0) {
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ error = zfsvfs_create_impl(&zfvp, zfsvfs, os);
+ if (error == 0) {
+ error = zfs_userspace_one(zfvp, type, domain,
+ rid, &value);
+ zfsvfs_free(zfvp);
+ }
+ }
+ if (domain != NULL)
+ strfree(domain);
+ }
+ dsl_dataset_rele(ds, FTAG);
+
+ if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) ||
+ (type == ZFS_PROP_GROUPQUOTA)))
+ error = ENOENT;
+ if (error != 0) {
+ return (zcp_handle_error(state, dataset_name,
+ prop_name, error));
+ }
+
+ (void) lua_pushnumber(state, value);
+ (void) lua_pushstring(state, dataset_name);
+ return (2);
+}
+#endif
+
+/*
+ * Determines the name of the snapshot referenced in the written property
+ * name. Returns snapshot name in snap_name, a buffer that must be at least
+ * as large as ZFS_MAX_DATASET_NAME_LEN
+ */
+static void
+parse_written_prop(const char *dataset_name, const char *prop_name,
+ char *snap_name)
+{
+ ASSERT(zfs_prop_written(prop_name));
+ const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN;
+ if (strchr(name, '@') == NULL) {
+ (void) sprintf(snap_name, "%s@%s", dataset_name, name);
+ } else {
+ (void) strcpy(snap_name, name);
+ }
+}
+
+/*
+ * Look up written@ property for given dataset. On success
+ * push the value and the setpoint. If error is fatal, we will
+ * longjmp, otherwise push nothing.
+ */
+static int
+zcp_get_written_prop(lua_State *state, dsl_pool_t *dp,
+ const char *dataset_name, const char *prop_name)
+{
+ char snap_name[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t used, comp, uncomp;
+ dsl_dataset_t *old;
+ int error = 0;
+
+ parse_written_prop(dataset_name, prop_name, snap_name);
+ dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (new == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ error = dsl_dataset_hold(dp, snap_name, FTAG, &old);
+ if (error != 0) {
+ dsl_dataset_rele(new, FTAG);
+ return (zcp_dataset_hold_error(state, dp, snap_name,
+ error));
+ }
+ error = dsl_dataset_space_written(old, new,
+ &used, &comp, &uncomp);
+
+ dsl_dataset_rele(old, FTAG);
+ dsl_dataset_rele(new, FTAG);
+
+ if (error != 0) {
+ return (zcp_handle_error(state, dataset_name,
+ snap_name, error));
+ }
+ (void) lua_pushnumber(state, used);
+ (void) lua_pushstring(state, dataset_name);
+ return (2);
+}
+
+static int zcp_get_prop(lua_State *state);
+static zcp_lib_info_t zcp_get_prop_info = {
+ .name = "get_prop",
+ .func = zcp_get_prop,
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ { .za_name = "property", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_get_prop(lua_State *state)
+{
+ const char *dataset_name;
+ const char *property_name;
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ zcp_lib_info_t *libinfo = &zcp_get_prop_info;
+
+ zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+ dataset_name = lua_tostring(state, 1);
+ property_name = lua_tostring(state, 2);
+
+ /* User defined property */
+ if (zfs_prop_user(property_name)) {
+ return (zcp_get_user_prop(state, dp,
+ dataset_name, property_name));
+ }
+ /* userspace property */
+ if (zfs_prop_userquota(property_name)) {
+#ifdef _KERNEL
+ return (zcp_get_userquota_prop(state, dp,
+ dataset_name, property_name));
+#else
+ return (luaL_error(state,
+ "user quota properties only supported in kernel mode",
+ property_name));
+#endif
+ }
+ /* written@ property */
+ if (zfs_prop_written(property_name)) {
+ return (zcp_get_written_prop(state, dp,
+ dataset_name, property_name));
+ }
+
+ zfs_prop_t zfs_prop = zfs_name_to_prop(property_name);
+ /* Valid system property */
+ if (zfs_prop != ZPROP_INVAL) {
+ return (zcp_get_system_prop(state, dp, dataset_name,
+ zfs_prop));
+ }
+
+ /* Invalid property name */
+ return (luaL_error(state,
+ "'%s' is not a valid property", property_name));
+}
+
+int
+zcp_load_get_lib(lua_State *state)
+{
+ lua_pushcclosure(state, zcp_get_prop_info.func, 0);
+ lua_setfield(state, -2, zcp_get_prop_info.name);
+
+ return (1);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c
new file mode 100644
index 000000000000..c25431fd6703
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zcp_global.h>
+
+#include "lua.h"
+#include "lauxlib.h"
+
+typedef struct zcp_errno_global {
+ const char *zeg_name;
+ int zeg_errno;
+} zcp_errno_global_t;
+
+static const zcp_errno_global_t errno_globals[] = {
+ {"EPERM", EPERM},
+ {"ENOENT", ENOENT},
+ {"ESRCH", ESRCH},
+ {"EINTR", EINTR},
+ {"EIO", EIO},
+ {"ENXIO", ENXIO},
+ {"E2BIG", E2BIG},
+ {"ENOEXEC", ENOEXEC},
+ {"EBADF", EBADF},
+ {"ECHILD", ECHILD},
+ {"EAGAIN", EAGAIN},
+ {"ENOMEM", ENOMEM},
+ {"EACCES", EACCES},
+ {"EFAULT", EFAULT},
+ {"ENOTBLK", ENOTBLK},
+ {"EBUSY", EBUSY},
+ {"EEXIST", EEXIST},
+ {"EXDEV", EXDEV},
+ {"ENODEV", ENODEV},
+ {"ENOTDIR", ENOTDIR},
+ {"EISDIR", EISDIR},
+ {"EINVAL", EINVAL},
+ {"ENFILE", ENFILE},
+ {"EMFILE", EMFILE},
+ {"ENOTTY", ENOTTY},
+ {"ETXTBSY", ETXTBSY},
+ {"EFBIG", EFBIG},
+ {"ENOSPC", ENOSPC},
+ {"ESPIPE", ESPIPE},
+ {"EROFS", EROFS},
+ {"EMLINK", EMLINK},
+ {"EPIPE", EPIPE},
+ {"EDOM", EDOM},
+ {"ERANGE", ERANGE},
+ {"EDEADLK", EDEADLK},
+ {"ENOLCK", ENOLCK},
+ {"ECANCELED", ECANCELED},
+ {"ENOTSUP", ENOTSUP},
+ {"EDQUOT", EDQUOT},
+ {"ENAMETOOLONG", ENAMETOOLONG},
+ {NULL, 0}
+};
+
+static void
+zcp_load_errno_globals(lua_State *state)
+{
+ const zcp_errno_global_t *global = errno_globals;
+ while (global->zeg_name != NULL) {
+ lua_pushnumber(state, (lua_Number)global->zeg_errno);
+ lua_setglobal(state, global->zeg_name);
+ global++;
+ }
+}
+
+void
+zcp_load_globals(lua_State *state)
+{
+ zcp_load_errno_globals(state);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c
new file mode 100644
index 000000000000..0236c6474ef6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c
@@ -0,0 +1,531 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include "lua.h"
+#include "lauxlib.h"
+
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/dsl_dir.h>
+#include <sys/zcp_prop.h>
+
+#include <sys/zcp.h>
+
+typedef int (zcp_list_func_t)(lua_State *);
+typedef struct zcp_list_info {
+ const char *name;
+ zcp_list_func_t *func;
+ zcp_list_func_t *gc;
+ const zcp_arg_t pargs[4];
+ const zcp_arg_t kwargs[2];
+} zcp_list_info_t;
+
+static int
+zcp_clones_iter(lua_State *state)
+{
+ int err;
+ char clonename[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+ uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ dsl_dataset_t *ds, *clone;
+ zap_attribute_t za;
+ zap_cursor_t zc;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err == ENOENT) {
+ return (0);
+ } else if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+ err));
+ }
+
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ zap_cursor_init_serialized(&zc, dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj, cursor);
+ dsl_dataset_rele(ds, FTAG);
+
+ err = zap_cursor_retrieve(&zc, &za);
+ if (err != 0) {
+ zap_cursor_fini(&zc);
+ if (err != ENOENT) {
+ return (luaL_error(state,
+ "unexpected error %d from zap_cursor_retrieve()",
+ err));
+ }
+ return (0);
+ }
+ zap_cursor_advance(&zc);
+ cursor = zap_cursor_serialize(&zc);
+ zap_cursor_fini(&zc);
+
+ err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &clone);
+ if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from "
+ "dsl_dataset_hold_obj(za_first_integer)", err));
+ }
+
+ dsl_dir_name(clone->ds_dir, clonename);
+ dsl_dataset_rele(clone, FTAG);
+
+ lua_pushnumber(state, cursor);
+ lua_replace(state, lua_upvalueindex(2));
+
+ (void) lua_pushstring(state, clonename);
+ return (1);
+}
+
+static int zcp_clones_list(lua_State *);
+static zcp_list_info_t zcp_clones_list_info = {
+ .name = "clones",
+ .func = zcp_clones_list,
+ .gc = NULL,
+ .pargs = {
+ { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_clones_list(lua_State *state)
+{
+ const char *snapname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ boolean_t issnap;
+ uint64_t dsobj, cursor;
+
+ /*
+ * zcp_dataset_hold will either successfully return the requested
+ * dataset or throw a lua error and longjmp out of the zfs.list.clones
+ * call without returning.
+ */
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+ cursor = 0;
+ issnap = ds->ds_is_snapshot;
+ dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ if (!issnap) {
+ return (zcp_argerror(state, 1, "%s is not a snapshot",
+ snapname));
+ }
+
+ lua_pushnumber(state, dsobj);
+ lua_pushnumber(state, cursor);
+ lua_pushcclosure(state, &zcp_clones_iter, 2);
+ return (1);
+}
+
+static int
+zcp_snapshots_iter(lua_State *state)
+{
+ int err;
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+ uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+ char *p;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+ err));
+ }
+
+ dsl_dataset_name(ds, snapname);
+ VERIFY3U(sizeof (snapname), >,
+ strlcat(snapname, "@", sizeof (snapname)));
+
+ p = strchr(snapname, '\0');
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ err = dmu_snapshot_list_next(os,
+ sizeof (snapname) - (p - snapname), p, NULL, &cursor, NULL);
+ dsl_dataset_rele(ds, FTAG);
+
+ if (err == ENOENT) {
+ return (0);
+ } else if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dmu_snapshot_list_next()", err));
+ }
+
+ lua_pushnumber(state, cursor);
+ lua_replace(state, lua_upvalueindex(2));
+
+ (void) lua_pushstring(state, snapname);
+ return (1);
+}
+
+static int zcp_snapshots_list(lua_State *);
+static zcp_list_info_t zcp_snapshots_list_info = {
+ .name = "snapshots",
+ .func = zcp_snapshots_list,
+ .gc = NULL,
+ .pargs = {
+ { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_snapshots_list(lua_State *state)
+{
+ const char *fsname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ boolean_t issnap;
+ uint64_t dsobj;
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+ issnap = ds->ds_is_snapshot;
+ dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ if (issnap) {
+ return (zcp_argerror(state, 1,
+ "argument %s cannot be a snapshot", fsname));
+ }
+
+ lua_pushnumber(state, dsobj);
+ lua_pushnumber(state, 0);
+ lua_pushcclosure(state, &zcp_snapshots_iter, 2);
+ return (1);
+}
+
+/*
+ * Note: channel programs only run in the global zone, so all datasets
+ * are visible to this zone.
+ */
+static boolean_t
+dataset_name_hidden(const char *name)
+{
+ if (strchr(name, '$') != NULL)
+ return (B_TRUE);
+ if (strchr(name, '%') != NULL)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+static int
+zcp_children_iter(lua_State *state)
+{
+ int err;
+ char childname[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+ uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+ zcp_run_info_t *ri = zcp_run_info(state);
+ dsl_pool_t *dp = ri->zri_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+ char *p;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+ err));
+ }
+
+ dsl_dataset_name(ds, childname);
+ VERIFY3U(sizeof (childname), >,
+ strlcat(childname, "/", sizeof (childname)));
+ p = strchr(childname, '\0');
+
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ do {
+ err = dmu_dir_list_next(os,
+ sizeof (childname) - (p - childname), p, NULL, &cursor);
+ } while (err == 0 && dataset_name_hidden(childname));
+ dsl_dataset_rele(ds, FTAG);
+
+ if (err == ENOENT) {
+ return (0);
+ } else if (err != 0) {
+ return (luaL_error(state,
+ "unexpected error %d from dmu_dir_list_next()",
+ err));
+ }
+
+ lua_pushnumber(state, cursor);
+ lua_replace(state, lua_upvalueindex(2));
+
+ (void) lua_pushstring(state, childname);
+ return (1);
+}
+
+static int zcp_children_list(lua_State *);
+static zcp_list_info_t zcp_children_list_info = {
+ .name = "children",
+ .func = zcp_children_list,
+ .gc = NULL,
+ .pargs = {
+ { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_children_list(lua_State *state)
+{
+ const char *fsname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ boolean_t issnap;
+ uint64_t dsobj;
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ issnap = ds->ds_is_snapshot;
+ dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ if (issnap) {
+ return (zcp_argerror(state, 1,
+ "argument %s cannot be a snapshot", fsname));
+ }
+
+ lua_pushnumber(state, dsobj);
+ lua_pushnumber(state, 0);
+ lua_pushcclosure(state, &zcp_children_iter, 2);
+ return (1);
+}
+
+static int
+zcp_props_list_gc(lua_State *state)
+{
+ nvlist_t **props = lua_touserdata(state, 1);
+ if (*props != NULL)
+ fnvlist_free(*props);
+ return (0);
+}
+
+static int
+zcp_props_iter(lua_State *state)
+{
+ char *source, *val;
+ nvlist_t *nvprop;
+ nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1));
+ nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2));
+
+ do {
+ pair = nvlist_next_nvpair(*props, pair);
+ if (pair == NULL) {
+ fnvlist_free(*props);
+ *props = NULL;
+ return (0);
+ }
+ } while (!zfs_prop_user(nvpair_name(pair)));
+
+ lua_pushlightuserdata(state, pair);
+ lua_replace(state, lua_upvalueindex(2));
+
+ nvprop = fnvpair_value_nvlist(pair);
+ val = fnvlist_lookup_string(nvprop, ZPROP_VALUE);
+ source = fnvlist_lookup_string(nvprop, ZPROP_SOURCE);
+
+ (void) lua_pushstring(state, nvpair_name(pair));
+ (void) lua_pushstring(state, val);
+ (void) lua_pushstring(state, source);
+ return (3);
+}
+
+static int zcp_props_list(lua_State *);
+static zcp_list_info_t zcp_props_list_info = {
+ .name = "properties",
+ .func = zcp_props_list,
+ .gc = zcp_props_list_gc,
+ .pargs = {
+ { .za_name = "filesystem | snapshot | volume",
+ .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_props_list(lua_State *state)
+{
+ const char *dsname = lua_tostring(state, 1);
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ objset_t *os;
+ nvlist_t **props = lua_newuserdata(state, sizeof (nvlist_t *));
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ VERIFY0(dsl_prop_get_all(os, props));
+ dsl_dataset_rele(ds, FTAG);
+
+ /*
+ * Set the metatable for the properties list to free it on completion.
+ */
+ luaL_getmetatable(state, zcp_props_list_info.name);
+ (void) lua_setmetatable(state, -2);
+
+ lua_pushlightuserdata(state, NULL);
+ lua_pushcclosure(state, &zcp_props_iter, 2);
+ return (1);
+}
+
+
+/*
+ * Populate nv with all valid properties and their values for the given
+ * dataset.
+ */
+static void
+zcp_dataset_props(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) {
+ /* Do not display hidden props */
+ if (!zfs_prop_visible(prop))
+ continue;
+ /* Do not display props not valid for this dataset */
+ if (!prop_valid_for_ds(ds, prop))
+ continue;
+ fnvlist_add_boolean(nv, zfs_prop_to_name(prop));
+ }
+}
+
+static int zcp_system_props_list(lua_State *);
+static zcp_list_info_t zcp_system_props_list_info = {
+ .name = "system_properties",
+ .func = zcp_system_props_list,
+ .pargs = {
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+/*
+ * Get a list of all visble properties and their values for a given dataset.
+ * Returned on the stack as a Lua table.
+ */
+static int
+zcp_system_props_list(lua_State *state)
+{
+ int error;
+ char errbuf[128];
+ const char *dataset_name;
+ dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+ zcp_list_info_t *libinfo = &zcp_system_props_list_info;
+ zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+ dataset_name = lua_tostring(state, 1);
+ nvlist_t *nv = fnvlist_alloc();
+
+ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+ if (ds == NULL)
+ return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+ /* Get the names of all valid properties for this dataset */
+ zcp_dataset_props(ds, nv);
+ dsl_dataset_rele(ds, FTAG);
+
+ /* push list as lua table */
+ error = zcp_nvlist_to_lua(state, nv, errbuf, sizeof (errbuf));
+ nvlist_free(nv);
+ if (error != 0) {
+ return (luaL_error(state,
+ "Error returning nvlist: %s", errbuf));
+ }
+ return (1);
+}
+
+static int
+zcp_list_func(lua_State *state)
+{
+ zcp_list_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
+
+ zcp_parse_args(state, info->name, info->pargs, info->kwargs);
+
+ return (info->func(state));
+}
+
+int
+zcp_load_list_lib(lua_State *state)
+{
+ int i;
+ zcp_list_info_t *zcp_list_funcs[] = {
+ &zcp_children_list_info,
+ &zcp_snapshots_list_info,
+ &zcp_props_list_info,
+ &zcp_clones_list_info,
+ &zcp_system_props_list_info,
+ NULL
+ };
+
+ lua_newtable(state);
+
+ for (i = 0; zcp_list_funcs[i] != NULL; i++) {
+ zcp_list_info_t *info = zcp_list_funcs[i];
+
+ if (info->gc != NULL) {
+ /*
+ * If the function requires garbage collection, create
+ * a metatable with its name and register the __gc
+ * function.
+ */
+ (void) luaL_newmetatable(state, info->name);
+ (void) lua_pushstring(state, "__gc");
+ lua_pushcfunction(state, info->gc);
+ lua_settable(state, -3);
+ lua_pop(state, 1);
+ }
+
+ lua_pushlightuserdata(state, info);
+ lua_pushcclosure(state, &zcp_list_func, 1);
+ lua_setfield(state, -2, info->name);
+ info++;
+ }
+
+ return (1);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c
new file mode 100644
index 000000000000..25d970ec0888
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c
@@ -0,0 +1,360 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ */
+
+#include "lua.h"
+#include "lauxlib.h"
+
+#include <sys/zcp.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfeature.h>
+#include <sys/metaslab.h>
+
+#define DST_AVG_BLKSHIFT 14
+
+typedef int (zcp_synctask_func_t)(lua_State *, boolean_t, nvlist_t *);
+typedef struct zcp_synctask_info {
+ const char *name;
+ zcp_synctask_func_t *func;
+ const zcp_arg_t pargs[4];
+ const zcp_arg_t kwargs[2];
+ zfs_space_check_t space_check;
+ int blocks_modified;
+} zcp_synctask_info_t;
+
+/*
+ * Generic synctask interface for channel program syncfuncs.
+ *
+ * To perform some action in syncing context, we'd generally call
+ * dsl_sync_task(), but since the Lua script is already running inside a
+ * synctask we need to leave out some actions (such as acquiring the config
+ * rwlock and performing space checks).
+ *
+ * If 'sync' is false, executes a dry run and returns the error code.
+ *
+ * If we are not running in syncing context and we are not doing a dry run
+ * (meaning we are running a zfs.sync function in open-context) then we
+ * return a Lua error.
+ *
+ * This function also handles common fatal error cases for channel program
+ * library functions. If a fatal error occurs, err_dsname will be the dataset
+ * name reported in error messages, if supplied.
+ */
+static int
+zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, void *arg, boolean_t sync, const char *err_dsname)
+{
+ int err;
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ err = checkfunc(arg, ri->zri_tx);
+ if (!sync)
+ return (err);
+
+ if (!ri->zri_sync) {
+ return (luaL_error(state, "running functions from the zfs.sync "
+ "submodule requires passing sync=TRUE to "
+ "lzc_channel_program() (i.e. do not specify the \"-n\" "
+ "command line argument)"));
+ }
+
+ if (err == 0) {
+ syncfunc(arg, ri->zri_tx);
+ } else if (err == EIO) {
+ if (err_dsname != NULL) {
+ return (luaL_error(state,
+ "I/O error while accessing dataset '%s'",
+ err_dsname));
+ } else {
+ return (luaL_error(state,
+ "I/O error while accessing dataset."));
+ }
+ }
+
+ return (err);
+}
+
+
+static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_destroy_info = {
+ .name = "destroy",
+ .func = zcp_synctask_destroy,
+ .pargs = {
+ {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
+ {NULL, 0}
+ },
+ .space_check = ZFS_SPACE_CHECK_DESTROY,
+ .blocks_modified = 0
+};
+
+/* ARGSUSED */
+static int
+zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ const char *dsname = lua_tostring(state, 1);
+
+ boolean_t issnap = (strchr(dsname, '@') != NULL);
+
+ if (!issnap && !lua_isnil(state, 2)) {
+ return (luaL_error(state,
+ "'deferred' kwarg only supported for snapshots: %s",
+ dsname));
+ }
+
+ if (issnap) {
+ dsl_destroy_snapshot_arg_t ddsa = { 0 };
+ ddsa.ddsa_name = dsname;
+ if (!lua_isnil(state, 2)) {
+ ddsa.ddsa_defer = lua_toboolean(state, 2);
+ } else {
+ ddsa.ddsa_defer = B_FALSE;
+ }
+
+ err = zcp_sync_task(state, dsl_destroy_snapshot_check,
+ dsl_destroy_snapshot_sync, &ddsa, sync, dsname);
+ } else {
+ dsl_destroy_head_arg_t ddha = { 0 };
+ ddha.ddha_name = dsname;
+
+ err = zcp_sync_task(state, dsl_destroy_head_check,
+ dsl_destroy_head_sync, &ddha, sync, dsname);
+ }
+
+ return (err);
+}
+
+static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_promote_info = {
+ .name = "promote",
+ .func = zcp_synctask_promote,
+ .pargs = {
+ {.za_name = "clone", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ },
+ .space_check = ZFS_SPACE_CHECK_RESERVED,
+ .blocks_modified = 3
+};
+
+static int
+zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ dsl_dataset_promote_arg_t ddpa = { 0 };
+ const char *dsname = lua_tostring(state, 1);
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ ddpa.ddpa_clonename = dsname;
+ ddpa.err_ds = err_details;
+ ddpa.cr = ri->zri_cred;
+
+ /*
+ * If there was a snapshot name conflict, then err_ds will be filled
+ * with a list of conflicting snapshot names.
+ */
+ err = zcp_sync_task(state, dsl_dataset_promote_check,
+ dsl_dataset_promote_sync, &ddpa, sync, dsname);
+
+ return (err);
+}
+
+static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details);
+static zcp_synctask_info_t zcp_synctask_rollback_info = {
+ .name = "rollback",
+ .func = zcp_synctask_rollback,
+ .space_check = ZFS_SPACE_CHECK_RESERVED,
+ .blocks_modified = 1,
+ .pargs = {
+ {.za_name = "filesystem", .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ }
+};
+
+static int
+zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ const char *dsname = lua_tostring(state, 1);
+ dsl_dataset_rollback_arg_t ddra = { 0 };
+
+ ddra.ddra_fsname = dsname;
+ ddra.ddra_result = err_details;
+
+ err = zcp_sync_task(state, dsl_dataset_rollback_check,
+ dsl_dataset_rollback_sync, &ddra, sync, dsname);
+
+ return (err);
+}
+
+static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_snapshot_info = {
+ .name = "snapshot",
+ .func = zcp_synctask_snapshot,
+ .pargs = {
+ {.za_name = "filesystem@snapname | volume@snapname",
+ .za_lua_type = LUA_TSTRING},
+ {NULL, 0}
+ },
+ .kwargs = {
+ {NULL, 0}
+ },
+ .space_check = ZFS_SPACE_CHECK_NORMAL,
+ .blocks_modified = 3
+};
+
+/* ARGSUSED */
+static int
+zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+ int err;
+ dsl_dataset_snapshot_arg_t ddsa = { 0 };
+ const char *dsname = lua_tostring(state, 1);
+ zcp_run_info_t *ri = zcp_run_info(state);
+
+ /*
+ * On old pools, the ZIL must not be active when a snapshot is created,
+ * but we can't suspend the ZIL because we're already in syncing
+ * context.
+ */
+ if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) {
+ return (ENOTSUP);
+ }
+
+ /*
+ * We only allow for a single snapshot rather than a list, so the
+ * error list output is unnecessary.
+ */
+ ddsa.ddsa_errors = NULL;
+ ddsa.ddsa_props = NULL;
+ ddsa.ddsa_cr = ri->zri_cred;
+ ddsa.ddsa_snaps = fnvlist_alloc();
+ fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
+
+ zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
+ (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
+
+ err = zcp_sync_task(state, dsl_dataset_snapshot_check,
+ dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
+
+ zcp_deregister_cleanup(state, zch);
+ fnvlist_free(ddsa.ddsa_snaps);
+
+ return (err);
+}
+
+static int
+zcp_synctask_wrapper(lua_State *state)
+{
+ int err;
+ zcp_cleanup_handler_t *zch;
+ int num_ret = 1;
+ nvlist_t *err_details = fnvlist_alloc();
+
+ /*
+ * Make sure err_details is properly freed, even if a fatal error is
+ * thrown during the synctask.
+ */
+ zch = zcp_register_cleanup(state,
+ (zcp_cleanup_t *)&fnvlist_free, err_details);
+
+ zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
+ boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));
+
+ zcp_run_info_t *ri = zcp_run_info(state);
+ dsl_pool_t *dp = ri->zri_pool;
+
+ /* MOS space is triple-dittoed, so we multiply by 3. */
+ uint64_t funcspace = (info->blocks_modified << DST_AVG_BLKSHIFT) * 3;
+
+ zcp_parse_args(state, info->name, info->pargs, info->kwargs);
+
+ err = 0;
+ if (info->space_check != ZFS_SPACE_CHECK_NONE) {
+ uint64_t quota = dsl_pool_unreserved_space(dp,
+ info->space_check);
+ uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
+ ri->zri_space_used;
+
+ if (used + funcspace > quota) {
+ err = SET_ERROR(ENOSPC);
+ }
+ }
+
+ if (err == 0) {
+ err = info->func(state, sync, err_details);
+ }
+
+ if (err == 0) {
+ ri->zri_space_used += funcspace;
+ }
+
+ lua_pushnumber(state, (lua_Number)err);
+ if (fnvlist_num_pairs(err_details) > 0) {
+ (void) zcp_nvlist_to_lua(state, err_details, NULL, 0);
+ num_ret++;
+ }
+
+ zcp_deregister_cleanup(state, zch);
+ fnvlist_free(err_details);
+
+ return (num_ret);
+}
+
+int
+zcp_load_synctask_lib(lua_State *state, boolean_t sync)
+{
+ int i;
+ zcp_synctask_info_t *zcp_synctask_funcs[] = {
+ &zcp_synctask_destroy_info,
+ &zcp_synctask_promote_info,
+ &zcp_synctask_rollback_info,
+ &zcp_synctask_snapshot_info,
+ NULL
+ };
+
+ lua_newtable(state);
+
+ for (i = 0; zcp_synctask_funcs[i] != NULL; i++) {
+ zcp_synctask_info_t *info = zcp_synctask_funcs[i];
+ lua_pushlightuserdata(state, info);
+ lua_pushboolean(state, sync);
+ lua_pushcclosure(state, &zcp_synctask_wrapper, 2);
+ lua_setfield(state, -2, info->name);
+ info++;
+ }
+
+ return (1);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
new file mode 100644
index 000000000000..f78414ae10ec
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
@@ -0,0 +1,505 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfeature.h>
+#include <sys/dmu.h>
+#include <sys/nvpair.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include "zfeature_common.h"
+#include <sys/spa_impl.h>
+
+/*
+ * ZFS Feature Flags
+ * -----------------
+ *
+ * ZFS feature flags are used to provide fine-grained versioning to the ZFS
+ * on-disk format. Once enabled on a pool feature flags replace the old
+ * spa_version() number.
+ *
+ * Each new on-disk format change will be given a uniquely identifying string
+ * guid rather than a version number. This avoids the problem of different
+ * organizations creating new on-disk formats with the same version number. To
+ * keep feature guids unique they should consist of the reverse dns name of the
+ * organization which implemented the feature and a short name for the feature,
+ * separated by a colon (e.g. com.delphix:async_destroy).
+ *
+ * Reference Counts
+ * ----------------
+ *
+ * Within each pool features can be in one of three states: disabled, enabled,
+ * or active. These states are differentiated by a reference count stored on
+ * disk for each feature:
+ *
+ * 1) If there is no reference count stored on disk the feature is disabled.
+ * 2) If the reference count is 0 a system administrator has enabled the
+ * feature, but the feature has not been used yet, so no on-disk
+ * format changes have been made.
+ * 3) If the reference count is greater than 0 the feature is active.
+ * The format changes required by the feature are currently on disk.
+ * Note that if the feature's format changes are reversed the feature
+ * may choose to set its reference count back to 0.
+ *
+ * Feature flags makes no differentiation between non-zero reference counts
+ * for an active feature (e.g. a reference count of 1 means the same thing as a
+ * reference count of 27834721), but feature implementations may choose to use
+ * the reference count to store meaningful information. For example, a new RAID
+ * implementation might set the reference count to the number of vdevs using
+ * it. If all those disks are removed from the pool the feature goes back to
+ * having a reference count of 0.
+ *
+ * It is the responsibility of the individual features to maintain a non-zero
+ * reference count as long as the feature's format changes are present on disk.
+ *
+ * Dependencies
+ * ------------
+ *
+ * Each feature may depend on other features. The only effect of this
+ * relationship is that when a feature is enabled all of its dependencies are
+ * automatically enabled as well. Any future work to support disabling of
+ * features would need to ensure that features cannot be disabled if other
+ * enabled features depend on them.
+ *
+ * On-disk Format
+ * --------------
+ *
+ * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
+ * (5000). In order for this to work the pool is automatically upgraded to
+ * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
+ * format changes will be in use.
+ *
+ * Information about features is stored in 3 ZAP objects in the pool's MOS.
+ * These objects are linked to by the following names in the pool directory
+ * object:
+ *
+ * 1) features_for_read: feature guid -> reference count
+ * Features needed to open the pool for reading.
+ * 2) features_for_write: feature guid -> reference count
+ * Features needed to open the pool for writing.
+ * 3) feature_descriptions: feature guid -> descriptive string
+ * A human readable string.
+ *
+ * All enabled features appear in either features_for_read or
+ * features_for_write, but not both.
+ *
+ * To open a pool in read-only mode only the features listed in
+ * features_for_read need to be supported.
+ *
+ * To open the pool in read-write mode features in both features_for_read and
+ * features_for_write need to be supported.
+ *
+ * Some features may be required to read the ZAP objects containing feature
+ * information. To allow software to check for compatibility with these features
+ * before the pool is opened their names must be stored in the label in a
+ * new "features_for_read" entry (note that features that are only required
+ * to write to a pool never need to be stored in the label since the
+ * features_for_write ZAP object can be read before the pool is written to).
+ * To save space in the label features must be explicitly marked as needing to
+ * be written to the label. Also, reference counts are not stored in the label,
+ * instead any feature whose reference count drops to 0 is removed from the
+ * label.
+ *
+ * Adding New Features
+ * -------------------
+ *
+ * Features must be registered in zpool_feature_init() function in
+ * zfeature_common.c using the zfeature_register() function. This function
+ * has arguments to specify if the feature should be stored in the
+ * features_for_read or features_for_write ZAP object and if it needs to be
+ * written to the label when active.
+ *
+ * Once a feature is registered it will appear as a "feature@<feature name>"
+ * property which can be set by an administrator. Feature implementors should
+ * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
+ * query the state of a feature and the spa_feature_incr() and
+ * spa_feature_decr() functions to change an enabled feature's reference count.
+ * Reference counts may only be updated in the syncing context.
+ *
+ * Features may not perform enable-time initialization. Instead, any such
+ * initialization should occur when the feature is first used. This design
+ * enforces that on-disk changes be made only when features are used. Code
+ * should only check if a feature is enabled using spa_feature_is_enabled(),
+ * not by relying on any feature specific metadata existing. If a feature is
+ * enabled, but the feature's metadata is not on disk yet then it should be
+ * created as needed.
+ *
+ * As an example, consider the com.delphix:async_destroy feature. This feature
+ * relies on the existence of a bptree in the MOS that store blocks for
+ * asynchronous freeing. This bptree is not created when async_destroy is
+ * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
+ * called to check if async_destroy is enabled. If it is and the bptree object
+ * does not exist yet, the bptree object is created as part of the dataset
+ * destroy and async_destroy's reference count is incremented to indicate it
+ * has made an on-disk format change. Later, after the destroyed dataset's
+ * blocks have all been asynchronously freed there is no longer any use for the
+ * bptree object, so it is destroyed and async_destroy's reference count is
+ * decremented back to 0 to indicate that it has undone its on-disk format
+ * changes.
+ */
+
+typedef enum {
+ FEATURE_ACTION_INCR,
+ FEATURE_ACTION_DECR,
+} feature_action_t;
+
+/*
+ * Checks that the active features in the pool are supported by
+ * this software. Adds each unsupported feature (name -> description) to
+ * the supplied nvlist.
+ */
+boolean_t
+spa_features_check(spa_t *spa, boolean_t for_write,
+ nvlist_t *unsup_feat, nvlist_t *enabled_feat)
+{
+ objset_t *os = spa->spa_meta_objset;
+ boolean_t supported;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ uint64_t obj = for_write ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ supported = B_TRUE;
+ for (zap_cursor_init(&zc, os, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+ za.za_num_integers == 1);
+
+ if (NULL != enabled_feat) {
+ fnvlist_add_uint64(enabled_feat, za.za_name,
+ za.za_first_integer);
+ }
+
+ if (za.za_first_integer != 0 &&
+ !zfeature_is_supported(za.za_name)) {
+ supported = B_FALSE;
+
+ if (NULL != unsup_feat) {
+ char *desc = "";
+ char buf[MAXPATHLEN];
+
+ if (zap_lookup(os, spa->spa_feat_desc_obj,
+ za.za_name, 1, sizeof (buf), buf) == 0)
+ desc = buf;
+
+ VERIFY(nvlist_add_string(unsup_feat, za.za_name,
+ desc) == 0);
+ }
+ }
+ }
+ zap_cursor_fini(&zc);
+
+ return (supported);
+}
+
+/*
+ * Use an in-memory cache of feature refcounts for quick retrieval.
+ *
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb, zhack, and spa_add_feature_stats().
+ */
+int
+feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+ ASSERT(VALID_FEATURE_FID(feature->fi_feature));
+ if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
+ SPA_FEATURE_DISABLED) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ *res = spa->spa_feat_refcount_cache[feature->fi_feature];
+ return (0);
+}
+
+/*
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+ uint64_t *res)
+{
+ int err;
+ uint64_t refcount;
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ /*
+ * If the pool is currently being created, the feature objects may not
+ * have been allocated yet. Act as though all features are disabled.
+ */
+ if (zapobj == 0)
+ return (SET_ERROR(ENOTSUP));
+
+ err = zap_lookup(spa->spa_meta_objset, zapobj,
+ feature->fi_guid, sizeof (uint64_t), 1, &refcount);
+ if (err != 0) {
+ if (err == ENOENT)
+ return (SET_ERROR(ENOTSUP));
+ else
+ return (err);
+ }
+ *res = refcount;
+ return (0);
+}
+
+
+static int
+feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+ uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
+
+ ASSERT(zfeature_depends_on(feature->fi_feature,
+ SPA_FEATURE_ENABLED_TXG));
+
+ if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ ASSERT(enabled_txg_obj != 0);
+
+ VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
+ feature->fi_guid, sizeof (uint64_t), 1, res));
+
+ return (0);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
+ dmu_tx_t *tx)
+{
+ ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
+ sizeof (uint64_t), 1, &refcount, tx));
+
+ /*
+ * feature_sync is called directly from zhack, allowing the
+ * creation of arbitrary features whose fi_feature field may
+ * be greater than SPA_FEATURES. When called from zhack, the
+ * zfeature_info_t object's fi_feature field will be set to
+ * SPA_FEATURE_NONE.
+ */
+ if (feature->fi_feature != SPA_FEATURE_NONE) {
+ uint64_t *refcount_cache =
+ &spa->spa_feat_refcount_cache[feature->fi_feature];
+ VERIFY3U(*refcount_cache, ==,
+ atomic_swap_64(refcount_cache, refcount));
+ }
+
+ if (refcount == 0)
+ spa_deactivate_mos_feature(spa, feature->fi_guid);
+ else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
+ spa_activate_mos_feature(spa, feature->fi_guid, tx);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+ uint64_t initial_refcount =
+ (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ ASSERT(0 != zapobj);
+ ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+ ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+ /*
+ * If the feature is already enabled, ignore the request.
+ */
+ if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
+ return;
+
+ for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
+ spa_feature_enable(spa, feature->fi_depends[i], tx);
+
+ VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
+ feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
+ feature->fi_desc, tx));
+
+ feature_sync(spa, feature, initial_refcount, tx);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
+ uint64_t enabling_txg = dmu_tx_get_txg(tx);
+
+ if (spa->spa_feat_enabled_txg_obj == 0ULL) {
+ spa->spa_feat_enabled_txg_obj =
+ zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURE_ENABLED_TXG, tx);
+ }
+ spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
+
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ spa->spa_feat_enabled_txg_obj, feature->fi_guid,
+ sizeof (uint64_t), 1, &enabling_txg, tx));
+ }
+}
+
+static void
+feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
+ dmu_tx_t *tx)
+{
+ uint64_t refcount;
+ zfeature_info_t *feature = &spa_feature_table[fid];
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ ASSERT(0 != zapobj);
+ ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+ VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
+
+ switch (action) {
+ case FEATURE_ACTION_INCR:
+ VERIFY3U(refcount, !=, UINT64_MAX);
+ refcount++;
+ break;
+ case FEATURE_ACTION_DECR:
+ VERIFY3U(refcount, !=, 0);
+ refcount--;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ feature_sync(spa, feature, refcount, tx);
+}
+
+void
+spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
+{
+ /*
+ * We create feature flags ZAP objects in two instances: during pool
+ * creation and during pool upgrade.
+ */
+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
+ tx->tx_txg == TXG_INITIAL));
+
+ spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURES_FOR_READ, tx);
+ spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURES_FOR_WRITE, tx);
+ spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURE_DESCRIPTIONS, tx);
+}
+
+/*
+ * Enable any required dependencies, then enable the requested feature.
+ */
+void
+spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+ ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+ ASSERT(VALID_FEATURE_FID(fid));
+ feature_enable_sync(spa, &spa_feature_table[fid], tx);
+}
+
+void
+spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+ feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
+}
+
+void
+spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+ feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
+}
+
+boolean_t
+spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
+{
+ int err;
+ uint64_t refcount;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ if (spa_version(spa) < SPA_VERSION_FEATURES)
+ return (B_FALSE);
+
+ err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
+ ASSERT(err == 0 || err == ENOTSUP);
+ return (err == 0);
+}
+
+boolean_t
+spa_feature_is_active(spa_t *spa, spa_feature_t fid)
+{
+ int err;
+ uint64_t refcount;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ if (spa_version(spa) < SPA_VERSION_FEATURES)
+ return (B_FALSE);
+
+ err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
+ ASSERT(err == 0 || err == ENOTSUP);
+ return (err == 0 && refcount > 0);
+}
+
+/*
+ * For the feature specified by fid (which must depend on
+ * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
+ * OUT txg argument.
+ *
+ * Returns B_TRUE if the feature is enabled, in which case txg will be filled
+ * with the transaction group in which the specified feature was enabled.
+ * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
+ */
+boolean_t
+spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
+{
+ int err;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ if (spa_version(spa) < SPA_VERSION_FEATURES)
+ return (B_FALSE);
+
+ err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
+ ASSERT(err == 0 || err == ENOTSUP);
+
+ return (err == 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
new file mode 100644
index 000000000000..09881909b804
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+name="zfs" parent="pseudo";
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
new file mode 100644
index 000000000000..a588c59b491c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -0,0 +1,2778 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <acl/acl_common.h>
+
+#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
+#define DENY ACE_ACCESS_DENIED_ACE_TYPE
+#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define MIN_ACE_TYPE ALLOW
+
+#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+ ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+ ZFS_ACL_PROTECTED)
+
+#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+ ZFS_ACL_OBJ_ACE)
+
+#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+ ((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+ ((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+ return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+ *datap = NULL;
+ return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+ zfs_ace_v0_get_mask,
+ zfs_ace_v0_set_mask,
+ zfs_ace_v0_get_flags,
+ zfs_ace_v0_set_flags,
+ zfs_ace_v0_get_type,
+ zfs_ace_v0_set_type,
+ zfs_ace_v0_get_who,
+ zfs_ace_v0_set_who,
+ zfs_ace_v0_size,
+ zfs_ace_v0_abstract_size,
+ zfs_ace_v0_mask_off,
+ zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+ uint16_t entry_type;
+ zfs_ace_t *acep = args;
+
+ entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (-1);
+ return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+ ((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+ zfs_ace_t *acep = arg;
+
+ uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return;
+ acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+ zfs_ace_hdr_t *zacep = acep;
+ uint16_t entry_type;
+
+ switch (zacep->z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ return (sizeof (zfs_object_ace_t));
+ case ALLOW:
+ case DENY:
+ entry_type =
+ (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+ if (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (sizeof (zfs_ace_hdr_t));
+ /*FALLTHROUGH*/
+ default:
+ return (sizeof (zfs_ace_t));
+ }
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+ return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+ return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+ zfs_ace_t *zacep = acep;
+ zfs_object_ace_t *zobjp;
+
+ switch (zacep->z_hdr.z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjp = acep;
+ *datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+ return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+ default:
+ *datap = NULL;
+ return (0);
+ }
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+ zfs_ace_fuid_get_mask,
+ zfs_ace_fuid_set_mask,
+ zfs_ace_fuid_get_flags,
+ zfs_ace_fuid_set_flags,
+ zfs_ace_fuid_get_type,
+ zfs_ace_fuid_set_type,
+ zfs_ace_fuid_get_who,
+ zfs_ace_fuid_set_who,
+ zfs_ace_fuid_size,
+ zfs_ace_fuid_abstract_size,
+ zfs_ace_fuid_mask_off,
+ zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file. Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+ int error;
+
+ if (zp->z_is_sa)
+ return (0);
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_extern_obj);
+ else {
+ /*
+ * after upgrade the SA_ZPL_ZNODE_ACL should have been
+ * removed
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (0);
+ }
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+ zfs_acl_phys_t *aclphys)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t acl_count;
+ int size;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_is_sa) {
+ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+ &size)) != 0)
+ return (error);
+ *aclsize = size;
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+ &acl_count, sizeof (acl_count))) != 0)
+ return (error);
+ *aclcount = acl_count;
+ } else {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ aclphys, sizeof (*aclphys))) != 0)
+ return (error);
+
+ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+ *aclcount = aclphys->z_acl_size;
+ } else {
+ *aclsize = aclphys->z_acl_size;
+ *aclcount = aclphys->z_acl_count;
+ }
+ }
+ return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa)
+ return (ZFS_ACL_VERSION_FUID);
+ else {
+ int error;
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_version);
+ else {
+ /*
+ * After upgrade SA_ZPL_ZNODE_ACL should have
+ * been removed.
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (ZFS_ACL_VERSION_FUID);
+ }
+ }
+}
+
+static int
+zfs_acl_version(int version)
+{
+ if (version < ZPL_VERSION_FUID)
+ return (ZFS_ACL_VERSION_INITIAL);
+ else
+ return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+ return (zfs_acl_version(zp->z_zfsvfs->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+ zfs_acl_t *aclp;
+
+ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+ list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+ offsetof(zfs_acl_node_t, z_next));
+ aclp->z_version = vers;
+ if (vers == ZFS_ACL_VERSION_FUID)
+ aclp->z_ops = zfs_acl_fuid_ops;
+ else
+ aclp->z_ops = zfs_acl_v0_ops;
+ return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+ zfs_acl_node_t *aclnode;
+
+ aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+ if (bytes) {
+ aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+ aclnode->z_allocdata = aclnode->z_acldata;
+ aclnode->z_allocsize = bytes;
+ aclnode->z_size = bytes;
+ }
+
+ return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+ if (aclnode->z_allocsize)
+ kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+ kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+ zfs_acl_node_t *aclnode;
+
+ while (aclnode = list_head(&aclp->z_acl)) {
+ list_remove(&aclp->z_acl, aclnode);
+ zfs_acl_node_free(aclnode);
+ }
+ aclp->z_acl_count = 0;
+ aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+ zfs_acl_release_nodes(aclp);
+ list_destroy(&aclp->z_acl);
+ kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+ uint16_t entry_type;
+
+ switch (type) {
+ case ALLOW:
+ case DENY:
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ entry_type = flags & ACE_TYPE_FLAGS;
+ return (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE || entry_type == 0 ||
+ entry_type == ACE_IDENTIFIER_GROUP);
+ default:
+ if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+ /*
+ * first check type of entry
+ */
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ return (B_FALSE);
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+ return (B_FALSE);
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ }
+
+ /*
+ * next check inheritance level flags
+ */
+
+ if (obj_type == VDIR &&
+ (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ if ((iflags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+ uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+ zfs_acl_node_t *aclnode;
+
+ ASSERT(aclp);
+
+ if (start == NULL) {
+ aclnode = list_head(&aclp->z_acl);
+ if (aclnode == NULL)
+ return (NULL);
+
+ aclp->z_next_ace = aclnode->z_acldata;
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ }
+
+ aclnode = aclp->z_curr_node;
+
+ if (aclnode == NULL)
+ return (NULL);
+
+ if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+ aclnode = list_next(&aclp->z_acl, aclnode);
+ if (aclnode == NULL)
+ return (NULL);
+ else {
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ aclp->z_next_ace = aclnode->z_acldata;
+ }
+ }
+
+ if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+ void *acep = aclp->z_next_ace;
+ size_t ace_size;
+
+ /*
+ * Make sure we don't overstep our bounds
+ */
+ ace_size = aclp->z_ops.ace_size(acep);
+
+ if (((caddr_t)acep + ace_size) >
+ ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+ return (NULL);
+ }
+
+ *iflags = aclp->z_ops.ace_flags_get(acep);
+ *type = aclp->z_ops.ace_type_get(acep);
+ *access_mask = aclp->z_ops.ace_mask_get(acep);
+ *who = aclp->z_ops.ace_who_get(acep);
+ aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+ aclnode->z_ace_idx++;
+
+ return ((void *)acep);
+ }
+ return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+ uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+ zfs_acl_t *aclp = datap;
+ zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+ uint64_t who;
+
+ acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+ flags, type);
+ return ((uint64_t)(uintptr_t)acep);
+}
+
+static zfs_acl_node_t *
+zfs_acl_curr_node(zfs_acl_t *aclp)
+{
+ ASSERT(aclp->z_curr_node);
+ return (aclp->z_curr_node);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
+ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+ zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+ int i;
+ uint16_t entry_type;
+ zfs_ace_t *aceptr = z_acl;
+ ace_t *acep = datap;
+ zfs_object_ace_t *zobjacep;
+ ace_object_t *aceobjp;
+
+ for (i = 0; i != aclcnt; i++) {
+ aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+ aceptr->z_hdr.z_flags = acep->a_flags;
+ aceptr->z_hdr.z_type = acep->a_type;
+ entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+ if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE) {
+ aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+ cr, (entry_type == 0) ?
+ ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+ }
+
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
+ aceptr->z_hdr.z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+
+ switch (acep->a_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjacep = (zfs_object_ace_t *)aceptr;
+ aceobjp = (ace_object_t *)acep;
+
+ bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+ sizeof (aceobjp->a_obj_type));
+ bcopy(aceobjp->a_inherit_obj_type,
+ zobjacep->z_inherit_type,
+ sizeof (aceobjp->a_inherit_obj_type));
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+ break;
+ default:
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+ }
+
+ aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+ aclp->z_ops.ace_size(aceptr));
+ }
+
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+ return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+ void *datap, int filter)
+{
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, type;
+ zfs_ace_hdr_t *zacep = NULL;
+ ace_t *acep = datap;
+ ace_object_t *objacep;
+ zfs_object_ace_t *zobjacep;
+ size_t ace_size;
+ uint16_t entry_type;
+
+ while (zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type)) {
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (filter) {
+ continue;
+ }
+ zobjacep = (zfs_object_ace_t *)zacep;
+ objacep = (ace_object_t *)acep;
+ bcopy(zobjacep->z_object_type,
+ objacep->a_obj_type,
+ sizeof (zobjacep->z_object_type));
+ bcopy(zobjacep->z_inherit_type,
+ objacep->a_inherit_obj_type,
+ sizeof (zobjacep->z_inherit_type));
+ ace_size = sizeof (ace_object_t);
+ break;
+ default:
+ ace_size = sizeof (ace_t);
+ break;
+ }
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ if ((entry_type != ACE_OWNER &&
+ entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE)) {
+ acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+ cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+ ZFS_ACE_GROUP : ZFS_ACE_USER);
+ } else {
+ acep->a_who = (uid_t)(int64_t)who;
+ }
+ acep->a_access_mask = access_mask;
+ acep->a_flags = iflags;
+ acep->a_type = type;
+ acep = (ace_t *)((caddr_t)acep + ace_size);
+ }
+}
+
+static int
+zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
+ zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+ int i;
+ zfs_oldace_t *aceptr = z_acl;
+
+ for (i = 0; i != aclcnt; i++, aceptr++) {
+ aceptr->z_access_mask = acep[i].a_access_mask;
+ aceptr->z_type = acep[i].a_type;
+ aceptr->z_flags = acep[i].a_flags;
+ aceptr->z_fuid = acep[i].a_who;
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
+ aceptr->z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+ }
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+ return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+ zfs_oldace_t *oldaclp;
+ int i;
+ uint16_t type, iflags;
+ uint32_t access_mask;
+ uint64_t who;
+ void *cookie = NULL;
+ zfs_acl_node_t *newaclnode;
+
+ ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+ /*
+ * First create the ACE in a contiguous piece of memory
+ * for zfs_copy_ace_2_fuid().
+ *
+ * We only convert an ACL once, so this won't happen
+ * everytime.
+ */
+ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+ KM_SLEEP);
+ i = 0;
+ while (cookie = zfs_acl_next_ace(aclp, cookie, &who,
+ &access_mask, &iflags, &type)) {
+ oldaclp[i].z_flags = iflags;
+ oldaclp[i].z_type = type;
+ oldaclp[i].z_fuid = who;
+ oldaclp[i++].z_access_mask = access_mask;
+ }
+
+ newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+ sizeof (zfs_object_ace_t));
+ aclp->z_ops = zfs_acl_fuid_ops;
+ VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
+ oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+ &newaclnode->z_size, NULL, cr) == 0);
+ newaclnode->z_ace_count = aclp->z_acl_count;
+ aclp->z_version = ZFS_ACL_VERSION;
+ kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+ /*
+ * Release all previous ACL nodes
+ */
+
+ zfs_acl_release_nodes(aclp);
+
+ list_insert_head(&aclp->z_acl, newaclnode);
+
+ aclp->z_acl_bytes = newaclnode->z_size;
+ aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & S_IXOTH)
+ new_mask |= ACE_EXECUTE;
+ if (access_mask & S_IWOTH)
+ new_mask |= ACE_WRITE_DATA;
+ if (access_mask & S_IROTH)
+ new_mask |= ACE_READ_DATA;
+ return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+ uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+ uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+ aclp->z_ops.ace_mask_set(acep, access_mask);
+ aclp->z_ops.ace_type_set(acep, access_type);
+ aclp->z_ops.ace_flags_set(acep, entry_type);
+ if ((type != ACE_OWNER && type != OWNING_GROUP &&
+ type != ACE_EVERYONE))
+ aclp->z_ops.ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+ uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+ int entry_type;
+ mode_t mode;
+ mode_t seen = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ uint64_t who;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ boolean_t an_exec_denied = B_FALSE;
+
+ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+ while (acep = zfs_acl_next_ace(aclp, acep, &who,
+ &access_mask, &iflags, &type)) {
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ /*
+ * Skip over any inherit_only ACEs
+ */
+ if (iflags & ACE_INHERIT_ONLY_ACE)
+ continue;
+
+ if (entry_type == ACE_OWNER || (entry_type == 0 &&
+ who == fuid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRUSR))) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWUSR))) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXUSR))) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ } else if (entry_type == OWNING_GROUP ||
+ (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRGRP))) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWGRP))) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXGRP))) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ } else if (entry_type == ACE_EVERYONE) {
+ if ((access_mask & ACE_READ_DATA)) {
+ if (!(seen & S_IRUSR)) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if (!(seen & S_IRGRP)) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if (!(seen & S_IROTH)) {
+ seen |= S_IROTH;
+ if (type == ALLOW) {
+ mode |= S_IROTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA)) {
+ if (!(seen & S_IWUSR)) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if (!(seen & S_IWGRP)) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if (!(seen & S_IWOTH)) {
+ seen |= S_IWOTH;
+ if (type == ALLOW) {
+ mode |= S_IWOTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_EXECUTE)) {
+ if (!(seen & S_IXUSR)) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ if (!(seen & S_IXGRP)) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ if (!(seen & S_IXOTH)) {
+ seen |= S_IXOTH;
+ if (type == ALLOW) {
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ } else {
+ /*
+ * Only care if this IDENTIFIER_GROUP or
+ * USER ACE denies execute access to someone,
+ * mode is not affected
+ */
+ if ((access_mask & ACE_EXECUTE) && type == DENY)
+ an_exec_denied = B_TRUE;
+ }
+ }
+
+ /*
+ * Failure to allow is effectively a deny, so execute permission
+ * is denied if it was never mentioned or if we explicitly
+ * weren't allowed it.
+ */
+ if (!an_exec_denied &&
+ ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+ (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+ an_exec_denied = B_TRUE;
+
+ if (an_exec_denied)
+ *pflags &= ~ZFS_NO_EXECS_DENIED;
+ else
+ *pflags |= ZFS_NO_EXECS_DENIED;
+
+ return (mode);
+}
+
+/*
+ * Read an external acl object. If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+static int
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
+{
+ zfs_acl_t *aclp;
+ int aclsize;
+ int acl_count;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_phys_t znode_acl;
+ int version;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+
+ if (zp->z_acl_cached && !will_modify) {
+ *aclpp = zp->z_acl_cached;
+ return (0);
+ }
+
+ version = zfs_znode_acl_version(zp);
+
+ if ((error = zfs_acl_znode_info(zp, &aclsize,
+ &acl_count, &znode_acl)) != 0) {
+ goto done;
+ }
+
+ aclp = zfs_acl_alloc(version);
+
+ aclp->z_acl_count = acl_count;
+ aclp->z_acl_bytes = aclsize;
+
+ aclnode = zfs_acl_node_alloc(aclsize);
+ aclnode->z_ace_count = aclp->z_acl_count;
+ aclnode->z_size = aclsize;
+
+ if (!zp->z_is_sa) {
+ if (znode_acl.z_acl_extern_obj) {
+ error = dmu_read(zp->z_zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+ aclnode->z_acldata, DMU_READ_PREFETCH);
+ } else {
+ bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+ aclnode->z_size);
+ }
+ } else {
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+ aclnode->z_acldata, aclnode->z_size);
+ }
+
+ if (error != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ goto done;
+ }
+
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ *aclpp = aclp;
+ if (!will_modify)
+ zp->z_acl_cached = aclp;
+done:
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+ boolean_t start, void *userdata)
+{
+ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+ if (start) {
+ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+ } else {
+ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+ cb->cb_acl_node);
+ }
+ *dataptr = cb->cb_acl_node->z_acldata;
+ *length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+ int error;
+ zfs_acl_t *aclp;
+
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+ if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
+ zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+ &zp->z_pflags, zp->z_uid, zp->z_gid);
+ return (error);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+ int error;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_type_t otype;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t mode;
+ sa_bulk_attr_t bulk[5];
+ uint64_t ctime[2];
+ int count = 0;
+ zfs_acl_phys_t acl_phys;
+
+ ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+ mode = zp->z_mode;
+
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+ zp->z_uid, zp->z_gid);
+
+ zp->z_mode = mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ /*
+ * Upgrade needed?
+ */
+ if (!zfsvfs->z_use_fuids) {
+ otype = DMU_OT_OLDACL;
+ } else {
+ if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+ (zfsvfs->z_version >= ZPL_VERSION_FUID))
+ zfs_acl_xform(zp, aclp, cr);
+ ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+ otype = DMU_OT_ACL;
+ }
+
+ /*
+ * Arrgh, we have to handle old on disk format
+ * as well as newer (preferred) SA format.
+ */
+
+ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+ locate.cb_aclp = aclp;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+ NULL, &aclp->z_acl_count, sizeof (uint64_t));
+ } else { /* Painful legacy way */
+ zfs_acl_node_t *aclnode;
+ uint64_t off = 0;
+ uint64_t aoid;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ &acl_phys, sizeof (acl_phys))) != 0)
+ return (error);
+
+ aoid = acl_phys.z_acl_extern_obj;
+
+ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ /*
+ * If ACL was previously external and we are now
+ * converting to new ACL format then release old
+ * ACL object and create a new one.
+ */
+ if (aoid &&
+ aclp->z_version != acl_phys.z_acl_version) {
+ error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+ if (error)
+ return (error);
+ aoid = 0;
+ }
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ otype, aclp->z_acl_bytes,
+ otype == DMU_OT_ACL ?
+ DMU_OT_SYSACL : DMU_OT_NONE,
+ otype == DMU_OT_ACL ?
+ DN_OLD_MAX_BONUSLEN : 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os,
+ aoid, aclp->z_acl_bytes, 0, tx);
+ }
+ acl_phys.z_acl_extern_obj = aoid;
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ dmu_write(zfsvfs->z_os, aoid, off,
+ aclnode->z_size, aclnode->z_acldata, tx);
+ off += aclnode->z_size;
+ }
+ } else {
+ void *start = acl_phys.z_ace_data;
+ /*
+ * Migrating back embedded?
+ */
+ if (acl_phys.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ acl_phys.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ acl_phys.z_acl_extern_obj = 0;
+ }
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ }
+ /*
+ * If Old version then swap count/bytes to match old
+ * layout of znode_acl_phys_t.
+ */
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ acl_phys.z_acl_size = aclp->z_acl_count;
+ acl_phys.z_acl_count = aclp->z_acl_bytes;
+ } else {
+ acl_phys.z_acl_size = aclp->z_acl_bytes;
+ acl_phys.z_acl_count = aclp->z_acl_count;
+ }
+ acl_phys.z_acl_version = aclp->z_version;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (acl_phys));
+ }
+
+ /*
+ * Replace ACL wide bits, but first clear them.
+ */
+ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+ zp->z_pflags |= aclp->z_hints;
+
+ if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+ zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
+ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
+ zfs_acl_t *aclp)
+{
+ void *acep = NULL;
+ uint64_t who;
+ int new_count, new_bytes;
+ int ace_size;
+ int entry_type;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ zfs_acl_node_t *newnode;
+ size_t abstract_size = aclp->z_ops.ace_abstract_size();
+ void *zacep;
+ boolean_t isdir;
+ trivial_acl_t masks;
+
+ new_count = new_bytes = 0;
+
+ isdir = (vtype == VDIR);
+
+ acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+ newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+ zacep = newnode->z_acldata;
+ if (masks.allow0) {
+ zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (masks.deny1) {
+ zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (masks.deny2) {
+ zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+
+ while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type)) {
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ /*
+ * ACEs used to represent the file mode may be divided
+ * into an equivalent pair of inherit-only and regular
+ * ACEs, if they are inheritable.
+ * Skip regular ACEs, which are replaced by the new mode.
+ */
+ if (split && (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)) {
+ if (!isdir || !(iflags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ continue;
+ /*
+ * We preserve owner@, group@, or @everyone
+ * permissions, if they are inheritable, by
+ * copying them to inherit_only ACEs. This
+ * prevents inheritable permissions from being
+ * altered along with the file mode.
+ */
+ iflags |= ACE_INHERIT_ONLY_ACE;
+ }
+
+ /*
+ * If this ACL has any inheritable ACEs, mark that in
+ * the hints (which are later masked into the pflags)
+ * so create knows to do inheritance.
+ */
+ if (isdir && (iflags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if ((type != ALLOW && type != DENY) ||
+ (iflags & ACE_INHERIT_ONLY_ACE)) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ break;
+ }
+ } else {
+ /*
+ * Limit permissions granted by ACEs to be no greater
+ * than permissions of the requested group mode.
+ * Applies when the "aclmode" property is set to
+ * "groupmask".
+ */
+ if ((type == ALLOW) && trim)
+ access_mask &= masks.group;
+ }
+ zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+ ace_size = aclp->z_ops.ace_size(acep);
+ zacep = (void *)((uintptr_t)zacep + ace_size);
+ new_count++;
+ new_bytes += ace_size;
+ }
+ zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
+
+ new_count += 3;
+ new_bytes += abstract_size * 3;
+ zfs_acl_release_nodes(aclp);
+ aclp->z_acl_count = new_count;
+ aclp->z_acl_bytes = new_bytes;
+ newnode->z_ace_count = new_count;
+ newnode->z_size = new_bytes;
+ list_insert_tail(&aclp->z_acl, newnode);
+}
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+ int error = 0;
+
+ mutex_enter(&zp->z_acl_lock);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
+ *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+ else
+ error = zfs_acl_node_read(zp, aclp, B_TRUE);
+
+ if (error == 0) {
+ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
+ (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
+{
+ int iflags = (acep_flags & 0xf);
+
+ if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+ return (1);
+ else if (iflags & ACE_FILE_INHERIT_ACE)
+ return (!((vtype == VDIR) &&
+ (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+ return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
+ uint64_t mode, boolean_t *need_chmod)
+{
+ void *pacep = NULL;
+ void *acep;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_t *aclp = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, newflags, type;
+ size_t ace_size;
+ void *data1, *data2;
+ size_t data1sz, data2sz;
+ uint_t aclinherit;
+ boolean_t isdir = (vtype == VDIR);
+ boolean_t isreg = (vtype == VREG);
+
+ *need_chmod = B_TRUE;
+
+ aclp = zfs_acl_alloc(paclp->z_version);
+ aclinherit = zfsvfs->z_acl_inherit;
+ if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK)
+ return (aclp);
+
+ while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
+ &access_mask, &iflags, &type)) {
+
+ /*
+ * don't inherit bogus ACEs
+ */
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ /*
+ * Check if ACE is inheritable by this vnode
+ */
+ if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+ !zfs_ace_can_use(vtype, iflags))
+ continue;
+
+ /*
+ * If owner@, group@, or everyone@ inheritable
+ * then zfs_acl_chmod() isn't needed.
+ */
+ if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
+ aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
+ ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+ ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
+ (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+ *need_chmod = B_FALSE;
+
+ /*
+ * Strip inherited execute permission from file if
+ * not in mode
+ */
+ if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+ !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+ access_mask &= ~ACE_EXECUTE;
+ }
+
+ /*
+ * Strip write_acl and write_owner from permissions
+ * when inheriting an ACE
+ */
+ if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+ access_mask &= ~RESTRICTED_CLEAR;
+ }
+
+ ace_size = aclp->z_ops.ace_size(pacep);
+ aclnode = zfs_acl_node_alloc(ace_size);
+ list_insert_tail(&aclp->z_acl, aclnode);
+ acep = aclnode->z_acldata;
+
+ zfs_set_ace(aclp, acep, access_mask, type,
+ who, iflags|ACE_INHERITED_ACE);
+
+ /*
+ * Copy special opaque data if any
+ */
+ if ((data1sz = paclp->z_ops.ace_data(pacep, &data1)) != 0) {
+ VERIFY((data2sz = aclp->z_ops.ace_data(acep,
+ &data2)) == data1sz);
+ bcopy(data1, data2, data2sz);
+ }
+
+ aclp->z_acl_count++;
+ aclnode->z_ace_count++;
+ aclp->z_acl_bytes += aclnode->z_size;
+ newflags = aclp->z_ops.ace_flags_get(acep);
+
+ /*
+ * If ACE is not to be inherited further, or if the vnode is
+ * not a directory, remove all inheritance flags
+ */
+ if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ newflags &= ~ALL_INHERIT;
+ aclp->z_ops.ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ continue;
+ }
+
+ /*
+ * This directory has an inheritable ACE
+ */
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ /*
+ * If only FILE_INHERIT is set then turn on
+ * inherit_only
+ */
+ if ((iflags & (ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+ newflags |= ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops.ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ } else {
+ newflags &= ~ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops.ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ }
+ }
+
+ return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+ vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+ int error;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_acl_t *paclp;
+ gid_t gid;
+ boolean_t need_chmod = B_TRUE;
+ boolean_t trim = B_FALSE;
+ boolean_t inherited = B_FALSE;
+
+ if ((flag & IS_ROOT_NODE) == 0)
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ else
+ ASSERT(dzp->z_vnode == NULL);
+ bzero(acl_ids, sizeof (zfs_acl_ids_t));
+ acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+ if (vsecp)
+ if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
+ &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+ return (error);
+ /*
+ * Determine uid and gid.
+ */
+ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+ ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+ acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr,
+ ZFS_OWNER, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid, cr,
+ ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ } else {
+ acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+ cr, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = 0;
+ if (vap->va_mask & AT_GID) {
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ if (acl_ids->z_fgid != dzp->z_gid &&
+ !groupmember(vap->va_gid, cr) &&
+ secpolicy_vnode_create_gid(cr) != 0)
+ acl_ids->z_fgid = 0;
+ }
+ if (acl_ids->z_fgid == 0) {
+#ifndef __FreeBSD_kernel__
+ if (dzp->z_mode & S_ISGID) {
+#endif
+ char *domain;
+ uint32_t rid;
+
+ acl_ids->z_fgid = dzp->z_gid;
+ gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+ cr, ZFS_GROUP);
+
+ if (zfsvfs->z_use_fuids &&
+ IS_EPHEMERAL(acl_ids->z_fgid)) {
+ domain = zfs_fuid_idx_domain(
+ &zfsvfs->z_fuid_idx,
+ FUID_INDEX(acl_ids->z_fgid));
+ rid = FUID_RID(acl_ids->z_fgid);
+ zfs_fuid_node_add(&acl_ids->z_fuidp,
+ domain, rid,
+ FUID_INDEX(acl_ids->z_fgid),
+ acl_ids->z_fgid, ZFS_GROUP);
+ }
+#ifndef __FreeBSD_kernel__
+ } else {
+ acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+ ZFS_GROUP, cr, &acl_ids->z_fuidp);
+ gid = crgetgid(cr);
+ }
+#endif
+ }
+ }
+
+ /*
+ * If we're creating a directory, and the parent directory has the
+ * set-GID bit set, set in on the new directory.
+ * Otherwise, if the user is neither privileged nor a member of the
+ * file's new group, clear the file's set-GID bit.
+ */
+
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+ (vap->va_type == VDIR)) {
+ acl_ids->z_mode |= S_ISGID;
+ } else {
+ if ((acl_ids->z_mode & S_ISGID) &&
+ secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
+ acl_ids->z_mode &= ~S_ISGID;
+ }
+
+ if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_acl_lock);
+ if (!(flag & IS_ROOT_NODE) &&
+ (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+ !(dzp->z_pflags & ZFS_XATTR)) {
+ VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
+ acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+ vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+ inherited = B_TRUE;
+ } else {
+ acl_ids->z_aclp =
+ zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+ mutex_exit(&dzp->z_acl_lock);
+
+ if (need_chmod) {
+ if (vap->va_type == VDIR)
+ acl_ids->z_aclp->z_hints |=
+ ZFS_ACL_AUTO_INHERIT;
+
+ if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+ zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+ zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+ trim = B_TRUE;
+ zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE,
+ trim, acl_ids->z_aclp);
+ }
+ }
+
+ if (inherited || vsecp) {
+ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+
+ return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+ if (acl_ids->z_aclp)
+ zfs_acl_free(acl_ids->z_aclp);
+ if (acl_ids->z_fuidp)
+ zfs_fuid_info_free(acl_ids->z_fuidp);
+ acl_ids->z_aclp = NULL;
+ acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
+{
+ return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+ zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ ulong_t mask;
+ int error;
+ int count = 0;
+ int largeace = 0;
+
+ mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+ VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
+ return (error);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ /*
+ * Scan ACL to determine number of ACEs
+ */
+ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+ void *zacep = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t type, iflags;
+
+ while (zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type)) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ largeace++;
+ continue;
+ default:
+ count++;
+ }
+ }
+ vsecp->vsa_aclcnt = count;
+ } else
+ count = (int)aclp->z_acl_count;
+
+ if (mask & VSA_ACECNT) {
+ vsecp->vsa_aclcnt = count;
+ }
+
+ if (mask & VSA_ACE) {
+ size_t aclsz;
+
+ aclsz = count * sizeof (ace_t) +
+ sizeof (ace_object_t) * largeace;
+
+ vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+ vsecp->vsa_aclentsz = aclsz;
+
+ if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+ zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
+ vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+ else {
+ zfs_acl_node_t *aclnode;
+ void *start = vsecp->vsa_aclentp;
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+ aclp->z_acl_bytes);
+ }
+ }
+ if (mask & VSA_ACE_ACLFLAGS) {
+ vsecp->vsa_aclflags = 0;
+ if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+ vsecp->vsa_aclflags |= ACL_DEFAULTED;
+ if (zp->z_pflags & ZFS_ACL_PROTECTED)
+ vsecp->vsa_aclflags |= ACL_PROTECTED;
+ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+ vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
+ vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+ zfs_acl_t *aclp;
+ zfs_acl_node_t *aclnode;
+ int aclcnt = vsecp->vsa_aclcnt;
+ int error;
+
+ if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+ return (SET_ERROR(EINVAL));
+
+ aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+ aclp->z_hints = 0;
+ aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
+ (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+ aclcnt, &aclnode->z_size)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ } else {
+ if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
+ vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+ &aclnode->z_size, fuidp, cr)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ }
+ aclp->z_acl_bytes = aclnode->z_size;
+ aclnode->z_ace_count = aclcnt;
+ aclp->z_acl_count = aclcnt;
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ /*
+ * If flags are being set then add them to z_hints
+ */
+ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+ if (vsecp->vsa_aclflags & ACL_PROTECTED)
+ aclp->z_hints |= ZFS_ACL_PROTECTED;
+ if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+ aclp->z_hints |= ZFS_ACL_DEFAULTED;
+ if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+ aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+ }
+
+ *zaclp = aclp;
+
+ return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_t *aclp;
+ zfs_fuid_info_t *fuidp = NULL;
+ boolean_t fuid_dirtied;
+ uint64_t acl_obj;
+
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ return (SET_ERROR(EPERM));
+
+ if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
+ return (error);
+
+ error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
+ &aclp);
+ if (error)
+ return (error);
+
+ /*
+ * If ACL wide flags aren't being set then preserve any
+ * existing flags.
+ */
+ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+ aclp->z_hints |=
+ (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+ }
+top:
+ mutex_enter(&zp->z_acl_lock);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ /*
+ * If old version and ACL won't fit in bonus and we aren't
+ * upgrading then take out necessary DMU holds
+ */
+
+ if ((acl_obj = zfs_external_acl(zp)) != 0) {
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+ }
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (error) {
+ mutex_exit(&zp->z_acl_lock);
+
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ zfs_acl_free(aclp);
+ return (error);
+ }
+
+ error = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT(error == 0);
+ ASSERT(zp->z_acl_cached == NULL);
+ zp->z_acl_cached = aclp;
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+ if (fuidp)
+ zfs_fuid_info_free(fuidp);
+ dmu_tx_commit(tx);
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only. Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+ if ((v4_mode & WRITE_MASK) &&
+ (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+ (!IS_DEVVP(ZTOV(zp)) ||
+ (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * Intentionally allow ZFS_READONLY through here.
+ * See zfs_zaccess_common().
+ */
+ if ((v4_mode & WRITE_MASK_DATA) &&
+ (zp->z_pflags & ZFS_IMMUTABLE)) {
+ return (SET_ERROR(EPERM));
+ }
+
+#ifdef illumos
+ if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
+ (zp->z_pflags & ZFS_NOUNLINK)) {
+ return (SET_ERROR(EPERM));
+ }
+#else
+ /*
+ * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK
+ * (sunlnk) is set. We just don't allow directory removal, which is
+ * handled in zfs_zaccess_delete().
+ */
+ if ((v4_mode & ACE_DELETE) &&
+ (zp->z_pflags & ZFS_NOUNLINK)) {
+ return (EPERM);
+ }
+#endif
+
+ if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+ return (SET_ERROR(EACCES));
+ }
+
+ return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied. The AoI are expressed as bits in
+ * the working_mode parameter. As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode. This removal
+ * facilitates two things. The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE. Removing the
+ * discovered access or denial enforces this rule. At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses. Returns:
+ * 0 if all AoI granted
+ * EACCESS if the denied mask is non-zero
+ * other error if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted. If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE. The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+ boolean_t anyaccess, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zfs_acl_t *aclp;
+ int error;
+ uid_t uid = crgetuid(cr);
+ uint64_t who;
+ uint16_t type, iflags;
+ uint16_t entry_type;
+ uint32_t access_mask;
+ uint32_t deny_mask = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ boolean_t checkit;
+ uid_t gowner;
+ uid_t fowner;
+
+ zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ ASSERT(zp->z_acl_cached);
+
+ while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type)) {
+ uint32_t mask_matched;
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
+ continue;
+
+ /* Skip ACE if it does not affect any AoI */
+ mask_matched = (access_mask & *working_mode);
+ if (!mask_matched)
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ checkit = B_FALSE;
+
+ switch (entry_type) {
+ case ACE_OWNER:
+ if (uid == fowner)
+ checkit = B_TRUE;
+ break;
+ case OWNING_GROUP:
+ who = gowner;
+ /*FALLTHROUGH*/
+ case ACE_IDENTIFIER_GROUP:
+ checkit = zfs_groupmember(zfsvfs, who, cr);
+ break;
+ case ACE_EVERYONE:
+ checkit = B_TRUE;
+ break;
+
+ /* USER Entry */
+ default:
+ if (entry_type == 0) {
+ uid_t newid;
+
+ newid = zfs_fuid_map_id(zfsvfs, who, cr,
+ ZFS_ACE_USER);
+ if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
+ uid == newid)
+ checkit = B_TRUE;
+ break;
+ } else {
+ mutex_exit(&zp->z_acl_lock);
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ if (checkit) {
+ if (type == DENY) {
+ DTRACE_PROBE3(zfs__ace__denies,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ deny_mask |= mask_matched;
+ } else {
+ DTRACE_PROBE3(zfs__ace__allows,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ if (anyaccess) {
+ mutex_exit(&zp->z_acl_lock);
+ return (0);
+ }
+ }
+ *working_mode &= ~mask_matched;
+ }
+
+ /* Are we done? */
+ if (*working_mode == 0)
+ break;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ /* Put the found 'denies' back on the working mode */
+ if (deny_mask) {
+ *working_mode |= deny_mask;
+ return (SET_ERROR(EACCES));
+ } else if (*working_mode) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+ uint32_t have = ACE_ALL_PERMS;
+
+ if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+ uid_t owner;
+
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
+ }
+ return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+ boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int err;
+
+ *working_mode = v4_mode;
+ *check_privs = B_TRUE;
+
+ /*
+ * Short circuit empty requests
+ */
+ if (v4_mode == 0 || zfsvfs->z_replay) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+ *check_privs = B_FALSE;
+ return (err);
+ }
+
+ /*
+ * The caller requested that the ACL check be skipped. This
+ * would only happen if the caller checked VOP_ACCESS() with a
+ * 32 bit ACE mask and already had the appropriate permissions.
+ */
+ if (skipaclchk) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ /*
+ * Note: ZFS_READONLY represents the "DOS R/O" attribute.
+ * When that flag is set, we should behave as if write access
+ * were not granted by anything in the ACL. In particular:
+ * We _must_ allow writes after opening the file r/w, then
+ * setting the DOS R/O attribute, and writing some more.
+ * (Similar to how you can write after fchmod(fd, 0444).)
+ *
+ * Therefore ZFS_READONLY is ignored in the dataset check
+ * above, and checked here as if part of the ACL check.
+ * Also note: DOS R/O is ignored for directories.
+ */
+ if ((v4_mode & WRITE_MASK_DATA) &&
+ (ZTOV(zp)->v_type != VDIR) &&
+ (zp->z_pflags & ZFS_READONLY)) {
+ return (SET_ERROR(EPERM));
+ }
+
+ return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+ cred_t *cr)
+{
+ if (*working_mode != ACE_WRITE_DATA)
+ return (SET_ERROR(EACCES));
+
+ return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+ check_privs, B_FALSE, cr));
+}
+
+/*
+ * Check if VEXEC is allowed.
+ *
+ * This routine is based on zfs_fastaccesschk_execute which has slowpath
+ * calling zfs_zaccess. This would be incorrect on FreeBSD (see
+ * zfs_freebsd_access for the difference). Thus this variant let's the
+ * caller handle the slowpath (if necessary).
+ *
+ * We only check for ZFS_NO_EXECS_DENIED and fail early. This routine can
+ * be extended to cover more cases, but the flag covers the majority.
+ */
+int
+zfs_freebsd_fastaccesschk_execute(struct vnode *vp, cred_t *cr)
+{
+ boolean_t is_attr;
+ znode_t *zdp = VTOZ(vp);
+
+ ASSERT_VOP_LOCKED(vp, __func__);
+
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+ return (1);
+
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+ (ZTOV(zdp)->v_type == VDIR));
+ if (is_attr)
+ return (1);
+
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED)
+ return (0);
+
+ return (1);
+}
+
+#ifdef illumos
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+ boolean_t owner = B_FALSE;
+ boolean_t groupmbr = B_FALSE;
+ boolean_t is_attr;
+ uid_t uid = crgetuid(cr);
+ int error;
+
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+ return (SET_ERROR(EACCES));
+
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+ (ZTOV(zdp)->v_type == VDIR));
+ if (is_attr)
+ goto slow;
+
+
+ mutex_enter(&zdp->z_acl_lock);
+
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+
+ if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+
+ if (uid == zdp->z_uid) {
+ owner = B_TRUE;
+ if (zdp->z_mode & S_IXUSR) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ } else {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+ }
+ if (groupmember(zdp->z_gid, cr)) {
+ groupmbr = B_TRUE;
+ if (zdp->z_mode & S_IXGRP) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ } else {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+ }
+ if (!owner && !groupmbr) {
+ if (zdp->z_mode & S_IXOTH) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+ }
+
+ mutex_exit(&zdp->z_acl_lock);
+
+slow:
+ DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+ ZFS_ENTER(zdp->z_zfsvfs);
+ error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+ ZFS_EXIT(zdp->z_zfsvfs);
+ return (error);
+}
+#endif
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+ uint32_t working_mode;
+ int error;
+ int is_attr;
+ boolean_t check_privs;
+ znode_t *xzp;
+ znode_t *check_zp = zp;
+ mode_t needed_bits;
+ uid_t owner;
+
+ is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
+
+#ifdef __FreeBSD_kernel__
+ /*
+ * In FreeBSD, we don't care about permissions of individual ADS.
+ * Note that not checking them is not just an optimization - without
+ * this shortcut, EA operations may bogusly fail with EACCES.
+ */
+ if (zp->z_pflags & ZFS_XATTR)
+ return (0);
+#else
+ /*
+ * If attribute then validate against base file
+ */
+ if (is_attr) {
+ uint64_t parent;
+
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+ sizeof (parent))) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zp->z_zfsvfs,
+ parent, &xzp)) != 0) {
+ return (error);
+ }
+
+ check_zp = xzp;
+
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+#endif
+
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ /*
+ * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+ * in needed_bits. Map the bits mapped by working_mode (currently
+ * missing) in missing_bits.
+ * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+ * needed_bits.
+ */
+ needed_bits = 0;
+
+ working_mode = mode;
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ owner == crgetuid(cr))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VREAD;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= VWRITE;
+ if (working_mode & ACE_EXECUTE)
+ needed_bits |= VEXEC;
+
+ if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+ &check_privs, skipaclchk, cr)) == 0) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits));
+ }
+
+ if (error && !check_privs) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (error);
+ }
+
+ if (error && (flags & V_APPEND)) {
+ error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+ }
+
+ if (error && check_privs) {
+ mode_t checkmode = 0;
+
+ /*
+ * First check for implicit owner permission on
+ * read_acl/read_attributes
+ */
+
+ error = 0;
+ ASSERT(working_mode != 0);
+
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+ owner == crgetuid(cr)))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= VREAD;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= VWRITE;
+ if (working_mode & ACE_EXECUTE)
+ checkmode |= VEXEC;
+
+ error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
+ needed_bits & ~checkmode, needed_bits);
+
+ if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+ error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
+ if (error == 0 && (working_mode & ACE_WRITE_ACL))
+ error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner);
+
+ if (error == 0 && (working_mode &
+ (ACE_DELETE|ACE_DELETE_CHILD)))
+ error = secpolicy_vnode_remove(ZTOV(check_zp), cr);
+
+ if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+ error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
+ }
+ if (error == 0) {
+ /*
+ * See if any bits other than those already checked
+ * for are still present. If so then return EACCES
+ */
+ if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+ error = SET_ERROR(EACCES);
+ }
+ }
+ } else if (error == 0) {
+ error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+ needed_bits, needed_bits);
+ }
+
+
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+
+ return (error);
+}
+
+/*
+ * Translate traditional unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+ return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+ int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+ return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+static int
+zfs_delete_final_check(znode_t *zp, znode_t *dzp,
+ mode_t available_perms, cred_t *cr)
+{
+ int error;
+ uid_t downer;
+
+ downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
+
+ error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+ downer, available_perms, VWRITE|VEXEC);
+
+ if (error == 0)
+ error = zfs_sticky_remove_access(dzp, zp, cr);
+
+ return (error);
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ * -------------------------------------------------------
+ * | Parent Dir | Target Object Permissions |
+ * | permissions | |
+ * -------------------------------------------------------
+ * | | ACL Allows | ACL Denies| Delete |
+ * | | Delete | Delete | unspecified|
+ * -------------------------------------------------------
+ * | ACL Allows | Permit | Permit | Permit |
+ * | DELETE_CHILD | |
+ * -------------------------------------------------------
+ * | ACL Denies | Permit | Deny | Deny |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL specifies | | | |
+ * | only allow | Permit | Permit | Permit |
+ * | write and | | | |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * | ACL denies | | | |
+ * | write and | Permit | Deny | Deny |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * ^
+ * |
+ * No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+ uint32_t dzp_working_mode = 0;
+ uint32_t zp_working_mode = 0;
+ int dzp_error, zp_error;
+ mode_t available_perms;
+ boolean_t dzpcheck_privs = B_TRUE;
+ boolean_t zpcheck_privs = B_TRUE;
+
+ /*
+ * We want specific DELETE permissions to
+ * take precedence over WRITE/EXECUTE. We don't
+ * want an ACL such as this to mess us up.
+ * user:joe:write_data:deny,user:joe:delete:allow
+ *
+ * However, deny permissions may ultimately be overridden
+ * by secpolicy_vnode_access().
+ *
+ * We will ask for all of the necessary permissions and then
+ * look at the working modes from the directory and target object
+ * to determine what was found.
+ */
+
+ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+ return (SET_ERROR(EPERM));
+
+ /*
+ * First row
+ * If the directory permissions allow the delete, we are done.
+ */
+ if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
+ return (0);
+
+ /*
+ * If target object has delete permission then we are done
+ */
+ if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+ &zpcheck_privs, B_FALSE, cr)) == 0)
+ return (0);
+
+ ASSERT(dzp_error && zp_error);
+
+ if (!dzpcheck_privs)
+ return (dzp_error);
+ if (!zpcheck_privs)
+ return (zp_error);
+
+ /*
+ * Second row
+ *
+ * If directory returns EACCES then delete_child was denied
+ * due to deny delete_child. In this case send the request through
+ * secpolicy_vnode_remove(). We don't use zfs_delete_final_check()
+ * since that *could* allow the delete based on write/execute permission
+ * and we want delete permissions to override write/execute.
+ */
+
+ if (dzp_error == EACCES)
+ return (secpolicy_vnode_remove(ZTOV(dzp), cr)); /* XXXPJD: s/dzp/zp/ ? */
+
+ /*
+ * Third Row
+ * only need to see if we have write/execute on directory.
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+
+ if (dzp_error != 0 && !dzpcheck_privs)
+ return (dzp_error);
+
+ /*
+ * Fourth row
+ */
+
+ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+ available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
+
+ return (zfs_delete_final_check(zp, dzp, available_perms, cr));
+
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+ znode_t *tzp, cred_t *cr)
+{
+ int add_perm;
+ int error;
+
+ if (szp->z_pflags & ZFS_AV_QUARANTINED)
+ return (SET_ERROR(EACCES));
+
+ add_perm = (ZTOV(szp)->v_type == VDIR) ?
+ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+ /*
+ * Rename permissions are combination of delete permission +
+ * add file/subdir permission.
+ *
+ * BSD operating systems also require write permission
+ * on the directory being moved from one parent directory
+ * to another.
+ */
+ if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
+ if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))
+ return (error);
+ }
+
+ /*
+ * first make sure we do the delete portion.
+ *
+ * If that succeeds then check for add_file/add_subdir permissions
+ */
+
+ if (error = zfs_zaccess_delete(sdzp, szp, cr))
+ return (error);
+
+ /*
+ * If we have a tzp, see if we can delete it?
+ */
+ if (tzp) {
+ if (error = zfs_zaccess_delete(tdzp, tzp, cr))
+ return (error);
+ }
+
+ /*
+ * Now check for add permissions
+ */
+ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
new file mode 100644
index 000000000000..6048eb124525
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
@@ -0,0 +1,199 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_acl.h>
+
+void
+zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
+{
+ int i;
+
+ for (i = 0; i != ace_cnt; i++, ace++) {
+ ace->a_who = BSWAP_32(ace->a_who);
+ ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+ ace->a_flags = BSWAP_16(ace->a_flags);
+ ace->a_type = BSWAP_16(ace->a_type);
+ }
+}
+
+/*
+ * swap ace_t and ace_oject_t
+ */
+void
+zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
+{
+ caddr_t end;
+ caddr_t ptr;
+ zfs_ace_t *zacep = NULL;
+ ace_t *acep;
+ uint16_t entry_type;
+ size_t entry_size;
+ int ace_type;
+
+ end = (caddr_t)buf + size;
+ ptr = buf;
+
+ while (ptr < end) {
+ if (zfs_layout) {
+ /*
+ * Avoid overrun. Embedded aces can have one
+ * of several sizes. We don't know exactly
+ * how many our present, only the size of the
+ * buffer containing them. That size may be
+ * larger than needed to hold the aces
+ * present. As long as we do not do any
+ * swapping beyond the end of our block we are
+ * okay. It it safe to swap any non-ace data
+ * within the block since it is just zeros.
+ */
+ if (ptr + sizeof (zfs_ace_hdr_t) > end) {
+ break;
+ }
+ zacep = (zfs_ace_t *)ptr;
+ zacep->z_hdr.z_access_mask =
+ BSWAP_32(zacep->z_hdr.z_access_mask);
+ zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags);
+ ace_type = zacep->z_hdr.z_type =
+ BSWAP_16(zacep->z_hdr.z_type);
+ entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+ } else {
+ /* Overrun avoidance */
+ if (ptr + sizeof (ace_t) > end) {
+ break;
+ }
+ acep = (ace_t *)ptr;
+ acep->a_access_mask = BSWAP_32(acep->a_access_mask);
+ acep->a_flags = BSWAP_16(acep->a_flags);
+ ace_type = acep->a_type = BSWAP_16(acep->a_type);
+ acep->a_who = BSWAP_32(acep->a_who);
+ entry_type = acep->a_flags & ACE_TYPE_FLAGS;
+ }
+ switch (entry_type) {
+ case ACE_OWNER:
+ case ACE_EVERYONE:
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ entry_size = zfs_layout ?
+ sizeof (zfs_ace_hdr_t) : sizeof (ace_t);
+ break;
+ case ACE_IDENTIFIER_GROUP:
+ default:
+ /* Overrun avoidance */
+ if (zfs_layout) {
+ if (ptr + sizeof (zfs_ace_t) <= end) {
+ zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+ } else {
+ entry_size = sizeof (zfs_ace_t);
+ break;
+ }
+ }
+ switch (ace_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ entry_size = zfs_layout ?
+ sizeof (zfs_object_ace_t) :
+ sizeof (ace_object_t);
+ break;
+ default:
+ entry_size = zfs_layout ? sizeof (zfs_ace_t) :
+ sizeof (ace_t);
+ break;
+ }
+ }
+ ptr = ptr + entry_size;
+ }
+}
+
+/* ARGSUSED */
+void
+zfs_oldacl_byteswap(void *buf, size_t size)
+{
+ int cnt;
+
+ /*
+ * Arggh, since we don't know how many ACEs are in
+ * the array, we have to swap the entire block
+ */
+
+ cnt = size / sizeof (ace_t);
+
+ zfs_oldace_byteswap((ace_t *)buf, cnt);
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+ zfs_ace_byteswap(buf, size, B_TRUE);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+ znode_phys_t *zp = buf;
+
+ ASSERT(size >= sizeof (znode_phys_t));
+
+ zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+ zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+ zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+ zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+ zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+ zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+ zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+ zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+ zp->zp_gen = BSWAP_64(zp->zp_gen);
+ zp->zp_mode = BSWAP_64(zp->zp_mode);
+ zp->zp_size = BSWAP_64(zp->zp_size);
+ zp->zp_parent = BSWAP_64(zp->zp_parent);
+ zp->zp_links = BSWAP_64(zp->zp_links);
+ zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+ zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+ zp->zp_flags = BSWAP_64(zp->zp_flags);
+ zp->zp_uid = BSWAP_64(zp->zp_uid);
+ zp->zp_gid = BSWAP_64(zp->zp_gid);
+ zp->zp_zap = BSWAP_64(zp->zp_zap);
+ zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+ zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+ zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+
+ zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+ zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size);
+ zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+ zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count);
+ if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
+ zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
+ ZFS_ACE_SPACE);
+ } else {
+ zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
+ ACE_SLOT_CNT);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..9775d842f55e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -0,0 +1,1364 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future. The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab. We have three
+ * types of objects:
+ *
+ * ctldir ------> snapshotdir -------> snapshot
+ * |
+ * |
+ * V
+ * mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land. The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
+ * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
+ * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
+ * However, vnodes within these mounted on file systems have their v_vfsp
+ * fields set to the head filesystem to make NFS happy (see
+ * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
+ * so that it cannot be freed until all snapshots have been unmounted.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/mount.h>
+#include <sys/zap.h>
+
+#include "zfs_namecheck.h"
+
+/* Common access mode for all virtual directories under the ctldir */
+const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+ S_IROTH | S_IXOTH;
+
+/*
+ * "Synthetic" filesystem implementation.
+ */
+
+/*
+ * Assert that A implies B.
+ */
+#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg));
+
+static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
+
+typedef struct sfs_node {
+ char sn_name[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t sn_parent_id;
+ uint64_t sn_id;
+} sfs_node_t;
+
+/*
+ * Check the parent's ID as well as the node's to account for a chance
+ * that IDs originating from different domains (snapshot IDs, artifical
+ * IDs, znode IDs) may clash.
+ */
+static int
+sfs_compare_ids(struct vnode *vp, void *arg)
+{
+ sfs_node_t *n1 = vp->v_data;
+ sfs_node_t *n2 = arg;
+ bool equal;
+
+ equal = n1->sn_id == n2->sn_id &&
+ n1->sn_parent_id == n2->sn_parent_id;
+
+ /* Zero means equality. */
+ return (!equal);
+}
+
+static int
+sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
+ uint64_t id, struct vnode **vpp)
+{
+ sfs_node_t search;
+ int err;
+
+ search.sn_id = id;
+ search.sn_parent_id = parent_id;
+ err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
+ sfs_compare_ids, &search);
+ return (err);
+}
+
+static int
+sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
+ uint64_t id, struct vnode **vpp)
+{
+ int err;
+
+ KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
+ err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
+ sfs_compare_ids, vp->v_data);
+ return (err);
+}
+
+static void
+sfs_vnode_remove(struct vnode *vp)
+{
+ vfs_hash_remove(vp);
+}
+
+typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
+
+static int
+sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
+ const char *tag, struct vop_vector *vops,
+ sfs_vnode_setup_fn setup, void *arg,
+ struct vnode **vpp)
+{
+ struct vnode *vp;
+ int error;
+
+ error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
+ if (error != 0 || *vpp != NULL) {
+ KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+ "sfs vnode with no data");
+ return (error);
+ }
+
+ /* Allocate a new vnode/inode. */
+ error = getnewvnode(tag, mp, vops, &vp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ /*
+ * Exclusively lock the vnode vnode while it's being constructed.
+ */
+ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
+ error = insmntque(vp, mp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ setup(vp, arg);
+
+ error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
+ if (error != 0 || *vpp != NULL) {
+ KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+ "sfs vnode with no data");
+ return (error);
+ }
+
+ *vpp = vp;
+ return (0);
+}
+
+static void
+sfs_print_node(sfs_node_t *node)
+{
+ printf("\tname = %s\n", node->sn_name);
+ printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
+ printf("\tid = %ju\n", (uintmax_t)node->sn_id);
+}
+
+static sfs_node_t *
+sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
+{
+ struct sfs_node *node;
+
+ KASSERT(strlen(name) < sizeof(node->sn_name),
+ ("sfs node name is too long"));
+ KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
+ node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
+ strlcpy(node->sn_name, name, sizeof(node->sn_name));
+ node->sn_parent_id = parent_id;
+ node->sn_id = id;
+
+ return (node);
+}
+
+static void
+sfs_destroy_node(sfs_node_t *node)
+{
+ free(node, M_SFSNODES);
+}
+
+static void *
+sfs_reclaim_vnode(vnode_t *vp)
+{
+ sfs_node_t *node;
+ void *data;
+
+ sfs_vnode_remove(vp);
+ data = vp->v_data;
+ vp->v_data = NULL;
+ return (data);
+}
+
+static int
+sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
+ uio_t *uio, off_t *offp)
+{
+ struct dirent entry;
+ int error;
+
+ /* Reset ncookies for subsequent use of vfs_read_dirent. */
+ if (ap->a_ncookies != NULL)
+ *ap->a_ncookies = 0;
+
+ if (uio->uio_resid < sizeof(entry))
+ return (SET_ERROR(EINVAL));
+
+ if (uio->uio_offset < 0)
+ return (SET_ERROR(EINVAL));
+ if (uio->uio_offset == 0) {
+ entry.d_fileno = id;
+ entry.d_type = DT_DIR;
+ entry.d_name[0] = '.';
+ entry.d_namlen = 1;
+ entry.d_reclen = sizeof(entry);
+ dirent_terminate(&entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ if (uio->uio_offset < sizeof(entry))
+ return (SET_ERROR(EINVAL));
+ if (uio->uio_offset == sizeof(entry)) {
+ entry.d_fileno = parent_id;
+ entry.d_type = DT_DIR;
+ entry.d_name[0] = '.';
+ entry.d_name[1] = '.';
+ entry.d_namlen = 2;
+ entry.d_reclen = sizeof(entry);
+ dirent_terminate(&entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ if (offp != NULL)
+ *offp = 2 * sizeof(entry);
+ return (0);
+}
+
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem. We use the following scheme:
+ *
+ * ENTRY ZFSCTL_INODE
+ * .zfs 1
+ * .zfs/snapshot 2
+ * .zfs/snapshot/<snap> objectid(snap)
+ */
+#define ZFSCTL_INO_SNAP(id) (id)
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+static struct vop_vector zfsctl_ops_shares_dir;
+
+void
+zfsctl_init(void)
+{
+}
+
+void
+zfsctl_fini(void)
+{
+}
+
+boolean_t
+zfsctl_is_node(vnode_t *vp)
+{
+ return (vn_matchops(vp, zfsctl_ops_root) ||
+ vn_matchops(vp, zfsctl_ops_snapdir) ||
+ vn_matchops(vp, zfsctl_ops_snapshot) ||
+ vn_matchops(vp, zfsctl_ops_shares_dir));
+
+}
+
+typedef struct zfsctl_root {
+ sfs_node_t node;
+ sfs_node_t *snapdir;
+ timestruc_t cmtime;
+} zfsctl_root_t;
+
+
+/*
+ * Create the '.zfs' directory.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+ zfsctl_root_t *dot_zfs;
+ sfs_node_t *snapdir;
+ vnode_t *rvp;
+ uint64_t crtime[2];
+
+ ASSERT(zfsvfs->z_ctldir == NULL);
+
+ snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
+ ZFSCTL_INO_SNAPDIR);
+ dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0,
+ ZFSCTL_INO_ROOT);
+ dot_zfs->snapdir = snapdir;
+
+ VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
+ VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ &crtime, sizeof(crtime)));
+ ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
+ vput(rvp);
+
+ zfsvfs->z_ctldir = dot_zfs;
+}
+
+/*
+ * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
+ * The nodes must not have any associated vnodes by now as they should be
+ * vflush-ed.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+ sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
+ sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
+ zfsvfs->z_ctldir = NULL;
+}
+
+static int
+zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ return (VFS_ROOT(mp, flags, vpp));
+}
+
+static void
+zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
+{
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ /* We support shared locking. */
+ VN_LOCK_ASHARE(vp);
+ vp->v_type = VDIR;
+ vp->v_data = arg;
+}
+
+static int
+zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ void *node;
+ int err;
+
+ node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
+ err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
+ zfsctl_common_vnode_setup, node, vpp);
+ return (err);
+}
+
+static int
+zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ void *node;
+ int err;
+
+ node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
+ err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
+ &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
+ return (err);
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
+{
+ vnode_t *vp;
+ int error;
+
+ error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
+ return (error);
+}
+
+/*
+ * Common open routine. Disallow any write access.
+ */
+static int
+zfsctl_common_open(struct vop_open_args *ap)
+{
+ int flags = ap->a_mode;
+
+ if (flags & FWRITE)
+ return (SET_ERROR(EACCES));
+
+ return (0);
+}
+
+/*
+ * Common close routine. Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(struct vop_close_args *ap)
+{
+ return (0);
+}
+
+/*
+ * Common access routine. Disallow writes.
+ */
+static int
+zfsctl_common_access(ap)
+ struct vop_access_args /* {
+ struct vnode *a_vp;
+ accmode_t a_accmode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ accmode_t accmode = ap->a_accmode;
+
+ if (accmode & VWRITE)
+ return (SET_ERROR(EACCES));
+ return (0);
+}
+
+/*
+ * Common getattr function. Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+ timestruc_t now;
+ sfs_node_t *node;
+
+ node = vp->v_data;
+
+ vap->va_uid = 0;
+ vap->va_gid = 0;
+ vap->va_rdev = 0;
+ /*
+ * We are a purely virtual object, so we have no
+ * blocksize or allocated blocks.
+ */
+ vap->va_blksize = 0;
+ vap->va_nblocks = 0;
+ vap->va_seq = 0;
+ vn_fsid(vp, vap);
+ vap->va_mode = zfsctl_ctldir_mode;
+ vap->va_type = VDIR;
+ /*
+ * We live in the now (for atime).
+ */
+ gethrestime(&now);
+ vap->va_atime = now;
+ /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_flags = 0;
+
+ vap->va_nodeid = node->sn_id;
+
+ /* At least '.' and '..'. */
+ vap->va_nlink = 2;
+}
+
+static int
+zfsctl_common_fid(ap)
+ struct vop_fid_args /* {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ fid_t *fidp = (void *)ap->a_fid;
+ sfs_node_t *node = vp->v_data;
+ uint64_t object = node->sn_id;
+ zfid_short_t *zfid;
+ int i;
+
+ zfid = (zfid_short_t *)fidp;
+ zfid->zf_len = SHORT_FID_LEN;
+
+ for (i = 0; i < sizeof(zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* .zfs nodes always have a generation number of 0 */
+ for (i = 0; i < sizeof(zfid->zf_gen); i++)
+ zfid->zf_gen[i] = 0;
+
+ return (0);
+}
+
+static int
+zfsctl_common_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+
+ (void) sfs_reclaim_vnode(vp);
+ return (0);
+}
+
+static int
+zfsctl_common_print(ap)
+ struct vop_print_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ sfs_print_node(ap->a_vp->v_data);
+ return (0);
+}
+
+/*
+ * Get root directory attributes.
+ */
+static int
+zfsctl_root_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr *vap = ap->a_vap;
+ zfsctl_root_t *node = vp->v_data;
+
+ zfsctl_common_getattr(vp, vap);
+ vap->va_ctime = node->cmtime;
+ vap->va_mtime = vap->va_ctime;
+ vap->va_birthtime = vap->va_ctime;
+ vap->va_nlink += 1; /* snapdir */
+ vap->va_size = vap->va_nlink;
+ return (0);
+}
+
+/*
+ * When we lookup "." we still can be asked to lock it
+ * differently, can't we?
+ */
+int
+zfsctl_relock_dot(vnode_t *dvp, int ltype)
+{
+ vref(dvp);
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+ /* Relock for the "." case may left us with reclaimed vnode. */
+ if (VN_IS_DOOMED(dvp)) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
+ }
+ return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+int
+zfsctl_root_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ struct componentname *cnp = ap->a_cnp;
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ int flags = ap->a_cnp->cn_flags;
+ int lkflags = ap->a_cnp->cn_lkflags;
+ int nameiop = ap->a_cnp->cn_nameiop;
+ int err;
+ int ltype;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+
+ if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+ err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+ if (err == 0)
+ *vpp = dvp;
+ } else if ((flags & ISDOTDOT) != 0) {
+ err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
+ lkflags, vpp);
+ } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
+ err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
+ } else {
+ err = SET_ERROR(ENOENT);
+ }
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+zfsctl_root_readdir(ap)
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *ncookies;
+ u_long **a_cookies;
+ } */ *ap;
+{
+ struct dirent entry;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_root_t *node = vp->v_data;
+ uio_t *uio = ap->a_uio;
+ int *eofp = ap->a_eofflag;
+ off_t dots_offset;
+ int error;
+
+ ASSERT(vp->v_type == VDIR);
+
+ error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
+ &dots_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
+ }
+ if (uio->uio_offset != dots_offset)
+ return (SET_ERROR(EINVAL));
+
+ CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
+ entry.d_fileno = node->snapdir->sn_id;
+ entry.d_type = DT_DIR;
+ strcpy(entry.d_name, node->snapdir->sn_name);
+ entry.d_namlen = strlen(entry.d_name);
+ entry.d_reclen = sizeof(entry);
+ dirent_terminate(&entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = 0;
+ return (SET_ERROR(error));
+ }
+ if (eofp != NULL)
+ *eofp = 1;
+ return (0);
+}
+
+static int
+zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
+{
+ static const char dotzfs_name[4] = ".zfs";
+ vnode_t *dvp;
+ int error;
+
+ if (*ap->a_buflen < sizeof (dotzfs_name))
+ return (SET_ERROR(ENOMEM));
+
+ error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
+ LK_SHARED, &dvp);
+ if (error != 0)
+ return (SET_ERROR(error));
+
+ VOP_UNLOCK(dvp);
+ *ap->a_vpp = dvp;
+ *ap->a_buflen -= sizeof (dotzfs_name);
+ bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
+ return (0);
+}
+
+static int
+zfsctl_common_pathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ int *a_retval;
+ } */ *ap;
+{
+ /*
+ * We care about ACL variables so that user land utilities like ls
+ * can display them correctly. Since the ctldir's st_dev is set to be
+ * the same as the parent dataset, we must support all variables that
+ * it supports.
+ */
+ switch (ap->a_name) {
+ case _PC_LINK_MAX:
+ *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *ap->a_retval = 64;
+ return (0);
+
+ case _PC_MIN_HOLE_SIZE:
+ *ap->a_retval = (int)SPA_MINBLOCKSIZE;
+ return (0);
+
+ case _PC_ACL_NFS4:
+ *ap->a_retval = 1;
+ return (0);
+
+ case _PC_ACL_PATH_MAX:
+ *ap->a_retval = ACL_MAX_ENTRIES;
+ return (0);
+
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
+
+ default:
+ return (vop_stdpathconf(ap));
+ }
+}
+
+/**
+ * Returns a trivial ACL
+ */
+int
+zfsctl_common_getacl(ap)
+ struct vop_getacl_args /* {
+ struct vnode *vp;
+ acl_type_t a_type;
+ struct acl *a_aclp;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
+{
+ int i;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
+ /*
+ * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
+ * attributes. That is not the case for the ctldir, so we must clear
+ * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs
+ * aren't supported by the ctldir.
+ */
+ for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
+ struct acl_entry *entry;
+ entry = &(ap->a_aclp->acl_entry[i]);
+ uint32_t old_perm = entry->ae_perm;
+ entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+ ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
+ ACL_READ_NAMED_ATTRS );
+ }
+
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_root = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_ioctl = VOP_EINVAL,
+ .vop_getattr = zfsctl_root_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = zfsctl_root_readdir,
+ .vop_lookup = zfsctl_root_lookup,
+ .vop_inactive = VOP_NULL,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+ .vop_print = zfsctl_common_print,
+ .vop_vptocnp = zfsctl_root_vptocnp,
+ .vop_pathconf = zfsctl_common_pathconf,
+ .vop_getacl = zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+ dmu_objset_name(os, zname);
+ if (strlen(zname) + 1 + strlen(name) >= len)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strcat(zname, "@");
+ (void) strcat(zname, name);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+ int err;
+
+ err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
+ return (err);
+}
+
+/*
+ * Given a vnode get a root vnode of a filesystem mounted on top of
+ * the vnode, if any. The root vnode is referenced and locked.
+ * If no filesystem is mounted then the orinal vnode remains referenced
+ * and locked. If any error happens the orinal vnode is unlocked and
+ * released.
+ */
+static int
+zfsctl_mounted_here(vnode_t **vpp, int flags)
+{
+ struct mount *mp;
+ int err;
+
+ ASSERT_VOP_LOCKED(*vpp, __func__);
+ ASSERT3S((*vpp)->v_type, ==, VDIR);
+
+ if ((mp = (*vpp)->v_mountedhere) != NULL) {
+ err = vfs_busy(mp, 0);
+ KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
+ KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
+ vput(*vpp);
+ err = VFS_ROOT(mp, flags, vpp);
+ vfs_unbusy(mp);
+ return (err);
+ }
+ return (EJUSTRETURN);
+}
+
+typedef struct {
+ const char *snap_name;
+ uint64_t snap_id;
+} snapshot_setup_arg_t;
+
+static void
+zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
+{
+ snapshot_setup_arg_t *ssa = arg;
+ sfs_node_t *node;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ node = sfs_alloc_node(sizeof(sfs_node_t),
+ ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
+ zfsctl_common_vnode_setup(vp, node);
+
+ /* We have to support recursive locking. */
+ VN_LOCK_AREC(vp);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory. Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ * There are four possibilities:
+ * - the snapshot node and vnode do not exist
+ * - the snapshot vnode is covered by the mounted snapshot
+ * - the snapshot vnode is not covered yet, the mount operation is in progress
+ * - the snapshot vnode is not covered, because the snapshot has been unmounted
+ * The last two states are transient and should be relatively short-lived.
+ */
+int
+zfsctl_snapdir_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ char name[NAME_MAX + 1];
+ char fullname[ZFS_MAX_DATASET_NAME_LEN];
+ char *mountpoint;
+ size_t mountpoint_len;
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ uint64_t snap_id;
+ int nameiop = cnp->cn_nameiop;
+ int lkflags = cnp->cn_lkflags;
+ int flags = cnp->cn_flags;
+ int err;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+
+ if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+ err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+ if (err == 0)
+ *vpp = dvp;
+ return (err);
+ }
+ if (flags & ISDOTDOT) {
+ err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
+ vpp);
+ return (err);
+ }
+
+ if (cnp->cn_namelen >= sizeof(name))
+ return (SET_ERROR(ENAMETOOLONG));
+
+ strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+ err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
+ if (err != 0)
+ return (SET_ERROR(ENOENT));
+
+ for (;;) {
+ snapshot_setup_arg_t ssa;
+
+ ssa.snap_name = name;
+ ssa.snap_id = snap_id;
+ err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
+ snap_id, "zfs", &zfsctl_ops_snapshot,
+ zfsctl_snapshot_vnode_setup, &ssa, vpp);
+ if (err != 0)
+ return (err);
+
+ /* Check if a new vnode has just been created. */
+ if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
+ break;
+
+ /*
+ * Check if a snapshot is already mounted on top of the vnode.
+ */
+ err = zfsctl_mounted_here(vpp, lkflags);
+ if (err != EJUSTRETURN)
+ return (err);
+
+ /*
+ * If the vnode is not covered, then either the mount operation
+ * is in progress or the snapshot has already been unmounted
+ * but the vnode hasn't been inactivated and reclaimed yet.
+ * We can try to re-use the vnode in the latter case.
+ */
+ VI_LOCK(*vpp);
+ if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
+ /* Upgrade to exclusive lock in order to:
+ * - avoid race conditions
+ * - satisfy the contract of mount_snapshot()
+ */
+ err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
+ if (err == 0)
+ break;
+ } else {
+ VI_UNLOCK(*vpp);
+ }
+
+ /*
+ * In this state we can loop on uncontested locks and starve
+ * the thread doing the lengthy, non-trivial mount operation.
+ * So, yield to prevent that from happening.
+ */
+ vput(*vpp);
+ kern_yield(PRI_USER);
+ }
+
+ VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
+
+ mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
+ strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
+ mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+ (void) snprintf(mountpoint, mountpoint_len,
+ "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
+ dvp->v_vfsp->mnt_stat.f_mntonname, name);
+
+ err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+ kmem_free(mountpoint, mountpoint_len);
+ if (err == 0) {
+ /*
+ * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
+ *
+ * This is where we lie about our v_vfsp in order to
+ * make .zfs/snapshot/<snapname> accessible over NFS
+ * without requiring manual mounts of <snapname>.
+ */
+ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
+ VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+
+ /* Clear the root flag (set via VFS_ROOT) as well. */
+ (*vpp)->v_vflag &= ~VV_ROOT;
+ }
+
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+zfsctl_snapdir_readdir(ap)
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *ncookies;
+ u_long **a_cookies;
+ } */ *ap;
+{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ struct dirent entry;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ uio_t *uio = ap->a_uio;
+ int *eofp = ap->a_eofflag;
+ off_t dots_offset;
+ int error;
+
+ ASSERT(vp->v_type == VDIR);
+
+ error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
+ &dots_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
+ }
+
+ ZFS_ENTER(zfsvfs);
+ for (;;) {
+ uint64_t cookie;
+ uint64_t id;
+
+ cookie = uio->uio_offset - dots_offset;
+
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
+ snapname, &id, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT) {
+ if (eofp != NULL)
+ *eofp = 1;
+ error = 0;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ entry.d_fileno = id;
+ entry.d_type = DT_DIR;
+ strcpy(entry.d_name, snapname);
+ entry.d_namlen = strlen(entry.d_name);
+ entry.d_reclen = sizeof(entry);
+ /* NOTE: d_off is the offset for the *next* entry. */
+ entry.d_off = cookie + dots_offset;
+ dirent_terminate(&entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = 0;
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(error));
+ }
+ uio->uio_offset = cookie + dots_offset;
+ }
+ /* NOTREACHED */
+}
+
+static int
+zfsctl_snapdir_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ dsl_dataset_t *ds;
+ sfs_node_t *node = vp->v_data;
+ uint64_t snap_count;
+ int err;
+
+ ZFS_ENTER(zfsvfs);
+ ds = dmu_objset_ds(zfsvfs->z_os);
+ zfsctl_common_getattr(vp, vap);
+ vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ vap->va_mtime = vap->va_ctime;
+ vap->va_birthtime = vap->va_ctime;
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+ err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+ if (err != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ vap->va_nlink += snap_count;
+ }
+ vap->va_size = vap->va_nlink;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_snapdir = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_getattr = zfsctl_snapdir_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = zfsctl_snapdir_readdir,
+ .vop_lookup = zfsctl_snapdir_lookup,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+ .vop_print = zfsctl_common_print,
+ .vop_pathconf = zfsctl_common_pathconf,
+ .vop_getacl = zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
+
+static int
+zfsctl_snapshot_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+
+ VERIFY(vrecycle(vp) == 1);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ void *data = vp->v_data;
+
+ sfs_reclaim_vnode(vp);
+ sfs_destroy_node(data);
+ return (0);
+}
+
+static int
+zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
+{
+ struct mount *mp;
+ vnode_t *dvp;
+ vnode_t *vp;
+ sfs_node_t *node;
+ size_t len;
+ enum vgetstate vs;
+ int locked;
+ int error;
+
+ vp = ap->a_vp;
+ node = vp->v_data;
+ len = strlen(node->sn_name);
+ if (*ap->a_buflen < len)
+ return (SET_ERROR(ENOMEM));
+
+ /*
+ * Prevent unmounting of the snapshot while the vnode lock
+ * is not held. That is not strictly required, but allows
+ * us to assert that an uncovered snapshot vnode is never
+ * "leaked".
+ */
+ mp = vp->v_mountedhere;
+ if (mp == NULL)
+ return (SET_ERROR(ENOENT));
+ error = vfs_busy(mp, 0);
+ KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
+
+ /*
+ * We can vput the vnode as we can now depend on the reference owned
+ * by the busied mp. But we also need to hold the vnode, because
+ * the reference may go after vfs_unbusy() which has to be called
+ * before we can lock the vnode again.
+ */
+ locked = VOP_ISLOCKED(vp);
+ vs = vget_prep(vp);
+ vput(vp);
+
+ /* Look up .zfs/snapshot, our parent. */
+ error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
+ if (error == 0) {
+ VOP_UNLOCK(dvp);
+ *ap->a_vpp = dvp;
+ *ap->a_buflen -= len;
+ bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
+ }
+ vfs_unbusy(mp);
+ vget_finish(vp, locked | LK_RETRY, vs);
+ return (error);
+}
+
+/*
+ * These VP's should never see the light of day. They should always
+ * be covered.
+ */
+static struct vop_vector zfsctl_ops_snapshot = {
+ .vop_default = NULL, /* ensure very restricted access */
+ .vop_inactive = zfsctl_snapshot_inactive,
+ .vop_need_inactive = vop_stdneed_inactive,
+ .vop_reclaim = zfsctl_snapshot_reclaim,
+ .vop_vptocnp = zfsctl_snapshot_vptocnp,
+ .vop_lock1 = vop_stdlock,
+ .vop_unlock = vop_stdunlock,
+ .vop_islocked = vop_stdislocked,
+ .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */
+ .vop_print = zfsctl_common_print,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+ struct mount *mp;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ vnode_t *vp;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ *zfsvfsp = NULL;
+ error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+ ZFSCTL_INO_SNAPDIR, objsetid, &vp);
+ if (error == 0 && vp != NULL) {
+ /*
+ * XXX Probably need to at least reference, if not busy, the mp.
+ */
+ if (vp->v_mountedhere != NULL)
+ *zfsvfsp = vp->v_mountedhere->mnt_data;
+ vput(vp);
+ }
+ if (*zfsvfsp == NULL)
+ return (SET_ERROR(EINVAL));
+ return (0);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem. This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ struct mount *mp;
+ vnode_t *dvp;
+ vnode_t *vp;
+ sfs_node_t *node;
+ sfs_node_t *snap;
+ uint64_t cookie;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+
+ cookie = 0;
+ for (;;) {
+ uint64_t id;
+
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
+ snapname, &id, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT)
+ error = 0;
+ break;
+ }
+
+ for (;;) {
+ error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+ ZFSCTL_INO_SNAPDIR, id, &vp);
+ if (error != 0 || vp == NULL)
+ break;
+
+ mp = vp->v_mountedhere;
+
+ /*
+ * v_mountedhere being NULL means that the
+ * (uncovered) vnode is in a transient state
+ * (mounting or unmounting), so loop until it
+ * settles down.
+ */
+ if (mp != NULL)
+ break;
+ vput(vp);
+ }
+ if (error != 0)
+ break;
+ if (vp == NULL)
+ continue; /* no mountpoint, nothing to do */
+
+ /*
+ * The mount-point vnode is kept locked to avoid spurious EBUSY
+ * from a concurrent umount.
+ * The vnode lock must have recursive locking enabled.
+ */
+ vfs_ref(mp);
+ error = dounmount(mp, fflags, curthread);
+ KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
+ ("extra references after unmount"));
+ vput(vp);
+ if (error != 0)
+ break;
+ }
+ KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
+ ("force unmounting failed"));
+ return (error);
+}
+
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
new file mode 100644
index 000000000000..a9cbe4dfe392
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
@@ -0,0 +1,112 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+
+void
+zfs_dbgmsg_init(void)
+{
+ list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+ offsetof(zfs_dbgmsg_t, zdm_node));
+ mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+ zfs_dbgmsg_t *zdm;
+
+ while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
+ int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+ mutex_destroy(&zfs_dbgmsgs_lock);
+ ASSERT0(zfs_dbgmsg_size);
+}
+
+/*
+ * Print these messages by running:
+ * echo ::zfs_dbgmsg | mdb -k
+ *
+ * Monitor these messages by running:
+ * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ *
+ * When used with libzpool, monitor with:
+ * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
+ */
+void
+zfs_dbgmsg(const char *fmt, ...)
+{
+ int size;
+ va_list adx;
+ zfs_dbgmsg_t *zdm;
+
+ va_start(adx, fmt);
+ size = vsnprintf(NULL, 0, fmt, adx);
+ va_end(adx);
+
+ /*
+ * There is one byte of string in sizeof (zfs_dbgmsg_t), used
+ * for the terminating null.
+ */
+ zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
+ zdm->zdm_timestamp = gethrestime_sec();
+
+ va_start(adx, fmt);
+ (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
+ va_end(adx);
+
+ DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
+
+ mutex_enter(&zfs_dbgmsgs_lock);
+ list_insert_tail(&zfs_dbgmsgs, zdm);
+ zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
+ while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
+ zdm = list_remove_head(&zfs_dbgmsgs);
+ size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+ zfs_dbgmsg_t *zdm;
+
+ (void) printf("ZFS_DBGMSG(%s):\n", tag);
+ mutex_enter(&zfs_dbgmsgs_lock);
+ for (zdm = list_head(&zfs_dbgmsgs); zdm;
+ zdm = list_next(&zfs_dbgmsgs, zdm))
+ (void) printf("%s\n", zdm->zdm_msg);
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
new file mode 100644
index 000000000000..c3621a24d137
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -0,0 +1,968 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/kcondvar.h>
+#include <sys/callb.h>
+#include <sys/smp.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/dnlc.h>
+#include <sys/extdirent.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+ matchtype_t mt, uint64_t *zoid)
+{
+ int error;
+
+ if (zfsvfs->z_norm) {
+
+ /*
+ * In the non-mixed case we only expect there would ever
+ * be one match, but we need to use the normalizing lookup.
+ */
+ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+ zoid, mt, NULL, 0, NULL);
+ } else {
+ error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+ }
+ *zoid = ZFS_DIRENT_OBJ(*zoid);
+
+ return (error);
+}
+
+/*
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
+ *
+ * Input arguments:
+ * dzp - znode for directory
+ * name - name of entry to lock
+ * flag - ZNEW: if the entry already exists, fail with EEXIST.
+ * ZEXISTS: if the entry does not exist, fail with ENOENT.
+ * ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ * zpp - pointer to the znode for the entry (NULL if there isn't one)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ matchtype_t mt = 0;
+ uint64_t zoid;
+ vnode_t *vp = NULL;
+ int error = 0;
+
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+
+ *zpp = NULL;
+
+ /*
+ * Verify that we are not trying to lock '.', '..', or '.zfs'
+ */
+ if (name[0] == '.' &&
+ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
+ zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
+ return (SET_ERROR(EEXIST));
+
+ /*
+ * Case sensitivity and normalization preferences are set when
+ * the file system is created. These are stored in the
+ * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
+ * affect how we perform zap lookups.
+ *
+ * When matching we may need to normalize & change case according to
+ * FS settings.
+ *
+ * Note that a normalized match is necessary for a case insensitive
+ * filesystem when the lookup request is not exact because normalization
+ * can fold case independent of normalizing code point sequences.
+ *
+ * See the table above zfs_dropname().
+ */
+ if (zfsvfs->z_norm != 0) {
+ mt = MT_NORMALIZE;
+
+ /*
+ * Determine if the match needs to honor the case specified in
+ * lookup, and if so keep track of that so that during
+ * normalization we don't fold case.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+ mt |= MT_MATCH_CASE;
+ }
+ }
+
+ /*
+ * Only look in or update the DNLC if we are looking for the
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name.
+ *
+ * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+ * because in that case MT_EXACT and MT_FIRST should produce exactly
+ * the same result.
+ */
+
+ if (dzp->z_unlinked && !(flag & ZXATTR))
+ return (ENOENT);
+ if (flag & ZXATTR) {
+ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+ sizeof (zoid));
+ if (error == 0)
+ error = (zoid == 0 ? ENOENT : 0);
+ } else {
+ error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid);
+ }
+ if (error) {
+ if (error != ENOENT || (flag & ZEXISTS)) {
+ return (error);
+ }
+ } else {
+ if (flag & ZNEW) {
+ return (SET_ERROR(EEXIST));
+ }
+ error = zfs_zget(zfsvfs, zoid, zpp);
+ if (error)
+ return (error);
+ ASSERT(!(*zpp)->z_unlinked);
+ }
+
+ return (0);
+}
+
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ znode_t *zp;
+ uint64_t parent;
+ int error;
+
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+
+ if (dzp->z_unlinked)
+ return (ENOENT);
+
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+
+ error = zfs_zget(zfsvfs, parent, &zp);
+ if (error == 0)
+ *zpp = zp;
+ return (error);
+}
+
+int
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ znode_t *zp;
+ int error = 0;
+
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+
+ if (dzp->z_unlinked)
+ return (SET_ERROR(ENOENT));
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *zpp = dzp;
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ error = zfs_dd_lookup(dzp, zpp);
+ } else {
+ error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
+ if (error == 0) {
+ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ *zpp = zp;
+ }
+ }
+ return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating. We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem). So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error. On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp->z_unlinked);
+ ASSERT(zp->z_links == 0);
+
+ VERIFY3U(0, ==,
+ zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ dmu_object_info_t doi;
+ znode_t *zp;
+ dmu_tx_t *tx;
+ int error;
+
+ /*
+ * Interate over the contents of the unlinked set.
+ */
+ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+ zap_cursor_retrieve(&zc, &zap) == 0;
+ zap_cursor_advance(&zc)) {
+
+ /*
+ * See what kind of object we have in list
+ */
+
+ error = dmu_object_info(zfsvfs->z_os,
+ zap.za_first_integer, &doi);
+ if (error != 0)
+ continue;
+
+ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+ /*
+ * We need to re-mark these list entries for deletion,
+ * so we pull them back into core and set zp->z_unlinked.
+ */
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+ /*
+ * We may pick up znodes that are already marked for deletion.
+ * This could happen during the purge of an extended attribute
+ * directory. All we need to do is skip over them, since they
+ * are already in the system marked z_unlinked.
+ */
+ if (error != 0)
+ continue;
+
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
+#if defined(__FreeBSD__)
+ /*
+ * Due to changes in zfs_rmnode we need to make sure the
+ * link count is set to zero here.
+ */
+ if (zp->z_links != 0) {
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ vput(ZTOV(zp));
+ continue;
+ }
+ zp->z_links = 0;
+ VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &zp->z_links, sizeof (zp->z_links), tx));
+ dmu_tx_commit(tx);
+ }
+#endif
+ zp->z_unlinked = B_TRUE;
+ vput(ZTOV(zp));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Delete the entire contents of a directory. Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ * so there is no need to lock its entries before deletion.
+ * Also, it assumes the directory contents is *only* regular
+ * files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ int skipped = 0;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ error = zfs_zget(zfsvfs,
+ ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+ if (error) {
+ skipped += 1;
+ continue;
+ }
+
+ vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
+ ASSERT((ZTOV(xzp)->v_type == VREG) ||
+ (ZTOV(xzp)->v_type == VLNK));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ /* Is this really needed ? */
+ zfs_sa_upgrade_txholds(tx, xzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ vput(ZTOV(xzp));
+ skipped += 1;
+ continue;
+ }
+
+ error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
+ if (error)
+ skipped += 1;
+ dmu_tx_commit(tx);
+
+ vput(ZTOV(xzp));
+ }
+ zap_cursor_fini(&zc);
+ if (error != ENOENT)
+ skipped += 1;
+ return (skipped);
+}
+
+#if defined(__FreeBSD__)
+extern taskq_t *zfsvfs_taskq;
+#endif
+
+void
+zfs_rmnode(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ dmu_tx_t *tx;
+ uint64_t acl_obj;
+ uint64_t xattr_obj;
+ int error;
+
+ ASSERT(zp->z_links == 0);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+
+ /*
+ * If this is an attribute directory, purge its contents.
+ */
+ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
+ (zp->z_pflags & ZFS_XATTR)) {
+ if (zfs_purgedir(zp) != 0) {
+ /*
+ * Not enough space to delete some xattrs.
+ * Leave it in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+ } else {
+ /*
+ * Free up all the data in the file. We don't do this for
+ * XATTR directories because we need truncate and remove to be
+ * in the same tx, like in zfs_znode_delete(). Otherwise, if
+ * we crash here we'll end up with an inconsistent truncated
+ * zap object in the delete queue. Note a truncated file is
+ * harmless since it only contains user data.
+ */
+ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+ if (error) {
+ /*
+ * Not enough space or we were interrupted by unmount.
+ * Leave the file in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+ }
+
+ /*
+ * If the file has extended attributes, we're going to unlink
+ * the xattr dir.
+ */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error)
+ xattr_obj = 0;
+
+ acl_obj = zfs_external_acl(zp);
+
+ /*
+ * Set up the final transaction.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ if (xattr_obj)
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ if (acl_obj)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ /*
+ * Not enough space to delete the file. Leave it in the
+ * unlinked set, leaking it until the fs is remounted (at
+ * which point we'll call zfs_unlinked_drain() to process it).
+ */
+ dmu_tx_abort(tx);
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+
+#if defined(__FreeBSD__)
+ /*
+ * FreeBSD's implemention of zfs_zget requires a vnode to back it.
+ * This means that we could end up calling into getnewvnode while
+ * calling zfs_rmnode as a result of a prior call to getnewvnode
+ * trying to clear vnodes out of the cache. If this repeats we can
+ * recurse enough that we overflow our stack. To avoid this, we
+ * avoid calling zfs_zget on the xattr znode and instead simply add
+ * it to the unlinked set and schedule a call to zfs_unlinked_drain.
+ */
+ if (xattr_obj) {
+ /* Add extended attribute directory to the unlinked set. */
+ VERIFY3U(0, ==,
+ zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx));
+ }
+#else
+ if (xzp) {
+ ASSERT(error == 0);
+ xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
+ xzp->z_links = 0; /* no more links to it */
+ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &xzp->z_links, sizeof (xzp->z_links), tx));
+ zfs_unlinked_add(xzp, tx);
+ }
+#endif
+
+ /* Remove this znode from the unlinked set */
+ VERIFY3U(0, ==,
+ zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+ zfs_znode_delete(zp, tx);
+
+ dmu_tx_commit(tx);
+
+#if defined(__FreeBSD__)
+ if (xattr_obj) {
+ /*
+ * We're using the FreeBSD taskqueue API here instead of
+ * the Solaris taskq API since the FreeBSD API allows for a
+ * task to be enqueued multiple times but executed once.
+ */
+ taskqueue_enqueue(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task);
+ }
+#endif
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+ uint64_t de = zp->z_id;
+
+ if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
+ de |= IFTODT(mode) << 60;
+ return (de);
+}
+
+/*
+ * Link zp into dzp. Can only fail if zp has been unlinked.
+ */
+int
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ uint64_t value;
+ int zp_is_dir = (vp->v_type == VDIR);
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
+ int error;
+
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+#ifdef __FreeBSD__
+ if (zp_is_dir) {
+ if (dzp->z_links >= ZFS_LINK_MAX)
+ return (SET_ERROR(EMLINK));
+ }
+#endif
+ if (!(flag & ZRENAMING)) {
+ if (zp->z_unlinked) { /* no new links to unlinked zp */
+ ASSERT(!(flag & (ZNEW | ZEXISTS)));
+ return (SET_ERROR(ENOENT));
+ }
+#ifdef __FreeBSD__
+ if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) {
+ return (SET_ERROR(EMLINK));
+ }
+#endif
+ zp->z_links++;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+
+ } else {
+ ASSERT(zp->z_unlinked == 0);
+ }
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &dzp->z_id, sizeof (dzp->z_id));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (!(flag & ZNEW)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime, B_TRUE);
+ }
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+
+ dzp->z_size++;
+ dzp->z_links += zp_is_dir;
+ count = 0;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+
+ value = zfs_dirent(zp, zp->z_mode);
+ error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
+ 8, 1, &value, tx);
+ VERIFY0(error);
+
+ return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type | z_norm | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0 | 0 | 0 (exact)
+ * CS norm | formX | 0 | MT_NORMALIZE
+ * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE
+ * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | ZCILOOK | MT_NORMALIZE
+ * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ * formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag)
+{
+ int error;
+
+ if (zp->z_zfsvfs->z_norm) {
+ matchtype_t mt = MT_NORMALIZE;
+
+ if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) {
+ mt |= MT_MATCH_CASE;
+ }
+
+ error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id,
+ name, mt, tx);
+ } else {
+ error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx);
+ }
+
+ return (error);
+}
+
+/*
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag, boolean_t *unlinkedp)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ int zp_is_dir = (vp->v_type == VDIR);
+ boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
+ int error;
+
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+
+ if (!(flag & ZRENAMING)) {
+
+ if (zp_is_dir && !zfs_dirempty(zp)) {
+#ifdef illumos
+ return (SET_ERROR(EEXIST));
+#else
+ return (SET_ERROR(ENOTEMPTY));
+#endif
+ }
+
+ /*
+ * If we get here, we are going to try to remove the object.
+ * First try removing the name from the directory; if that
+ * fails, return the error.
+ */
+ error = zfs_dropname(dzp, name, zp, tx, flag);
+ if (error != 0) {
+ return (error);
+ }
+
+ if (zp->z_links <= zp_is_dir) {
+ zfs_panic_recover("zfs: link count on vnode %p is %u, "
+ "should be at least %u", zp->z_vnode,
+ (int)zp->z_links,
+ zp_is_dir + 1);
+ zp->z_links = zp_is_dir + 1;
+ }
+ if (--zp->z_links == zp_is_dir) {
+ zp->z_unlinked = B_TRUE;
+ zp->z_links = 0;
+ unlinked = B_TRUE;
+ } else {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+ B_TRUE);
+ }
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &zp->z_links, sizeof (zp->z_links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ count = 0;
+ ASSERT0(error);
+ } else {
+ ASSERT(zp->z_unlinked == 0);
+ error = zfs_dropname(dzp, name, zp, tx, flag);
+ if (error != 0)
+ return (error);
+ }
+
+ dzp->z_size--; /* one dirent removed */
+ dzp->z_links -= zp_is_dir; /* ".." link from zp */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(error);
+
+ if (unlinkedp != NULL)
+ *unlinkedp = unlinked;
+ else if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+ return (dzp->z_size == 2);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t parent;
+
+ *xvpp = NULL;
+
+ /*
+ * In FreeBSD, access checking for creating an EA is being done
+ * in zfs_setextattr(),
+ */
+#ifndef __FreeBSD_kernel__
+ if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
+ return (error);
+#endif
+
+ if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+ &acl_ids)) != 0)
+ return (error);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ getnewvnode_reserve();
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ return (error);
+ }
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef DEBUG
+ error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ sizeof (xzp->z_id), tx));
+
+ (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+ xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ *xvpp = ZTOV(xzp);
+
+ return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ * IN: zp - znode to obtain attribute directory from
+ * cr - credentials of caller
+ * flags - flags from the VOP_LOOKUP call
+ *
+ * OUT: xzpp - pointer to extended attribute znode
+ *
+ * RETURN: 0 on success
+ * error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ vattr_t va;
+ int error;
+top:
+ error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
+ if (error)
+ return (error);
+
+ if (xzp != NULL) {
+ *xvpp = ZTOV(xzp);
+ return (0);
+ }
+
+
+ if (!(flags & CREATE_XATTR_DIR)) {
+#ifdef illumos
+ return (SET_ERROR(ENOENT));
+#else
+ return (SET_ERROR(ENOATTR));
+#endif
+ }
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * The ability to 'create' files in an attribute
+ * directory comes from the write_xattr permission on the base file.
+ *
+ * The ability to 'search' an attribute directory requires
+ * read_xattr permission on the base file.
+ *
+ * Once in a directory the ability to read/write attributes
+ * is controlled by the permissions on the attribute file.
+ */
+ va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
+ va.va_type = VDIR;
+ va.va_mode = S_IFDIR | S_ISVTX | 0777;
+ zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+ error = zfs_make_xattrdir(zp, &va, xvpp, cr);
+
+ if (error == ERESTART) {
+ /* NB: we already did dmu_tx_wait() if necessary */
+ goto top;
+ }
+ if (error == 0)
+ VOP_UNLOCK(*xvpp);
+
+ return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ * you own the directory,
+ * you own the entry,
+ * the entry is a plain file and you have write access,
+ * or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+ uid_t uid;
+ uid_t downer;
+ uid_t fowner;
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+
+ if (zdp->z_zfsvfs->z_replay)
+ return (0);
+
+ if ((zdp->z_mode & S_ISVTX) == 0)
+ return (0);
+
+ downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+ fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+
+ if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+ (ZTOV(zp)->v_type == VREG &&
+ zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
+ return (0);
+ else
+ return (secpolicy_vnode_remove(ZTOV(zp), cr));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
new file mode 100644
index 000000000000..398a3d04aa6e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
@@ -0,0 +1,871 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/sysevent.h>
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports. The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * EREPORT POOL VDEV IO
+ * block X X X
+ * data X X
+ * device X X
+ * pool X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA (Error Numeric Association).
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
+ * to chain together all ereports associated with a logical piece of data. For
+ * read I/Os, there are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ * +---------------+
+ * | Aggregate I/O | No associated logical data or device
+ * +---------------+
+ * |
+ * V
+ * +---------------+ Reads associated with a piece of logical data.
+ * | Read I/O | This includes reads on behalf of RAID-Z,
+ * +---------------+ mirrors, gang blocks, retries, etc.
+ * |
+ * V
+ * +---------------+ Reads associated with a particular device, but
+ * | Physical I/O | no logical data. Issued as part of vdev caching
+ * +---------------+ and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO. Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer. But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs. They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data. When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself. Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA. For vdev cache fill and queue aggregation I/O,
+ * this pointer is set to NULL, and no ereport will be generated (since it
+ * doesn't actually correspond to any particular device or piece of data,
+ * and the caller will always retry without caching or queueing anyway).
+ *
+ * For checksum errors, we want to include more information about the actual
+ * error which occurs. Accordingly, we build an ereport when the error is
+ * noticed, but instead of sending it in immediately, we hang it off of the
+ * io_cksum_report field of the logical IO. When the logical IO completes
+ * (successfully or not), zfs_ereport_finish_checksum() is called with the
+ * good and bad versions of the buffer (if available), and we annotate the
+ * ereport with information about the differences.
+ */
+#ifdef _KERNEL
+static void
+zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
+ const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+ uint64_t stateoroffset, uint64_t size)
+{
+ nvlist_t *ereport, *detector;
+
+ uint64_t ena;
+ char class[64];
+
+ /*
+ * If we are doing a spa_tryimport() or in recovery mode,
+ * ignore errors.
+ */
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER)
+ return;
+
+ /*
+ * If we are in the middle of opening a pool, and the previous attempt
+ * failed, don't bother logging any new ereports - we're just going to
+ * get the same diagnosis anyway.
+ */
+ if (spa_load_state(spa) != SPA_LOAD_NONE &&
+ spa->spa_last_open_failed)
+ return;
+
+ if (zio != NULL) {
+ /*
+ * If this is not a read or write zio, ignore the error. This
+ * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
+ */
+ if (zio->io_type != ZIO_TYPE_READ &&
+ zio->io_type != ZIO_TYPE_WRITE)
+ return;
+
+ /*
+ * Ignore any errors from speculative I/Os, as failure is an
+ * expected result.
+ */
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ /*
+ * If this I/O is not a retry I/O, don't post an ereport.
+ * Otherwise, we risk making bad diagnoses based on B_FAILFAST
+ * I/Os.
+ */
+ if (zio->io_error == EIO &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY))
+ return;
+
+ if (vd != NULL) {
+ /*
+ * If the vdev has already been marked as failing due
+ * to a failed probe, then ignore any subsequent I/O
+ * errors, as the DE will automatically fault the vdev
+ * on the first such failure. This also catches cases
+ * where vdev_remove_wanted is set and the device has
+ * not yet been asynchronously placed into the REMOVED
+ * state.
+ */
+ if (zio->io_vd == vd && !vdev_accessible(vd, zio))
+ return;
+
+ /*
+ * Ignore checksum errors for reads from DTL regions of
+ * leaf vdevs.
+ */
+ if (zio->io_type == ZIO_TYPE_READ &&
+ zio->io_error == ECKSUM &&
+ vd->vdev_ops->vdev_op_leaf &&
+ vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
+ return;
+ }
+ }
+
+ /*
+ * For probe failure, we want to avoid posting ereports if we've
+ * already removed the device in the meantime.
+ */
+ if (vd != NULL &&
+ strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+ (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+ return;
+
+ if ((ereport = fm_nvlist_create(NULL)) == NULL)
+ return;
+
+ if ((detector = fm_nvlist_create(NULL)) == NULL) {
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ return;
+ }
+
+ /*
+ * Serialize ereport generation
+ */
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Determine the ENA to use for this event. If we are in a loading
+ * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
+ * a root zio-wide ENA. Otherwise, simply use a unique ENA.
+ */
+ if (spa_load_state(spa) != SPA_LOAD_NONE) {
+ if (spa->spa_ena == 0)
+ spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+ ena = spa->spa_ena;
+ } else if (zio != NULL && zio->io_logical != NULL) {
+ if (zio->io_logical->io_ena == 0)
+ zio->io_logical->io_ena =
+ fm_ena_generate(0, FM_ENA_FMT1);
+ ena = zio->io_logical->io_ena;
+ } else {
+ ena = fm_ena_generate(0, FM_ENA_FMT1);
+ }
+
+ /*
+ * Construct the full class, detector, and other standard FMA fields.
+ */
+ (void) snprintf(class, sizeof (class), "%s.%s",
+ ZFS_ERROR_CLASS, subclass);
+
+ fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+ vd != NULL ? vd->vdev_guid : 0);
+
+ fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
+
+ /*
+ * Construct the per-ereport payload, depending on which parameters are
+ * passed in.
+ */
+
+ /*
+ * Generic payload members common to all ereports.
+ */
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
+ DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ DATA_TYPE_UINT64, spa_guid(spa),
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+ spa_load_state(spa), NULL);
+
+ if (spa != NULL) {
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+ DATA_TYPE_STRING,
+ spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
+ FM_EREPORT_FAILMODE_WAIT :
+ spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
+ FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
+ NULL);
+ }
+
+ if (vd != NULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ DATA_TYPE_UINT64, vd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
+ if (vd->vdev_path != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+ DATA_TYPE_STRING, vd->vdev_path, NULL);
+ if (vd->vdev_devid != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+ DATA_TYPE_STRING, vd->vdev_devid, NULL);
+ if (vd->vdev_fru != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
+ DATA_TYPE_STRING, vd->vdev_fru, NULL);
+
+ if (pvd != NULL) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+ DATA_TYPE_UINT64, pvd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+ DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+ NULL);
+ if (pvd->vdev_path)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+ DATA_TYPE_STRING, pvd->vdev_path, NULL);
+ if (pvd->vdev_devid)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+ DATA_TYPE_STRING, pvd->vdev_devid, NULL);
+ }
+ }
+
+ if (zio != NULL) {
+ /*
+ * Payload common to all I/Os.
+ */
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+ DATA_TYPE_INT32, zio->io_error, NULL);
+
+ /*
+ * If the 'size' parameter is non-zero, it indicates this is a
+ * RAID-Z or other I/O where the physical offset and length are
+ * provided for us, instead of within the zio_t.
+ */
+ if (vd != NULL) {
+ if (size)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ DATA_TYPE_UINT64, stateoroffset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, size, NULL);
+ else
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ DATA_TYPE_UINT64, zio->io_offset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, zio->io_size, NULL);
+ }
+
+ /*
+ * Payload for I/Os with corresponding logical information.
+ */
+ if (zio->io_logical != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_objset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_object,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+ DATA_TYPE_INT64,
+ zio->io_logical->io_bookmark.zb_level,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_blkid, NULL);
+ } else if (vd != NULL) {
+ /*
+ * If we have a vdev but no zio, this is a device fault, and the
+ * 'stateoroffset' parameter indicates the previous state of the
+ * vdev.
+ */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+ DATA_TYPE_UINT64, stateoroffset, NULL);
+ }
+
+ mutex_exit(&spa->spa_errlist_lock);
+
+ *ereport_out = ereport;
+ *detector_out = detector;
+}
+
+/* if it's <= 128 bytes, save the corruption directly */
+#define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
+
+#define MAX_RANGES 16
+
+typedef struct zfs_ecksum_info {
+ /* histograms of set and cleared bits by bit number in a 64-bit word */
+ uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+ uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+
+ /* inline arrays of bits set and cleared. */
+ uint64_t zei_bits_set[ZFM_MAX_INLINE];
+ uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
+
+ /*
+ * for each range, the number of bits set and cleared. The Hamming
+ * distance between the good and bad buffers is the sum of them all.
+ */
+ uint32_t zei_range_sets[MAX_RANGES];
+ uint32_t zei_range_clears[MAX_RANGES];
+
+ struct zei_ranges {
+ uint32_t zr_start;
+ uint32_t zr_end;
+ } zei_ranges[MAX_RANGES];
+
+ size_t zei_range_count;
+ uint32_t zei_mingap;
+ uint32_t zei_allowed_mingap;
+
+} zfs_ecksum_info_t;
+
+static void
+update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
+{
+ size_t i;
+ size_t bits = 0;
+ uint64_t value = BE_64(value_arg);
+
+ /* We store the bits in big-endian (largest-first) order */
+ for (i = 0; i < 64; i++) {
+ if (value & (1ull << i)) {
+ hist[63 - i]++;
+ ++bits;
+ }
+ }
+ /* update the count of bits changed */
+ *count += bits;
+}
+
+/*
+ * We've now filled up the range array, and need to increase "mingap" and
+ * shrink the range list accordingly. zei_mingap is always the smallest
+ * distance between array entries, so we set the new_allowed_gap to be
+ * one greater than that. We then go through the list, joining together
+ * any ranges which are closer than the new_allowed_gap.
+ *
+ * By construction, there will be at least one. We also update zei_mingap
+ * to the new smallest gap, to prepare for our next invocation.
+ */
+static void
+shrink_ranges(zfs_ecksum_info_t *eip)
+{
+ uint32_t mingap = UINT32_MAX;
+ uint32_t new_allowed_gap = eip->zei_mingap + 1;
+
+ size_t idx, output;
+ size_t max = eip->zei_range_count;
+
+ struct zei_ranges *r = eip->zei_ranges;
+
+ ASSERT3U(eip->zei_range_count, >, 0);
+ ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
+
+ output = idx = 0;
+ while (idx < max - 1) {
+ uint32_t start = r[idx].zr_start;
+ uint32_t end = r[idx].zr_end;
+
+ while (idx < max - 1) {
+ idx++;
+
+ uint32_t nstart = r[idx].zr_start;
+ uint32_t nend = r[idx].zr_end;
+
+ uint32_t gap = nstart - end;
+ if (gap < new_allowed_gap) {
+ end = nend;
+ continue;
+ }
+ if (gap < mingap)
+ mingap = gap;
+ break;
+ }
+ r[output].zr_start = start;
+ r[output].zr_end = end;
+ output++;
+ }
+ ASSERT3U(output, <, eip->zei_range_count);
+ eip->zei_range_count = output;
+ eip->zei_mingap = mingap;
+ eip->zei_allowed_mingap = new_allowed_gap;
+}
+
+static void
+add_range(zfs_ecksum_info_t *eip, int start, int end)
+{
+ struct zei_ranges *r = eip->zei_ranges;
+ size_t count = eip->zei_range_count;
+
+ if (count >= MAX_RANGES) {
+ shrink_ranges(eip);
+ count = eip->zei_range_count;
+ }
+ if (count == 0) {
+ eip->zei_mingap = UINT32_MAX;
+ eip->zei_allowed_mingap = 1;
+ } else {
+ int gap = start - r[count - 1].zr_end;
+
+ if (gap < eip->zei_allowed_mingap) {
+ r[count - 1].zr_end = end;
+ return;
+ }
+ if (gap < eip->zei_mingap)
+ eip->zei_mingap = gap;
+ }
+ r[count].zr_start = start;
+ r[count].zr_end = end;
+ eip->zei_range_count++;
+}
+
+static size_t
+range_total_size(zfs_ecksum_info_t *eip)
+{
+ struct zei_ranges *r = eip->zei_ranges;
+ size_t count = eip->zei_range_count;
+ size_t result = 0;
+ size_t idx;
+
+ for (idx = 0; idx < count; idx++)
+ result += (r[idx].zr_end - r[idx].zr_start);
+
+ return (result);
+}
+
+static zfs_ecksum_info_t *
+annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
+ const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
+ boolean_t drop_if_identical)
+{
+ const uint64_t *good = (const uint64_t *)goodbuf;
+ const uint64_t *bad = (const uint64_t *)badbuf;
+
+ uint64_t allset = 0;
+ uint64_t allcleared = 0;
+
+ size_t nui64s = size / sizeof (uint64_t);
+
+ size_t inline_size;
+ int no_inline = 0;
+ size_t idx;
+ size_t range;
+
+ size_t offset = 0;
+ ssize_t start = -1;
+
+ zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
+
+ /* don't do any annotation for injected checksum errors */
+ if (info != NULL && info->zbc_injected)
+ return (eip);
+
+ if (info != NULL && info->zbc_has_cksum) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
+ DATA_TYPE_UINT64_ARRAY,
+ sizeof (info->zbc_expected) / sizeof (uint64_t),
+ (uint64_t *)&info->zbc_expected,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
+ DATA_TYPE_UINT64_ARRAY,
+ sizeof (info->zbc_actual) / sizeof (uint64_t),
+ (uint64_t *)&info->zbc_actual,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
+ DATA_TYPE_STRING,
+ info->zbc_checksum_name,
+ NULL);
+
+ if (info->zbc_byteswapped) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
+ DATA_TYPE_BOOLEAN, 1,
+ NULL);
+ }
+ }
+
+ if (badbuf == NULL || goodbuf == NULL)
+ return (eip);
+
+ ASSERT3U(nui64s, <=, UINT32_MAX);
+ ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, <=, UINT32_MAX);
+
+ /* build up the range list by comparing the two buffers. */
+ for (idx = 0; idx < nui64s; idx++) {
+ if (good[idx] == bad[idx]) {
+ if (start == -1)
+ continue;
+
+ add_range(eip, start, idx);
+ start = -1;
+ } else {
+ if (start != -1)
+ continue;
+
+ start = idx;
+ }
+ }
+ if (start != -1)
+ add_range(eip, start, idx);
+
+ /* See if it will fit in our inline buffers */
+ inline_size = range_total_size(eip);
+ if (inline_size > ZFM_MAX_INLINE)
+ no_inline = 1;
+
+ /*
+ * If there is no change and we want to drop if the buffers are
+ * identical, do so.
+ */
+ if (inline_size == 0 && drop_if_identical) {
+ kmem_free(eip, sizeof (*eip));
+ return (NULL);
+ }
+
+ /*
+ * Now walk through the ranges, filling in the details of the
+ * differences. Also convert our uint64_t-array offsets to byte
+ * offsets.
+ */
+ for (range = 0; range < eip->zei_range_count; range++) {
+ size_t start = eip->zei_ranges[range].zr_start;
+ size_t end = eip->zei_ranges[range].zr_end;
+
+ for (idx = start; idx < end; idx++) {
+ uint64_t set, cleared;
+
+ // bits set in bad, but not in good
+ set = ((~good[idx]) & bad[idx]);
+ // bits set in good, but not in bad
+ cleared = (good[idx] & (~bad[idx]));
+
+ allset |= set;
+ allcleared |= cleared;
+
+ if (!no_inline) {
+ ASSERT3U(offset, <, inline_size);
+ eip->zei_bits_set[offset] = set;
+ eip->zei_bits_cleared[offset] = cleared;
+ offset++;
+ }
+
+ update_histogram(set, eip->zei_histogram_set,
+ &eip->zei_range_sets[range]);
+ update_histogram(cleared, eip->zei_histogram_cleared,
+ &eip->zei_range_clears[range]);
+ }
+
+ /* convert to byte offsets */
+ eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
+ eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
+ }
+ eip->zei_allowed_mingap *= sizeof (uint64_t);
+ inline_size *= sizeof (uint64_t);
+
+ /* fill in ereport */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
+ DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
+ (uint32_t *)eip->zei_ranges,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
+ DATA_TYPE_UINT32, eip->zei_allowed_mingap,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
+ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
+ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
+ NULL);
+
+ if (!no_inline) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
+ DATA_TYPE_UINT8_ARRAY,
+ inline_size, (uint8_t *)eip->zei_bits_set,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
+ DATA_TYPE_UINT8_ARRAY,
+ inline_size, (uint8_t *)eip->zei_bits_cleared,
+ NULL);
+ } else {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
+ DATA_TYPE_UINT32_ARRAY,
+ NBBY * sizeof (uint64_t), eip->zei_histogram_set,
+ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
+ DATA_TYPE_UINT32_ARRAY,
+ NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
+ NULL);
+ }
+ return (eip);
+}
+#endif
+
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+ uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+ nvlist_t *ereport = NULL;
+ nvlist_t *detector = NULL;
+
+ zfs_ereport_start(&ereport, &detector,
+ subclass, spa, vd, zio, stateoroffset, size);
+
+ if (ereport == NULL)
+ return;
+
+ fm_ereport_post(ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+#endif
+}
+
+void
+zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length, void *arg,
+ zio_bad_cksum_t *info)
+{
+ zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+
+ if (zio->io_vsd != NULL)
+ zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
+ else
+ zio_vsd_default_cksum_report(zio, report, arg);
+
+ /* copy the checksum failure information if it was provided */
+ if (info != NULL) {
+ report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
+ bcopy(info, report->zcr_ckinfo, sizeof (*info));
+ }
+
+ report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
+ report->zcr_length = length;
+
+#ifdef _KERNEL
+ zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
+ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+ if (report->zcr_ereport == NULL) {
+ report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
+ if (report->zcr_ckinfo != NULL) {
+ kmem_free(report->zcr_ckinfo,
+ sizeof (*report->zcr_ckinfo));
+ }
+ kmem_free(report, sizeof (*report));
+ return;
+ }
+#endif
+
+ mutex_enter(&spa->spa_errlist_lock);
+ report->zcr_next = zio->io_logical->io_cksum_report;
+ zio->io_logical->io_cksum_report = report;
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+void
+zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+ const void *good_data, const void *bad_data, boolean_t drop_if_identical)
+{
+#ifdef _KERNEL
+ zfs_ecksum_info_t *info = NULL;
+ info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
+ good_data, bad_data, report->zcr_length, drop_if_identical);
+
+ if (info != NULL)
+ fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
+ report->zcr_ereport = report->zcr_detector = NULL;
+
+ if (info != NULL)
+ kmem_free(info, sizeof (*info));
+#endif
+}
+
+void
+zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
+{
+#ifdef _KERNEL
+ if (rpt->zcr_ereport != NULL) {
+ fm_nvlist_destroy(rpt->zcr_ereport,
+ FM_NVA_FREE);
+ fm_nvlist_destroy(rpt->zcr_detector,
+ FM_NVA_FREE);
+ }
+#endif
+ rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
+
+ if (rpt->zcr_ckinfo != NULL)
+ kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
+
+ kmem_free(rpt, sizeof (*rpt));
+}
+
+void
+zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
+{
+#ifdef _KERNEL
+ fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+#endif
+}
+
+void
+zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t offset, uint64_t length,
+ const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
+{
+#ifdef _KERNEL
+ nvlist_t *ereport = NULL;
+ nvlist_t *detector = NULL;
+ zfs_ecksum_info_t *info;
+
+ zfs_ereport_start(&ereport, &detector,
+ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+ if (ereport == NULL)
+ return;
+
+ info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
+ B_FALSE);
+
+ if (info != NULL)
+ fm_ereport_post(ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+
+ if (info != NULL)
+ kmem_free(info, sizeof (*info));
+#endif
+}
+
+static void
+zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
+{
+#ifdef _KERNEL
+ nvlist_t *resource;
+ char class[64];
+
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+ return;
+
+ if ((resource = fm_nvlist_create(NULL)) == NULL)
+ return;
+
+ (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
+ ZFS_ERROR_CLASS, name);
+ VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
+ VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
+ if (vd)
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
+
+ fm_ereport_post(resource, EVCH_SLEEP);
+
+ fm_nvlist_destroy(resource, FM_NVA_FREE);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
+ * has been removed from the system. This will cause the DE to ignore any
+ * recent I/O errors, inferring that they are due to the asynchronous device
+ * removal.
+ */
+void
+zfs_post_remove(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
+}
+
+/*
+ * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
+ * has the 'autoreplace' property set, and therefore any broken vdevs will be
+ * handled by higher level logic, and no vdev fault should be generated.
+ */
+void
+zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
+}
+
+/*
+ * The 'resource.fs.zfs.statechange' event is an internal signal that the
+ * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
+ * cause the retire agent to repair any outstanding fault management cases
+ * open because the device was not found (fault.fs.zfs.device).
+ */
+void
+zfs_post_state_change(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
new file mode 100644
index 000000000000..581b6b1bfb64
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
@@ -0,0 +1,762 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/avl.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/nvpair.h>
+#ifdef _KERNEL
+#include <sys/kidmap.h>
+#include <sys/sid.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#endif
+#include <sys/zfs_fuid.h>
+
+/*
+ * FUID Domain table(s).
+ *
+ * The FUID table is stored as a packed nvlist of an array
+ * of nvlists which contain an index, domain string and offset
+ *
+ * During file system initialization the nvlist(s) are read and
+ * two AVL trees are created. One tree is keyed by the index number
+ * and the other by the domain string. Nodes are never removed from
+ * trees, but new entries may be added. If a new entry is added then
+ * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
+ * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
+ *
+ */
+
+#define FUID_IDX "fuid_idx"
+#define FUID_DOMAIN "fuid_domain"
+#define FUID_OFFSET "fuid_offset"
+#define FUID_NVP_ARRAY "fuid_nvlist"
+
+typedef struct fuid_domain {
+ avl_node_t f_domnode;
+ avl_node_t f_idxnode;
+ ksiddomain_t *f_ksid;
+ uint64_t f_idx;
+} fuid_domain_t;
+
+static char *nulldomain = "";
+
+/*
+ * Compare two indexes.
+ */
+static int
+idx_compare(const void *arg1, const void *arg2)
+{
+ const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+ const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
+
+ return (AVL_CMP(node1->f_idx, node2->f_idx));
+}
+
+/*
+ * Compare two domain strings.
+ */
+static int
+domain_compare(const void *arg1, const void *arg2)
+{
+ const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+ const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
+ int val;
+
+ val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
+
+ return (AVL_ISIGN(val));
+}
+
+void
+zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+ avl_create(idx_tree, idx_compare,
+ sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
+ avl_create(domain_tree, domain_compare,
+ sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
+}
+
+/*
+ * load initial fuid domain and idx trees. This function is used by
+ * both the kernel and zdb.
+ */
+uint64_t
+zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
+ avl_tree_t *domain_tree)
+{
+ dmu_buf_t *db;
+ uint64_t fuid_size;
+
+ ASSERT(fuid_obj != 0);
+ VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
+ FTAG, &db));
+ fuid_size = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ if (fuid_size) {
+ nvlist_t **fuidnvp;
+ nvlist_t *nvp = NULL;
+ uint_t count;
+ char *packed;
+ int i;
+
+ packed = kmem_alloc(fuid_size, KM_SLEEP);
+ VERIFY(dmu_read(os, fuid_obj, 0,
+ fuid_size, packed, DMU_READ_PREFETCH) == 0);
+ VERIFY(nvlist_unpack(packed, fuid_size,
+ &nvp, 0) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
+ &fuidnvp, &count) == 0);
+
+ for (i = 0; i != count; i++) {
+ fuid_domain_t *domnode;
+ char *domain;
+ uint64_t idx;
+
+ VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
+ &domain) == 0);
+ VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
+ &idx) == 0);
+
+ domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+
+ domnode->f_idx = idx;
+ domnode->f_ksid = ksid_lookupdomain(domain);
+ avl_add(idx_tree, domnode);
+ avl_add(domain_tree, domnode);
+ }
+ nvlist_free(nvp);
+ kmem_free(packed, fuid_size);
+ }
+ return (fuid_size);
+}
+
+void
+zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+ fuid_domain_t *domnode;
+ void *cookie;
+
+ cookie = NULL;
+ while (domnode = avl_destroy_nodes(domain_tree, &cookie))
+ ksiddomain_rele(domnode->f_ksid);
+
+ avl_destroy(domain_tree);
+ cookie = NULL;
+ while (domnode = avl_destroy_nodes(idx_tree, &cookie))
+ kmem_free(domnode, sizeof (fuid_domain_t));
+ avl_destroy(idx_tree);
+}
+
+char *
+zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
+{
+ fuid_domain_t searchnode, *findnode;
+ avl_index_t loc;
+
+ searchnode.f_idx = idx;
+
+ findnode = avl_find(idx_tree, &searchnode, &loc);
+
+ return (findnode ? findnode->f_ksid->kd_name : nulldomain);
+}
+
+#ifdef _KERNEL
+/*
+ * Load the fuid table(s) into memory.
+ */
+static void
+zfs_fuid_init(zfsvfs_t *zfsvfs)
+{
+ rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+ if (zfsvfs->z_fuid_loaded) {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ return;
+ }
+
+ zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+
+ (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
+ if (zfsvfs->z_fuid_obj != 0) {
+ zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
+ zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
+ &zfsvfs->z_fuid_domain);
+ }
+
+ zfsvfs->z_fuid_loaded = B_TRUE;
+ rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * sync out AVL trees to persistent storage.
+ */
+void
+zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ nvlist_t *nvp;
+ nvlist_t **fuids;
+ size_t nvsize = 0;
+ char *packed;
+ dmu_buf_t *db;
+ fuid_domain_t *domnode;
+ int numnodes;
+ int i;
+
+ if (!zfsvfs->z_fuid_dirty) {
+ return;
+ }
+
+ rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+ /*
+ * First see if table needs to be created?
+ */
+ if (zfsvfs->z_fuid_obj == 0) {
+ zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
+ sizeof (uint64_t), tx);
+ VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_FUID_TABLES, sizeof (uint64_t), 1,
+ &zfsvfs->z_fuid_obj, tx) == 0);
+ }
+
+ VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
+ fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
+ for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
+ domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
+ VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
+ domnode->f_idx) == 0);
+ VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
+ VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
+ domnode->f_ksid->kd_name) == 0);
+ }
+ VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+ fuids, numnodes) == 0);
+ for (i = 0; i != numnodes; i++)
+ nvlist_free(fuids[i]);
+ kmem_free(fuids, numnodes * sizeof (void *));
+ VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+ VERIFY(nvlist_pack(nvp, &packed, &nvsize,
+ NV_ENCODE_XDR, KM_SLEEP) == 0);
+ nvlist_free(nvp);
+ zfsvfs->z_fuid_size = nvsize;
+ dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
+ zfsvfs->z_fuid_size, packed, tx);
+ kmem_free(packed, zfsvfs->z_fuid_size);
+ VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
+ FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
+ dmu_buf_rele(db, FTAG);
+
+ zfsvfs->z_fuid_dirty = B_FALSE;
+ rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Query domain table for a given domain.
+ *
+ * If domain isn't found and addok is set, it is added to AVL trees and
+ * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be
+ * necessary for the caller or another thread to detect the dirty table
+ * and sync out the changes.
+ */
+int
+zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
+ char **retdomain, boolean_t addok)
+{
+ fuid_domain_t searchnode, *findnode;
+ avl_index_t loc;
+ krw_t rw = RW_READER;
+
+ /*
+ * If the dummy "nobody" domain then return an index of 0
+ * to cause the created FUID to be a standard POSIX id
+ * for the user nobody.
+ */
+ if (domain[0] == '\0') {
+ if (retdomain)
+ *retdomain = nulldomain;
+ return (0);
+ }
+
+ searchnode.f_ksid = ksid_lookupdomain(domain);
+ if (retdomain)
+ *retdomain = searchnode.f_ksid->kd_name;
+ if (!zfsvfs->z_fuid_loaded)
+ zfs_fuid_init(zfsvfs);
+
+retry:
+ rw_enter(&zfsvfs->z_fuid_lock, rw);
+ findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
+
+ if (findnode) {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ ksiddomain_rele(searchnode.f_ksid);
+ return (findnode->f_idx);
+ } else if (addok) {
+ fuid_domain_t *domnode;
+ uint64_t retidx;
+
+ if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ rw = RW_WRITER;
+ goto retry;
+ }
+
+ domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+ domnode->f_ksid = searchnode.f_ksid;
+
+ retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
+
+ avl_add(&zfsvfs->z_fuid_domain, domnode);
+ avl_add(&zfsvfs->z_fuid_idx, domnode);
+ zfsvfs->z_fuid_dirty = B_TRUE;
+ rw_exit(&zfsvfs->z_fuid_lock);
+ return (retidx);
+ } else {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ return (-1);
+ }
+}
+
+/*
+ * Query domain table by index, returning domain string
+ *
+ * Returns a pointer from an avl node of the domain string.
+ *
+ */
+const char *
+zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
+{
+ char *domain;
+
+ if (idx == 0 || !zfsvfs->z_use_fuids)
+ return (NULL);
+
+ if (!zfsvfs->z_fuid_loaded)
+ zfs_fuid_init(zfsvfs);
+
+ rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
+
+ if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
+ domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
+ else
+ domain = nulldomain;
+ rw_exit(&zfsvfs->z_fuid_lock);
+
+ ASSERT(domain);
+ return (domain);
+}
+
+void
+zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
+{
+ *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
+}
+
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+ cred_t *cr, zfs_fuid_type_t type)
+{
+ uint32_t index = FUID_INDEX(fuid);
+ const char *domain;
+ uid_t id;
+
+ if (index == 0)
+ return (fuid);
+
+ domain = zfs_fuid_find_by_idx(zfsvfs, index);
+ ASSERT(domain != NULL);
+
+#ifdef illumos
+ if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
+ (void) kidmap_getuidbysid(crgetzone(cr), domain,
+ FUID_RID(fuid), &id);
+ } else {
+ (void) kidmap_getgidbysid(crgetzone(cr), domain,
+ FUID_RID(fuid), &id);
+ }
+#else
+ id = UID_NOBODY;
+#endif
+ return (id);
+}
+
+/*
+ * Add a FUID node to the list of fuid's being created for this
+ * ACL
+ *
+ * If ACL has multiple domains, then keep only one copy of each unique
+ * domain.
+ */
+void
+zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
+ uint64_t idx, uint64_t id, zfs_fuid_type_t type)
+{
+ zfs_fuid_t *fuid;
+ zfs_fuid_domain_t *fuid_domain;
+ zfs_fuid_info_t *fuidp;
+ uint64_t fuididx;
+ boolean_t found = B_FALSE;
+
+ if (*fuidpp == NULL)
+ *fuidpp = zfs_fuid_info_alloc();
+
+ fuidp = *fuidpp;
+ /*
+ * First find fuid domain index in linked list
+ *
+ * If one isn't found then create an entry.
+ */
+
+ for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
+ fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
+ fuid_domain), fuididx++) {
+ if (idx == fuid_domain->z_domidx) {
+ found = B_TRUE;
+ break;
+ }
+ }
+
+ if (!found) {
+ fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
+ fuid_domain->z_domain = domain;
+ fuid_domain->z_domidx = idx;
+ list_insert_tail(&fuidp->z_domains, fuid_domain);
+ fuidp->z_domain_str_sz += strlen(domain) + 1;
+ fuidp->z_domain_cnt++;
+ }
+
+ if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
+
+ /*
+ * Now allocate fuid entry and add it on the end of the list
+ */
+
+ fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+ fuid->z_id = id;
+ fuid->z_domidx = idx;
+ fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
+
+ list_insert_tail(&fuidp->z_fuids, fuid);
+ fuidp->z_fuid_cnt++;
+ } else {
+ if (type == ZFS_OWNER)
+ fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
+ else
+ fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
+ }
+}
+
+/*
+ * Create a file system FUID, based on information in the users cred
+ *
+ * If cred contains KSID_OWNER then it should be used to determine
+ * the uid otherwise cred's uid will be used. By default cred's gid
+ * is used unless it's an ephemeral ID in which case KSID_GROUP will
+ * be used if it exists.
+ */
+uint64_t
+zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
+ cred_t *cr, zfs_fuid_info_t **fuidp)
+{
+ uint64_t idx;
+ ksid_t *ksid;
+ uint32_t rid;
+ char *kdomain;
+ const char *domain;
+ uid_t id;
+
+ VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
+
+ ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+
+ if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
+ id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
+
+ if (IS_EPHEMERAL(id))
+ return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
+
+ return ((uint64_t)id);
+ }
+
+ /*
+ * ksid is present and FUID is supported
+ */
+ id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
+
+ if (!IS_EPHEMERAL(id))
+ return ((uint64_t)id);
+
+ if (type == ZFS_GROUP)
+ id = ksid_getid(ksid);
+
+ rid = ksid_getrid(ksid);
+ domain = ksid_getdomain(ksid);
+
+ idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
+
+ zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
+
+ return (FUID_ENCODE(idx, rid));
+}
+
+/*
+ * Create a file system FUID for an ACL ace
+ * or a chown/chgrp of the file.
+ * This is similar to zfs_fuid_create_cred, except that
+ * we can't find the domain + rid information in the
+ * cred. Instead we have to query Winchester for the
+ * domain and rid.
+ *
+ * During replay operations the domain+rid information is
+ * found in the zfs_fuid_info_t that the replay code has
+ * attached to the zfsvfs of the file system.
+ */
+uint64_t
+zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
+ zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
+{
+ const char *domain;
+ char *kdomain;
+ uint32_t fuid_idx = FUID_INDEX(id);
+ uint32_t rid;
+ idmap_stat status;
+ uint64_t idx = 0;
+ zfs_fuid_t *zfuid = NULL;
+ zfs_fuid_info_t *fuidp = NULL;
+
+ /*
+ * If POSIX ID, or entry is already a FUID then
+ * just return the id
+ *
+ * We may also be handed an already FUID'ized id via
+ * chmod.
+ */
+
+ if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
+ return (id);
+
+ if (zfsvfs->z_replay) {
+ fuidp = zfsvfs->z_fuid_replay;
+
+ /*
+ * If we are passed an ephemeral id, but no
+ * fuid_info was logged then return NOBODY.
+ * This is most likely a result of idmap service
+ * not being available.
+ */
+ if (fuidp == NULL)
+ return (UID_NOBODY);
+
+ VERIFY3U(type, >=, ZFS_OWNER);
+ VERIFY3U(type, <=, ZFS_ACE_GROUP);
+
+ switch (type) {
+ case ZFS_ACE_USER:
+ case ZFS_ACE_GROUP:
+ zfuid = list_head(&fuidp->z_fuids);
+ rid = FUID_RID(zfuid->z_logfuid);
+ idx = FUID_INDEX(zfuid->z_logfuid);
+ break;
+ case ZFS_OWNER:
+ rid = FUID_RID(fuidp->z_fuid_owner);
+ idx = FUID_INDEX(fuidp->z_fuid_owner);
+ break;
+ case ZFS_GROUP:
+ rid = FUID_RID(fuidp->z_fuid_group);
+ idx = FUID_INDEX(fuidp->z_fuid_group);
+ break;
+ };
+ domain = fuidp->z_domain_table[idx - 1];
+ } else {
+ if (type == ZFS_OWNER || type == ZFS_ACE_USER)
+ status = kidmap_getsidbyuid(crgetzone(cr), id,
+ &domain, &rid);
+ else
+ status = kidmap_getsidbygid(crgetzone(cr), id,
+ &domain, &rid);
+
+ if (status != 0) {
+ /*
+ * When returning nobody we will need to
+ * make a dummy fuid table entry for logging
+ * purposes.
+ */
+ rid = UID_NOBODY;
+ domain = nulldomain;
+ }
+ }
+
+ idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
+
+ if (!zfsvfs->z_replay)
+ zfs_fuid_node_add(fuidpp, kdomain,
+ rid, idx, id, type);
+ else if (zfuid != NULL) {
+ list_remove(&fuidp->z_fuids, zfuid);
+ kmem_free(zfuid, sizeof (zfs_fuid_t));
+ }
+ return (FUID_ENCODE(idx, rid));
+}
+
+void
+zfs_fuid_destroy(zfsvfs_t *zfsvfs)
+{
+ rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+ if (!zfsvfs->z_fuid_loaded) {
+ rw_exit(&zfsvfs->z_fuid_lock);
+ return;
+ }
+ zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+ rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Allocate zfs_fuid_info for tracking FUIDs created during
+ * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
+ */
+zfs_fuid_info_t *
+zfs_fuid_info_alloc(void)
+{
+ zfs_fuid_info_t *fuidp;
+
+ fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
+ list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
+ offsetof(zfs_fuid_domain_t, z_next));
+ list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
+ offsetof(zfs_fuid_t, z_next));
+ return (fuidp);
+}
+
+/*
+ * Release all memory associated with zfs_fuid_info_t
+ */
+void
+zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
+{
+ zfs_fuid_t *zfuid;
+ zfs_fuid_domain_t *zdomain;
+
+ while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
+ list_remove(&fuidp->z_fuids, zfuid);
+ kmem_free(zfuid, sizeof (zfs_fuid_t));
+ }
+
+ if (fuidp->z_domain_table != NULL)
+ kmem_free(fuidp->z_domain_table,
+ (sizeof (char **)) * fuidp->z_domain_cnt);
+
+ while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
+ list_remove(&fuidp->z_domains, zdomain);
+ kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
+ }
+
+ kmem_free(fuidp, sizeof (zfs_fuid_info_t));
+}
+
+/*
+ * Check to see if id is a groupmember. If cred
+ * has ksid info then sidlist is checked first
+ * and if still not found then POSIX groups are checked
+ *
+ * Will use a straight FUID compare when possible.
+ */
+boolean_t
+zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
+{
+#ifdef illumos
+ ksid_t *ksid = crgetsid(cr, KSID_GROUP);
+ ksidlist_t *ksidlist = crgetsidlist(cr);
+#endif
+ uid_t gid;
+
+#ifdef illumos
+ if (ksid && ksidlist) {
+ int i;
+ ksid_t *ksid_groups;
+ uint32_t idx = FUID_INDEX(id);
+ uint32_t rid = FUID_RID(id);
+
+ ksid_groups = ksidlist->ksl_sids;
+
+ for (i = 0; i != ksidlist->ksl_nsid; i++) {
+ if (idx == 0) {
+ if (id != IDMAP_WK_CREATOR_GROUP_GID &&
+ id == ksid_groups[i].ks_id) {
+ return (B_TRUE);
+ }
+ } else {
+ const char *domain;
+
+ domain = zfs_fuid_find_by_idx(zfsvfs, idx);
+ ASSERT(domain != NULL);
+
+ if (strcmp(domain,
+ IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
+ return (B_FALSE);
+
+ if ((strcmp(domain,
+ ksid_groups[i].ks_domain->kd_name) == 0) &&
+ rid == ksid_groups[i].ks_rid)
+ return (B_TRUE);
+ }
+ }
+ }
+#endif /* illumos */
+
+ /*
+ * Not found in ksidlist, check posix groups
+ */
+ gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
+ return (groupmember(gid, cr));
+}
+
+void
+zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ if (zfsvfs->z_fuid_obj == 0) {
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ FUID_SIZE_ESTIMATE(zfsvfs));
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+ } else {
+ dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+ dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+ FUID_SIZE_ESTIMATE(zfsvfs));
+ }
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
new file mode 100644
index 000000000000..a7e2aff6e683
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -0,0 +1,7692 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright 2017 RackTop Systems.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
+ */
+
+/*
+ * ZFS ioctls.
+ *
+ * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
+ * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
+ *
+ * There are two ways that we handle ioctls: the legacy way where almost
+ * all of the logic is in the ioctl callback, and the new way where most
+ * of the marshalling is handled in the common entry point, zfsdev_ioctl().
+ *
+ * Non-legacy ioctls should be registered by calling
+ * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked
+ * from userland by lzc_ioctl().
+ *
+ * The registration arguments are as follows:
+ *
+ * const char *name
+ * The name of the ioctl. This is used for history logging. If the
+ * ioctl returns successfully (the callback returns 0), and allow_log
+ * is true, then a history log entry will be recorded with the input &
+ * output nvlists. The log entry can be printed with "zpool history -i".
+ *
+ * zfs_ioc_t ioc
+ * The ioctl request number, which userland will pass to ioctl(2).
+ * We want newer versions of libzfs and libzfs_core to run against
+ * existing zfs kernel modules (i.e. a deferred reboot after an update).
+ * Therefore the ioctl numbers cannot change from release to release.
+ *
+ * zfs_secpolicy_func_t *secpolicy
+ * This function will be called before the zfs_ioc_func_t, to
+ * determine if this operation is permitted. It should return EPERM
+ * on failure, and 0 on success. Checks include determining if the
+ * dataset is visible in this zone, and if the user has either all
+ * zfs privileges in the zone (SYS_MOUNT), or has been granted permission
+ * to do this operation on this dataset with "zfs allow".
+ *
+ * zfs_ioc_namecheck_t namecheck
+ * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
+ * name, a dataset name, or nothing. If the name is not well-formed,
+ * the ioctl will fail and the callback will not be called.
+ * Therefore, the callback can assume that the name is well-formed
+ * (e.g. is null-terminated, doesn't have more than one '@' character,
+ * doesn't have invalid characters).
+ *
+ * zfs_ioc_poolcheck_t pool_check
+ * This specifies requirements on the pool state. If the pool does
+ * not meet them (is suspended or is readonly), the ioctl will fail
+ * and the callback will not be called. If any checks are specified
+ * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
+ * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
+ * POOL_CHECK_READONLY).
+ *
+ * zfs_ioc_key_t *nvl_keys
+ * The list of expected/allowable innvl input keys. This list is used
+ * to validate the nvlist input to the ioctl.
+ *
+ * boolean_t smush_outnvlist
+ * If smush_outnvlist is true, then the output is presumed to be a
+ * list of errors, and it will be "smushed" down to fit into the
+ * caller's buffer, by removing some entries and replacing them with a
+ * single "N_MORE_ERRORS" entry indicating how many were removed. See
+ * nvlist_smush() for details. If smush_outnvlist is false, and the
+ * outnvlist does not fit into the userland-provided buffer, then the
+ * ioctl will fail with ENOMEM.
+ *
+ * zfs_ioc_func_t *func
+ * The callback function that will perform the operation.
+ *
+ * The callback should return 0 on success, or an error number on
+ * failure. If the function fails, the userland ioctl will return -1,
+ * and errno will be set to the callback's return value. The callback
+ * will be called with the following arguments:
+ *
+ * const char *name
+ * The name of the pool or dataset to operate on, from
+ * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the
+ * expected type (pool, dataset, or none).
+ *
+ * nvlist_t *innvl
+ * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or
+ * NULL if no input nvlist was provided. Changes to this nvlist are
+ * ignored. If the input nvlist could not be deserialized, the
+ * ioctl will fail and the callback will not be called.
+ *
+ * nvlist_t *outnvl
+ * The output nvlist, initially empty. The callback can fill it in,
+ * and it will be returned to userland by serializing it into
+ * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization
+ * fails (e.g. because the caller didn't supply a large enough
+ * buffer), then the overall ioctl will fail. See the
+ * 'smush_nvlist' argument above for additional behaviors.
+ *
+ * There are two typical uses of the output nvlist:
+ * - To return state, e.g. property values. In this case,
+ * smush_outnvlist should be false. If the buffer was not large
+ * enough, the caller will reallocate a larger buffer and try
+ * the ioctl again.
+ *
+ * - To return multiple errors from an ioctl which makes on-disk
+ * changes. In this case, smush_outnvlist should be true.
+ * Ioctls which make on-disk modifications should generally not
+ * use the outnvl if they succeed, because the caller can not
+ * distinguish between the operation failing, and
+ * deserialization failing.
+ *
+ *
+ * IOCTL Interface Errors
+ *
+ * The following ioctl input errors can be returned:
+ * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
+ * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
+ * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
+ * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
+ */
+
+#ifdef __FreeBSD__
+#include "opt_kstack_pages.h"
+#endif
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/varargs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_comutil.h"
+#include "zfs_ioctl_compat.h"
+
+#include "lua.h"
+#include "lauxlib.h"
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) nitems(x)
+#endif
+
+static struct cdev *zfsdev;
+
+extern void zfs_init(void);
+extern void zfs_fini(void);
+
+uint_t zfs_fsyncer_key;
+extern uint_t rrw_tsd_key;
+static uint_t zfs_allow_log_key;
+extern uint_t zfs_geom_probe_vdev_key;
+
+typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
+typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
+typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *);
+
+/*
+ * IOC Keys are used to document and validate user->kernel interface inputs.
+ * See zfs_keys_recv_new for an example declaration. Any key name that is not
+ * listed will be rejected as input.
+ *
+ * The keyname 'optional' is always allowed, and must be an nvlist if present.
+ * Arguments which older kernels can safely ignore can be placed under the
+ * "optional" key.
+ *
+ * When adding new keys to an existing ioc for new functionality, consider:
+ * - adding an entry into zfs_sysfs.c zfs_features[] list
+ * - updating the libzfs_input_check.c test utility
+ *
+ * Note: in the ZK_WILDCARDLIST case, the name serves as documentation
+ * for the expected name (bookmark, snapshot, property, etc) but there
+ * is no validation in the preflight zfs_check_input_nvpairs() check.
+ */
+typedef enum {
+ ZK_OPTIONAL = 1 << 0, /* pair is optional */
+ ZK_WILDCARDLIST = 1 << 1, /* one or more unspecified key names */
+} ioc_key_flag_t;
+
+/* DATA_TYPE_ANY is used when zkey_type can vary. */
+#define DATA_TYPE_ANY DATA_TYPE_UNKNOWN
+
+typedef struct zfs_ioc_key {
+ const char *zkey_name;
+ data_type_t zkey_type;
+ ioc_key_flag_t zkey_flags;
+} zfs_ioc_key_t;
+
+typedef enum {
+ NO_NAME,
+ POOL_NAME,
+ DATASET_NAME,
+ ENTITY_NAME
+} zfs_ioc_namecheck_t;
+
+typedef enum {
+ POOL_CHECK_NONE = 1 << 0,
+ POOL_CHECK_SUSPENDED = 1 << 1,
+ POOL_CHECK_READONLY = 1 << 2,
+} zfs_ioc_poolcheck_t;
+
+typedef struct zfs_ioc_vec {
+ zfs_ioc_legacy_func_t *zvec_legacy_func;
+ zfs_ioc_func_t *zvec_func;
+ zfs_secpolicy_func_t *zvec_secpolicy;
+ zfs_ioc_namecheck_t zvec_namecheck;
+ boolean_t zvec_allow_log;
+ zfs_ioc_poolcheck_t zvec_pool_check;
+ boolean_t zvec_smush_outnvlist;
+ const char *zvec_name;
+ const zfs_ioc_key_t *zvec_nvl_keys;
+ size_t zvec_nvl_key_count;
+} zfs_ioc_vec_t;
+
+/* This array is indexed by zfs_userquota_prop_t */
+static const char *userquota_perms[] = {
+ ZFS_DELEG_PERM_USERUSED,
+ ZFS_DELEG_PERM_USERQUOTA,
+ ZFS_DELEG_PERM_GROUPUSED,
+ ZFS_DELEG_PERM_GROUPQUOTA,
+};
+
+static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
+static int zfs_check_settable(const char *name, nvpair_t *property,
+ cred_t *cr);
+static int zfs_check_clearable(char *dataset, nvlist_t *props,
+ nvlist_t **errors);
+static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
+ boolean_t *);
+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
+static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
+
+static void zfsdev_close(void *data);
+
+static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
+
+/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ char buf[512];
+ va_list adx;
+
+ /*
+ * Get rid of annoying "../common/" prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ /*
+ * To get this data, use the zfs-dprintf probe as so:
+ * dtrace -q -n 'zfs-dprintf \
+ * /stringof(arg0) == "dbuf.c"/ \
+ * {printf("%s: %s", stringof(arg1), stringof(arg3))}'
+ * arg0 = file name
+ * arg1 = function name
+ * arg2 = line number
+ * arg3 = message
+ */
+ DTRACE_PROBE4(zfs__dprintf,
+ char *, newfile, char *, func, int, line, char *, buf);
+}
+
+static void
+history_str_free(char *buf)
+{
+ kmem_free(buf, HIS_MAX_RECORD_LEN);
+}
+
+static char *
+history_str_get(zfs_cmd_t *zc)
+{
+ char *buf;
+
+ if (zc->zc_history == 0)
+ return (NULL);
+
+ buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+ if (copyinstr((void *)(uintptr_t)zc->zc_history,
+ buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
+ history_str_free(buf);
+ return (NULL);
+ }
+
+ buf[HIS_MAX_RECORD_LEN -1] = '\0';
+
+ return (buf);
+}
+
+/*
+ * Check to see if the named dataset is currently defined as bootable
+ */
+static boolean_t
+zfs_is_bootfs(const char *name)
+{
+ objset_t *os;
+
+ if (dmu_objset_hold(name, FTAG, &os) == 0) {
+ boolean_t ret;
+ ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
+ dmu_objset_rele(os, FTAG);
+ return (ret);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Return non-zero if the spa version is less than requested version.
+ */
+static int
+zfs_earlier_version(const char *name, int version)
+{
+ spa_t *spa;
+
+ if (spa_open(name, &spa, FTAG) == 0) {
+ if (spa_version(spa) < version) {
+ spa_close(spa, FTAG);
+ return (1);
+ }
+ spa_close(spa, FTAG);
+ }
+ return (0);
+}
+
+/*
+ * Return TRUE if the ZPL version is less than requested version.
+ */
+static boolean_t
+zpl_earlier_version(const char *name, int version)
+{
+ objset_t *os;
+ boolean_t rc = B_TRUE;
+
+ if (dmu_objset_hold(name, FTAG, &os) == 0) {
+ uint64_t zplversion;
+
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (B_TRUE);
+ }
+ /* XXX reading from non-owned objset */
+ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
+ rc = zplversion < version;
+ dmu_objset_rele(os, FTAG);
+ }
+ return (rc);
+}
+
+static void
+zfs_log_history(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *buf;
+
+ if ((buf = history_str_get(zc)) == NULL)
+ return;
+
+ if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
+ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
+ (void) spa_history_log(spa, buf);
+ spa_close(spa, FTAG);
+ }
+ history_str_free(buf);
+}
+
+/*
+ * Policy for top-level read operations (list pools). Requires no privileges,
+ * and can be used in the local zone, as there is no associated dataset.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (0);
+}
+
+/*
+ * Policy for dataset read operations (list children, get statistics). Requires
+ * no privileges, but must be visible in the local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ if (INGLOBALZONE(curthread) ||
+ zone_dataset_visible(zc->zc_name, NULL))
+ return (0);
+
+ return (SET_ERROR(ENOENT));
+}
+
+static int
+zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
+{
+ int writable = 1;
+
+ /*
+ * The dataset must be visible by this zone -- check this first
+ * so they don't see EPERM on something they shouldn't know about.
+ */
+ if (!INGLOBALZONE(curthread) &&
+ !zone_dataset_visible(dataset, &writable))
+ return (SET_ERROR(ENOENT));
+
+ if (INGLOBALZONE(curthread)) {
+ /*
+ * If the fs is zoned, only root can access it from the
+ * global zone.
+ */
+ if (secpolicy_zfs(cr) && zoned)
+ return (SET_ERROR(EPERM));
+ } else {
+ /*
+ * If we are in a local zone, the 'zoned' property must be set.
+ */
+ if (!zoned)
+ return (SET_ERROR(EPERM));
+
+ /* must be writable by this zone */
+ if (!writable)
+ return (SET_ERROR(EPERM));
+ }
+ return (0);
+}
+
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+ uint64_t zoned;
+
+ if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
+ return (SET_ERROR(ENOENT));
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
+{
+ uint64_t zoned;
+
+ if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
+ return (SET_ERROR(ENOENT));
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+ const char *perm, cred_t *cr)
+{
+ int error;
+
+ error = zfs_dozonecheck_ds(name, ds, cr);
+ if (error == 0) {
+ error = secpolicy_zfs(cr);
+ if (error != 0)
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ }
+ return (error);
+}
+
+static int
+zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
+{
+ int error;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp;
+
+ /*
+ * First do a quick check for root in the global zone, which
+ * is allowed to do all write_perms. This ensures that zfs_ioc_*
+ * will get to handle nonexistent datasets.
+ */
+ if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
+ return (0);
+
+ error = dsl_pool_hold(name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, name, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
+
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+#ifdef SECLABEL
+/*
+ * Policy for setting the security label property.
+ *
+ * Returns 0 for success, non-zero for access and other errors.
+ */
+static int
+zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
+{
+ char ds_hexsl[MAXNAMELEN];
+ bslabel_t ds_sl, new_sl;
+ boolean_t new_default = FALSE;
+ uint64_t zoned;
+ int needed_priv = -1;
+ int error;
+
+ /* First get the existing dataset label. */
+ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+ if (error != 0)
+ return (SET_ERROR(EPERM));
+
+ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+ new_default = TRUE;
+
+ /* The label must be translatable */
+ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * In a non-global zone, disallow attempts to set a label that
+ * doesn't match that of the zone; otherwise no other checks
+ * are needed.
+ */
+ if (!INGLOBALZONE(curproc)) {
+ if (new_default || !blequal(&new_sl, CR_SL(CRED())))
+ return (SET_ERROR(EPERM));
+ return (0);
+ }
+
+ /*
+ * For global-zone datasets (i.e., those whose zoned property is
+ * "off", verify that the specified new label is valid for the
+ * global zone.
+ */
+ if (dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+ return (SET_ERROR(EPERM));
+ if (!zoned) {
+ if (zfs_check_global_label(name, strval) != 0)
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * If the existing dataset label is nondefault, check if the
+ * dataset is mounted (label cannot be changed while mounted).
+ * Get the zfsvfs; if there isn't one, then the dataset isn't
+ * mounted (or isn't a dataset, doesn't exist, ...).
+ */
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
+ objset_t *os;
+ static char *setsl_tag = "setsl_tag";
+
+ /*
+ * Try to own the dataset; abort if there is any error,
+ * (e.g., already mounted, in use, or other error).
+ */
+ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
+ setsl_tag, &os);
+ if (error != 0)
+ return (SET_ERROR(EPERM));
+
+ dmu_objset_disown(os, setsl_tag);
+
+ if (new_default) {
+ needed_priv = PRIV_FILE_DOWNGRADE_SL;
+ goto out_check;
+ }
+
+ if (hexstr_to_label(strval, &new_sl) != 0)
+ return (SET_ERROR(EPERM));
+
+ if (blstrictdom(&ds_sl, &new_sl))
+ needed_priv = PRIV_FILE_DOWNGRADE_SL;
+ else if (blstrictdom(&new_sl, &ds_sl))
+ needed_priv = PRIV_FILE_UPGRADE_SL;
+ } else {
+ /* dataset currently has a default label */
+ if (!new_default)
+ needed_priv = PRIV_FILE_UPGRADE_SL;
+ }
+
+out_check:
+ if (needed_priv != -1)
+ return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
+ return (0);
+}
+#endif /* SECLABEL */
+
+static int
+zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
+ cred_t *cr)
+{
+ char *strval;
+
+ /*
+ * Check permissions for special properties.
+ */
+ switch (prop) {
+ case ZFS_PROP_ZONED:
+ /*
+ * Disallow setting of 'zoned' from within a local zone.
+ */
+ if (!INGLOBALZONE(curthread))
+ return (SET_ERROR(EPERM));
+ break;
+
+ case ZFS_PROP_QUOTA:
+ case ZFS_PROP_FILESYSTEM_LIMIT:
+ case ZFS_PROP_SNAPSHOT_LIMIT:
+ if (!INGLOBALZONE(curthread)) {
+ uint64_t zoned;
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+ /*
+ * Unprivileged users are allowed to modify the
+ * limit on things *under* (ie. contained by)
+ * the thing they own.
+ */
+ if (dsl_prop_get_integer(dsname, "jailed", &zoned,
+ setpoint))
+ return (SET_ERROR(EPERM));
+ if (!zoned || strlen(dsname) <= strlen(setpoint))
+ return (SET_ERROR(EPERM));
+ }
+ break;
+
+ case ZFS_PROP_MLSLABEL:
+#ifdef SECLABEL
+ if (!is_system_labeled())
+ return (SET_ERROR(EPERM));
+
+ if (nvpair_value_string(propval, &strval) == 0) {
+ int err;
+
+ err = zfs_set_slabel_policy(dsname, strval, CRED());
+ if (err != 0)
+ return (err);
+ }
+#else
+ return (EOPNOTSUPP);
+#endif
+ break;
+ }
+
+ return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error;
+
+ error = zfs_dozonecheck(zc->zc_name, cr);
+ if (error != 0)
+ return (error);
+
+ /*
+ * permission to set permissions will be evaluated later in
+ * dsl_deleg_can_allow()
+ */
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_ROLLBACK, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ char *cp;
+ int error;
+
+ /*
+ * Generate the current snapshot name from the given objsetid, then
+ * use that name for the secpolicy/zone checks.
+ */
+ cp = strchr(zc->zc_name, '@');
+ if (cp == NULL)
+ return (SET_ERROR(EINVAL));
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_name(ds, zc->zc_name);
+
+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+ ZFS_DELEG_PERM_SEND, cr);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_SEND, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ vnode_t *vp;
+ int error;
+
+ if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+ NO_FOLLOW, NULL, &vp)) != 0)
+ return (error);
+
+ /* Now make sure mntpnt and dataset are ZFS */
+
+ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
+ (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+ zc->zc_name) != 0)) {
+ VN_RELE(vp);
+ return (SET_ERROR(EPERM));
+ }
+
+ VN_RELE(vp);
+ return (dsl_deleg_access(zc->zc_name,
+ ZFS_DELEG_PERM_SHARE, cr));
+}
+
+int
+zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ if (!INGLOBALZONE(curthread))
+ return (SET_ERROR(EPERM));
+
+ if (secpolicy_nfs(cr) == 0) {
+ return (0);
+ } else {
+ return (zfs_secpolicy_deleg_share(zc, innvl, cr));
+ }
+}
+
+int
+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ if (!INGLOBALZONE(curthread))
+ return (SET_ERROR(EPERM));
+
+ if (secpolicy_smb(cr) == 0) {
+ return (0);
+ } else {
+ return (zfs_secpolicy_deleg_share(zc, innvl, cr));
+ }
+}
+
+static int
+zfs_get_parent(const char *datasetname, char *parent, int parentsize)
+{
+ char *cp;
+
+ /*
+ * Remove the @bla or /bla from the end of the name to get the parent.
+ */
+ (void) strncpy(parent, datasetname, parentsize);
+ cp = strrchr(parent, '@');
+ if (cp != NULL) {
+ cp[0] = '\0';
+ } else {
+ cp = strrchr(parent, '/');
+ if (cp == NULL)
+ return (SET_ERROR(ENOENT));
+ cp[0] = '\0';
+ }
+
+ return (0);
+}
+
+int
+zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
+{
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+ return (error);
+
+ return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
+}
+
+/*
+ * Destroying snapshots with delegated permissions requires
+ * descendant mount and destroy permissions.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvlist_t *snaps;
+ nvpair_t *pair, *nextpair;
+ int error = 0;
+
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nextpair) {
+ nextpair = nvlist_next_nvpair(snaps, pair);
+ error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
+ if (error == ENOENT) {
+ /*
+ * Ignore any snapshots that don't exist (we consider
+ * them "already destroyed"). Remove the name from the
+ * nvl here in case the snapshot is created between
+ * now and when we try to destroy it (in which case
+ * we don't want to destroy it since we haven't
+ * checked for permission).
+ */
+ fnvlist_remove_nvpair(snaps, pair);
+ error = 0;
+ }
+ if (error != 0)
+ break;
+ }
+
+ return (error);
+}
+
+int
+zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
+{
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(from,
+ ZFS_DELEG_PERM_RENAME, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(from,
+ ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_get_parent(to, parentname,
+ sizeof (parentname))) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(parentname,
+ ZFS_DELEG_PERM_CREATE, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(parentname,
+ ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+ return (error);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ char *at = NULL;
+ char *pound;
+ int error;
+
+ if ((pound = strchr(zc->zc_name, '#')) != NULL) {
+ *pound = '\0';
+ error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_RENAME, cr);
+ if (error == 0) {
+ error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_BOOKMARK, cr);
+ }
+ *pound = '#';
+ return (error);
+ }
+
+ if ((zc->zc_cookie & 1) != 0) {
+ /*
+ * This is recursive rename, so the starting snapshot might
+ * not exist. Check file system or volume permission instead.
+ */
+ at = strchr(zc->zc_name, '@');
+ if (at == NULL)
+ return (EINVAL);
+ *at = '\0';
+ }
+
+ error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);
+
+ if (at != NULL)
+ *at = '@';
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *clone;
+ int error;
+
+ error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_PROMOTE, cr);
+ if (error != 0)
+ return (error);
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
+
+ if (error == 0) {
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_t *origin = NULL;
+ dsl_dir_t *dd;
+ dd = clone->ds_dir;
+
+ error = dsl_dataset_hold_obj(dd->dd_pool,
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
+ if (error != 0) {
+ dsl_dataset_rele(clone, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
+ ZFS_DELEG_PERM_MOUNT, cr);
+
+ dsl_dataset_name(origin, parentname);
+ if (error == 0) {
+ error = zfs_secpolicy_write_perms_ds(parentname, origin,
+ ZFS_DELEG_PERM_PROMOTE, cr);
+ }
+ dsl_dataset_rele(clone, FTAG);
+ dsl_dataset_rele(origin, FTAG);
+ }
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+ return (error);
+
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_CREATE, cr));
+}
+
+int
+zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_SNAPSHOT, cr));
+}
+
+/*
+ * Check for permission to create each snapshot in the nvlist.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvlist_t *snaps;
+ int error;
+ nvpair_t *pair;
+
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ char *name = nvpair_name(pair);
+ char *atp = strchr(name, '@');
+
+ if (atp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ *atp = '\0';
+ error = zfs_secpolicy_snapshot_perms(name, cr);
+ *atp = '@';
+ if (error != 0)
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Check for permission to create each bookmark in the nvlist.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error = 0;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ char *name = nvpair_name(pair);
+ char *hashp = strchr(name, '#');
+
+ if (hashp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ *hashp = '\0';
+ error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_BOOKMARK, cr);
+ *hashp = '#';
+ if (error != 0)
+ break;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_REMAP, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvpair_t *pair, *nextpair;
+ int error = 0;
+
+ for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+ pair = nextpair) {
+ char *name = nvpair_name(pair);
+ char *hashp = strchr(name, '#');
+ nextpair = nvlist_next_nvpair(innvl, pair);
+
+ if (hashp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ *hashp = '\0';
+ error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_DESTROY, cr);
+ *hashp = '#';
+ if (error == ENOENT) {
+ /*
+ * Ignore any filesystems that don't exist (we consider
+ * their bookmarks "already destroyed"). Remove
+ * the name from the nvl here in case the filesystem
+ * is created between now and when we try to destroy
+ * the bookmark (in which case we don't want to
+ * destroy it since we haven't checked for permission).
+ */
+ fnvlist_remove_nvpair(innvl, pair);
+ error = 0;
+ }
+ if (error != 0)
+ break;
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ /*
+ * Even root must have a proper TSD so that we know what pool
+ * to log to.
+ */
+ if (tsd_get(zfs_allow_log_key) == NULL)
+ return (SET_ERROR(EPERM));
+ return (0);
+}
+
+static int
+zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ int error;
+ char *origin;
+
+ if ((error = zfs_get_parent(zc->zc_name, parentname,
+ sizeof (parentname))) != 0)
+ return (error);
+
+ if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
+ (error = zfs_secpolicy_write_perms(origin,
+ ZFS_DELEG_PERM_CLONE, cr)) != 0)
+ return (error);
+
+ if ((error = zfs_secpolicy_write_perms(parentname,
+ ZFS_DELEG_PERM_CREATE, cr)) != 0)
+ return (error);
+
+ return (zfs_secpolicy_write_perms(parentname,
+ ZFS_DELEG_PERM_MOUNT, cr));
+}
+
+/*
+ * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
+ * SYS_CONFIG privilege, which is not available in a local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ if (secpolicy_sys_config(cr, B_FALSE) != 0)
+ return (SET_ERROR(EPERM));
+
+ return (0);
+}
+
+/*
+ * Policy for object to name lookups.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error;
+
+ if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
+ return (error);
+}
+
+/*
+ * Policy for fault injection. Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (secpolicy_zinject(cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
+
+ if (prop == ZPROP_INVAL) {
+ if (!zfs_prop_user(zc->zc_value))
+ return (SET_ERROR(EINVAL));
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_USERPROP, cr));
+ } else {
+ return (zfs_secpolicy_setprop(zc->zc_name, prop,
+ NULL, cr));
+ }
+}
+
+static int
+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int err = zfs_secpolicy_read(zc, innvl, cr);
+ if (err)
+ return (err);
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (SET_ERROR(EINVAL));
+
+ if (zc->zc_value[0] == 0) {
+ /*
+ * They are asking about a posix uid/gid. If it's
+ * themself, allow it.
+ */
+ if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
+ zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
+ if (zc->zc_guid == crgetuid(cr))
+ return (0);
+ } else {
+ if (groupmember(zc->zc_guid, cr))
+ return (0);
+ }
+ }
+
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int err = zfs_secpolicy_read(zc, innvl, cr);
+ if (err)
+ return (err);
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (SET_ERROR(EINVAL));
+
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ userquota_perms[zc->zc_objset_type], cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
+ NULL, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvpair_t *pair;
+ nvlist_t *holds;
+ int error;
+
+ holds = fnvlist_lookup_nvlist(innvl, "holds");
+
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
+ error = dmu_fsname(nvpair_name(pair), fsname);
+ if (error != 0)
+ return (error);
+ error = zfs_secpolicy_write_perms(fsname,
+ ZFS_DELEG_PERM_HOLD, cr);
+ if (error != 0)
+ return (error);
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvpair_t *pair;
+ int error;
+
+ for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(innvl, pair)) {
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
+ error = dmu_fsname(nvpair_name(pair), fsname);
+ if (error != 0)
+ return (error);
+ error = zfs_secpolicy_write_perms(fsname,
+ ZFS_DELEG_PERM_RELEASE, cr);
+ if (error != 0)
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * Policy for allowing temporary snapshots to be taken or released
+ */
+static int
+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ /*
+ * A temporary snapshot is the same as a snapshot,
+ * hold, destroy and release all rolled into one.
+ * Delegated diff alone is sufficient that we allow this.
+ */
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_DIFF, cr)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
+ if (innvl != NULL) {
+ if (error == 0)
+ error = zfs_secpolicy_hold(zc, innvl, cr);
+ if (error == 0)
+ error = zfs_secpolicy_release(zc, innvl, cr);
+ if (error == 0)
+ error = zfs_secpolicy_destroy(zc, innvl, cr);
+ }
+ return (error);
+}
+
+/*
+ * Returns the nvlist as specified by the user in the zfs_cmd_t.
+ */
+static int
+get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
+{
+ char *packed;
+ int error;
+ nvlist_t *list = NULL;
+
+ /*
+ * Read in and unpack the user-supplied nvlist.
+ */
+ if (size == 0)
+ return (SET_ERROR(EINVAL));
+
+ packed = kmem_alloc(size, KM_SLEEP);
+
+ if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+ iflag)) != 0) {
+ kmem_free(packed, size);
+ return (SET_ERROR(EFAULT));
+ }
+
+ if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
+ kmem_free(packed, size);
+ return (error);
+ }
+
+ kmem_free(packed, size);
+
+ *nvp = list;
+ return (0);
+}
+
+/*
+ * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
+ * Entries will be removed from the end of the nvlist, and one int32 entry
+ * named "N_MORE_ERRORS" will be added indicating how many entries were
+ * removed.
+ */
+static int
+nvlist_smush(nvlist_t *errors, size_t max)
+{
+ size_t size;
+
+ size = fnvlist_size(errors);
+
+ if (size > max) {
+ nvpair_t *more_errors;
+ int n = 0;
+
+ if (max < 1024)
+ return (SET_ERROR(ENOMEM));
+
+ fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
+ more_errors = nvlist_prev_nvpair(errors, NULL);
+
+ do {
+ nvpair_t *pair = nvlist_prev_nvpair(errors,
+ more_errors);
+ fnvlist_remove_nvpair(errors, pair);
+ n++;
+ size = fnvlist_size(errors);
+ } while (size > max);
+
+ fnvlist_remove_nvpair(errors, more_errors);
+ fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
+ ASSERT3U(fnvlist_size(errors), <=, max);
+ }
+
+ return (0);
+}
+
+static int
+put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
+{
+ char *packed = NULL;
+ int error = 0;
+ size_t size;
+
+ size = fnvlist_size(nvl);
+
+ if (size > zc->zc_nvlist_dst_size) {
+ /*
+ * Solaris returns ENOMEM here, because even if an error is
+ * returned from an ioctl(2), new zc_nvlist_dst_size will be
+ * passed to the userland. This is not the case for FreeBSD.
+ * We need to return 0, so the kernel will copy the
+ * zc_nvlist_dst_size back and the userland can discover that a
+ * bigger buffer is needed.
+ */
+ error = 0;
+ } else {
+ packed = fnvlist_pack(nvl, &size);
+ if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ size, zc->zc_iflags) != 0)
+ error = SET_ERROR(EFAULT);
+ fnvlist_pack_free(packed, size);
+ }
+
+ zc->zc_nvlist_dst_size = size;
+ zc->zc_nvlist_dst_filled = B_TRUE;
+ return (error);
+}
+
+int
+getzfsvfs_impl(objset_t *os, vfs_t **vfsp)
+{
+ zfsvfs_t *zfvp;
+ int error = 0;
+
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ mutex_enter(&os->os_user_ptr_lock);
+ zfvp = dmu_objset_get_user(os);
+ if (zfvp) {
+ *vfsp = zfvp->z_vfs;
+ vfs_ref(zfvp->z_vfs);
+ } else {
+ error = SET_ERROR(ESRCH);
+ }
+ mutex_exit(&os->os_user_ptr_lock);
+ return (error);
+}
+
+int
+getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
+{
+ objset_t *os;
+ vfs_t *vfsp;
+ int error;
+
+ error = dmu_objset_hold(dsname, FTAG, &os);
+ if (error != 0)
+ return (error);
+ error = getzfsvfs_impl(os, &vfsp);
+ dmu_objset_rele(os, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = vfs_busy(vfsp, 0);
+ vfs_rel(vfsp);
+ if (error != 0) {
+ *zfvp = NULL;
+ error = SET_ERROR(ESRCH);
+ } else {
+ *zfvp = vfsp->vfs_data;
+ }
+ return (error);
+}
+
+/*
+ * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
+ * case its z_vfs will be NULL, and it will be opened as the owner.
+ * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
+ * which prevents all vnode ops from running.
+ */
+static int
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
+{
+ int error = 0;
+
+ if (getzfsvfs(name, zfvp) != 0)
+ error = zfsvfs_create(name, zfvp);
+ if (error == 0) {
+ rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+ RW_READER, tag);
+#ifdef illumos
+ if ((*zfvp)->z_unmounted) {
+ /*
+ * XXX we could probably try again, since the unmounting
+ * thread should be just about to disassociate the
+ * objset from the zfsvfs.
+ */
+ rrm_exit(&(*zfvp)->z_teardown_lock, tag);
+ return (SET_ERROR(EBUSY));
+ }
+#else
+ /*
+ * vfs_busy() ensures that the filesystem is not and
+ * can not be unmounted.
+ */
+ ASSERT(!(*zfvp)->z_unmounted);
+#endif
+ }
+ return (error);
+}
+
+static void
+zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+{
+ rrm_exit(&zfsvfs->z_teardown_lock, tag);
+
+ if (zfsvfs->z_vfs) {
+#ifdef illumos
+ VFS_RELE(zfsvfs->z_vfs);
+#else
+ vfs_unbusy(zfsvfs->z_vfs);
+#endif
+ } else {
+ dmu_objset_disown(zfsvfs->z_os, zfsvfs);
+ zfsvfs_free(zfsvfs);
+ }
+}
+
+static int
+zfs_ioc_pool_create(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *config, *props = NULL;
+ nvlist_t *rootprops = NULL;
+ nvlist_t *zplprops = NULL;
+ char *spa_name = zc->zc_name;
+
+ if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config))
+ return (error);
+
+ if (zc->zc_nvlist_src_size != 0 && (error =
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
+ nvlist_free(config);
+ return (error);
+ }
+
+ if (props) {
+ nvlist_t *nvl = NULL;
+ uint64_t version = SPA_VERSION;
+ char *tname;
+
+ (void) nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
+ if (!SPA_VERSION_IS_SUPPORTED(version)) {
+ error = SET_ERROR(EINVAL);
+ goto pool_props_bad;
+ }
+ (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
+ if (nvl) {
+ error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
+ if (error != 0) {
+ nvlist_free(config);
+ nvlist_free(props);
+ return (error);
+ }
+ (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
+ }
+ VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ error = zfs_fill_zplprops_root(version, rootprops,
+ zplprops, NULL);
+ if (error != 0)
+ goto pool_props_bad;
+
+ if (nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0)
+ spa_name = tname;
+ }
+
+ error = spa_create(zc->zc_name, config, props, zplprops);
+
+ /*
+ * Set the remaining root properties
+ */
+ if (!error && (error = zfs_set_prop_nvlist(spa_name,
+ ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
+ (void) spa_destroy(spa_name);
+
+pool_props_bad:
+ nvlist_free(rootprops);
+ nvlist_free(zplprops);
+ nvlist_free(config);
+ nvlist_free(props);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+{
+ int error;
+ zfs_log_history(zc);
+ error = spa_destroy(zc->zc_name);
+#ifndef __FreeBSD__
+ if (error == 0)
+ zvol_remove_minors(zc->zc_name);
+#endif
+ return (error);
+}
+
+static int
+zfs_ioc_pool_import(zfs_cmd_t *zc)
+{
+ nvlist_t *config, *props = NULL;
+ uint64_t guid;
+ int error;
+
+ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config)) != 0)
+ return (error);
+
+ if (zc->zc_nvlist_src_size != 0 && (error =
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
+ nvlist_free(config);
+ return (error);
+ }
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+ guid != zc->zc_guid)
+ error = SET_ERROR(EINVAL);
+ else
+ error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
+
+ if (zc->zc_nvlist_dst != 0) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ }
+
+ nvlist_free(config);
+
+ nvlist_free(props);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_export(zfs_cmd_t *zc)
+{
+ int error;
+ boolean_t force = (boolean_t)zc->zc_cookie;
+ boolean_t hardforce = (boolean_t)zc->zc_guid;
+
+ zfs_log_history(zc);
+ error = spa_export(zc->zc_name, NULL, force, hardforce);
+#ifndef __FreeBSD__
+ if (error == 0)
+ zvol_remove_minors(zc->zc_name);
+#endif
+ return (error);
+}
+
+static int
+zfs_ioc_pool_configs(zfs_cmd_t *zc)
+{
+ nvlist_t *configs;
+ int error;
+
+ if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+ return (SET_ERROR(EEXIST));
+
+ error = put_nvlist(zc, configs);
+
+ nvlist_free(configs);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of the pool
+ *
+ * outputs:
+ * zc_cookie real errno
+ * zc_nvlist_dst config nvlist
+ * zc_nvlist_dst_size size of config nvlist
+ */
+static int
+zfs_ioc_pool_stats(zfs_cmd_t *zc)
+{
+ nvlist_t *config;
+ int error;
+ int ret = 0;
+
+ error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
+ sizeof (zc->zc_value));
+
+ if (config != NULL) {
+ ret = put_nvlist(zc, config);
+ nvlist_free(config);
+
+ /*
+ * The config may be present even if 'error' is non-zero.
+ * In this case we return success, and preserve the real errno
+ * in 'zc_cookie'.
+ */
+ zc->zc_cookie = error;
+ } else {
+ ret = error;
+ }
+
+ return (ret);
+}
+
+/*
+ * Try to import the given pool, returning pool stats as appropriate so that
+ * user land knows which devices are available and overall pool health.
+ */
+static int
+zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
+{
+ nvlist_t *tryconfig, *config;
+ int error;
+
+ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &tryconfig)) != 0)
+ return (error);
+
+ config = spa_tryimport(tryconfig);
+
+ nvlist_free(tryconfig);
+
+ if (config == NULL)
+ return (SET_ERROR(EINVAL));
+
+ error = put_nvlist(zc, config);
+ nvlist_free(config);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_cookie scan func (pool_scan_func_t)
+ * zc_flags scrub pause/resume flag (pool_scrub_cmd_t)
+ */
+static int
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
+ return (SET_ERROR(EINVAL));
+
+ if (zc->zc_flags == POOL_SCRUB_PAUSE)
+ error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
+ else if (zc->zc_cookie == POOL_SCAN_NONE)
+ error = spa_scan_stop(spa);
+ else
+ error = spa_scan(spa, zc->zc_cookie);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_freeze(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ spa_freeze(spa);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (zc->zc_cookie < spa_version(spa) ||
+ !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ spa_upgrade(spa, zc->zc_cookie);
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_get_history(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *hist_buf;
+ uint64_t size;
+ int error;
+
+ if ((size = zc->zc_history_len) == 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ hist_buf = kmem_alloc(size, KM_SLEEP);
+ if ((error = spa_history_get(spa, &zc->zc_history_offset,
+ &zc->zc_history_len, hist_buf)) == 0) {
+ error = ddi_copyout(hist_buf,
+ (void *)(uintptr_t)zc->zc_history,
+ zc->zc_history_len, zc->zc_iflags);
+ }
+
+ spa_close(spa, FTAG);
+ kmem_free(hist_buf, size);
+ return (error);
+}
+
+static int
+zfs_ioc_pool_reguid(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ error = spa_change_guid(spa);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
+{
+ return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_value name of object
+ */
+static int
+zfs_ioc_obj_to_path(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+ return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_rele(os, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_stat stats on object
+ * zc_value path to object
+ */
+static int
+zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+ return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_rele(os, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_add(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *config, **l2cache, **spares;
+ uint_t nl2cache = 0, nspares = 0;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config);
+ (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache);
+
+ (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares);
+
+#ifdef illumos
+ /*
+ * A root pool with concatenated devices is not supported.
+ * Thus, can not add a device to a root pool.
+ *
+ * Intent log device can not be added to a rootpool because
+ * during mountroot, zil is replayed, a seperated log device
+ * can not be accessed during the mountroot time.
+ *
+ * l2cache and spare devices are ok to be added to a rootpool.
+ */
+ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
+ nvlist_free(config);
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EDOM));
+ }
+#endif /* illumos */
+
+ if (error == 0) {
+ error = spa_vdev_add(spa, config);
+ nvlist_free(config);
+ }
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_guid guid of vdev to remove
+ * zc_cookie cancel removal
+ */
+static int
+zfs_ioc_vdev_remove(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ if (zc->zc_cookie != 0) {
+ error = spa_vdev_remove_cancel(spa);
+ } else {
+ error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
+ }
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+ switch (zc->zc_cookie) {
+ case VDEV_STATE_ONLINE:
+ error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
+ break;
+
+ case VDEV_STATE_OFFLINE:
+ error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
+ break;
+
+ case VDEV_STATE_FAULTED:
+ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL)
+ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+ error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
+ break;
+
+ case VDEV_STATE_DEGRADED:
+ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+ zc->zc_obj != VDEV_AUX_EXTERNAL)
+ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+ error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
+ break;
+
+ default:
+ error = SET_ERROR(EINVAL);
+ }
+ zc->zc_cookie = newstate;
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_attach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int replacing = zc->zc_cookie;
+ nvlist_t *config;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config)) == 0) {
+ error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
+ nvlist_free(config);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_detach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_split(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ nvlist_t *config, *props = NULL;
+ int error;
+ boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ zc->zc_iflags, &config)) {
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_nvlist_src_size != 0 && (error =
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
+ spa_close(spa, FTAG);
+ nvlist_free(config);
+ return (error);
+ }
+
+ error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
+
+ spa_close(spa, FTAG);
+
+ nvlist_free(config);
+ nvlist_free(props);
+
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *path = zc->zc_value;
+ uint64_t guid = zc->zc_guid;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = spa_vdev_setpath(spa, guid, path);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *fru = zc->zc_value;
+ uint64_t guid = zc->zc_guid;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = spa_vdev_setfru(spa, guid, fru);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+{
+ int error = 0;
+ nvlist_t *nv;
+
+ dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+ if (zc->zc_nvlist_dst != 0 &&
+ (error = dsl_prop_get_all(os, &nv)) == 0) {
+ dmu_objset_stats(os, nv);
+ /*
+ * NB: zvol_get_stats() will read the objset contents,
+ * which we aren't supposed to do with a
+ * DS_MODE_USER hold, because it could be
+ * inconsistent. So this is a bit of a workaround...
+ * XXX reading with out owning
+ */
+ if (!zc->zc_objset_stats.dds_inconsistent &&
+ dmu_objset_type(os) == DMU_OST_ZVOL) {
+ error = zvol_get_stats(os, nv);
+ if (error == EIO)
+ return (error);
+ VERIFY0(error);
+ }
+ error = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ }
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_objset_stats stats
+ * zc_nvlist_dst property nvlist
+ * zc_nvlist_dst_size size of property nvlist
+ */
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error == 0) {
+ error = zfs_ioc_objset_stats_impl(zc, os);
+ dmu_objset_rele(os, FTAG);
+ }
+
+ if (error == ENOMEM)
+ error = 0;
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst received property nvlist
+ * zc_nvlist_dst_size size of received property nvlist
+ *
+ * Gets received properties (distinct from local properties on or after
+ * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
+ * local property values.
+ */
+static int
+zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
+{
+ int error = 0;
+ nvlist_t *nv;
+
+ /*
+ * Without this check, we would return local property values if the
+ * caller has not already received properties on or after
+ * SPA_VERSION_RECVD_PROPS.
+ */
+ if (!dsl_prop_get_hasrecvd(zc->zc_name))
+ return (SET_ERROR(ENOTSUP));
+
+ if (zc->zc_nvlist_dst != 0 &&
+ (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
+ error = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ }
+
+ return (error);
+}
+
+static int
+nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
+{
+ uint64_t value;
+ int error;
+
+ /*
+ * zfs_get_zplprop() will either find a value or give us
+ * the default value (if there is one).
+ */
+ if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
+ return (error);
+ VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for zpl property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst zpl property nvlist
+ * zc_nvlist_dst_size size of zpl property nvlist
+ */
+static int
+zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int err;
+
+ /* XXX reading without owning */
+ if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
+ return (err);
+
+ dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+ /*
+ * NB: nvl_add_zplprop() will read the objset contents,
+ * which we aren't supposed to do with a DS_MODE_USER
+ * hold, because it could be inconsistent.
+ */
+ if (zc->zc_nvlist_dst != 0 &&
+ !zc->zc_objset_stats.dds_inconsistent &&
+ dmu_objset_type(os) == DMU_OST_ZFS) {
+ nvlist_t *nv;
+
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
+ (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
+ (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
+ (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
+ err = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ } else {
+ err = SET_ERROR(ENOENT);
+ }
+ dmu_objset_rele(os, FTAG);
+ return (err);
+}
+
+boolean_t
+dataset_name_hidden(const char *name)
+{
+ /*
+ * Skip over datasets that are not visible in this zone,
+ * internal datasets (which have a $ in their name), and
+ * temporary datasets (which have a % in their name).
+ */
+ if (strchr(name, '$') != NULL)
+ return (B_TRUE);
+ if (strchr(name, '%') != NULL)
+ return (B_TRUE);
+ if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_cookie zap cursor
+ * zc_nvlist_src iteration range nvlist
+ * zc_nvlist_src_size size of iteration range nvlist
+ *
+ * outputs:
+ * zc_name name of next filesystem
+ * zc_cookie zap cursor
+ * zc_objset_stats stats
+ * zc_nvlist_dst property nvlist
+ * zc_nvlist_dst_size size of property nvlist
+ */
+static int
+zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+ char *p;
+ size_t orig_len = strlen(zc->zc_name);
+
+top:
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
+ if (error == ENOENT)
+ error = SET_ERROR(ESRCH);
+ return (error);
+ }
+
+ p = strrchr(zc->zc_name, '/');
+ if (p == NULL || p[1] != '\0')
+ (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
+ p = zc->zc_name + strlen(zc->zc_name);
+
+ do {
+ error = dmu_dir_list_next(os,
+ sizeof (zc->zc_name) - (p - zc->zc_name), p,
+ NULL, &zc->zc_cookie);
+ if (error == ENOENT)
+ error = SET_ERROR(ESRCH);
+ } while (error == 0 && dataset_name_hidden(zc->zc_name));
+ dmu_objset_rele(os, FTAG);
+
+ /*
+ * If it's an internal dataset (ie. with a '$' in its name),
+ * don't try to get stats for it, otherwise we'll return ENOENT.
+ */
+ if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
+ error = zfs_ioc_objset_stats(zc); /* fill in the stats */
+ if (error == ENOENT) {
+ /* We lost a race with destroy, get the next one. */
+ zc->zc_name[orig_len] = '\0';
+ goto top;
+ }
+ }
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_cookie zap cursor
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ * zc_simple when set, only name is requested
+ *
+ * outputs:
+ * zc_name name of next snapshot
+ * zc_objset_stats stats
+ * zc_nvlist_dst property nvlist
+ * zc_nvlist_dst_size size of property nvlist
+ */
+static int
+zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
+{
+ int error;
+ objset_t *os, *ossnap;
+ dsl_dataset_t *ds;
+ uint64_t min_txg = 0, max_txg = 0;
+
+ if (zc->zc_nvlist_src_size != 0) {
+ nvlist_t *props = NULL;
+ error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props);
+ if (error != 0)
+ return (error);
+ (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG,
+ &min_txg);
+ (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG,
+ &max_txg);
+ nvlist_free(props);
+ }
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error != 0) {
+ return (error == ENOENT ? ESRCH : error);
+ }
+
+ /*
+ * A dataset name of maximum length cannot have any snapshots,
+ * so exit immediately.
+ */
+ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+ ZFS_MAX_DATASET_NAME_LEN) {
+ dmu_objset_rele(os, FTAG);
+ return (SET_ERROR(ESRCH));
+ }
+
+ while (error == 0) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ error = SET_ERROR(EINTR);
+ break;
+ }
+
+ error = dmu_snapshot_list_next(os,
+ sizeof (zc->zc_name) - strlen(zc->zc_name),
+ zc->zc_name + strlen(zc->zc_name), &zc->zc_obj,
+ &zc->zc_cookie, NULL);
+ if (error == ENOENT) {
+ error = SET_ERROR(ESRCH);
+ break;
+ } else if (error != 0) {
+ break;
+ }
+
+ error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj,
+ FTAG, &ds);
+ if (error != 0)
+ break;
+
+ if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) ||
+ (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) {
+ dsl_dataset_rele(ds, FTAG);
+ /* undo snapshot name append */
+ *(strchr(zc->zc_name, '@') + 1) = '\0';
+ /* skip snapshot */
+ continue;
+ }
+
+ if (zc->zc_simple) {
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+
+ if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+ if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+ dsl_dataset_rele(ds, FTAG);
+ break;
+ }
+
+ dmu_objset_rele(os, FTAG);
+ /* if we failed, undo the @ that we tacked on to zc_name */
+ if (error != 0)
+ *strchr(zc->zc_name, '@') = '\0';
+ return (error);
+}
+
+static int
+zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
+{
+ const char *propname = nvpair_name(pair);
+ uint64_t *valary;
+ unsigned int vallen;
+ const char *domain;
+ char *dash;
+ zfs_userquota_prop_t type;
+ uint64_t rid;
+ uint64_t quota;
+ zfsvfs_t *zfsvfs;
+ int err;
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * A correctly constructed propname is encoded as
+ * userquota@<rid>-<domain>.
+ */
+ if ((dash = strchr(propname, '-')) == NULL ||
+ nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
+ vallen != 3)
+ return (SET_ERROR(EINVAL));
+
+ domain = dash + 1;
+ type = valary[0];
+ rid = valary[1];
+ quota = valary[2];
+
+ err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
+ if (err == 0) {
+ err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
+ zfsvfs_rele(zfsvfs, FTAG);
+ }
+
+ return (err);
+}
+
+/*
+ * If the named property is one that has a special function to set its value,
+ * return 0 on success and a positive error code on failure; otherwise if it is
+ * not one of the special properties handled by this function, return -1.
+ *
+ * XXX: It would be better for callers of the property interface if we handled
+ * these special cases in dsl_prop.c (in the dsl layer).
+ */
+static int
+zfs_prop_set_special(const char *dsname, zprop_source_t source,
+ nvpair_t *pair)
+{
+ const char *propname = nvpair_name(pair);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ uint64_t intval;
+ int err = -1;
+
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_userquota(propname))
+ return (zfs_prop_set_userquota(dsname, pair));
+ return (-1);
+ }
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
+
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
+ return (-1);
+
+ VERIFY(0 == nvpair_value_uint64(pair, &intval));
+
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ err = dsl_dir_set_quota(dsname, source, intval);
+ break;
+ case ZFS_PROP_REFQUOTA:
+ err = dsl_dataset_set_refquota(dsname, source, intval);
+ break;
+ case ZFS_PROP_FILESYSTEM_LIMIT:
+ case ZFS_PROP_SNAPSHOT_LIMIT:
+ if (intval == UINT64_MAX) {
+ /* clearing the limit, just do it */
+ err = 0;
+ } else {
+ err = dsl_dir_activate_fs_ss_limit(dsname);
+ }
+ /*
+ * Set err to -1 to force the zfs_set_prop_nvlist code down the
+ * default path to set the value in the nvlist.
+ */
+ if (err == 0)
+ err = -1;
+ break;
+ case ZFS_PROP_RESERVATION:
+ err = dsl_dir_set_reservation(dsname, source, intval);
+ break;
+ case ZFS_PROP_REFRESERVATION:
+ err = dsl_dataset_set_refreservation(dsname, source, intval);
+ break;
+ case ZFS_PROP_VOLSIZE:
+ err = zvol_set_volsize(dsname, intval);
+ break;
+ case ZFS_PROP_VERSION:
+ {
+ zfsvfs_t *zfsvfs;
+
+ if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
+ break;
+
+ err = zfs_set_version(zfsvfs, intval);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
+ zfs_cmd_t *zc;
+
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ (void) strcpy(zc->zc_name, dsname);
+ (void) zfs_ioc_userspace_upgrade(zc);
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ }
+ break;
+ }
+ default:
+ err = -1;
+ }
+
+ return (err);
+}
+
+/*
+ * This function is best effort. If it fails to set any of the given properties,
+ * it continues to set as many as it can and returns the last error
+ * encountered. If the caller provides a non-NULL errlist, it will be filled in
+ * with the list of names of all the properties that failed along with the
+ * corresponding error numbers.
+ *
+ * If every property is set successfully, zero is returned and errlist is not
+ * modified.
+ */
+int
+zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
+ nvlist_t *errlist)
+{
+ nvpair_t *pair;
+ nvpair_t *propval;
+ int rv = 0;
+ uint64_t intval;
+ char *strval;
+ nvlist_t *genericnvl = fnvlist_alloc();
+ nvlist_t *retrynvl = fnvlist_alloc();
+
+retry:
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ int err = 0;
+
+ /* decode the property value */
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ attrs = fnvpair_value_nvlist(pair);
+ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &propval) != 0)
+ err = SET_ERROR(EINVAL);
+ }
+
+ /* Validate value type */
+ if (err == 0 && prop == ZPROP_INVAL) {
+ if (zfs_prop_user(propname)) {
+ if (nvpair_type(propval) != DATA_TYPE_STRING)
+ err = SET_ERROR(EINVAL);
+ } else if (zfs_prop_userquota(propname)) {
+ if (nvpair_type(propval) !=
+ DATA_TYPE_UINT64_ARRAY)
+ err = SET_ERROR(EINVAL);
+ } else {
+ err = SET_ERROR(EINVAL);
+ }
+ } else if (err == 0) {
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
+ err = SET_ERROR(EINVAL);
+ } else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
+ const char *unused;
+
+ intval = fnvpair_value_uint64(propval);
+
+ switch (zfs_prop_get_type(prop)) {
+ case PROP_TYPE_NUMBER:
+ break;
+ case PROP_TYPE_STRING:
+ err = SET_ERROR(EINVAL);
+ break;
+ case PROP_TYPE_INDEX:
+ if (zfs_prop_index_to_string(prop,
+ intval, &unused) != 0)
+ err = SET_ERROR(EINVAL);
+ break;
+ default:
+ cmn_err(CE_PANIC,
+ "unknown property type");
+ }
+ } else {
+ err = SET_ERROR(EINVAL);
+ }
+ }
+
+ /* Validate permissions */
+ if (err == 0)
+ err = zfs_check_settable(dsname, pair, CRED());
+
+ if (err == 0) {
+ err = zfs_prop_set_special(dsname, source, pair);
+ if (err == -1) {
+ /*
+ * For better performance we build up a list of
+ * properties to set in a single transaction.
+ */
+ err = nvlist_add_nvpair(genericnvl, pair);
+ } else if (err != 0 && nvl != retrynvl) {
+ /*
+ * This may be a spurious error caused by
+ * receiving quota and reservation out of order.
+ * Try again in a second pass.
+ */
+ err = nvlist_add_nvpair(retrynvl, pair);
+ }
+ }
+
+ if (err != 0) {
+ if (errlist != NULL)
+ fnvlist_add_int32(errlist, propname, err);
+ rv = err;
+ }
+ }
+
+ if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
+ nvl = retrynvl;
+ goto retry;
+ }
+
+ if (!nvlist_empty(genericnvl) &&
+ dsl_props_set(dsname, source, genericnvl) != 0) {
+ /*
+ * If this fails, we still want to set as many properties as we
+ * can, so try setting them individually.
+ */
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+ int err = 0;
+
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ attrs = fnvpair_value_nvlist(pair);
+ propval = fnvlist_lookup_nvpair(attrs,
+ ZPROP_VALUE);
+ }
+
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ strval = fnvpair_value_string(propval);
+ err = dsl_prop_set_string(dsname, propname,
+ source, strval);
+ } else {
+ intval = fnvpair_value_uint64(propval);
+ err = dsl_prop_set_int(dsname, propname, source,
+ intval);
+ }
+
+ if (err != 0) {
+ if (errlist != NULL) {
+ fnvlist_add_int32(errlist, propname,
+ err);
+ }
+ rv = err;
+ }
+ }
+ }
+ nvlist_free(genericnvl);
+ nvlist_free(retrynvl);
+
+ return (rv);
+}
+
+/*
+ * Check that all the properties are valid user properties.
+ */
+static int
+zfs_check_userprops(nvlist_t *nvl)
+{
+ nvpair_t *pair = NULL;
+
+ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+
+ if (!zfs_prop_user(propname) ||
+ nvpair_type(pair) != DATA_TYPE_STRING)
+ return (SET_ERROR(EINVAL));
+
+ if (strlen(propname) >= ZAP_MAXNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
+ return (E2BIG);
+ }
+ return (0);
+}
+
+static void
+props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
+{
+ nvpair_t *pair;
+
+ VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
+ if (nvlist_exists(skipped, nvpair_name(pair)))
+ continue;
+
+ VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
+ }
+}
+
+static int
+clear_received_props(const char *dsname, nvlist_t *props,
+ nvlist_t *skipped)
+{
+ int err = 0;
+ nvlist_t *cleared_props = NULL;
+ props_skip(props, skipped, &cleared_props);
+ if (!nvlist_empty(cleared_props)) {
+ /*
+ * Acts on local properties until the dataset has received
+ * properties at least once on or after SPA_VERSION_RECVD_PROPS.
+ */
+ zprop_source_t flags = (ZPROP_SRC_NONE |
+ (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
+ err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
+ }
+ nvlist_free(cleared_props);
+ return (err);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value name of property to set
+ * zc_nvlist_src{_size} nvlist of properties to apply
+ * zc_cookie received properties flag
+ *
+ * outputs:
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ */
+static int
+zfs_ioc_set_prop(zfs_cmd_t *zc)
+{
+ nvlist_t *nvl;
+ boolean_t received = zc->zc_cookie;
+ zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
+ ZPROP_SRC_LOCAL);
+ nvlist_t *errors;
+ int error;
+
+ if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl)) != 0)
+ return (error);
+
+ if (received) {
+ nvlist_t *origprops;
+
+ if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
+ (void) clear_received_props(zc->zc_name,
+ origprops, nvl);
+ nvlist_free(origprops);
+ }
+
+ error = dsl_prop_set_hasrecvd(zc->zc_name);
+ }
+
+ errors = fnvlist_alloc();
+ if (error == 0)
+ error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
+
+ if (zc->zc_nvlist_dst != 0 && errors != NULL) {
+ (void) put_nvlist(zc, errors);
+ }
+
+ nvlist_free(errors);
+ nvlist_free(nvl);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value name of property to inherit
+ * zc_cookie revert to received value if TRUE
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_inherit_prop(zfs_cmd_t *zc)
+{
+ const char *propname = zc->zc_value;
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ boolean_t received = zc->zc_cookie;
+ zprop_source_t source = (received
+ ? ZPROP_SRC_NONE /* revert to received value, if any */
+ : ZPROP_SRC_INHERITED); /* explicitly inherit */
+
+ if (received) {
+ nvlist_t *dummy;
+ nvpair_t *pair;
+ zprop_type_t type;
+ int err;
+
+ /*
+ * zfs_prop_set_special() expects properties in the form of an
+ * nvpair with type info.
+ */
+ if (prop == ZPROP_INVAL) {
+ if (!zfs_prop_user(propname))
+ return (SET_ERROR(EINVAL));
+
+ type = PROP_TYPE_STRING;
+ } else if (prop == ZFS_PROP_VOLSIZE ||
+ prop == ZFS_PROP_VERSION) {
+ return (SET_ERROR(EINVAL));
+ } else {
+ type = zfs_prop_get_type(prop);
+ }
+
+ VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ switch (type) {
+ case PROP_TYPE_STRING:
+ VERIFY(0 == nvlist_add_string(dummy, propname, ""));
+ break;
+ case PROP_TYPE_NUMBER:
+ case PROP_TYPE_INDEX:
+ VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
+ break;
+ default:
+ nvlist_free(dummy);
+ return (SET_ERROR(EINVAL));
+ }
+
+ pair = nvlist_next_nvpair(dummy, NULL);
+ err = zfs_prop_set_special(zc->zc_name, source, pair);
+ nvlist_free(dummy);
+ if (err != -1)
+ return (err); /* special property already handled */
+ } else {
+ /*
+ * Only check this in the non-received case. We want to allow
+ * 'inherit -S' to revert non-inheritable properties like quota
+ * and reservation to the received or default values even though
+ * they are not considered inheritable.
+ */
+ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* property name has been validated by zfs_secpolicy_inherit_prop() */
+ return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
+}
+
+static int
+zfs_ioc_pool_set_props(zfs_cmd_t *zc)
+{
+ nvlist_t *props;
+ spa_t *spa;
+ int error;
+ nvpair_t *pair;
+
+ if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))
+ return (error);
+
+ /*
+ * If the only property is the configfile, then just do a spa_lookup()
+ * to handle the faulted case.
+ */
+ pair = nvlist_next_nvpair(props, NULL);
+ if (pair != NULL && strcmp(nvpair_name(pair),
+ zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
+ nvlist_next_nvpair(props, pair) == NULL) {
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(zc->zc_name)) != NULL) {
+ spa_configfile_set(spa, props, B_FALSE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ }
+ mutex_exit(&spa_namespace_lock);
+ if (spa != NULL) {
+ nvlist_free(props);
+ return (0);
+ }
+ }
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+ nvlist_free(props);
+ return (error);
+ }
+
+ error = spa_prop_set(spa, props);
+
+ nvlist_free(props);
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_get_props(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *nvp = NULL;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+ /*
+ * If the pool is faulted, there may be properties we can still
+ * get (such as altroot and cachefile), so attempt to get them
+ * anyway.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(zc->zc_name)) != NULL)
+ error = spa_prop_get(spa, &nvp);
+ mutex_exit(&spa_namespace_lock);
+ } else {
+ error = spa_prop_get(spa, &nvp);
+ spa_close(spa, FTAG);
+ }
+
+ if (error == 0 && zc->zc_nvlist_dst != 0)
+ error = put_nvlist(zc, nvp);
+ else
+ error = SET_ERROR(EFAULT);
+
+ nvlist_free(nvp);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_src{_size} nvlist of delegated permissions
+ * zc_perm_action allow/unallow flag
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_set_fsacl(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *fsaclnv = NULL;
+
+ if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &fsaclnv)) != 0)
+ return (error);
+
+ /*
+ * Verify nvlist is constructed correctly
+ */
+ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
+ nvlist_free(fsaclnv);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If we don't have PRIV_SYS_MOUNT, then validate
+ * that user is allowed to hand out each permission in
+ * the nvlist(s)
+ */
+
+ error = secpolicy_zfs(CRED());
+ if (error != 0) {
+ if (zc->zc_perm_action == B_FALSE) {
+ error = dsl_deleg_can_allow(zc->zc_name,
+ fsaclnv, CRED());
+ } else {
+ error = dsl_deleg_can_unallow(zc->zc_name,
+ fsaclnv, CRED());
+ }
+ }
+
+ if (error == 0)
+ error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
+
+ nvlist_free(fsaclnv);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size} nvlist of delegated permissions
+ */
+static int
+zfs_ioc_get_fsacl(zfs_cmd_t *zc)
+{
+ nvlist_t *nvp;
+ int error;
+
+ if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
+ error = put_nvlist(zc, nvp);
+ nvlist_free(nvp);
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+ zfs_creat_t *zct = arg;
+
+ zfs_create_fs(os, cr, zct->zct_zplprops, tx);
+}
+
+#define ZFS_PROP_UNDEFINED ((uint64_t)-1)
+
+/*
+ * inputs:
+ * os parent objset pointer (NULL if root fs)
+ * fuids_ok fuids allowed in this version of the spa?
+ * sa_ok SAs allowed in this version of the spa?
+ * createprops list of properties requested by creator
+ *
+ * outputs:
+ * zplprops values for the zplprops we attach to the master node object
+ * is_ci true if requested file system will be purely case-insensitive
+ *
+ * Determine the settings for utf8only, normalization and
+ * casesensitivity. Specific values may have been requested by the
+ * creator and/or we can inherit values from the parent dataset. If
+ * the file system is of too early a vintage, a creator can not
+ * request settings for these properties, even if the requested
+ * setting is the default value. We don't actually want to create dsl
+ * properties for these, so remove them from the source nvlist after
+ * processing.
+ */
+static int
+zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
+ boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
+{
+ uint64_t sense = ZFS_PROP_UNDEFINED;
+ uint64_t norm = ZFS_PROP_UNDEFINED;
+ uint64_t u8 = ZFS_PROP_UNDEFINED;
+
+ ASSERT(zplprops != NULL);
+
+ /* parent dataset must be a filesystem */
+ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
+ return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+
+ /*
+ * Pull out creator prop choices, if any.
+ */
+ if (createprops) {
+ (void) nvlist_lookup_uint64(createprops,
+ zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
+ (void) nvlist_lookup_uint64(createprops,
+ zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
+ (void) nvlist_remove_all(createprops,
+ zfs_prop_to_name(ZFS_PROP_NORMALIZE));
+ (void) nvlist_lookup_uint64(createprops,
+ zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
+ (void) nvlist_remove_all(createprops,
+ zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
+ (void) nvlist_lookup_uint64(createprops,
+ zfs_prop_to_name(ZFS_PROP_CASE), &sense);
+ (void) nvlist_remove_all(createprops,
+ zfs_prop_to_name(ZFS_PROP_CASE));
+ }
+
+ /*
+ * If the zpl version requested is whacky or the file system
+ * or pool is version is too "young" to support normalization
+ * and the creator tried to set a value for one of the props,
+ * error out.
+ */
+ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
+ (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+ (zplver >= ZPL_VERSION_SA && !sa_ok) ||
+ (zplver < ZPL_VERSION_NORMALIZATION &&
+ (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
+ sense != ZFS_PROP_UNDEFINED)))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * Put the version in the zplprops
+ */
+ VERIFY(nvlist_add_uint64(zplprops,
+ zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
+
+ if (norm == ZFS_PROP_UNDEFINED)
+ VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+ VERIFY(nvlist_add_uint64(zplprops,
+ zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
+
+ /*
+ * If we're normalizing, names must always be valid UTF-8 strings.
+ */
+ if (norm)
+ u8 = 1;
+ if (u8 == ZFS_PROP_UNDEFINED)
+ VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+ VERIFY(nvlist_add_uint64(zplprops,
+ zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
+
+ if (sense == ZFS_PROP_UNDEFINED)
+ VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+ VERIFY(nvlist_add_uint64(zplprops,
+ zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
+
+ if (is_ci)
+ *is_ci = (sense == ZFS_CASE_INSENSITIVE);
+
+ return (0);
+}
+
+static int
+zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
+{
+ boolean_t fuids_ok, sa_ok;
+ uint64_t zplver = ZPL_VERSION;
+ objset_t *os = NULL;
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ spa_t *spa;
+ uint64_t spa_vers;
+ int error;
+
+ zfs_get_parent(dataset, parentname, sizeof (parentname));
+
+ if ((error = spa_open(dataset, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_vers = spa_version(spa);
+ spa_close(spa, FTAG);
+
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
+
+ /*
+ * Open parent object set so we can inherit zplprop values.
+ */
+ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
+ return (error);
+
+ error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
+ zplprops, is_ci);
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+static int
+zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
+{
+ boolean_t fuids_ok;
+ boolean_t sa_ok;
+ uint64_t zplver = ZPL_VERSION;
+ int error;
+
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
+
+ error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
+ createprops, zplprops, is_ci);
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "type" -> dmu_objset_type_t (int32)
+ * (optional) "props" -> { prop -> value }
+ * }
+ *
+ * outnvl: propname -> error code (int32)
+ */
+
+static const zfs_ioc_key_t zfs_keys_create[] = {
+ {"type", DATA_TYPE_INT32, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error = 0;
+ zfs_creat_t zct = { 0 };
+ nvlist_t *nvprops = NULL;
+ void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+ dmu_objset_type_t type;
+ boolean_t is_insensitive = B_FALSE;
+
+ type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type");
+ (void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
+
+ switch (type) {
+ case DMU_OST_ZFS:
+ cbfunc = zfs_create_cb;
+ break;
+
+ case DMU_OST_ZVOL:
+ cbfunc = zvol_create_cb;
+ break;
+
+ default:
+ cbfunc = NULL;
+ break;
+ }
+ if (strchr(fsname, '@') ||
+ strchr(fsname, '%'))
+ return (SET_ERROR(EINVAL));
+
+ zct.zct_props = nvprops;
+
+ if (cbfunc == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (type == DMU_OST_ZVOL) {
+ uint64_t volsize, volblocksize;
+
+ if (nvprops == NULL)
+ return (SET_ERROR(EINVAL));
+ if (nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ &volblocksize)) != 0 && error != ENOENT)
+ return (SET_ERROR(EINVAL));
+
+ if (error != 0)
+ volblocksize = zfs_prop_default_numeric(
+ ZFS_PROP_VOLBLOCKSIZE);
+
+ if ((error = zvol_check_volblocksize(
+ volblocksize)) != 0 ||
+ (error = zvol_check_volsize(volsize,
+ volblocksize)) != 0)
+ return (error);
+ } else if (type == DMU_OST_ZFS) {
+ int error;
+
+ /*
+ * We have to have normalization and
+ * case-folding flags correct when we do the
+ * file system creation, so go figure them out
+ * now.
+ */
+ VERIFY(nvlist_alloc(&zct.zct_zplprops,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ error = zfs_fill_zplprops(fsname, nvprops,
+ zct.zct_zplprops, &is_insensitive);
+ if (error != 0) {
+ nvlist_free(zct.zct_zplprops);
+ return (error);
+ }
+ }
+
+ error = dmu_objset_create(fsname, type,
+ is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
+ nvlist_free(zct.zct_zplprops);
+
+ /*
+ * It would be nice to do this atomically.
+ */
+ if (error == 0) {
+ error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
+ nvprops, outnvl);
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ /*
+ * Wait for ZVOL operations to settle down before destroying.
+ */
+ if (error != 0) {
+ spa_t *spa;
+
+ if (spa_open(fsname, &spa, FTAG) == 0) {
+ taskqueue_drain_all(
+ spa->spa_zvol_taskq->tq_queue);
+ spa_close(spa, FTAG);
+ }
+ }
+#endif
+ if (error != 0)
+ (void) dsl_destroy_head(fsname);
+ }
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "origin" -> name of origin snapshot
+ * (optional) "props" -> { prop -> value }
+ * }
+ *
+ * outnvl: propname -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_clone[] = {
+ {"origin", DATA_TYPE_STRING, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error = 0;
+ nvlist_t *nvprops = NULL;
+ char *origin_name;
+
+ origin_name = fnvlist_lookup_string(innvl, "origin");
+ (void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
+
+ if (strchr(fsname, '@') ||
+ strchr(fsname, '%'))
+ return (SET_ERROR(EINVAL));
+
+ if (dataset_namecheck(origin_name, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+ error = dmu_objset_clone(fsname, origin_name);
+ if (error != 0)
+ return (error);
+
+ /*
+ * It would be nice to do this atomically.
+ */
+ if (error == 0) {
+ error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
+ nvprops, outnvl);
+ if (error != 0)
+ (void) dsl_destroy_head(fsname);
+ }
+ return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_remap[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ if (strchr(fsname, '@') ||
+ strchr(fsname, '%'))
+ return (SET_ERROR(EINVAL));
+
+ return (dmu_objset_remap_indirects(fsname));
+}
+
+/*
+ * innvl: {
+ * "snaps" -> { snapshot1, snapshot2 }
+ * (optional) "props" -> { prop -> value (string) }
+ * }
+ *
+ * outnvl: snapshot -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_snapshot[] = {
+ {"snaps", DATA_TYPE_NVLIST, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ nvlist_t *snaps;
+ nvlist_t *props = NULL;
+ int error, poollen;
+ nvpair_t *pair;
+
+ (void) nvlist_lookup_nvlist(innvl, "props", &props);
+ if (!nvlist_empty(props) &&
+ zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
+ return (SET_ERROR(ENOTSUP));
+ if ((error = zfs_check_userprops(props)) != 0)
+ return (error);
+
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+ poollen = strlen(poolname);
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ const char *name = nvpair_name(pair);
+ char *cp = strchr(name, '@');
+
+ /*
+ * The snap name must contain an @, and the part after it must
+ * contain only valid characters.
+ */
+ if (cp == NULL ||
+ zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * The snap must be in the specified pool.
+ */
+ if (strncmp(name, poolname, poollen) != 0 ||
+ (name[poollen] != '/' && name[poollen] != '@'))
+ return (SET_ERROR(EXDEV));
+
+ /*
+ * Check for permission to set the properties on the fs.
+ */
+ if (!nvlist_empty(props)) {
+ *cp = '\0';
+ error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_USERPROP, CRED());
+ *cp = '@';
+ if (error != 0)
+ return (error);
+ }
+
+ /* This must be the only snap of this fs. */
+ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
+ pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
+ if (strncmp(name, nvpair_name(pair2), cp - name + 1)
+ == 0) {
+ return (SET_ERROR(EXDEV));
+ }
+ }
+ }
+
+ error = dsl_dataset_snapshot(snaps, props, outnvl);
+ return (error);
+}
+
+/*
+ * innvl: "message" -> string
+ */
+static const zfs_ioc_key_t zfs_keys_log_history[] = {
+ {"message", DATA_TYPE_STRING, 0},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ char *message;
+ spa_t *spa;
+ int error;
+ char *poolname;
+
+ /*
+ * The poolname in the ioctl is not set, we get it from the TSD,
+ * which was set at the end of the last successful ioctl that allows
+ * logging. The secpolicy func already checked that it is set.
+ * Only one log ioctl is allowed after each successful ioctl, so
+ * we clear the TSD here.
+ */
+ poolname = tsd_get(zfs_allow_log_key);
+ (void) tsd_set(zfs_allow_log_key, NULL);
+ error = spa_open(poolname, &spa, FTAG);
+ strfree(poolname);
+ if (error != 0)
+ return (error);
+
+ message = fnvlist_lookup_string(innvl, "message");
+
+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = spa_history_log(spa, message);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/*
+ * This ioctl is used to set the bootenv configuration on the current
+ * pool. This configuration is stored in the second padding area of the label,
+ * and it is used by the GRUB bootloader used on Linux to store the contents
+ * of the grubenv file. The file is stored as raw ASCII, and is protected by
+ * an embedded checksum. By default, GRUB will check if the boot filesystem
+ * supports storing the environment data in a special location, and if so,
+ * will invoke filesystem specific logic to retrieve it. This can be overriden
+ * by a variable, should the user so desire.
+ */
+/* ARGSUSED */
+static const zfs_ioc_key_t zfs_keys_set_bootenv[] = {
+ {"envmap", DATA_TYPE_STRING, 0},
+};
+
+static int
+zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ char *envmap;
+ int error;
+ spa_t *spa;
+
+ envmap = fnvlist_lookup_string(innvl, "envmap");
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_get_bootenv[] = {
+ /* no nvl keys */
+};
+
+ /* ARGSUSED */
+static int
+zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+#ifdef __FreeBSD__
+static const zfs_ioc_key_t zfs_keys_nextboot[] = {
+ {"command", DATA_TYPE_STRING, 0},
+ {ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0},
+ {ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0}
+};
+
+static int
+zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ char name[MAXNAMELEN];
+ spa_t *spa;
+ vdev_t *vd;
+ char *command;
+ uint64_t pool_guid;
+ uint64_t vdev_guid;
+ int error;
+
+ if (nvlist_lookup_uint64(innvl,
+ ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+ return (EINVAL);
+ if (nvlist_lookup_uint64(innvl,
+ ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
+ return (EINVAL);
+ command = fnvlist_lookup_string(innvl, "command");
+
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_by_guid(pool_guid, vdev_guid);
+ if (spa != NULL)
+ strcpy(name, spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+ if (spa == NULL)
+ return (ENOENT);
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+ if (vd == NULL) {
+ (void) spa_vdev_state_exit(spa, NULL, ENXIO);
+ spa_close(spa, FTAG);
+ return (ENODEV);
+ }
+ error = vdev_label_write_pad2(vd, command, strlen(command));
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+#endif
+
+/*
+ * The dp_config_rwlock must not be held when calling this, because the
+ * unmount may need to write out data.
+ *
+ * This function is best-effort. Callers must deal gracefully if it
+ * remains mounted (or is remounted after this call).
+ *
+ * Returns 0 if the argument is not a snapshot, or it is not currently a
+ * filesystem, or we were able to unmount it. Returns error code otherwise.
+ */
+void
+zfs_unmount_snap(const char *snapname)
+{
+ vfs_t *vfsp = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+
+ if (strchr(snapname, '@') == NULL)
+ return;
+
+ int err = getzfsvfs(snapname, &zfsvfs);
+ if (err != 0) {
+ ASSERT3P(zfsvfs, ==, NULL);
+ return;
+ }
+ vfsp = zfsvfs->z_vfs;
+
+ ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
+
+#ifdef illumos
+ err = vn_vfswlock(vfsp->vfs_vnodecovered);
+ VFS_RELE(vfsp);
+ if (err != 0)
+ return;
+#endif
+
+ /*
+ * Always force the unmount for snapshots.
+ */
+#ifdef illumos
+ (void) dounmount(vfsp, MS_FORCE, kcred);
+#else
+ vfs_ref(vfsp);
+ vfs_unbusy(vfsp);
+ (void) dounmount(vfsp, MS_FORCE, curthread);
+#endif
+}
+
+/* ARGSUSED */
+static int
+zfs_unmount_snap_cb(const char *snapname, void *arg)
+{
+ zfs_unmount_snap(snapname);
+ return (0);
+}
+
+/*
+ * When a clone is destroyed, its origin may also need to be destroyed,
+ * in which case it must be unmounted. This routine will do that unmount
+ * if necessary.
+ */
+void
+zfs_destroy_unmount_origin(const char *fsname)
+{
+ int error;
+ objset_t *os;
+ dsl_dataset_t *ds;
+
+ error = dmu_objset_hold(fsname, FTAG, &os);
+ if (error != 0)
+ return;
+ ds = dmu_objset_ds(os);
+ if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
+ char originname[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(ds->ds_prev, originname);
+ dmu_objset_rele(os, FTAG);
+ zfs_unmount_snap(originname);
+ } else {
+ dmu_objset_rele(os, FTAG);
+ }
+}
+
+/*
+ * innvl: {
+ * "snaps" -> { snapshot1, snapshot2 }
+ * (optional boolean) "defer"
+ * }
+ *
+ * outnvl: snapshot -> error code (int32)
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
+ {"snaps", DATA_TYPE_NVLIST, 0},
+ {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error, poollen;
+ nvlist_t *snaps;
+ nvpair_t *pair;
+ boolean_t defer;
+
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+ defer = nvlist_exists(innvl, "defer");
+
+ poollen = strlen(poolname);
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ const char *name = nvpair_name(pair);
+
+ /*
+ * The snap must be in the specified pool to prevent the
+ * invalid removal of zvol minors below.
+ */
+ if (strncmp(name, poolname, poollen) != 0 ||
+ (name[poollen] != '/' && name[poollen] != '@'))
+ return (SET_ERROR(EXDEV));
+
+ zfs_unmount_snap(nvpair_name(pair));
+ }
+
+ return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
+}
+
+/*
+ * Create bookmarks. Bookmark names are of the form <fs>#<bmark>.
+ * All bookmarks must be in the same pool.
+ *
+ * innvl: {
+ * bookmark1 -> snapshot1, bookmark2 -> snapshot2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_bookmark[] = {
+ {"<bookmark>...", DATA_TYPE_STRING, ZK_WILDCARDLIST},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ char *snap_name;
+
+ /*
+ * Verify the snapshot argument.
+ */
+ if (nvpair_value_string(pair, &snap_name) != 0)
+ return (SET_ERROR(EINVAL));
+
+
+ /* Verify that the keys (bookmarks) are unique */
+ for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
+ pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
+ if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ return (dsl_bookmark_create(innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ * property 1, property 2, ...
+ * }
+ *
+ * outnvl: {
+ * bookmark name 1 -> { property 1, property 2, ... },
+ * bookmark name 2 -> { property 1, property 2, ... }
+ * }
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = {
+ {"<property>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ return (dsl_get_bookmarks(fsname, innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ * bookmark name 1, bookmark name 2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = {
+ {"<bookmark>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST},
+};
+
+static int
+zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ int error, poollen;
+
+ poollen = strlen(poolname);
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ const char *name = nvpair_name(pair);
+ const char *cp = strchr(name, '#');
+
+ /*
+ * The bookmark name must contain an #, and the part after it
+ * must contain only valid characters.
+ */
+ if (cp == NULL ||
+ zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * The bookmark must be in the specified pool.
+ */
+ if (strncmp(name, poolname, poollen) != 0 ||
+ (name[poollen] != '/' && name[poollen] != '#'))
+ return (SET_ERROR(EXDEV));
+ }
+
+ error = dsl_bookmark_destroy(innvl, outnvl);
+ return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_channel_program[] = {
+ {"program", DATA_TYPE_STRING, 0},
+ {"arg", DATA_TYPE_ANY, 0},
+ {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
+ {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ char *program;
+ uint64_t instrlimit, memlimit;
+ boolean_t sync_flag;
+ nvpair_t *nvarg = NULL;
+
+ program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM);
+ if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
+ sync_flag = B_TRUE;
+ }
+ if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
+ instrlimit = ZCP_DEFAULT_INSTRLIMIT;
+ }
+ if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
+ memlimit = ZCP_DEFAULT_MEMLIMIT;
+ }
+ nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST);
+
+ if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit)
+ return (EINVAL);
+ if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
+ return (EINVAL);
+
+ return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
+ nvarg, outnvl));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ return (spa_checkpoint(poolname));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ return (spa_checkpoint_discard(poolname));
+}
+
+/*
+ * inputs:
+ * zc_name name of dataset to destroy
+ * zc_defer_destroy mark for deferred destroy
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ dmu_objset_type_t ost;
+ int err;
+
+ err = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (err != 0)
+ return (err);
+ ost = dmu_objset_type(os);
+ dmu_objset_rele(os, FTAG);
+
+ if (ost == DMU_OST_ZFS)
+ zfs_unmount_snap(zc->zc_name);
+
+ if (strchr(zc->zc_name, '@'))
+ err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
+ else
+ err = dsl_destroy_head(zc->zc_name);
+#ifndef __FreeBSD__
+ if (ost == DMU_OST_ZVOL && err == 0)
+ (void) zvol_remove_minor(zc->zc_name);
+#endif
+ return (err);
+}
+
+/*
+ * innvl: {
+ * vdevs: {
+ * guid 1, guid 2, ...
+ * },
+ * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
+ * }
+ *
+ * outnvl: {
+ * [func: EINVAL (if provided command type didn't make sense)],
+ * [vdevs: {
+ * guid1: errno, (see function body for possible errnos)
+ * ...
+ * }]
+ * }
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
+ {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0},
+ {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0}
+};
+
+static int
+zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(poolname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ uint64_t cmd_type;
+ if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
+ &cmd_type) != 0) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
+ cmd_type == POOL_INITIALIZE_DO ||
+ cmd_type == POOL_INITIALIZE_SUSPEND)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_guids;
+ if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
+ &vdev_guids) != 0) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ int total_errors = 0;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+ uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+ error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
+ if (error != 0) {
+ char guid_as_str[MAXNAMELEN];
+
+ (void) snprintf(guid_as_str, sizeof (guid_as_str),
+ "%llu", (unsigned long long)vdev_guid);
+ fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+ total_errors++;
+ }
+ }
+ if (fnvlist_size(vdev_errlist) > 0) {
+ fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
+ vdev_errlist);
+ }
+ fnvlist_free(vdev_errlist);
+
+ spa_close(spa, FTAG);
+ return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
+ * fsname is name of dataset to rollback (to most recent snapshot)
+ *
+ * innvl may contain name of expected target snapshot
+ *
+ * outnvl: "target" -> name of most recent snapshot
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_rollback[] = {
+ {"target", DATA_TYPE_STRING, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ zfsvfs_t *zfsvfs;
+ char *target = NULL;
+ int error;
+
+ (void) nvlist_lookup_string(innvl, "target", &target);
+ if (target != NULL) {
+ const char *cp = strchr(target, '@');
+
+ /*
+ * The snap name must contain an @, and the part after it must
+ * contain only valid characters.
+ */
+ if (cp == NULL ||
+ zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (getzfsvfs(fsname, &zfsvfs) == 0) {
+ dsl_dataset_t *ds;
+
+ ds = dmu_objset_ds(zfsvfs->z_os);
+ error = zfs_suspend_fs(zfsvfs);
+ if (error == 0) {
+ int resume_err;
+
+ error = dsl_dataset_rollback(fsname, target, zfsvfs,
+ outnvl);
+ resume_err = zfs_resume_fs(zfsvfs, ds);
+ error = error ? error : resume_err;
+ }
+#ifdef illumos
+ VFS_RELE(zfsvfs->z_vfs);
+#else
+ vfs_unbusy(zfsvfs->z_vfs);
+#endif
+ } else {
+ error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
+ }
+ return (error);
+}
+
+static int
+recursive_unmount(const char *fsname, void *arg)
+{
+ const char *snapname = arg;
+ char fullname[ZFS_MAX_DATASET_NAME_LEN];
+
+ (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
+ zfs_unmount_snap(fullname);
+
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_name old name of dataset or bookmark
+ * zc_value new name of dataset or bookmark
+ * zc_cookie recursive flag (only valid for snapshots)
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_rename(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ dmu_objset_type_t ost;
+ boolean_t recursive = zc->zc_cookie & 1;
+ char *pos, *pos2;
+ boolean_t allow_mounted = B_TRUE;
+ int err;
+
+#ifdef __FreeBSD__
+ allow_mounted = (zc->zc_cookie & 2) != 0;
+#endif
+
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
+
+ pos = strchr(zc->zc_name, '#');
+ if (pos != NULL) {
+ /* Bookmarks must be in same fs. */
+ pos2 = strchr(zc->zc_value, '#');
+ if (pos2 == NULL)
+ return (SET_ERROR(EINVAL));
+
+ /* Recursive flag is not supported yet. */
+ if (recursive)
+ return (SET_ERROR(ENOTSUP));
+
+ *pos = '\0';
+ *pos2 = '\0';
+ if (strcmp(zc->zc_name, zc->zc_value) == 0) {
+ err = dsl_bookmark_rename(zc->zc_name,
+ pos + 1, pos2 + 1);
+ } else {
+ err = SET_ERROR(EXDEV);
+ }
+ *pos = '#';
+ *pos2 = '#';
+ return (err);
+ }
+
+ /* "zfs rename" from and to ...%recv datasets should both fail */
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
+ dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+ strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%'))
+ return (SET_ERROR(EINVAL));
+
+ err = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (err != 0)
+ return (err);
+ ost = dmu_objset_type(os);
+ dmu_objset_rele(os, FTAG);
+
+ pos = strchr(zc->zc_name, '@');
+ if (pos != NULL) {
+ /* Snapshots must be in same fs. */
+ pos2 = strchr(zc->zc_value, '@');
+ if (pos2 == NULL)
+ return (SET_ERROR(EINVAL));
+ *pos = '\0';
+ *pos2 = '\0';
+ if (strcmp(zc->zc_name, zc->zc_value) != 0) {
+ err = SET_ERROR(EXDEV);
+ } else {
+ if (ost == DMU_OST_ZFS && !allow_mounted) {
+ err = dmu_objset_find(zc->zc_name,
+ recursive_unmount, pos + 1,
+ recursive ? DS_FIND_CHILDREN : 0);
+ }
+ if (err == 0) {
+ err = dsl_dataset_rename_snapshot(zc->zc_name,
+ pos + 1, pos2 + 1, recursive);
+ }
+ }
+ *pos = '@';
+ *pos2 = '@';
+ return (err);
+ } else {
+#ifdef illumos
+ if (ost == DMU_OST_ZVOL)
+ (void) zvol_remove_minor(zc->zc_name);
+#endif
+ return (dsl_dir_rename(zc->zc_name, zc->zc_value));
+ }
+}
+
+static int
+zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+{
+ const char *propname = nvpair_name(pair);
+ boolean_t issnap = (strchr(dsname, '@') != NULL);
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+ uint64_t intval;
+ int err;
+
+ if (prop == ZPROP_INVAL) {
+ if (zfs_prop_user(propname)) {
+ if (err = zfs_secpolicy_write_perms(dsname,
+ ZFS_DELEG_PERM_USERPROP, cr))
+ return (err);
+ return (0);
+ }
+
+ if (!issnap && zfs_prop_userquota(propname)) {
+ const char *perm = NULL;
+ const char *uq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
+ const char *gq_prefix =
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
+
+ if (strncmp(propname, uq_prefix,
+ strlen(uq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_USERQUOTA;
+ } else if (strncmp(propname, gq_prefix,
+ strlen(gq_prefix)) == 0) {
+ perm = ZFS_DELEG_PERM_GROUPQUOTA;
+ } else {
+ /* USERUSED and GROUPUSED are read-only */
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
+ return (err);
+ return (0);
+ }
+
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (issnap)
+ return (SET_ERROR(EINVAL));
+
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ /*
+ * dsl_prop_get_all_impl() returns properties in this
+ * format.
+ */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &pair) == 0);
+ }
+
+ /*
+ * Check that this value is valid for this pool version
+ */
+ switch (prop) {
+ case ZFS_PROP_COMPRESSION:
+ /*
+ * If the user specified gzip compression, make sure
+ * the SPA supports it. We ignore any errors here since
+ * we'll catch them later.
+ */
+ if (nvpair_value_uint64(pair, &intval) == 0) {
+ if (intval >= ZIO_COMPRESS_GZIP_1 &&
+ intval <= ZIO_COMPRESS_GZIP_9 &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_GZIP_COMPRESSION)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ if (intval == ZIO_COMPRESS_ZLE &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_ZLE_COMPRESSION))
+ return (SET_ERROR(ENOTSUP));
+
+ if (intval == ZIO_COMPRESS_LZ4) {
+ spa_t *spa;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LZ4_COMPRESS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+
+ /*
+ * If this is a bootable dataset then
+ * verify that the compression algorithm
+ * is supported for booting. We must return
+ * something other than ENOTSUP since it
+ * implies a downrev pool version.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ !BOOTFS_COMPRESS_VALID(intval)) {
+ return (SET_ERROR(ERANGE));
+ }
+ }
+ break;
+
+ case ZFS_PROP_COPIES:
+ if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+ break;
+
+ case ZFS_PROP_RECORDSIZE:
+ /* Record sizes above 128k need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ spa_t *spa;
+
+ /*
+ * We don't allow setting the property above 1MB,
+ * unless the tunable has been changed.
+ */
+ if (intval > zfs_max_recordsize ||
+ intval > SPA_MAXBLOCKSIZE)
+ return (SET_ERROR(ERANGE));
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
+ case ZFS_PROP_DNODESIZE:
+ /* Dnode sizes above 512 need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval != ZFS_DNSIZE_LEGACY) {
+ spa_t *spa;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_DNODE)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
+ case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
+ /*
+ * This property could require the allocation classes
+ * feature to be active for setting, however we allow
+ * it so that tests of settable properties succeed.
+ * The CLI will issue a warning in this case.
+ */
+ break;
+
+ case ZFS_PROP_SHARESMB:
+ if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
+ return (SET_ERROR(ENOTSUP));
+ break;
+
+ case ZFS_PROP_ACLINHERIT:
+ if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(pair, &intval) == 0) {
+ if (intval == ZFS_ACL_PASSTHROUGH_X &&
+ zfs_earlier_version(dsname,
+ SPA_VERSION_PASSTHROUGH_X))
+ return (SET_ERROR(ENOTSUP));
+ }
+ break;
+
+ case ZFS_PROP_CHECKSUM:
+ case ZFS_PROP_DEDUP:
+ {
+ spa_feature_t feature;
+ spa_t *spa;
+
+ /* dedup feature version checks */
+ if (prop == ZFS_PROP_DEDUP &&
+ zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+ return (SET_ERROR(ENOTSUP));
+
+ if (nvpair_value_uint64(pair, &intval) != 0)
+ return (SET_ERROR(EINVAL));
+
+ /* check prop value is enabled in features */
+ feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
+ if (feature == SPA_FEATURE_NONE)
+ break;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa, feature)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ break;
+ }
+ }
+
+ return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
+}
+
+/*
+ * Checks for a race condition to make sure we don't increment a feature flag
+ * multiple times.
+ */
+static int
+zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_feature_t *featurep = arg;
+
+ if (!spa_feature_is_active(spa, *featurep))
+ return (0);
+ else
+ return (SET_ERROR(EBUSY));
+}
+
+/*
+ * The callback invoked on feature activation in the sync task caused by
+ * zfs_prop_activate_feature.
+ */
+static void
+zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_feature_t *featurep = arg;
+
+ spa_feature_incr(spa, *featurep, tx);
+}
+
+/*
+ * Activates a feature on a pool in response to a property setting. This
+ * creates a new sync task which modifies the pool to reflect the feature
+ * as being active.
+ */
+static int
+zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
+{
+ int err;
+
+ /* EBUSY here indicates that the feature is already active */
+ err = dsl_sync_task(spa_name(spa),
+ zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
+ &feature, 2, ZFS_SPACE_CHECK_RESERVED);
+
+ if (err != 0 && err != EBUSY)
+ return (err);
+ else
+ return (0);
+}
+
+/*
+ * Removes properties from the given props list that fail permission checks
+ * needed to clear them and to restore them in case of a receive error. For each
+ * property, make sure we have both set and inherit permissions.
+ *
+ * Returns the first error encountered if any permission checks fail. If the
+ * caller provides a non-NULL errlist, it also gives the complete list of names
+ * of all the properties that failed a permission check along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property checks out successfully, zero is returned and the list
+ * pointed at by errlist is NULL.
+ */
+static int
+zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
+{
+ zfs_cmd_t *zc;
+ nvpair_t *pair, *next_pair;
+ nvlist_t *errors;
+ int err, rv = 0;
+
+ if (props == NULL)
+ return (0);
+
+ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ (void) strcpy(zc->zc_name, dataset);
+ pair = nvlist_next_nvpair(props, NULL);
+ while (pair != NULL) {
+ next_pair = nvlist_next_nvpair(props, pair);
+
+ (void) strcpy(zc->zc_value, nvpair_name(pair));
+ if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
+ (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
+ VERIFY(nvlist_remove_nvpair(props, pair) == 0);
+ VERIFY(nvlist_add_int32(errors,
+ zc->zc_value, err) == 0);
+ }
+ pair = next_pair;
+ }
+ kmem_free(zc, sizeof (zfs_cmd_t));
+
+ if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+ nvlist_free(errors);
+ errors = NULL;
+ } else {
+ VERIFY(nvpair_value_int32(pair, &rv) == 0);
+ }
+
+ if (errlist == NULL)
+ nvlist_free(errors);
+ else
+ *errlist = errors;
+
+ return (rv);
+}
+
+static boolean_t
+propval_equals(nvpair_t *p1, nvpair_t *p2)
+{
+ if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
+ /* dsl_prop_get_all_impl() format */
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &p1) == 0);
+ }
+
+ if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
+ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+ &p2) == 0);
+ }
+
+ if (nvpair_type(p1) != nvpair_type(p2))
+ return (B_FALSE);
+
+ if (nvpair_type(p1) == DATA_TYPE_STRING) {
+ char *valstr1, *valstr2;
+
+ VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
+ VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+ return (strcmp(valstr1, valstr2) == 0);
+ } else {
+ uint64_t intval1, intval2;
+
+ VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
+ VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
+ return (intval1 == intval2);
+ }
+}
+
+/*
+ * Remove properties from props if they are not going to change (as determined
+ * by comparison with origprops). Remove them from origprops as well, since we
+ * do not need to clear or restore properties that won't change.
+ */
+static void
+props_reduce(nvlist_t *props, nvlist_t *origprops)
+{
+ nvpair_t *pair, *next_pair;
+
+ if (origprops == NULL)
+ return; /* all props need to be received */
+
+ pair = nvlist_next_nvpair(props, NULL);
+ while (pair != NULL) {
+ const char *propname = nvpair_name(pair);
+ nvpair_t *match;
+
+ next_pair = nvlist_next_nvpair(props, pair);
+
+ if ((nvlist_lookup_nvpair(origprops, propname,
+ &match) != 0) || !propval_equals(pair, match))
+ goto next; /* need to set received value */
+
+ /* don't clear the existing received value */
+ (void) nvlist_remove_nvpair(origprops, match);
+ /* don't bother receiving the property */
+ (void) nvlist_remove_nvpair(props, pair);
+next:
+ pair = next_pair;
+ }
+}
+
+/*
+ * Extract properties that cannot be set PRIOR to the receipt of a dataset.
+ * For example, refquota cannot be set until after the receipt of a dataset,
+ * because in replication streams, an older/earlier snapshot may exceed the
+ * refquota. We want to receive the older/earlier snapshot, but setting
+ * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
+ * the older/earlier snapshot from being received (with EDQUOT).
+ *
+ * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
+ *
+ * libzfs will need to be judicious handling errors encountered by props
+ * extracted by this function.
+ */
+static nvlist_t *
+extract_delay_props(nvlist_t *props)
+{
+ nvlist_t *delayprops;
+ nvpair_t *nvp, *tmp;
+ static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
+ int i;
+
+ VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(props, nvp)) {
+ /*
+ * strcmp() is safe because zfs_prop_to_name() always returns
+ * a bounded string.
+ */
+ for (i = 0; delayable[i] != 0; i++) {
+ if (strcmp(zfs_prop_to_name(delayable[i]),
+ nvpair_name(nvp)) == 0) {
+ break;
+ }
+ }
+ if (delayable[i] != 0) {
+ tmp = nvlist_prev_nvpair(props, nvp);
+ VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
+ VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
+ nvp = tmp;
+ }
+ }
+
+ if (nvlist_empty(delayprops)) {
+ nvlist_free(delayprops);
+ delayprops = NULL;
+ }
+ return (delayprops);
+}
+
+#ifdef DEBUG
+static boolean_t zfs_ioc_recv_inject_err;
+#endif
+
+/*
+ * inputs:
+ * zc_name name of containing filesystem
+ * zc_nvlist_src{_size} nvlist of properties to apply
+ * zc_value name of snapshot to create
+ * zc_string name of clone origin (if DRR_FLAG_CLONE)
+ * zc_cookie file descriptor to recv from
+ * zc_begin_record the BEGIN record of the stream (not byteswapped)
+ * zc_guid force flag
+ * zc_cleanup_fd cleanup-on-exit file descriptor
+ * zc_action_handle handle for this guid/ds mapping (or zero on first call)
+ * zc_resumable if data is incomplete assume sender will resume
+ *
+ * outputs:
+ * zc_cookie number of bytes read
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ * zc_obj zprop_errflags_t
+ * zc_action_handle handle for this guid/ds mapping
+ */
+static int
+zfs_ioc_recv(zfs_cmd_t *zc)
+{
+ file_t *fp;
+ dmu_recv_cookie_t drc;
+ boolean_t force = (boolean_t)zc->zc_guid;
+ int fd;
+ int error = 0;
+ int props_error = 0;
+ nvlist_t *errors;
+ offset_t off;
+ nvlist_t *props = NULL; /* sent properties */
+ nvlist_t *origprops = NULL; /* existing properties */
+ nvlist_t *delayprops = NULL; /* sent properties applied post-receive */
+ char *origin = NULL;
+ char *tosnap;
+ char tofs[ZFS_MAX_DATASET_NAME_LEN];
+ boolean_t first_recvd_props = B_FALSE;
+
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+ strchr(zc->zc_value, '@') == NULL ||
+ strchr(zc->zc_value, '%'))
+ return (SET_ERROR(EINVAL));
+
+ (void) strcpy(tofs, zc->zc_value);
+ tosnap = strchr(tofs, '@');
+ *tosnap++ = '\0';
+
+ if (zc->zc_nvlist_src != 0 &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props)) != 0)
+ return (error);
+
+ fd = zc->zc_cookie;
+#ifdef illumos
+ fp = getf(fd);
+#else
+ fget_read(curthread, fd, &cap_pread_rights, &fp);
+#endif
+ if (fp == NULL) {
+ nvlist_free(props);
+ return (SET_ERROR(EBADF));
+ }
+
+ errors = fnvlist_alloc();
+
+ if (zc->zc_string[0])
+ origin = zc->zc_string;
+
+ error = dmu_recv_begin(tofs, tosnap,
+ &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Set properties before we receive the stream so that they are applied
+ * to the new data. Note that we must call dmu_recv_stream() if
+ * dmu_recv_begin() succeeds.
+ */
+ if (props != NULL && !drc.drc_newfs) {
+ if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
+ SPA_VERSION_RECVD_PROPS &&
+ !dsl_prop_get_hasrecvd(tofs))
+ first_recvd_props = B_TRUE;
+
+ /*
+ * If new received properties are supplied, they are to
+ * completely replace the existing received properties, so stash
+ * away the existing ones.
+ */
+ if (dsl_prop_get_received(tofs, &origprops) == 0) {
+ nvlist_t *errlist = NULL;
+ /*
+ * Don't bother writing a property if its value won't
+ * change (and avoid the unnecessary security checks).
+ *
+ * The first receive after SPA_VERSION_RECVD_PROPS is a
+ * special case where we blow away all local properties
+ * regardless.
+ */
+ if (!first_recvd_props)
+ props_reduce(props, origprops);
+ if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
+ (void) nvlist_merge(errors, errlist, 0);
+ nvlist_free(errlist);
+
+ if (clear_received_props(tofs, origprops,
+ first_recvd_props ? NULL : props) != 0)
+ zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+ } else {
+ zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+ }
+ }
+
+ if (props != NULL) {
+ props_error = dsl_prop_set_hasrecvd(tofs);
+
+ if (props_error == 0) {
+ delayprops = extract_delay_props(props);
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+ props, errors);
+ }
+ }
+
+ off = fp->f_offset;
+ error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
+ &zc->zc_action_handle);
+
+ if (error == 0) {
+ zfsvfs_t *zfsvfs = NULL;
+
+ if (getzfsvfs(tofs, &zfsvfs) == 0) {
+ /* online recv */
+ dsl_dataset_t *ds;
+ int end_err;
+
+ ds = dmu_objset_ds(zfsvfs->z_os);
+ error = zfs_suspend_fs(zfsvfs);
+ /*
+ * If the suspend fails, then the recv_end will
+ * likely also fail, and clean up after itself.
+ */
+ end_err = dmu_recv_end(&drc, zfsvfs);
+ if (error == 0)
+ error = zfs_resume_fs(zfsvfs, ds);
+ error = error ? error : end_err;
+#ifdef illumos
+ VFS_RELE(zfsvfs->z_vfs);
+#else
+ vfs_unbusy(zfsvfs->z_vfs);
+#endif
+ } else {
+ error = dmu_recv_end(&drc, NULL);
+ }
+
+ /* Set delayed properties now, after we're done receiving. */
+ if (delayprops != NULL && error == 0) {
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+ delayprops, errors);
+ }
+ }
+
+ if (delayprops != NULL) {
+ /*
+ * Merge delayed props back in with initial props, in case
+ * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
+ * we have to make sure clear_received_props() includes
+ * the delayed properties).
+ *
+ * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
+ * using ASSERT() will be just like a VERIFY.
+ */
+ ASSERT(nvlist_merge(props, delayprops, 0) == 0);
+ nvlist_free(delayprops);
+ }
+
+ /*
+ * Now that all props, initial and delayed, are set, report the prop
+ * errors to the caller.
+ */
+ if (zc->zc_nvlist_dst_size != 0 &&
+ (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
+ put_nvlist(zc, errors) != 0)) {
+ /*
+ * Caller made zc->zc_nvlist_dst less than the minimum expected
+ * size or supplied an invalid address.
+ */
+ props_error = SET_ERROR(EINVAL);
+ }
+
+ zc->zc_cookie = off - fp->f_offset;
+ if (off >= 0 && off <= MAXOFFSET_T)
+ fp->f_offset = off;
+
+#ifdef DEBUG
+ if (zfs_ioc_recv_inject_err) {
+ zfs_ioc_recv_inject_err = B_FALSE;
+ error = 1;
+ }
+#endif
+
+ /*
+ * On error, restore the original props.
+ */
+ if (error != 0 && props != NULL && !drc.drc_newfs) {
+ if (clear_received_props(tofs, props, NULL) != 0) {
+ /*
+ * We failed to clear the received properties.
+ * Since we may have left a $recvd value on the
+ * system, we can't clear the $hasrecvd flag.
+ */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ } else if (first_recvd_props) {
+ dsl_prop_unset_hasrecvd(tofs);
+ }
+
+ if (origprops == NULL && !drc.drc_newfs) {
+ /* We failed to stash the original properties. */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ }
+
+ /*
+ * dsl_props_set() will not convert RECEIVED to LOCAL on or
+ * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
+ * explictly if we're restoring local properties cleared in the
+ * first new-style receive.
+ */
+ if (origprops != NULL &&
+ zfs_set_prop_nvlist(tofs, (first_recvd_props ?
+ ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
+ origprops, NULL) != 0) {
+ /*
+ * We stashed the original properties but failed to
+ * restore them.
+ */
+ zc->zc_obj |= ZPROP_ERR_NORESTORE;
+ }
+ }
+out:
+ nvlist_free(props);
+ nvlist_free(origprops);
+ nvlist_free(errors);
+ releasef(fd);
+
+ if (error == 0)
+ error = props_error;
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of snapshot to send
+ * zc_cookie file descriptor to send stream to
+ * zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
+ * zc_sendobj objsetid of snapshot to send
+ * zc_fromobj objsetid of incremental fromsnap (may be zero)
+ * zc_guid if set, estimate size of stream only. zc_cookie is ignored.
+ * output size in zc_objset_type.
+ * zc_flags lzc_send_flags
+ *
+ * outputs:
+ * zc_objset_type estimated size, if zc_guid is set
+ *
+ * NOTE: This is no longer the preferred interface, any new functionality
+ * should be added to zfs_ioc_send_new() instead.
+ */
+static int
+zfs_ioc_send(zfs_cmd_t *zc)
+{
+ int error;
+ offset_t off;
+ boolean_t estimate = (zc->zc_guid != 0);
+ boolean_t embedok = (zc->zc_flags & 0x1);
+ boolean_t large_block_ok = (zc->zc_flags & 0x2);
+ boolean_t compressok = (zc->zc_flags & 0x4);
+
+ if (zc->zc_obj != 0) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *tosnap;
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (dsl_dir_is_clone(tosnap->ds_dir))
+ zc->zc_fromobj =
+ dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ }
+
+ if (estimate) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *tosnap;
+ dsl_dataset_t *fromsnap = NULL;
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_fromobj != 0) {
+ error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
+ FTAG, &fromsnap);
+ if (error != 0) {
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+ }
+
+ error = dmu_send_estimate(tosnap, fromsnap, compressok,
+ &zc->zc_objset_type);
+
+ if (fromsnap != NULL)
+ dsl_dataset_rele(fromsnap, FTAG);
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ } else {
+ file_t *fp;
+
+#ifdef illumos
+ fp = getf(zc->zc_cookie);
+#else
+ fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp);
+#endif
+ if (fp == NULL)
+ return (SET_ERROR(EBADF));
+
+ off = fp->f_offset;
+ error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+ zc->zc_fromobj, embedok, large_block_ok, compressok,
+#ifdef illumos
+ zc->zc_cookie, fp->f_vnode, &off);
+#else
+ zc->zc_cookie, fp, &off);
+#endif
+
+ if (off >= 0 && off <= MAXOFFSET_T)
+ fp->f_offset = off;
+ releasef(zc->zc_cookie);
+ }
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of snapshot on which to report progress
+ * zc_cookie file descriptor of send stream
+ *
+ * outputs:
+ * zc_cookie number of bytes written in send stream thus far
+ */
+static int
+zfs_ioc_send_progress(zfs_cmd_t *zc)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ dmu_sendarg_t *dsp = NULL;
+ int error;
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ mutex_enter(&ds->ds_sendstream_lock);
+
+ /*
+ * Iterate over all the send streams currently active on this dataset.
+ * If there's one which matches the specified file descriptor _and_ the
+ * stream was started by the current process, return the progress of
+ * that stream.
+ */
+ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
+ dsp = list_next(&ds->ds_sendstreams, dsp)) {
+ if (dsp->dsa_outfd == zc->zc_cookie &&
+ dsp->dsa_proc == curproc)
+ break;
+ }
+
+ if (dsp != NULL)
+ zc->zc_cookie = *(dsp->dsa_off);
+ else
+ error = SET_ERROR(ENOENT);
+
+ mutex_exit(&ds->ds_sendstream_lock);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_inject_fault(zfs_cmd_t *zc)
+{
+ int id, error;
+
+ error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
+ &zc->zc_inject_record);
+
+ if (error == 0)
+ zc->zc_guid = (uint64_t)id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear_fault(zfs_cmd_t *zc)
+{
+ return (zio_clear_fault((int)zc->zc_guid));
+}
+
+static int
+zfs_ioc_inject_list_next(zfs_cmd_t *zc)
+{
+ int id = (int)zc->zc_guid;
+ int error;
+
+ error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
+ &zc->zc_inject_record);
+
+ zc->zc_guid = id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_error_log(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ size_t count = (size_t)zc->zc_nvlist_dst_size;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ &count);
+ if (error == 0)
+ zc->zc_nvlist_dst_size = count;
+ else
+ zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ vdev_t *vd;
+ int error;
+
+ /*
+ * On zpool clear we also fix up missing slogs
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(zc->zc_name);
+ if (spa == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EIO));
+ }
+ if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
+ /* we need to let spa_open/spa_load clear the chains */
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ }
+ spa->spa_last_open_failed = 0;
+ mutex_exit(&spa_namespace_lock);
+
+ if (zc->zc_cookie & ZPOOL_NO_REWIND) {
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ } else {
+ nvlist_t *policy;
+ nvlist_t *config = NULL;
+
+ if (zc->zc_nvlist_src == 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = get_nvlist(zc->zc_nvlist_src,
+ zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
+ error = spa_open_rewind(zc->zc_name, &spa, FTAG,
+ policy, &config);
+ if (config != NULL) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ nvlist_free(config);
+ }
+ nvlist_free(policy);
+ }
+ }
+
+ if (error != 0)
+ return (error);
+
+ /*
+ * If multihost is enabled, resuming I/O is unsafe as another
+ * host may have imported the pool.
+ */
+ if (spa_multihost(spa) && spa_suspended(spa))
+ return (SET_ERROR(EINVAL));
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if (zc->zc_guid == 0) {
+ vd = NULL;
+ } else {
+ vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
+ if (vd == NULL) {
+ (void) spa_vdev_state_exit(spa, NULL, ENODEV);
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENODEV));
+ }
+ }
+
+ vdev_clear(spa, vd);
+
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ /*
+ * Resume any suspended I/Os.
+ */
+ if (zio_resume(spa) != 0)
+ error = SET_ERROR(EIO);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Reopen all the vdevs associated with the pool.
+ *
+ * innvl: {
+ * "scrub_restart" -> when true and scrub is running, allow to restart
+ * scrub as the side effect of the reopen (boolean).
+ * }
+ *
+ * outnvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_pool_reopen[] = {
+ {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+ boolean_t scrub_restart = B_TRUE;
+
+ if (innvl) {
+ scrub_restart = fnvlist_lookup_boolean_value(innvl,
+ "scrub_restart");
+ }
+
+ error = spa_open(pool, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ /*
+ * If a resilver is already in progress then set the
+ * spa_scrub_reopen flag to B_TRUE so that we don't restart
+ * the scan as a side effect of the reopen. Otherwise, let
+ * vdev_open() decided if a resilver is required.
+ */
+ spa->spa_scrub_reopen = (!scrub_restart &&
+ dsl_scan_resilvering(spa->spa_dsl_pool));
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ spa_close(spa, FTAG);
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * zc_string name of conflicting snapshot, if there is one
+ */
+static int
+zfs_ioc_promote(zfs_cmd_t *zc)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds, *ods;
+ char origin[ZFS_MAX_DATASET_NAME_LEN];
+ char *cp;
+ int error;
+
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
+ strchr(zc->zc_name, '%'))
+ return (SET_ERROR(EINVAL));
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ if (!dsl_dir_is_clone(ds->ds_dir)) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ error = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ dsl_dataset_name(ods, origin);
+ dsl_dataset_rele(ods, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ /*
+ * We don't need to unmount *all* the origin fs's snapshots, but
+ * it's easier.
+ */
+ cp = strchr(origin, '@');
+ if (cp)
+ *cp = '\0';
+ (void) dmu_objset_find(origin,
+ zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
+ return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
+}
+
+/*
+ * Retrieve a single {user|group}{used|quota}@... property.
+ *
+ * inputs:
+ * zc_name name of filesystem
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_value domain name (eg. "S-1-234-567-89")
+ * zc_guid RID/UID/GID
+ *
+ * outputs:
+ * zc_cookie property value
+ */
+static int
+zfs_ioc_userspace_one(zfs_cmd_t *zc)
+{
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (SET_ERROR(EINVAL));
+
+ error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
+ if (error != 0)
+ return (error);
+
+ error = zfs_userspace_one(zfsvfs,
+ zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_cookie zap cursor
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
+ *
+ * outputs:
+ * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t)
+ * zc_cookie zap cursor
+ */
+static int
+zfs_ioc_userspace_many(zfs_cmd_t *zc)
+{
+ zfsvfs_t *zfsvfs;
+ int bufsize = zc->zc_nvlist_dst_size;
+
+ if (bufsize <= 0)
+ return (SET_ERROR(ENOMEM));
+
+ int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
+ if (error != 0)
+ return (error);
+
+ void *buf = kmem_alloc(bufsize, KM_SLEEP);
+
+ error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
+ buf, &zc->zc_nvlist_dst_size);
+
+ if (error == 0) {
+ error = ddi_copyout(buf,
+ (void *)(uintptr_t)zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size, zc->zc_iflags);
+ }
+ kmem_free(buf, bufsize);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * none
+ */
+static int
+zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error = 0;
+ zfsvfs_t *zfsvfs;
+
+ if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+ if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
+ /*
+ * If userused is not enabled, it may be because the
+ * objset needs to be closed & reopened (to grow the
+ * objset_phys_t). Suspend/resume the fs will do that.
+ */
+ dsl_dataset_t *ds, *newds;
+
+ ds = dmu_objset_ds(zfsvfs->z_os);
+ error = zfs_suspend_fs(zfsvfs);
+ if (error == 0) {
+ dmu_objset_refresh_ownership(ds, &newds,
+ zfsvfs);
+ error = zfs_resume_fs(zfsvfs, newds);
+ }
+ }
+ if (error == 0)
+ error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
+#ifdef illumos
+ VFS_RELE(zfsvfs->z_vfs);
+#else
+ vfs_unbusy(zfsvfs->z_vfs);
+#endif
+ } else {
+ /* XXX kind of reading contents without owning */
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error != 0)
+ return (error);
+
+ error = dmu_objset_userspace_upgrade(os);
+ dmu_objset_rele(os, FTAG);
+ }
+
+ return (error);
+}
+
+#ifdef illumos
+/*
+ * We don't want to have a hard dependency
+ * against some special symbols in sharefs
+ * nfs, and smbsrv. Determine them if needed when
+ * the first file system is shared.
+ * Neither sharefs, nfs or smbsrv are unloadable modules.
+ */
+int (*znfsexport_fs)(void *arg);
+int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
+int (*zsmbexport_fs)(void *arg, boolean_t add_share);
+
+int zfs_nfsshare_inited;
+int zfs_smbshare_inited;
+
+ddi_modhandle_t nfs_mod;
+ddi_modhandle_t sharefs_mod;
+ddi_modhandle_t smbsrv_mod;
+#endif /* illumos */
+kmutex_t zfs_share_lock;
+
+#ifdef illumos
+static int
+zfs_init_sharefs()
+{
+ int error;
+
+ ASSERT(MUTEX_HELD(&zfs_share_lock));
+ /* Both NFS and SMB shares also require sharetab support. */
+ if (sharefs_mod == NULL && ((sharefs_mod =
+ ddi_modopen("fs/sharefs",
+ KRTLD_MODE_FIRST, &error)) == NULL)) {
+ return (SET_ERROR(ENOSYS));
+ }
+ if (zshare_fs == NULL && ((zshare_fs =
+ (int (*)(enum sharefs_sys_op, share_t *, uint32_t))
+ ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
+ return (SET_ERROR(ENOSYS));
+ }
+ return (0);
+}
+#endif /* illumos */
+
+static int
+zfs_ioc_share(zfs_cmd_t *zc)
+{
+#ifdef illumos
+ int error;
+ int opcode;
+
+ switch (zc->zc_share.z_sharetype) {
+ case ZFS_SHARE_NFS:
+ case ZFS_UNSHARE_NFS:
+ if (zfs_nfsshare_inited == 0) {
+ mutex_enter(&zfs_share_lock);
+ if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
+ KRTLD_MODE_FIRST, &error)) == NULL)) {
+ mutex_exit(&zfs_share_lock);
+ return (SET_ERROR(ENOSYS));
+ }
+ if (znfsexport_fs == NULL &&
+ ((znfsexport_fs = (int (*)(void *))
+ ddi_modsym(nfs_mod,
+ "nfs_export", &error)) == NULL)) {
+ mutex_exit(&zfs_share_lock);
+ return (SET_ERROR(ENOSYS));
+ }
+ error = zfs_init_sharefs();
+ if (error != 0) {
+ mutex_exit(&zfs_share_lock);
+ return (SET_ERROR(ENOSYS));
+ }
+ zfs_nfsshare_inited = 1;
+ mutex_exit(&zfs_share_lock);
+ }
+ break;
+ case ZFS_SHARE_SMB:
+ case ZFS_UNSHARE_SMB:
+ if (zfs_smbshare_inited == 0) {
+ mutex_enter(&zfs_share_lock);
+ if (smbsrv_mod == NULL && ((smbsrv_mod =
+ ddi_modopen("drv/smbsrv",
+ KRTLD_MODE_FIRST, &error)) == NULL)) {
+ mutex_exit(&zfs_share_lock);
+ return (SET_ERROR(ENOSYS));
+ }
+ if (zsmbexport_fs == NULL && ((zsmbexport_fs =
+ (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
+ "smb_server_share", &error)) == NULL)) {
+ mutex_exit(&zfs_share_lock);
+ return (SET_ERROR(ENOSYS));
+ }
+ error = zfs_init_sharefs();
+ if (error != 0) {
+ mutex_exit(&zfs_share_lock);
+ return (SET_ERROR(ENOSYS));
+ }
+ zfs_smbshare_inited = 1;
+ mutex_exit(&zfs_share_lock);
+ }
+ break;
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+
+ switch (zc->zc_share.z_sharetype) {
+ case ZFS_SHARE_NFS:
+ case ZFS_UNSHARE_NFS:
+ if (error =
+ znfsexport_fs((void *)
+ (uintptr_t)zc->zc_share.z_exportdata))
+ return (error);
+ break;
+ case ZFS_SHARE_SMB:
+ case ZFS_UNSHARE_SMB:
+ if (error = zsmbexport_fs((void *)
+ (uintptr_t)zc->zc_share.z_exportdata,
+ zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
+ B_TRUE: B_FALSE)) {
+ return (error);
+ }
+ break;
+ }
+
+ opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS ||
+ zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
+ SHAREFS_ADD : SHAREFS_REMOVE;
+
+ /*
+ * Add or remove share from sharetab
+ */
+ error = zshare_fs(opcode,
+ (void *)(uintptr_t)zc->zc_share.z_sharedata,
+ zc->zc_share.z_sharemax);
+
+ return (error);
+
+#else /* !illumos */
+ return (ENOSYS);
+#endif /* illumos */
+}
+
+ace_t full_access[] = {
+ {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
+};
+
+/*
+ * inputs:
+ * zc_name name of containing filesystem
+ * zc_obj object # beyond which we want next in-use object #
+ *
+ * outputs:
+ * zc_obj next in-use object #
+ */
+static int
+zfs_ioc_next_obj(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error != 0)
+ return (error);
+
+ error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0);
+
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value prefix name for snapshot
+ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
+ *
+ * outputs:
+ * zc_value short name of new snapshot
+ */
+static int
+zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
+{
+ char *snap_name;
+ char *hold_name;
+ int error;
+ minor_t minor;
+
+ error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+ if (error != 0)
+ return (error);
+
+ snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+ (u_longlong_t)ddi_get_lbolt64());
+ hold_name = kmem_asprintf("%%%s", zc->zc_value);
+
+ error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
+ hold_name);
+ if (error == 0)
+ (void) strcpy(zc->zc_value, snap_name);
+ strfree(snap_name);
+ strfree(hold_name);
+ zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of "to" snapshot
+ * zc_value name of "from" snapshot
+ * zc_cookie file descriptor to write diff data on
+ *
+ * outputs:
+ * dmu_diff_record_t's to the file descriptor
+ */
+static int
+zfs_ioc_diff(zfs_cmd_t *zc)
+{
+ file_t *fp;
+ offset_t off;
+ int error;
+
+#ifdef illumos
+ fp = getf(zc->zc_cookie);
+#else
+ fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp);
+#endif
+ if (fp == NULL)
+ return (SET_ERROR(EBADF));
+
+ off = fp->f_offset;
+
+#ifdef illumos
+ error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
+#else
+ error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
+#endif
+
+ if (off >= 0 && off <= MAXOFFSET_T)
+ fp->f_offset = off;
+ releasef(zc->zc_cookie);
+
+ return (error);
+}
+
+#ifdef illumos
+/*
+ * Remove all ACL files in shares dir
+ */
+static int
+zfs_smb_acl_purge(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
+ NULL, 0)) != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ return (error);
+}
+#endif /* illumos */
+
+static int
+zfs_ioc_smb_acl(zfs_cmd_t *zc)
+{
+#ifdef illumos
+ vnode_t *vp;
+ znode_t *dzp;
+ vnode_t *resourcevp = NULL;
+ znode_t *sharedir;
+ zfsvfs_t *zfsvfs;
+ nvlist_t *nvlist;
+ char *src, *target;
+ vattr_t vattr;
+ vsecattr_t vsec;
+ int error = 0;
+
+ if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+ NO_FOLLOW, NULL, &vp)) != 0)
+ return (error);
+
+ /* Now make sure mntpnt and dataset are ZFS */
+
+ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
+ (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+ zc->zc_name) != 0)) {
+ VN_RELE(vp);
+ return (SET_ERROR(EINVAL));
+ }
+
+ dzp = VTOZ(vp);
+ zfsvfs = dzp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Create share dir if its missing.
+ */
+ mutex_enter(&zfsvfs->z_lock);
+ if (zfsvfs->z_shares_dir == 0) {
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
+ ZFS_SHARES_DIR);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ error = zfs_create_share_dir(zfsvfs, tx);
+ dmu_tx_commit(tx);
+ }
+ if (error != 0) {
+ mutex_exit(&zfsvfs->z_lock);
+ VN_RELE(vp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+ mutex_exit(&zfsvfs->z_lock);
+
+ ASSERT(zfsvfs->z_shares_dir);
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
+ VN_RELE(vp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ switch (zc->zc_cookie) {
+ case ZFS_SMB_ACL_ADD:
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VREG;
+ vattr.va_mode = S_IFREG|0777;
+ vattr.va_uid = 0;
+ vattr.va_gid = 0;
+
+ vsec.vsa_mask = VSA_ACE;
+ vsec.vsa_aclentp = &full_access;
+ vsec.vsa_aclentsz = sizeof (full_access);
+ vsec.vsa_aclcnt = 1;
+
+ error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
+ &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
+ if (resourcevp)
+ VN_RELE(resourcevp);
+ break;
+
+ case ZFS_SMB_ACL_REMOVE:
+ error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
+ NULL, 0);
+ break;
+
+ case ZFS_SMB_ACL_RENAME:
+ if ((error = get_nvlist(zc->zc_nvlist_src,
+ zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
+ VN_RELE(vp);
+ VN_RELE(ZTOV(sharedir));
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
+ nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
+ &target)) {
+ VN_RELE(vp);
+ VN_RELE(ZTOV(sharedir));
+ ZFS_EXIT(zfsvfs);
+ nvlist_free(nvlist);
+ return (error);
+ }
+ error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
+ kcred, NULL, 0);
+ nvlist_free(nvlist);
+ break;
+
+ case ZFS_SMB_ACL_PURGE:
+ error = zfs_smb_acl_purge(sharedir);
+ break;
+
+ default:
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ VN_RELE(vp);
+ VN_RELE(ZTOV(sharedir));
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+#else /* !illumos */
+ return (EOPNOTSUPP);
+#endif /* illumos */
+}
+
+/*
+ * innvl: {
+ * "holds" -> { snapname -> holdname (string), ... }
+ * (optional) "cleanup_fd" -> fd (int32)
+ * }
+ *
+ * outnvl: {
+ * snapname -> error value (int32)
+ * ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_hold[] = {
+ {"holds", DATA_TYPE_NVLIST, 0},
+ {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
+{
+ nvpair_t *pair;
+ nvlist_t *holds;
+ int cleanup_fd = -1;
+ int error;
+ minor_t minor = 0;
+
+ holds = fnvlist_lookup_nvlist(args, "holds");
+
+ /* make sure the user didn't pass us any invalid (empty) tags */
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ char *htag;
+
+ error = nvpair_value_string(pair, &htag);
+ if (error != 0)
+ return (SET_ERROR(error));
+
+ if (strlen(htag) == 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
+ error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (error != 0)
+ return (error);
+ }
+
+ error = dsl_dataset_user_hold(holds, minor, errlist);
+ if (minor != 0)
+ zfs_onexit_fd_rele(cleanup_fd);
+ return (error);
+}
+
+/*
+ * innvl is not used.
+ *
+ * outnvl: {
+ * holdname -> time added (uint64 seconds since epoch)
+ * ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_get_holds[] = {
+ /* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
+{
+ return (dsl_dataset_get_holds(snapname, outnvl));
+}
+
+/*
+ * innvl: {
+ * snapname -> { holdname, ... }
+ * ...
+ * }
+ *
+ * outnvl: {
+ * snapname -> error value (int32)
+ * ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_release[] = {
+ {"<snapname>...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
+{
+ return (dsl_dataset_user_release(holds, errlist));
+}
+
+/*
+ * inputs:
+ * zc_name name of new filesystem or snapshot
+ * zc_value full name of old snapshot
+ *
+ * outputs:
+ * zc_cookie space in bytes
+ * zc_objset_type compressed space in bytes
+ * zc_perm_action uncompressed space in bytes
+ */
+static int
+zfs_ioc_space_written(zfs_cmd_t *zc)
+{
+ int error;
+ dsl_pool_t *dp;
+ dsl_dataset_t *new, *old;
+
+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+ if (error != 0)
+ return (error);
+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+ error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
+ if (error != 0) {
+ dsl_dataset_rele(new, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
+ &zc->zc_objset_type, &zc->zc_perm_action);
+ dsl_dataset_rele(old, FTAG);
+ dsl_dataset_rele(new, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "firstsnap" -> snapshot name
+ * }
+ *
+ * outnvl: {
+ * "used" -> space in bytes
+ * "compressed" -> compressed space in bytes
+ * "uncompressed" -> uncompressed space in bytes
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_space_snaps[] = {
+ {"firstsnap", DATA_TYPE_STRING, 0},
+};
+
+static int
+zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int error;
+ dsl_pool_t *dp;
+ dsl_dataset_t *new, *old;
+ char *firstsnap;
+ uint64_t used, comp, uncomp;
+
+ firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
+
+ error = dsl_pool_hold(lastsnap, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
+ if (error == 0 && !new->ds_is_snapshot) {
+ dsl_dataset_rele(new, FTAG);
+ error = SET_ERROR(EINVAL);
+ }
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+ error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
+ if (error == 0 && !old->ds_is_snapshot) {
+ dsl_dataset_rele(old, FTAG);
+ error = SET_ERROR(EINVAL);
+ }
+ if (error != 0) {
+ dsl_dataset_rele(new, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
+ dsl_dataset_rele(old, FTAG);
+ dsl_dataset_rele(new, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ fnvlist_add_uint64(outnvl, "used", used);
+ fnvlist_add_uint64(outnvl, "compressed", comp);
+ fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
+ return (error);
+}
+
+static int
+zfs_ioc_jail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
+ (int)zc->zc_jailid));
+}
+
+static int
+zfs_ioc_unjail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
+ (int)zc->zc_jailid));
+}
+
+/*
+ * innvl: {
+ * "fd" -> file descriptor to write stream to (int32)
+ * (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "compressok" -> (value ignored)
+ * presence indicates compressed DRR_WRITE records are permitted
+ * (optional) "resume_object" and "resume_offset" -> (uint64)
+ * if present, resume send stream from specified object and offset.
+ * }
+ *
+ * outnvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_send_new[] = {
+ {"fd", DATA_TYPE_INT32, 0},
+ {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ file_t *fp;
+ int error;
+ offset_t off;
+ char *fromname = NULL;
+ int fd;
+ boolean_t largeblockok;
+ boolean_t embedok;
+ boolean_t compressok;
+ uint64_t resumeobj = 0;
+ uint64_t resumeoff = 0;
+
+ fd = fnvlist_lookup_int32(innvl, "fd");
+
+ (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+
+ largeblockok = nvlist_exists(innvl, "largeblockok");
+ embedok = nvlist_exists(innvl, "embedok");
+ compressok = nvlist_exists(innvl, "compressok");
+
+ (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+ (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+
+#ifdef illumos
+ file_t *fp = getf(fd);
+#else
+ fget_write(curthread, fd, &cap_write_rights, &fp);
+#endif
+ if (fp == NULL)
+ return (SET_ERROR(EBADF));
+
+ off = fp->f_offset;
+ error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
+#ifdef illumos
+ fd, resumeobj, resumeoff, fp->f_vnode, &off);
+#else
+ fd, resumeobj, resumeoff, fp, &off);
+#endif
+
+#ifdef illumos
+ if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
+ fp->f_offset = off;
+#else
+ fp->f_offset = off;
+#endif
+
+ releasef(fd);
+ return (error);
+}
+
+/*
+ * Determine approximately how large a zfs send stream will be -- the number
+ * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
+ *
+ * innvl: {
+ * (optional) "from" -> full snap or bookmark name to send an incremental
+ * from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "compressok" -> (value ignored)
+ * presence indicates compressed DRR_WRITE records are permitted
+ * }
+ *
+ * outnvl: {
+ * "space" -> bytes of space (uint64)
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_send_space[] = {
+ {"from", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *tosnap;
+ int error;
+ char *fromname;
+ boolean_t compressok;
+ uint64_t space;
+
+ error = dsl_pool_hold(snapname, FTAG, &dp);
+ if (error != 0)
+ return (error);
+
+ error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
+ if (error != 0) {
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+ }
+
+ compressok = nvlist_exists(innvl, "compressok");
+
+ error = nvlist_lookup_string(innvl, "from", &fromname);
+ if (error == 0) {
+ if (strchr(fromname, '@') != NULL) {
+ /*
+ * If from is a snapshot, hold it and use the more
+ * efficient dmu_send_estimate to estimate send space
+ * size using deadlists.
+ */
+ dsl_dataset_t *fromsnap;
+ error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
+ if (error != 0)
+ goto out;
+ error = dmu_send_estimate(tosnap, fromsnap, compressok,
+ &space);
+ dsl_dataset_rele(fromsnap, FTAG);
+ } else if (strchr(fromname, '#') != NULL) {
+ /*
+ * If from is a bookmark, fetch the creation TXG of the
+ * snapshot it was created from and use that to find
+ * blocks that were born after it.
+ */
+ zfs_bookmark_phys_t frombm;
+
+ error = dsl_bookmark_lookup(dp, fromname, tosnap,
+ &frombm);
+ if (error != 0)
+ goto out;
+ error = dmu_send_estimate_from_txg(tosnap,
+ frombm.zbm_creation_txg, compressok, &space);
+ } else {
+ /*
+ * from is not properly formatted as a snapshot or
+ * bookmark
+ */
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ } else {
+ /*
+ * If estimating the size of a full send, use dmu_send_estimate.
+ */
+ error = dmu_send_estimate(tosnap, NULL, compressok, &space);
+ }
+
+ fnvlist_add_uint64(outnvl, "space", space);
+
+out:
+ dsl_dataset_rele(tosnap, FTAG);
+ dsl_pool_rele(dp, FTAG);
+ return (error);
+}
+
+/*
+ * Sync the currently open TXG to disk for the specified pool.
+ * This is somewhat similar to 'zfs_sync()'.
+ * For cases that do not result in error this ioctl will wait for
+ * the currently open TXG to commit before returning back to the caller.
+ *
+ * innvl: {
+ * "force" -> when true, force uberblock update even if there is no dirty data.
+ * In addition this will cause the vdev configuration to be written
+ * out including updating the zpool cache file. (boolean_t)
+ * }
+ *
+ * onvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
+ {"force", DATA_TYPE_BOOLEAN_VALUE, 0},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
+{
+ int err;
+ boolean_t force;
+ spa_t *spa;
+
+ if ((err = spa_open(pool, &spa, FTAG)) != 0)
+ return (err);
+
+ force = fnvlist_lookup_boolean_value(innvl, "force");
+ if (force) {
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ spa_close(spa, FTAG);
+
+ return (err);
+}
+
+static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
+
+static void
+zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
+ boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
+{
+ zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
+
+ ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
+ ASSERT3U(ioc, <, ZFS_IOC_LAST);
+ ASSERT3P(vec->zvec_legacy_func, ==, NULL);
+ ASSERT3P(vec->zvec_func, ==, NULL);
+
+ vec->zvec_legacy_func = func;
+ vec->zvec_secpolicy = secpolicy;
+ vec->zvec_namecheck = namecheck;
+ vec->zvec_allow_log = log_history;
+ vec->zvec_pool_check = pool_check;
+}
+
+/*
+ * See the block comment at the beginning of this file for details on
+ * each argument to this function.
+ */
+static void
+zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
+ zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
+ boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys)
+{
+ zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
+
+ ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
+ ASSERT3U(ioc, <, ZFS_IOC_LAST);
+ ASSERT3P(vec->zvec_legacy_func, ==, NULL);
+ ASSERT3P(vec->zvec_func, ==, NULL);
+
+ /* if we are logging, the name must be valid */
+ ASSERT(!allow_log || namecheck != NO_NAME);
+
+ vec->zvec_name = name;
+ vec->zvec_func = func;
+ vec->zvec_secpolicy = secpolicy;
+ vec->zvec_namecheck = namecheck;
+ vec->zvec_pool_check = pool_check;
+ vec->zvec_smush_outnvlist = smush_outnvlist;
+ vec->zvec_allow_log = allow_log;
+ vec->zvec_nvl_keys = nvl_keys;
+ vec->zvec_nvl_key_count = num_keys;
+}
+
+static void
+zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
+ zfs_ioc_poolcheck_t pool_check)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ POOL_NAME, log_history, pool_check);
+}
+
+static void
+zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ DATASET_NAME, B_FALSE, pool_check);
+}
+
+static void
+zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
+{
+ zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
+ POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+}
+
+static void
+zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ NO_NAME, B_FALSE, POOL_CHECK_NONE);
+}
+
+static void
+zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
+ zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
+}
+
+static void
+zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
+{
+ zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
+ zfs_secpolicy_read);
+}
+
+static void
+zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+ zfs_secpolicy_func_t *secpolicy)
+{
+ zfs_ioctl_register_legacy(ioc, func, secpolicy,
+ DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+}
+
+static void
+zfs_ioctl_init(void)
+{
+ zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
+ zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot));
+
+ zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
+ zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history));
+
+ zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
+ zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps));
+
+ zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
+ zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new));
+
+ zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
+ zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space));
+
+ zfs_ioctl_register("create", ZFS_IOC_CREATE,
+ zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_create, ARRAY_SIZE(zfs_keys_create));
+
+ zfs_ioctl_register("clone", ZFS_IOC_CLONE,
+ zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone));
+
+ zfs_ioctl_register("remap", ZFS_IOC_REMAP,
+ zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap));
+
+ zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
+ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps));
+
+ zfs_ioctl_register("hold", ZFS_IOC_HOLD,
+ zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold));
+ zfs_ioctl_register("release", ZFS_IOC_RELEASE,
+ zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_release, ARRAY_SIZE(zfs_keys_release));
+
+ zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
+ zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds));
+
+ zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
+ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback));
+
+ zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
+ zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark));
+
+ zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
+ zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks));
+
+ zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
+ zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
+ POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_destroy_bookmarks,
+ ARRAY_SIZE(zfs_keys_destroy_bookmarks));
+
+ zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
+ zfs_ioc_channel_program, zfs_secpolicy_config,
+ POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
+ B_TRUE, zfs_keys_channel_program,
+ ARRAY_SIZE(zfs_keys_channel_program));
+
+ zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
+ zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint));
+
+ zfs_ioctl_register("zpool_discard_checkpoint",
+ ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
+ zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_discard_checkpoint,
+ ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
+
+ zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
+ zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
+
+ zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
+ zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync));
+ zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
+ zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE,
+ B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen));
+
+ zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV,
+ zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv));
+
+ zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV,
+ zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE,
+ zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv));
+
+ /* IOCTLS that use the legacy function signature */
+
+ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
+
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
+ zfs_ioc_pool_scan);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
+ zfs_ioc_pool_upgrade);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
+ zfs_ioc_vdev_add);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
+ zfs_ioc_vdev_remove);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
+ zfs_ioc_vdev_set_state);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
+ zfs_ioc_vdev_attach);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
+ zfs_ioc_vdev_detach);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
+ zfs_ioc_vdev_setpath);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
+ zfs_ioc_vdev_setfru);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
+ zfs_ioc_pool_set_props);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
+ zfs_ioc_vdev_split);
+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
+ zfs_ioc_pool_reguid);
+
+ zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
+ zfs_ioc_pool_configs, zfs_secpolicy_none);
+ zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
+ zfs_ioc_pool_tryimport, zfs_secpolicy_config);
+ zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
+ zfs_ioc_inject_fault, zfs_secpolicy_inject);
+ zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
+ zfs_ioc_clear_fault, zfs_secpolicy_inject);
+ zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
+ zfs_ioc_inject_list_next, zfs_secpolicy_inject);
+
+ /*
+ * pool destroy, and export don't log the history as part of
+ * zfsdev_ioctl, but rather zfs_ioc_pool_export
+ * does the logging of those commands.
+ */
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
+
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
+ zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
+ zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
+
+ zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
+ zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
+ zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
+ zfs_ioc_dsobj_to_dsname,
+ zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
+ zfs_ioc_pool_get_history,
+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
+
+ zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+
+ zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY);
+
+ zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
+ zfs_ioc_space_written);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
+ zfs_ioc_objset_recvd_props);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
+ zfs_ioc_next_obj);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
+ zfs_ioc_get_fsacl);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
+ zfs_ioc_objset_stats);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
+ zfs_ioc_objset_zplprops);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
+ zfs_ioc_dataset_list_next);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
+ zfs_ioc_snapshot_list_next);
+ zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
+ zfs_ioc_send_progress);
+
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
+ zfs_ioc_diff, zfs_secpolicy_diff);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
+ zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
+ zfs_ioc_obj_to_path, zfs_secpolicy_diff);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
+ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
+ zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
+ zfs_ioc_send, zfs_secpolicy_send);
+
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
+ zfs_secpolicy_none);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
+ zfs_secpolicy_destroy);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
+ zfs_secpolicy_recv);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
+ zfs_secpolicy_promote);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
+ zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
+ zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
+ zfs_secpolicy_set_fsacl);
+
+ /*
+ * Not using zfs_ioctl_register_dataset_modify as DATASET_NAME check
+ * won't allow a bookmark name.
+ */
+ zfs_ioctl_register_legacy(ZFS_IOC_RENAME, zfs_ioc_rename,
+ zfs_secpolicy_rename, ENTITY_NAME, B_TRUE,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
+ zfs_secpolicy_share, POOL_CHECK_NONE);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
+ zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
+ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
+ zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+
+#ifdef __FreeBSD__
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
+ zfs_secpolicy_config, POOL_CHECK_NONE);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
+ zfs_secpolicy_config, POOL_CHECK_NONE);
+ zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
+ zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
+ POOL_CHECK_NONE, B_FALSE, B_FALSE,
+ zfs_keys_nextboot, ARRAY_SIZE(zfs_keys_nextboot));
+#endif
+}
+
+/*
+ * Verify that for non-legacy ioctls the input nvlist
+ * pairs match against the expected input.
+ *
+ * Possible errors are:
+ * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered
+ * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing
+ * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair
+ */
+static int
+zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec)
+{
+ const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys;
+ boolean_t required_keys_found = B_FALSE;
+
+ /*
+ * examine each input pair
+ */
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ char *name = nvpair_name(pair);
+ data_type_t type = nvpair_type(pair);
+ boolean_t identified = B_FALSE;
+
+ /*
+ * check pair against the documented names and type
+ */
+ for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
+ /* if not a wild card name, check for an exact match */
+ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 &&
+ strcmp(nvl_keys[k].zkey_name, name) != 0)
+ continue;
+
+ identified = B_TRUE;
+
+ if (nvl_keys[k].zkey_type != DATA_TYPE_ANY &&
+ nvl_keys[k].zkey_type != type) {
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE));
+ }
+
+ if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
+ continue;
+
+ required_keys_found = B_TRUE;
+ break;
+ }
+
+ /* allow an 'optional' key, everything else is invalid */
+ if (!identified &&
+ (strcmp(name, "optional") != 0 ||
+ type != DATA_TYPE_NVLIST)) {
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL));
+ }
+ }
+
+ /* verify that all required keys were found */
+ for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
+ if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
+ continue;
+
+ if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) {
+ /* at least one non-optionial key is expected here */
+ if (!required_keys_found)
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
+ continue;
+ }
+
+ if (!nvlist_exists(innvl, nvl_keys[k].zkey_name))
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
+ }
+
+ return (0);
+}
+
+int
+pool_status_check(const char *name, zfs_ioc_namecheck_t type,
+ zfs_ioc_poolcheck_t check)
+{
+ spa_t *spa;
+ int error;
+
+ ASSERT(type == POOL_NAME || type == DATASET_NAME ||
+ type == ENTITY_NAME);
+
+ if (check & POOL_CHECK_NONE)
+ return (0);
+
+ error = spa_open(name, &spa, FTAG);
+ if (error == 0) {
+ if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
+ error = SET_ERROR(EAGAIN);
+ else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
+ error = SET_ERROR(EROFS);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+/*
+ * Find a free minor number.
+ */
+minor_t
+zfsdev_minor_alloc(void)
+{
+ static minor_t last_minor;
+ minor_t m;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ for (m = last_minor + 1; m != last_minor; m++) {
+ if (m > ZFSDEV_MAX_MINOR)
+ m = 1;
+ if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
+ last_minor = m;
+ return (m);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_ctldev_init(struct cdev *devp)
+{
+ minor_t minor;
+ zfs_soft_state_t *zs;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ minor = zfsdev_minor_alloc();
+ if (minor == 0)
+ return (SET_ERROR(ENXIO));
+
+ if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
+ return (SET_ERROR(EAGAIN));
+
+ devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
+
+ zs = ddi_get_soft_state(zfsdev_state, minor);
+ zs->zss_type = ZSST_CTLDEV;
+ zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
+
+ return (0);
+}
+
+static void
+zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ zfs_onexit_destroy(zo);
+ ddi_soft_state_free(zfsdev_state, minor);
+}
+
+void *
+zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
+{
+ zfs_soft_state_t *zp;
+
+ zp = ddi_get_soft_state(zfsdev_state, minor);
+ if (zp == NULL || zp->zss_type != which)
+ return (NULL);
+
+ return (zp->zss_data);
+}
+
+static int
+zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
+{
+ int error = 0;
+
+#ifdef illumos
+ if (getminor(*devp) != 0)
+ return (zvol_open(devp, flag, otyp, cr));
+#endif
+
+ /* This is the control device. Allocate a new minor if requested. */
+ if (flag & FEXCL) {
+ mutex_enter(&spa_namespace_lock);
+ error = zfs_ctldev_init(devp);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (error);
+}
+
+static void
+zfsdev_close(void *data)
+{
+ zfs_onexit_t *zo;
+ minor_t minor = (minor_t)(uintptr_t)data;
+
+ if (minor == 0)
+ return;
+
+ mutex_enter(&spa_namespace_lock);
+ zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+ if (zo == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return;
+ }
+ zfs_ctldev_destroy(zo, minor);
+ mutex_exit(&spa_namespace_lock);
+}
+
+static int
+zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ zfs_cmd_t *zc;
+ uint_t vecnum;
+ int error, rc, len;
+#ifdef illumos
+ minor_t minor = getminor(dev);
+#else
+ zfs_iocparm_t *zc_iocparm;
+ int cflag, cmd, oldvecnum;
+ boolean_t newioc, compat;
+ void *compat_zc = NULL;
+ cred_t *cr = td->td_ucred;
+#endif
+ const zfs_ioc_vec_t *vec;
+ char *saved_poolname = NULL;
+ nvlist_t *innvl = NULL;
+
+ cflag = ZFS_CMD_COMPAT_NONE;
+ compat = B_FALSE;
+ newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */
+
+ len = IOCPARM_LEN(zcmd);
+ vecnum = cmd = zcmd & 0xff;
+
+ /*
+ * Check if we are talking to supported older binaries
+ * and translate zfs_cmd if necessary
+ */
+ if (len != sizeof(zfs_iocparm_t)) {
+ newioc = B_FALSE;
+ compat = B_TRUE;
+
+ vecnum = cmd;
+
+ switch (len) {
+ case sizeof(zfs_cmd_zcmd_t):
+ cflag = ZFS_CMD_COMPAT_LZC;
+ break;
+ case sizeof(zfs_cmd_deadman_t):
+ cflag = ZFS_CMD_COMPAT_DEADMAN;
+ break;
+ case sizeof(zfs_cmd_v28_t):
+ cflag = ZFS_CMD_COMPAT_V28;
+ break;
+ case sizeof(zfs_cmd_v15_t):
+ if (cmd >= sizeof(zfs_ioctl_v15_to_v28) /
+ sizeof(zfs_ioctl_v15_to_v28[0]))
+ return (EINVAL);
+
+ cflag = ZFS_CMD_COMPAT_V15;
+ vecnum = zfs_ioctl_v15_to_v28[cmd];
+
+ /*
+ * Return without further handling
+ * if the command is blacklisted.
+ */
+ if (vecnum == ZFS_IOC_COMPAT_PASS)
+ return (0);
+ else if (vecnum == ZFS_IOC_COMPAT_FAIL)
+ return (ENOTSUP);
+ break;
+ default:
+ return (EINVAL);
+ }
+ }
+
+#ifdef illumos
+ vecnum = cmd - ZFS_IOC_FIRST;
+ ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
+#endif
+
+ if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+ return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
+ vec = &zfs_ioc_vec[vecnum];
+
+ zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
+
+#ifdef illumos
+ error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
+ if (error != 0) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+#else /* !illumos */
+ bzero(zc, sizeof(zfs_cmd_t));
+
+ if (newioc) {
+ zc_iocparm = (void *)arg;
+
+ switch (zc_iocparm->zfs_ioctl_version) {
+ case ZFS_IOCVER_CURRENT:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ break;
+ case ZFS_IOCVER_INLANES:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_INLANES;
+ break;
+ case ZFS_IOCVER_RESUME:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_RESUME;
+ break;
+ case ZFS_IOCVER_EDBP:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_EDBP;
+ break;
+ case ZFS_IOCVER_ZCMD:
+ if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
+ zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_ZCMD;
+ break;
+ default:
+ error = SET_ERROR(EINVAL);
+ goto out;
+ /* NOTREACHED */
+ }
+
+ if (compat) {
+ ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
+ compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
+ bzero(compat_zc, sizeof(zfs_cmd_t));
+
+ error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
+ compat_zc, zc_iocparm->zfs_cmd_size, flag);
+ if (error != 0) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ } else {
+ error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
+ zc, zc_iocparm->zfs_cmd_size, flag);
+ if (error != 0) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ }
+ }
+
+ if (compat) {
+ if (newioc) {
+ ASSERT(compat_zc != NULL);
+ zfs_cmd_compat_get(zc, compat_zc, cflag);
+ } else {
+ ASSERT(compat_zc == NULL);
+ zfs_cmd_compat_get(zc, arg, cflag);
+ }
+ oldvecnum = vecnum;
+ error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
+ if (error != 0)
+ goto out;
+ if (oldvecnum != vecnum)
+ vec = &zfs_ioc_vec[vecnum];
+ }
+#endif /* !illumos */
+
+ zc->zc_iflags = flag & FKIOCTL;
+ if (zc->zc_nvlist_src_size != 0) {
+ error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &innvl);
+ if (error != 0)
+ goto out;
+ }
+
+ /* rewrite innvl for backwards compatibility */
+ if (compat)
+ innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);
+
+ /*
+ * Ensure that all pool/dataset names are valid before we pass down to
+ * the lower layers.
+ */
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ switch (vec->zvec_namecheck) {
+ case POOL_NAME:
+ if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = SET_ERROR(EINVAL);
+ else
+ error = pool_status_check(zc->zc_name,
+ vec->zvec_namecheck, vec->zvec_pool_check);
+ break;
+
+ case DATASET_NAME:
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = SET_ERROR(EINVAL);
+ else
+ error = pool_status_check(zc->zc_name,
+ vec->zvec_namecheck, vec->zvec_pool_check);
+ break;
+
+ case ENTITY_NAME:
+ if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) {
+ error = SET_ERROR(EINVAL);
+ } else {
+ error = pool_status_check(zc->zc_name,
+ vec->zvec_namecheck, vec->zvec_pool_check);
+ }
+ break;
+
+ case NO_NAME:
+ break;
+ }
+
+ /*
+ * Ensure that all input pairs are valid before we pass them down
+ * to the lower layers.
+ *
+ * The vectored functions can use fnvlist_lookup_{type} for any
+ * required pairs since zfs_check_input_nvpairs() confirmed that
+ * they exist and are of the correct type.
+ */
+ if (error == 0 && vec->zvec_func != NULL) {
+ error = zfs_check_input_nvpairs(innvl, vec);
+ if (error != 0)
+ goto out;
+ }
+
+ if (error == 0)
+ error = vec->zvec_secpolicy(zc, innvl, cr);
+
+ if (error != 0)
+ goto out;
+
+ /* legacy ioctls can modify zc_name */
+ len = strcspn(zc->zc_name, "/@#") + 1;
+ saved_poolname = kmem_alloc(len, KM_SLEEP);
+ (void) strlcpy(saved_poolname, zc->zc_name, len);
+
+ if (vec->zvec_func != NULL) {
+ nvlist_t *outnvl;
+ int puterror = 0;
+ spa_t *spa;
+ nvlist_t *lognv = NULL;
+
+ ASSERT(vec->zvec_legacy_func == NULL);
+
+ /*
+ * Add the innvl to the lognv before calling the func,
+ * in case the func changes the innvl.
+ */
+ if (vec->zvec_allow_log) {
+ lognv = fnvlist_alloc();
+ fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
+ vec->zvec_name);
+ if (!nvlist_empty(innvl)) {
+ fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
+ innvl);
+ }
+ }
+
+ outnvl = fnvlist_alloc();
+ error = vec->zvec_func(zc->zc_name, innvl, outnvl);
+
+ /*
+ * Some commands can partially execute, modify state, and still
+ * return an error. In these cases, attempt to record what
+ * was modified.
+ */
+ if ((error == 0 ||
+ (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
+ vec->zvec_allow_log &&
+ spa_open(zc->zc_name, &spa, FTAG) == 0) {
+ if (!nvlist_empty(outnvl)) {
+ fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
+ outnvl);
+ }
+ if (error != 0) {
+ fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
+ error);
+ }
+ (void) spa_history_log_nvl(spa, lognv);
+ spa_close(spa, FTAG);
+ }
+ fnvlist_free(lognv);
+
+ /* rewrite outnvl for backwards compatibility */
+ if (compat)
+ outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
+ cflag);
+
+ if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
+ int smusherror = 0;
+ if (vec->zvec_smush_outnvlist) {
+ smusherror = nvlist_smush(outnvl,
+ zc->zc_nvlist_dst_size);
+ }
+ if (smusherror == 0)
+ puterror = put_nvlist(zc, outnvl);
+ }
+
+ if (puterror != 0)
+ error = puterror;
+
+ nvlist_free(outnvl);
+ } else {
+ error = vec->zvec_legacy_func(zc);
+ }
+
+out:
+ nvlist_free(innvl);
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ /*
+ * Wait for ZVOL changes to get applied.
+ * NB: taskqueue_drain_all() does less than taskq_wait(),
+ * but enough for what we want.
+ * And there is no equivalent illumos API.
+ */
+ if (error == 0) {
+ spa_t *spa;
+
+ if (spa_open(saved_poolname, &spa, FTAG) == 0) {
+ taskqueue_drain_all(
+ spa->spa_zvol_taskq->tq_queue);
+ spa_close(spa, FTAG);
+ }
+ }
+#endif
+
+#ifdef illumos
+ rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
+ if (error == 0 && rc != 0)
+ error = SET_ERROR(EFAULT);
+#else
+ if (compat) {
+ zfs_ioctl_compat_post(zc, cmd, cflag);
+ if (newioc) {
+ ASSERT(compat_zc != NULL);
+ ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
+
+ zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
+ rc = ddi_copyout(compat_zc,
+ (void *)(uintptr_t)zc_iocparm->zfs_cmd,
+ zc_iocparm->zfs_cmd_size, flag);
+ if (error == 0 && rc != 0)
+ error = SET_ERROR(EFAULT);
+ kmem_free(compat_zc, sizeof (zfs_cmd_t));
+ } else {
+ zfs_cmd_compat_put(zc, arg, vecnum, cflag);
+ }
+ } else {
+ ASSERT(newioc);
+
+ rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
+ sizeof (zfs_cmd_t), flag);
+ if (error == 0 && rc != 0)
+ error = SET_ERROR(EFAULT);
+ }
+#endif
+ if (error == 0 && vec->zvec_allow_log) {
+ char *s = tsd_get(zfs_allow_log_key);
+ if (s != NULL)
+ strfree(s);
+ (void) tsd_set(zfs_allow_log_key, saved_poolname);
+ } else {
+ if (saved_poolname != NULL)
+ strfree(saved_poolname);
+ }
+
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ return (error);
+}
+
+#ifdef illumos
+static int
+zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
+ DDI_PSEUDO, 0) == DDI_FAILURE)
+ return (DDI_FAILURE);
+
+ zfs_dip = dip;
+
+ ddi_report_dev(dip);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (spa_busy() || zfs_busy() || zvol_busy())
+ return (DDI_FAILURE);
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ zfs_dip = NULL;
+
+ ddi_prop_remove_all(dip);
+ ddi_remove_minor_node(dip, NULL);
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = zfs_dip;
+ return (DDI_SUCCESS);
+
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ return (DDI_SUCCESS);
+ }
+
+ return (DDI_FAILURE);
+}
+#endif /* illumos */
+
+/*
+ * OK, so this is a little weird.
+ *
+ * /dev/zfs is the control node, i.e. minor 0.
+ * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
+ *
+ * /dev/zfs has basically nothing to do except serve up ioctls,
+ * so most of the standard driver entry points are in zvol.c.
+ */
+#ifdef illumos
+static struct cb_ops zfs_cb_ops = {
+ zfsdev_open, /* open */
+ zfsdev_close, /* close */
+ zvol_strategy, /* strategy */
+ nodev, /* print */
+ zvol_dump, /* dump */
+ zvol_read, /* read */
+ zvol_write, /* write */
+ zfsdev_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* prop_op */
+ NULL, /* streamtab */
+ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */
+ CB_REV, /* version */
+ nodev, /* async read */
+ nodev, /* async write */
+};
+
+static struct dev_ops zfs_dev_ops = {
+ DEVO_REV, /* version */
+ 0, /* refcnt */
+ zfs_info, /* info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ zfs_attach, /* attach */
+ zfs_detach, /* detach */
+ nodev, /* reset */
+ &zfs_cb_ops, /* driver operations */
+ NULL, /* no bus operations */
+ NULL, /* power */
+ ddi_quiesce_not_needed, /* quiesce */
+};
+
+static struct modldrv zfs_modldrv = {
+ &mod_driverops,
+ "ZFS storage pool",
+ &zfs_dev_ops
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&zfs_modlfs,
+ (void *)&zfs_modldrv,
+ NULL
+};
+#endif /* illumos */
+
+static struct cdevsw zfs_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = zfsdev_open,
+ .d_ioctl = zfsdev_ioctl,
+ .d_name = ZFS_DEV_NAME
+};
+
+static void
+zfs_allow_log_destroy(void *arg)
+{
+ char *poolname = arg;
+ strfree(poolname);
+}
+
+static void
+zfsdev_init(void)
+{
+ zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
+ ZFS_DEV_NAME);
+}
+
+static void
+zfsdev_fini(void)
+{
+ if (zfsdev != NULL)
+ destroy_dev(zfsdev);
+}
+
+static struct root_hold_token *zfs_root_token;
+
+#ifdef illumos
+int
+_init(void)
+{
+ int error;
+
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+ zvol_init();
+ zfs_ioctl_init();
+
+ if ((error = mod_install(&modlinkage)) != 0) {
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+ return (error);
+ }
+
+ tsd_create(&zfs_fsyncer_key, NULL);
+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+ tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+
+ error = ldi_ident_from_mod(&modlinkage, &zfs_li);
+ ASSERT(error == 0);
+ mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
+ return (SET_ERROR(EBUSY));
+
+ if ((error = mod_remove(&modlinkage)) != 0)
+ return (error);
+
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+ if (zfs_nfsshare_inited)
+ (void) ddi_modclose(nfs_mod);
+ if (zfs_smbshare_inited)
+ (void) ddi_modclose(smbsrv_mod);
+ if (zfs_nfsshare_inited || zfs_smbshare_inited)
+ (void) ddi_modclose(sharefs_mod);
+
+ tsd_destroy(&zfs_fsyncer_key);
+ ldi_ident_release(zfs_li);
+ zfs_li = NULL;
+ mutex_destroy(&zfs_share_lock);
+
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+#endif /* illumos */
+
+static int zfs__init(void);
+static int zfs__fini(void);
+static void zfs_shutdown(void *, int);
+
+static eventhandler_tag zfs_shutdown_event_tag;
+
+#ifdef __FreeBSD__
+#define ZFS_MIN_KSTACK_PAGES 4
+#endif
+
+int
+zfs__init(void)
+{
+
+#ifdef __FreeBSD__
+#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
+ printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
+ "overflow panic!\nPlease consider adding "
+ "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
+ ZFS_MIN_KSTACK_PAGES);
+#endif
+#endif
+ zfs_root_token = root_mount_hold("ZFS");
+
+ mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+ zvol_init();
+ zfs_ioctl_init();
+
+ tsd_create(&zfs_fsyncer_key, NULL);
+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+ tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+ tsd_create(&zfs_geom_probe_vdev_key, NULL);
+
+ printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
+ root_mount_rel(zfs_root_token);
+
+ zfsdev_init();
+
+ return (0);
+}
+
+int
+zfs__fini(void)
+{
+ if (spa_busy() || zfs_busy() || zvol_busy() ||
+ zio_injection_enabled) {
+ return (EBUSY);
+ }
+
+ zfsdev_fini();
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+
+ tsd_destroy(&zfs_fsyncer_key);
+ tsd_destroy(&rrw_tsd_key);
+ tsd_destroy(&zfs_allow_log_key);
+
+ mutex_destroy(&zfs_share_lock);
+
+ return (0);
+}
+
+static void
+zfs_shutdown(void *arg __unused, int howto __unused)
+{
+
+ /*
+ * ZFS fini routines can not properly work in a panic-ed system.
+ */
+ if (!KERNEL_PANICKED())
+ (void)zfs__fini();
+}
+
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+ int err;
+
+ switch (type) {
+ case MOD_LOAD:
+ err = zfs__init();
+ if (err == 0)
+ zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
+ shutdown_post_sync, zfs_shutdown, NULL,
+ SHUTDOWN_PRI_FIRST);
+ return (err);
+ case MOD_UNLOAD:
+ err = zfs__fini();
+ if (err == 0 && zfs_shutdown_event_tag != NULL)
+ EVENTHANDLER_DEREGISTER(shutdown_post_sync,
+ zfs_shutdown_event_tag);
+ return (err);
+ case MOD_SHUTDOWN:
+ return (0);
+ default:
+ break;
+ }
+ return (EOPNOTSUPP);
+}
+
+static moduledata_t zfs_mod = {
+ "zfsctrl",
+ zfs_modevent,
+ 0
+};
+DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
+MODULE_VERSION(zfsctrl, 1);
+MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, zlib, 1, 1, 1);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
new file mode 100644
index 000000000000..c00c60a25ebb
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -0,0 +1,688 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/byteorder.h>
+#include <sys/policy.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dsl_dataset.h>
+
+/*
+ * These zfs_log_* functions must be called within a dmu tx, in one
+ * of 2 contexts depending on zilog->z_replay:
+ *
+ * Non replay mode
+ * ---------------
+ * We need to record the transaction so that if it is committed to
+ * the Intent Log then it can be replayed. An intent log transaction
+ * structure (itx_t) is allocated and all the information necessary to
+ * possibly replay the transaction is saved in it. The itx is then assigned
+ * a sequence number and inserted in the in-memory list anchored in the zilog.
+ *
+ * Replay mode
+ * -----------
+ * We need to mark the intent log record as replayed in the log header.
+ * This is done in the same transaction as the replay so that they
+ * commit atomically.
+ */
+
+int
+zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
+{
+ int isxvattr = (vap->va_mask & AT_XVATTR);
+ switch (type) {
+ case Z_FILE:
+ if (vsecp == NULL && !isxvattr)
+ return (TX_CREATE);
+ if (vsecp && isxvattr)
+#ifdef TODO
+ return (TX_CREATE_ACL_ATTR);
+#else
+ panic("%s:%u: unsupported condition", __func__, __LINE__);
+#endif
+ if (vsecp)
+ return (TX_CREATE_ACL);
+ else
+ return (TX_CREATE_ATTR);
+ /*NOTREACHED*/
+ case Z_DIR:
+ if (vsecp == NULL && !isxvattr)
+ return (TX_MKDIR);
+ if (vsecp && isxvattr)
+#ifdef TODO
+ return (TX_MKDIR_ACL_ATTR);
+#else
+ panic("%s:%u: unsupported condition", __func__, __LINE__);
+#endif
+ if (vsecp)
+ return (TX_MKDIR_ACL);
+ else
+ return (TX_MKDIR_ATTR);
+ case Z_XATTRDIR:
+ return (TX_MKXATTR);
+ }
+ ASSERT(0);
+ return (TX_MAX_TYPE);
+}
+
+/*
+ * build up the log data necessary for logging xvattr_t
+ * First lr_attr_t is initialized. following the lr_attr_t
+ * is the mapsize and attribute bitmap copied from the xvattr_t.
+ * Following the bitmap and bitmapsize two 64 bit words are reserved
+ * for the create time which may be set. Following the create time
+ * records a single 64 bit integer which has the bits to set on
+ * replay for the xvattr.
+ */
+static void
+zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+ uint32_t *bitmap;
+ uint64_t *attrs;
+ uint64_t *crtime;
+ xoptattr_t *xoap;
+ void *scanstamp;
+ int i;
+
+ xoap = xva_getxoptattr(xvap);
+ ASSERT(xoap);
+
+ lrattr->lr_attr_masksize = xvap->xva_mapsize;
+ bitmap = &lrattr->lr_attr_bitmap;
+ for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
+ *bitmap = xvap->xva_reqattrmap[i];
+ }
+
+ /* Now pack the attributes up in a single uint64_t */
+ attrs = (uint64_t *)bitmap;
+ crtime = attrs + 1;
+ scanstamp = (caddr_t)(crtime + 2);
+ *attrs = 0;
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+ *attrs |= (xoap->xoa_readonly == 0) ? 0 :
+ XAT0_READONLY;
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+ *attrs |= (xoap->xoa_hidden == 0) ? 0 :
+ XAT0_HIDDEN;
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+ *attrs |= (xoap->xoa_system == 0) ? 0 :
+ XAT0_SYSTEM;
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+ *attrs |= (xoap->xoa_archive == 0) ? 0 :
+ XAT0_ARCHIVE;
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+ *attrs |= (xoap->xoa_immutable == 0) ? 0 :
+ XAT0_IMMUTABLE;
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+ *attrs |= (xoap->xoa_nounlink == 0) ? 0 :
+ XAT0_NOUNLINK;
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+ *attrs |= (xoap->xoa_appendonly == 0) ? 0 :
+ XAT0_APPENDONLY;
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+ *attrs |= (xoap->xoa_opaque == 0) ? 0 :
+ XAT0_APPENDONLY;
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+ *attrs |= (xoap->xoa_nodump == 0) ? 0 :
+ XAT0_NODUMP;
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+ *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
+ XAT0_AV_QUARANTINED;
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+ *attrs |= (xoap->xoa_av_modified == 0) ? 0 :
+ XAT0_AV_MODIFIED;
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+ *attrs |= (xoap->xoa_reparse == 0) ? 0 :
+ XAT0_REPARSE;
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ *attrs |= (xoap->xoa_offline == 0) ? 0 :
+ XAT0_OFFLINE;
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ *attrs |= (xoap->xoa_sparse == 0) ? 0 :
+ XAT0_SPARSE;
+}
+
+static void *
+zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
+{
+ zfs_fuid_t *zfuid;
+ uint64_t *fuidloc = start;
+
+ /* First copy in the ACE FUIDs */
+ for (zfuid = list_head(&fuidp->z_fuids); zfuid;
+ zfuid = list_next(&fuidp->z_fuids, zfuid)) {
+ *fuidloc++ = zfuid->z_logfuid;
+ }
+ return (fuidloc);
+}
+
+
+static void *
+zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
+{
+ zfs_fuid_domain_t *zdomain;
+
+ /* now copy in the domain info, if any */
+ if (fuidp->z_domain_str_sz != 0) {
+ for (zdomain = list_head(&fuidp->z_domains); zdomain;
+ zdomain = list_next(&fuidp->z_domains, zdomain)) {
+ bcopy((void *)zdomain->z_domain, start,
+ strlen(zdomain->z_domain) + 1);
+ start = (caddr_t)start +
+ strlen(zdomain->z_domain) + 1;
+ }
+ }
+ return (start);
+}
+
+/*
+ * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
+ * TK_MKXATTR transactions.
+ *
+ * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
+ * domain information appended prior to the name. In this case the
+ * uid/gid in the log record will be a log centric FUID.
+ *
+ * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
+ * may contain attributes, ACL and optional fuid information.
+ *
+ * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
+ * and ACL and normal users/groups in the ACEs.
+ *
+ * There may be an optional xvattr attribute information similar
+ * to zfs_log_setattr.
+ *
+ * Also, after the file name "domain" strings may be appended.
+ */
+void
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
+ zfs_fuid_info_t *fuidp, vattr_t *vap)
+{
+ itx_t *itx;
+ lr_create_t *lr;
+ lr_acl_create_t *lracl;
+ size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0;
+ size_t xvatsize = 0;
+ size_t txsize;
+ xvattr_t *xvap = (xvattr_t *)vap;
+ void *end;
+ size_t lrsize;
+ size_t namesize = strlen(name) + 1;
+ size_t fuidsz = 0;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ /*
+ * If we have FUIDs present then add in space for
+ * domains and ACE fuid's if any.
+ */
+ if (fuidp) {
+ fuidsz += fuidp->z_domain_str_sz;
+ fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
+ }
+
+ if (vap->va_mask & AT_XVATTR)
+ xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+ if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
+ (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
+ (int)txtype == TX_MKXATTR) {
+ txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
+ lrsize = sizeof (*lr);
+ } else {
+ txsize =
+ sizeof (lr_acl_create_t) + namesize + fuidsz +
+ ZIL_ACE_LENGTH(aclsize) + xvatsize;
+ lrsize = sizeof (lr_acl_create_t);
+ }
+
+ itx = zil_itx_create(txtype, txsize);
+
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ /* Store dnode slot count in 8 bits above object id. */
+ LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT);
+ lr->lr_mode = zp->z_mode;
+ if (!IS_EPHEMERAL(zp->z_uid)) {
+ lr->lr_uid = (uint64_t)zp->z_uid;
+ } else {
+ lr->lr_uid = fuidp->z_fuid_owner;
+ }
+ if (!IS_EPHEMERAL(zp->z_gid)) {
+ lr->lr_gid = (uint64_t)zp->z_gid;
+ } else {
+ lr->lr_gid = fuidp->z_fuid_group;
+ }
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
+
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
+ sizeof (lr->lr_rdev)) != 0)
+ lr->lr_rdev = 0;
+
+ /*
+ * Fill in xvattr info if any
+ */
+ if (vap->va_mask & AT_XVATTR) {
+ zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
+ end = (caddr_t)lr + lrsize + xvatsize;
+ } else {
+ end = (caddr_t)lr + lrsize;
+ }
+
+ /* Now fill in any ACL info */
+
+ if (vsecp) {
+ lracl = (lr_acl_create_t *)&itx->itx_lr;
+ lracl->lr_aclcnt = vsecp->vsa_aclcnt;
+ lracl->lr_acl_bytes = aclsize;
+ lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+ lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
+ if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
+ lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+ else
+ lracl->lr_acl_flags = 0;
+
+ bcopy(vsecp->vsa_aclentp, end, aclsize);
+ end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
+ }
+
+ /* drop in FUID info */
+ if (fuidp) {
+ end = zfs_log_fuid_ids(fuidp, end);
+ end = zfs_log_fuid_domains(fuidp, end);
+ }
+ /*
+ * Now place file name in log record
+ */
+ bcopy(name, end, namesize);
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+void
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, char *name, uint64_t foid)
+{
+ itx_t *itx;
+ lr_remove_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_remove_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ itx->itx_oid = foid;
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_LINK transactions.
+ */
+void
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name)
+{
+ itx_t *itx;
+ lr_link_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_link_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_link_obj = zp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_SYMLINK transactions.
+ */
+void
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link)
+{
+ itx_t *itx;
+ lr_create_t *lr;
+ size_t namesize = strlen(name) + 1;
+ size_t linksize = strlen(link) + 1;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ lr->lr_uid = zp->z_uid;
+ lr->lr_gid = zp->z_gid;
+ lr->lr_mode = zp->z_mode;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
+ bcopy(name, (char *)(lr + 1), namesize);
+ bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_RENAME transactions.
+ */
+void
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+{
+ itx_t *itx;
+ lr_rename_t *lr;
+ size_t snamesize = strlen(sname) + 1;
+ size_t dnamesize = strlen(dname) + 1;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+ lr = (lr_rename_t *)&itx->itx_lr;
+ lr->lr_sdoid = sdzp->z_id;
+ lr->lr_tdoid = tdzp->z_id;
+ bcopy(sname, (char *)(lr + 1), snamesize);
+ bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+ itx->itx_oid = szp->z_id;
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_WRITE transactions.
+ */
+ssize_t zfs_immediate_write_sz = 32768;
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_LONG(_vfs_zfs, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
+ &zfs_immediate_write_sz, 0, "Minimal size for indirect log write");
+#endif
+
+void
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t resid, int ioflag)
+{
+ uint32_t blocksize = zp->z_blksz;
+ itx_wr_state_t write_state;
+ uintptr_t fsync_cnt;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ return;
+
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ write_state = WR_INDIRECT;
+ else if (!spa_has_slogs(zilog->zl_spa) &&
+ resid >= zfs_immediate_write_sz)
+ write_state = WR_INDIRECT;
+ else if (ioflag & (FSYNC | FDSYNC))
+ write_state = WR_COPIED;
+ else
+ write_state = WR_NEED_COPY;
+
+ if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
+ (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
+ }
+
+ while (resid) {
+ itx_t *itx;
+ lr_write_t *lr;
+ itx_wr_state_t wr_state = write_state;
+ ssize_t len = resid;
+
+ /*
+ * A WR_COPIED record must fit entirely in one log block.
+ * Large writes can use WR_NEED_COPY, which the ZIL will
+ * split into multiple records across several log blocks
+ * if necessary.
+ */
+ if (wr_state == WR_COPIED &&
+ resid > zil_max_copied_data(zilog))
+ wr_state = WR_NEED_COPY;
+ else if (wr_state == WR_INDIRECT)
+ len = MIN(blocksize - P2PHASE(off, blocksize), resid);
+
+ itx = zil_itx_create(txtype, sizeof (*lr) +
+ (wr_state == WR_COPIED ? len : 0));
+ lr = (lr_write_t *)&itx->itx_lr;
+ if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+ zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ wr_state = WR_NEED_COPY;
+ }
+
+ itx->itx_wr_state = wr_state;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = zp->z_zfsvfs;
+
+ if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
+ (fsync_cnt == 0))
+ itx->itx_sync = B_FALSE;
+
+ zil_itx_assign(zilog, itx, tx);
+
+ off += len;
+ resid -= len;
+ }
+}
+
+/*
+ * Handles TX_TRUNCATE transactions.
+ */
+void
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len)
+{
+ itx_t *itx;
+ lr_truncate_t *lr;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_truncate_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_SETATTR transactions.
+ */
+void
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
+{
+ itx_t *itx;
+ lr_setattr_t *lr;
+ xvattr_t *xvap = (xvattr_t *)vap;
+ size_t recsize = sizeof (lr_setattr_t);
+ void *start;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ return;
+
+ /*
+ * If XVATTR set, then log record size needs to allow
+ * for lr_attr_t + xvattr mask, mapsize and create time
+ * plus actual attribute values
+ */
+ if (vap->va_mask & AT_XVATTR)
+ recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+ if (fuidp)
+ recsize += fuidp->z_domain_str_sz;
+
+ itx = zil_itx_create(txtype, recsize);
+ lr = (lr_setattr_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mask = (uint64_t)mask_applied;
+ lr->lr_mode = (uint64_t)vap->va_mode;
+ if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
+ lr->lr_uid = fuidp->z_fuid_owner;
+ else
+ lr->lr_uid = (uint64_t)vap->va_uid;
+
+ if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
+ lr->lr_gid = fuidp->z_fuid_group;
+ else
+ lr->lr_gid = (uint64_t)vap->va_gid;
+
+ lr->lr_size = (uint64_t)vap->va_size;
+ ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+ ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+ start = (lr_setattr_t *)(lr + 1);
+ if (vap->va_mask & AT_XVATTR) {
+ zfs_log_xvattr((lr_attr_t *)start, xvap);
+ start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+ }
+
+ /*
+ * Now stick on domain information if any on end
+ */
+
+ if (fuidp)
+ (void) zfs_log_fuid_domains(fuidp, start);
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_ACL transactions.
+ */
+void
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
+ vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
+{
+ itx_t *itx;
+ lr_acl_v0_t *lrv0;
+ lr_acl_t *lr;
+ int txtype;
+ int lrsize;
+ size_t txsize;
+ size_t aclbytes = vsecp->vsa_aclentsz;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ return;
+
+ txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
+ TX_ACL_V0 : TX_ACL;
+
+ if (txtype == TX_ACL)
+ lrsize = sizeof (*lr);
+ else
+ lrsize = sizeof (*lrv0);
+
+ txsize = lrsize +
+ ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
+ (fuidp ? fuidp->z_domain_str_sz : 0) +
+ sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
+
+ itx = zil_itx_create(txtype, txsize);
+
+ lr = (lr_acl_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ if (txtype == TX_ACL) {
+ lr->lr_acl_bytes = aclbytes;
+ lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+ lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
+ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
+ lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+ else
+ lr->lr_acl_flags = 0;
+ }
+ lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
+
+ if (txtype == TX_ACL_V0) {
+ lrv0 = (lr_acl_v0_t *)lr;
+ bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
+ } else {
+ void *start = (ace_t *)(lr + 1);
+
+ bcopy(vsecp->vsa_aclentp, start, aclbytes);
+
+ start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
+
+ if (fuidp) {
+ start = zfs_log_fuid_ids(fuidp, start);
+ (void) zfs_log_fuid_domains(fuidp, start);
+ }
+ }
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
new file mode 100644
index 000000000000..edb9ca86caa8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
@@ -0,0 +1,254 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+
+/*
+ * ZFS kernel routines may add/delete callback routines to be invoked
+ * upon process exit (triggered via the close operation from the /dev/zfs
+ * driver).
+ *
+ * These cleanup callbacks are intended to allow for the accumulation
+ * of kernel state across multiple ioctls. User processes participate
+ * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a
+ * clone-open, generating a unique minor number. The process then passes
+ * along that file descriptor to each ioctl that might have a cleanup operation.
+ *
+ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
+ * on to validate the given fd and add a reference to its file table entry.
+ * This allows the consumer to do its work and then add a callback, knowing
+ * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers
+ * should call zfs_onexit_fd_rele().
+ *
+ * A simple example is zfs_ioc_recv(), where we might create an AVL tree
+ * with dataset/GUID mappings and then reuse that tree on subsequent
+ * zfs_ioc_recv() calls.
+ *
+ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
+ * the AVL tree and pass it along with a callback function to
+ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
+ * callback and return an action handle.
+ *
+ * The action handle is then passed from user space to subsequent
+ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
+ * by calling zfs_onexit_cb_data() with the device minor number and
+ * action handle.
+ *
+ * If the user process exits abnormally, the callback is invoked implicitly
+ * as part of the driver close operation. Once the user space process is
+ * finished with the accumulated kernel state, it can also just call close(2)
+ * on the cleanup fd to trigger the cleanup callback.
+ */
+
+void
+zfs_onexit_init(zfs_onexit_t **zop)
+{
+ zfs_onexit_t *zo;
+
+ zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
+ mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
+ offsetof(zfs_onexit_action_node_t, za_link));
+}
+
+void
+zfs_onexit_destroy(zfs_onexit_t *zo)
+{
+ zfs_onexit_action_node_t *ap;
+
+ mutex_enter(&zo->zo_lock);
+ while ((ap = list_head(&zo->zo_actions)) != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ mutex_enter(&zo->zo_lock);
+ }
+ mutex_exit(&zo->zo_lock);
+
+ list_destroy(&zo->zo_actions);
+ mutex_destroy(&zo->zo_lock);
+ kmem_free(zo, sizeof (zfs_onexit_t));
+}
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+ *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+ if (*zo == NULL)
+ return (SET_ERROR(EBADF));
+
+ return (0);
+}
+
+/*
+ * Consumers might need to operate by minor number instead of fd, since
+ * they might be running in another thread (e.g. txg_sync_thread). Callers
+ * of this function must call zfs_onexit_fd_rele() when they're finished
+ * using the minor number.
+ */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+ file_t *fp, *tmpfp;
+ zfs_onexit_t *zo;
+ cap_rights_t rights;
+ void *data;
+ int error;
+
+ fp = getf(fd, &cap_no_rights);
+ if (fp == NULL)
+ return (SET_ERROR(EBADF));
+
+ tmpfp = curthread->td_fpop;
+ curthread->td_fpop = fp;
+ error = devfs_get_cdevpriv(&data);
+ if (error == 0)
+ *minorp = (minor_t)(uintptr_t)data;
+ curthread->td_fpop = tmpfp;
+ if (error != 0)
+ return (SET_ERROR(EBADF));
+ return (zfs_onexit_minor_to_state(*minorp, &zo));
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+ releasef(fd);
+}
+
+/*
+ * Add a callback to be invoked when the calling process exits.
+ */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
+ list_link_init(&ap->za_link);
+ ap->za_func = func;
+ ap->za_data = data;
+
+ mutex_enter(&zo->zo_lock);
+ list_insert_tail(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (action_handle)
+ *action_handle = (uint64_t)(uintptr_t)ap;
+
+ return (0);
+}
+
+static zfs_onexit_action_node_t *
+zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle)
+{
+ zfs_onexit_action_node_t *match;
+ zfs_onexit_action_node_t *ap;
+ list_t *l;
+
+ ASSERT(MUTEX_HELD(&zo->zo_lock));
+
+ match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle;
+ l = &zo->zo_actions;
+ for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) {
+ if (match == ap)
+ break;
+ }
+ return (ap);
+}
+
+/*
+ * Delete the callback, triggering it first if 'fire' is set.
+ */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ mutex_enter(&zo->zo_lock);
+ ap = zfs_onexit_find_cb(zo, action_handle);
+ if (ap != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (fire)
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ } else {
+ mutex_exit(&zo->zo_lock);
+ error = SET_ERROR(ENOENT);
+ }
+
+ return (error);
+}
+
+/*
+ * Return the data associated with this callback. This allows consumers
+ * of the cleanup-on-exit interfaces to stash kernel data across system
+ * calls, knowing that it will be cleaned up if the calling process exits.
+ */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ *data = NULL;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ mutex_enter(&zo->zo_lock);
+ ap = zfs_onexit_find_cb(zo, action_handle);
+ if (ap != NULL)
+ *data = ap->za_data;
+ else
+ error = SET_ERROR(ENOENT);
+ mutex_exit(&zo->zo_lock);
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
new file mode 100644
index 000000000000..c913e287e2ad
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -0,0 +1,1069 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_fuid.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+#include <sys/namei.h>
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+ uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+ VATTR_NULL(vap);
+ vap->va_mask = (uint_t)mask;
+ if (mask & AT_TYPE)
+ vap->va_type = IFTOVT(mode);
+ if (mask & AT_MODE)
+ vap->va_mode = mode & MODEMASK;
+ if (mask & AT_UID)
+ vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
+ if (mask & AT_GID)
+ vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
+ vap->va_rdev = zfs_cmpldev(rdev);
+ vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
+{
+ return (SET_ERROR(ENOTSUP));
+}
+
+static void
+zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+ xoptattr_t *xoap = NULL;
+ uint64_t *attrs;
+ uint64_t *crtime;
+ uint32_t *bitmap;
+ void *scanstamp;
+ int i;
+
+ xvap->xva_vattr.va_mask |= AT_XVATTR;
+ if ((xoap = xva_getxoptattr(xvap)) == NULL) {
+ xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
+ return;
+ }
+
+ ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
+
+ bitmap = &lrattr->lr_attr_bitmap;
+ for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
+ xvap->xva_reqattrmap[i] = *bitmap;
+
+ attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
+ crtime = attrs + 1;
+ scanstamp = (caddr_t)(crtime + 2);
+
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+ xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+ xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+ xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+ xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+ xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+ xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+ xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+ xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+ xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+ xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+ xoap->xoa_av_quarantined =
+ ((*attrs & XAT0_AV_QUARANTINED) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+ ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+ xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
+}
+
+static int
+zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
+{
+ uint64_t uid_idx;
+ uint64_t gid_idx;
+ int domcnt = 0;
+
+ uid_idx = FUID_INDEX(uid);
+ gid_idx = FUID_INDEX(gid);
+ if (uid_idx)
+ domcnt++;
+ if (gid_idx > 0 && gid_idx != uid_idx)
+ domcnt++;
+
+ return (domcnt);
+}
+
+static void *
+zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
+ int domcnt)
+{
+ int i;
+
+ for (i = 0; i != domcnt; i++) {
+ fuid_infop->z_domain_table[i] = start;
+ start = (caddr_t)start + strlen(start) + 1;
+ }
+
+ return (start);
+}
+
+/*
+ * Set the uid/gid in the fuid_info structure.
+ */
+static void
+zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
+{
+ /*
+ * If owner or group are log specific FUIDs then slurp up
+ * domain information and build zfs_fuid_info_t
+ */
+ if (IS_EPHEMERAL(uid))
+ fuid_infop->z_fuid_owner = uid;
+
+ if (IS_EPHEMERAL(gid))
+ fuid_infop->z_fuid_group = gid;
+}
+
+/*
+ * Load fuid domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
+{
+ int domcnt;
+
+ zfs_fuid_info_t *fuid_infop;
+
+ fuid_infop = zfs_fuid_info_alloc();
+
+ domcnt = zfs_replay_domain_cnt(uid, gid);
+
+ if (domcnt == 0)
+ return (fuid_infop);
+
+ fuid_infop->z_domain_table =
+ kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
+
+ zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+ fuid_infop->z_domain_cnt = domcnt;
+ *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
+ return (fuid_infop);
+}
+
+/*
+ * load zfs_fuid_t's and fuid_domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
+ uint64_t gid)
+{
+ uint64_t *log_fuid = (uint64_t *)start;
+ zfs_fuid_info_t *fuid_infop;
+ int i;
+
+ fuid_infop = zfs_fuid_info_alloc();
+ fuid_infop->z_domain_cnt = domcnt;
+
+ fuid_infop->z_domain_table =
+ kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
+
+ for (i = 0; i != idcnt; i++) {
+ zfs_fuid_t *zfuid;
+
+ zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+ zfuid->z_logfuid = *log_fuid;
+ zfuid->z_id = -1;
+ zfuid->z_domidx = 0;
+ list_insert_tail(&fuid_infop->z_fuids, zfuid);
+ log_fuid++;
+ }
+
+ zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+ *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
+ return (fuid_infop);
+}
+
+static void
+zfs_replay_swap_attrs(lr_attr_t *lrattr)
+{
+ /* swap the lr_attr structure */
+ byteswap_uint32_array(lrattr, sizeof (*lrattr));
+ /* swap the bitmap */
+ byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
+ sizeof (uint32_t));
+ /* swap the attributes, create time + 64 bit word for attributes */
+ byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
+ (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
+}
+
+/*
+ * Replay file create with optional ACL, xvattr information as well
+ * as option FUID information.
+ */
+static int
+zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_acl_create_t *lracl = arg2;
+ char *name = NULL; /* location determined later */
+ lr_create_t *lr = (lr_create_t *)lracl;
+ znode_t *dzp;
+ vnode_t *vp = NULL;
+ xvattr_t xva;
+ int vflg = 0;
+ vsecattr_t vsec = { 0 };
+ lr_attr_t *lrattr;
+ void *aclstart;
+ void *fuidstart;
+ size_t xvatlen = 0;
+ uint64_t txtype;
+ uint64_t objid;
+ uint64_t dnodesize;
+ int error;
+
+ txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
+ if (byteswap) {
+ byteswap_uint64_array(lracl, sizeof (*lracl));
+ if (txtype == TX_CREATE_ACL_ATTR ||
+ txtype == TX_MKDIR_ACL_ATTR) {
+ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+ zfs_replay_swap_attrs(lrattr);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ }
+
+ aclstart = (caddr_t)(lracl + 1) + xvatlen;
+ zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
+ /* swap fuids */
+ if (lracl->lr_fuidcnt) {
+ byteswap_uint64_array((caddr_t)aclstart +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
+ lracl->lr_fuidcnt * sizeof (uint64_t));
+ }
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
+ xva_init(&xva);
+ zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
+
+ /*
+ * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+ * eventually end up in zfs_mknode(), which assigns the object's
+ * creation time, generation number, and dnode size. The generic
+ * zfs_create() has no concept of these attributes, so we smuggle
+ * the values inside the vattr's otherwise unused va_ctime,
+ * va_nblocks, and va_fsid fields.
+ */
+ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+ xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
+
+ error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
+ if (error != ENOENT)
+ goto bail;
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+ switch (txtype) {
+ case TX_CREATE_ACL:
+ aclstart = (caddr_t)(lracl + 1);
+ fuidstart = (caddr_t)aclstart +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+ zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+ lr->lr_uid, lr->lr_gid);
+ /*FALLTHROUGH*/
+ case TX_CREATE_ACL_ATTR:
+ if (name == NULL) {
+ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ xva.xva_vattr.va_mask |= AT_XVATTR;
+ zfs_replay_xvattr(lrattr, &xva);
+ }
+ vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+ vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+ vsec.vsa_aclcnt = lracl->lr_aclcnt;
+ vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+ vsec.vsa_aclflags = lracl->lr_acl_flags;
+ if (zfsvfs->z_fuid_replay == NULL) {
+ fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuids(fuidstart,
+ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+ lr->lr_uid, lr->lr_gid);
+ }
+
+#ifdef TODO
+ error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
+ 0, 0, &vp, kcred, vflg, NULL, &vsec);
+#else
+ panic("%s:%u: unsupported condition", __func__, __LINE__);
+#endif
+ break;
+ case TX_MKDIR_ACL:
+ aclstart = (caddr_t)(lracl + 1);
+ fuidstart = (caddr_t)aclstart +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+ zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+ lr->lr_uid, lr->lr_gid);
+ /*FALLTHROUGH*/
+ case TX_MKDIR_ACL_ATTR:
+ if (name == NULL) {
+ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ zfs_replay_xvattr(lrattr, &xva);
+ }
+ vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+ vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+ vsec.vsa_aclcnt = lracl->lr_aclcnt;
+ vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+ vsec.vsa_aclflags = lracl->lr_acl_flags;
+ if (zfsvfs->z_fuid_replay == NULL) {
+ fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+ ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuids(fuidstart,
+ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+ lr->lr_uid, lr->lr_gid);
+ }
+#ifdef TODO
+ error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
+ &vp, kcred, NULL, vflg, &vsec);
+#else
+ panic("%s:%u: unsupported condition", __func__, __LINE__);
+#endif
+ break;
+ default:
+ error = SET_ERROR(ENOTSUP);
+ }
+
+bail:
+ if (error == 0 && vp != NULL)
+ VN_RELE(vp);
+
+ VN_RELE(ZTOV(dzp));
+
+ if (zfsvfs->z_fuid_replay)
+ zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+ zfsvfs->z_fuid_replay = NULL;
+
+ return (error);
+}
+
+static int
+zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_create_t *lr = arg2;
+ char *name = NULL; /* location determined later */
+ char *link; /* symlink content follows name */
+ znode_t *dzp;
+ vnode_t *vp = NULL;
+ xvattr_t xva;
+ int vflg = 0;
+ size_t lrsize = sizeof (lr_create_t);
+ lr_attr_t *lrattr;
+ void *start;
+ size_t xvatlen;
+ uint64_t txtype;
+ struct componentname cn;
+ int error;
+
+ txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
+ zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+ }
+
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
+ xva_init(&xva);
+ zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
+
+ /*
+ * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+ * eventually end up in zfs_mknode(), which assigns the object's
+ * creation time, generation number, and dnode slot count. The
+ * generic zfs_create() has no concept of these attributes, so
+ * we smuggle the values inside the vattr's otherwise unused
+ * va_ctime, va_nblocks and va_fsid fields.
+ */
+ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+ xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
+
+ error = dmu_object_info(zfsvfs->z_os, objid, NULL);
+ if (error != ENOENT)
+ goto out;
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+
+ /*
+ * Symlinks don't have fuid info, and CIFS never creates
+ * symlinks.
+ *
+ * The _ATTR versions will grab the fuid info in their subcases.
+ */
+ if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
+ (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
+ (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
+ start = (lr + 1);
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuid_domain(start, &start,
+ lr->lr_uid, lr->lr_gid);
+ }
+
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ cn.cn_flags = SAVENAME;
+
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
+ switch (txtype) {
+ case TX_CREATE_ATTR:
+ lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+ start = (caddr_t)(lr + 1) + xvatlen;
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuid_domain(start, &start,
+ lr->lr_uid, lr->lr_gid);
+ name = (char *)start;
+
+ /*FALLTHROUGH*/
+ case TX_CREATE:
+ if (name == NULL)
+ name = (char *)start;
+
+ cn.cn_nameptr = name;
+ error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
+ break;
+ case TX_MKDIR_ATTR:
+ lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+ start = (caddr_t)(lr + 1) + xvatlen;
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuid_domain(start, &start,
+ lr->lr_uid, lr->lr_gid);
+ name = (char *)start;
+
+ /*FALLTHROUGH*/
+ case TX_MKDIR:
+ if (name == NULL)
+ name = (char *)(lr + 1);
+
+ cn.cn_nameptr = name;
+ error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
+ break;
+ case TX_MKXATTR:
+ error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
+ break;
+ case TX_SYMLINK:
+ name = (char *)(lr + 1);
+ link = name + strlen(name) + 1;
+ cn.cn_nameptr = name;
+ error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/);
+ break;
+ default:
+ error = SET_ERROR(ENOTSUP);
+ }
+ VOP_UNLOCK(ZTOV(dzp));
+
+out:
+ if (error == 0 && vp != NULL)
+ VN_URELE(vp);
+
+ VN_RELE(ZTOV(dzp));
+
+ if (zfsvfs->z_fuid_replay)
+ zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+ zfsvfs->z_fuid_replay = NULL;
+ return (error);
+}
+
+static int
+zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_remove_t *lr = arg2;
+ char *name = (char *)(lr + 1); /* name follows lr_remove_t */
+ znode_t *dzp;
+ struct componentname cn;
+ vnode_t *vp;
+ int error;
+ int vflg = 0;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+ cn.cn_nameptr = name;
+ cn.cn_namelen = strlen(name);
+ cn.cn_nameiop = DELETE;
+ cn.cn_flags = ISLASTCN | SAVENAME;
+ cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
+ if (error != 0) {
+ VOP_UNLOCK(ZTOV(dzp));
+ goto fail;
+ }
+
+ switch ((int)lr->lr_common.lrc_txtype) {
+ case TX_REMOVE:
+ error = VOP_REMOVE(ZTOV(dzp), vp, &cn /*,vflg*/);
+ break;
+ case TX_RMDIR:
+ error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/);
+ break;
+ default:
+ error = SET_ERROR(ENOTSUP);
+ }
+ vput(vp);
+ VOP_UNLOCK(ZTOV(dzp));
+
+fail:
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_link_t *lr = arg2;
+ char *name = (char *)(lr + 1); /* name follows lr_link_t */
+ znode_t *dzp, *zp;
+ struct componentname cn;
+ int error;
+ int vflg = 0;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+ VN_RELE(ZTOV(dzp));
+ return (error);
+ }
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+
+ cn.cn_nameptr = name;
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ cn.cn_flags = SAVENAME;
+
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/);
+ VOP_UNLOCK(ZTOV(zp));
+ VOP_UNLOCK(ZTOV(dzp));
+
+ VN_RELE(ZTOV(zp));
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_rename_t *lr = arg2;
+ char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+ char *tname = sname + strlen(sname) + 1;
+ znode_t *sdzp, *tdzp;
+ struct componentname scn, tcn;
+ vnode_t *svp, *tvp;
+ kthread_t *td = curthread;
+ int error;
+ int vflg = 0;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+ VN_RELE(ZTOV(sdzp));
+ return (error);
+ }
+
+ if (lr->lr_common.lrc_txtype & TX_CI)
+ vflg |= FIGNORECASE;
+ svp = tvp = NULL;
+
+ scn.cn_nameptr = sname;
+ scn.cn_namelen = strlen(sname);
+ scn.cn_nameiop = DELETE;
+ scn.cn_flags = ISLASTCN | SAVENAME;
+ scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+ scn.cn_cred = kcred;
+ scn.cn_thread = td;
+ vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
+ VOP_UNLOCK(ZTOV(sdzp));
+ if (error != 0)
+ goto fail;
+ VOP_UNLOCK(svp);
+
+ tcn.cn_nameptr = tname;
+ tcn.cn_namelen = strlen(tname);
+ tcn.cn_nameiop = RENAME;
+ tcn.cn_flags = ISLASTCN | SAVENAME;
+ tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+ tcn.cn_cred = kcred;
+ tcn.cn_thread = td;
+ vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
+ if (error == EJUSTRETURN)
+ tvp = NULL;
+ else if (error != 0) {
+ VOP_UNLOCK(ZTOV(tdzp));
+ goto fail;
+ }
+
+ error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/);
+ return (error);
+fail:
+ if (svp != NULL)
+ vrele(svp);
+ if (tvp != NULL)
+ vrele(tvp);
+ VN_RELE(ZTOV(tdzp));
+ VN_RELE(ZTOV(sdzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_write_t *lr = arg2;
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ znode_t *zp;
+ int error;
+ ssize_t resid;
+ uint64_t eod, offset, length;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log writes out of order, it's possible the
+ * file has been removed. In this case just drop the write
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+ eod = offset + length; /* end of data for this write */
+
+ /*
+ * This may be a write from a dmu_sync() for a whole block,
+ * and may extend beyond the current end of the file.
+ * We can't just replay what was written for this TX_WRITE as
+ * a future TX_WRITE2 may extend the eof and the data for that
+ * write needs to be there. So we write the whole block and
+ * reduce the eof. This needs to be done within the single dmu
+ * transaction created within vn_rdwr -> zfs_write. So a possible
+ * new end of file is passed through in zfsvfs->z_replay_eof
+ */
+
+ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ if (zp->z_size < eod)
+ zfsvfs->z_replay_eof = eod;
+ }
+
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
+ UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+ VN_RELE(ZTOV(zp));
+ zfsvfs->z_replay_eof = 0; /* safety */
+
+ return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_write_t *lr = arg2;
+ znode_t *zp;
+ int error;
+ uint64_t end;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+top:
+ end = lr->lr_offset + lr->lr_length;
+ if (end > zp->z_size) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ zp->z_size = end;
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ VN_RELE(ZTOV(zp));
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+ /* Ensure the replayed seq is updated */
+ (void) zil_replaying(zfsvfs->z_log, tx);
+
+ dmu_tx_commit(tx);
+ }
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+static int
+zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
+{
+#ifdef illumos
+ zfsvfs_t *zfsvfs = arg1;
+ lr_truncate_t *lr = arg2;
+ znode_t *zp;
+ flock64_t fl;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&fl, sizeof (fl));
+ fl.l_type = F_WRLCK;
+ fl.l_whence = 0;
+ fl.l_start = lr->lr_offset;
+ fl.l_len = lr->lr_length;
+
+ error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
+ lr->lr_offset, kcred, NULL);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+#else
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ return (EOPNOTSUPP);
+#endif
+}
+
+static int
+zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_setattr_t *lr = arg2;
+ znode_t *zp;
+ xvattr_t xva;
+ vattr_t *vap = &xva.xva_vattr;
+ vnode_t *vp;
+ int error;
+ void *start;
+
+ xva_init(&xva);
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((lr->lr_mask & AT_XVATTR) &&
+ zfsvfs->z_version >= ZPL_VERSION_INITIAL)
+ zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
+ lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+ vap->va_size = lr->lr_size;
+ ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
+
+ /*
+ * Fill in xvattr_t portions if necessary.
+ */
+
+ start = (lr_setattr_t *)(lr + 1);
+ if (vap->va_mask & AT_XVATTR) {
+ zfs_replay_xvattr((lr_attr_t *)start, &xva);
+ start = (caddr_t)start +
+ ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
+ } else
+ xva.xva_vattr.va_mask &= ~AT_XVATTR;
+
+ zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
+ lr->lr_uid, lr->lr_gid);
+
+ vp = ZTOV(zp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_SETATTR(vp, vap, kcred);
+ VOP_UNLOCK(vp);
+
+ zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+ zfsvfs->z_fuid_replay = NULL;
+ VN_RELE(vp);
+
+ return (error);
+}
+
+extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+ caller_context_t *ct);
+
+static int
+zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_acl_v0_t *lr = arg2;
+ ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
+ vsecattr_t vsa;
+ vnode_t *vp;
+ znode_t *zp;
+ int error;
+
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ zfs_oldace_byteswap(ace, lr->lr_aclcnt);
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&vsa, sizeof (vsa));
+ vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+ vsa.vsa_aclcnt = lr->lr_aclcnt;
+ vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
+ vsa.vsa_aclflags = 0;
+ vsa.vsa_aclentp = ace;
+
+ vp = ZTOV(zp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
+ VOP_UNLOCK(vp);
+
+ VN_RELE(vp);
+
+ return (error);
+}
+
+/*
+ * Replaying ACLs is complicated by FUID support.
+ * The log record may contain some optional data
+ * to be used for replaying FUID's. These pieces
+ * are the actual FUIDs that were created initially.
+ * The FUID table index may no longer be valid and
+ * during zfs_create() a new index may be assigned.
+ * Because of this the log will contain the original
+ * doman+rid in order to create a new FUID.
+ *
+ * The individual ACEs may contain an ephemeral uid/gid which is no
+ * longer valid and will need to be replaced with an actual FUID.
+ *
+ */
+static int
+zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_acl_t *lr = arg2;
+ ace_t *ace = (ace_t *)(lr + 1);
+ vsecattr_t vsa;
+ znode_t *zp;
+ vnode_t *vp;
+ int error;
+
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
+ if (lr->lr_fuidcnt) {
+ byteswap_uint64_array((caddr_t)ace +
+ ZIL_ACE_LENGTH(lr->lr_acl_bytes),
+ lr->lr_fuidcnt * sizeof (uint64_t));
+ }
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&vsa, sizeof (vsa));
+ vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
+ vsa.vsa_aclcnt = lr->lr_aclcnt;
+ vsa.vsa_aclentp = ace;
+ vsa.vsa_aclentsz = lr->lr_acl_bytes;
+ vsa.vsa_aclflags = lr->lr_acl_flags;
+
+ if (lr->lr_fuidcnt) {
+ void *fuidstart = (caddr_t)ace +
+ ZIL_ACE_LENGTH(lr->lr_acl_bytes);
+
+ zfsvfs->z_fuid_replay =
+ zfs_replay_fuids(fuidstart, &fuidstart,
+ lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
+ }
+
+ vp = ZTOV(zp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
+ VOP_UNLOCK(vp);
+
+ if (zfsvfs->z_fuid_replay)
+ zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+
+ zfsvfs->z_fuid_replay = NULL;
+ VN_RELE(vp);
+
+ return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+ zfs_replay_error, /* 0 no such transaction type */
+ zfs_replay_create, /* TX_CREATE */
+ zfs_replay_create, /* TX_MKDIR */
+ zfs_replay_create, /* TX_MKXATTR */
+ zfs_replay_create, /* TX_SYMLINK */
+ zfs_replay_remove, /* TX_REMOVE */
+ zfs_replay_remove, /* TX_RMDIR */
+ zfs_replay_link, /* TX_LINK */
+ zfs_replay_rename, /* TX_RENAME */
+ zfs_replay_write, /* TX_WRITE */
+ zfs_replay_truncate, /* TX_TRUNCATE */
+ zfs_replay_setattr, /* TX_SETATTR */
+ zfs_replay_acl_v0, /* TX_ACL_V0 */
+ zfs_replay_acl, /* TX_ACL */
+ zfs_replay_create_acl, /* TX_CREATE_ACL */
+ zfs_replay_create, /* TX_CREATE_ATTR */
+ zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */
+ zfs_replay_create_acl, /* TX_MKDIR_ACL */
+ zfs_replay_create, /* TX_MKDIR_ATTR */
+ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
+ zfs_replay_write2, /* TX_WRITE2 */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
new file mode 100644
index 000000000000..434be78ffce2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -0,0 +1,641 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/*
+ * This file contains the code to implement file range locking in
+ * ZFS, although there isn't much specific to ZFS (all that comes to mind is
+ * support for growing the blocksize).
+ *
+ * Interface
+ * ---------
+ * Defined in zfs_rlock.h but essentially:
+ * lr = rangelock_enter(zp, off, len, lock_type);
+ * rangelock_reduce(lr, off, len); // optional
+ * rangelock_exit(lr);
+ *
+ * AVL tree
+ * --------
+ * An AVL tree is used to maintain the state of the existing ranges
+ * that are locked for exclusive (writer) or shared (reader) use.
+ * The starting range offset is used for searching and sorting the tree.
+ *
+ * Common case
+ * -----------
+ * The (hopefully) usual case is of no overlaps or contention for locks. On
+ * entry to rangelock_enter(), a locked_range_t is allocated; the tree
+ * searched that finds no overlap, and *this* locked_range_t is placed in the
+ * tree.
+ *
+ * Overlaps/Reference counting/Proxy locks
+ * ---------------------------------------
+ * The avl code only allows one node at a particular offset. Also it's very
+ * inefficient to search through all previous entries looking for overlaps
+ * (because the very 1st in the ordered list might be at offset 0 but
+ * cover the whole file).
+ * So this implementation uses reference counts and proxy range locks.
+ * Firstly, only reader locks use reference counts and proxy locks,
+ * because writer locks are exclusive.
+ * When a reader lock overlaps with another then a proxy lock is created
+ * for that range and replaces the original lock. If the overlap
+ * is exact then the reference count of the proxy is simply incremented.
+ * Otherwise, the proxy lock is split into smaller lock ranges and
+ * new proxy locks created for non overlapping ranges.
+ * The reference counts are adjusted accordingly.
+ * Meanwhile, the orginal lock is kept around (this is the callers handle)
+ * and its offset and length are used when releasing the lock.
+ *
+ * Thread coordination
+ * -------------------
+ * In order to make wakeups efficient and to ensure multiple continuous
+ * readers on a range don't starve a writer for the same range lock,
+ * two condition variables are allocated in each rl_t.
+ * If a writer (or reader) can't get a range it initialises the writer
+ * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
+ * and waits on that cv. When a thread unlocks that range it wakes up all
+ * writers then all readers before destroying the lock.
+ *
+ * Append mode writes
+ * ------------------
+ * Append mode writes need to lock a range at the end of a file.
+ * The offset of the end of the file is determined under the
+ * range locking mutex, and the lock type converted from RL_APPEND to
+ * RL_WRITER and the range locked.
+ *
+ * Grow block handling
+ * -------------------
+ * ZFS supports multiple block sizes, up to 16MB. The smallest
+ * block size is used for the file which is grown as needed. During this
+ * growth all other writers and readers must be excluded.
+ * So if the block size needs to be grown then the whole file is
+ * exclusively locked, then later the caller will reduce the lock
+ * range to just the range to be written using rangelock_reduce().
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/zfs_rlock.h>
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+static int
+rangelock_compare(const void *arg1, const void *arg2)
+{
+ const locked_range_t *rl1 = (const locked_range_t *)arg1;
+ const locked_range_t *rl2 = (const locked_range_t *)arg2;
+
+ return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
+}
+
+/*
+ * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
+ * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
+ * and may increase the range that's locked for RL_WRITER.
+ */
+void
+rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg)
+{
+ mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&rl->rl_tree, rangelock_compare,
+ sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
+ rl->rl_cb = cb;
+ rl->rl_arg = arg;
+}
+
+void
+rangelock_fini(rangelock_t *rl)
+{
+ mutex_destroy(&rl->rl_lock);
+ avl_destroy(&rl->rl_tree);
+}
+
+/*
+ * Check if a write lock can be grabbed. If not, fail immediately or sleep and
+ * recheck until available, depending on the value of the "nonblock" parameter.
+ */
+static boolean_t
+rangelock_enter_writer(rangelock_t *rl, locked_range_t *new, boolean_t nonblock)
+{
+ avl_tree_t *tree = &rl->rl_tree;
+ locked_range_t *lr;
+ avl_index_t where;
+ uint64_t orig_off = new->lr_offset;
+ uint64_t orig_len = new->lr_length;
+ rangelock_type_t orig_type = new->lr_type;
+
+ for (;;) {
+ /*
+ * Call callback which can modify new->r_off,len,type.
+ * Note, the callback is used by the ZPL to handle appending
+ * and changing blocksizes. It isn't needed for zvols.
+ */
+ if (rl->rl_cb != NULL) {
+ rl->rl_cb(new, rl->rl_arg);
+ }
+
+ /*
+ * If the type was APPEND, the callback must convert it to
+ * WRITER.
+ */
+ ASSERT3U(new->lr_type, ==, RL_WRITER);
+
+ /*
+ * First check for the usual case of no locks
+ */
+ if (avl_numnodes(tree) == 0) {
+ avl_add(tree, new);
+ return (B_TRUE);
+ }
+
+ /*
+ * Look for any locks in the range.
+ */
+ lr = avl_find(tree, new, &where);
+ if (lr != NULL)
+ goto wait; /* already locked at same offset */
+
+ lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+ if (lr != NULL &&
+ lr->lr_offset < new->lr_offset + new->lr_length)
+ goto wait;
+
+ lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
+ if (lr != NULL &&
+ lr->lr_offset + lr->lr_length > new->lr_offset)
+ goto wait;
+
+ avl_insert(tree, new, where);
+ return (B_TRUE);
+wait:
+ if (nonblock)
+ return (B_FALSE);
+ if (!lr->lr_write_wanted) {
+ cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
+ lr->lr_write_wanted = B_TRUE;
+ }
+ cv_wait(&lr->lr_write_cv, &rl->rl_lock);
+
+ /* reset to original */
+ new->lr_offset = orig_off;
+ new->lr_length = orig_len;
+ new->lr_type = orig_type;
+ }
+}
+
+/*
+ * If this is an original (non-proxy) lock then replace it by
+ * a proxy and return the proxy.
+ */
+static locked_range_t *
+rangelock_proxify(avl_tree_t *tree, locked_range_t *lr)
+{
+ locked_range_t *proxy;
+
+ if (lr->lr_proxy)
+ return (lr); /* already a proxy */
+
+ ASSERT3U(lr->lr_count, ==, 1);
+ ASSERT(lr->lr_write_wanted == B_FALSE);
+ ASSERT(lr->lr_read_wanted == B_FALSE);
+ avl_remove(tree, lr);
+ lr->lr_count = 0;
+
+ /* create a proxy range lock */
+ proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ proxy->lr_offset = lr->lr_offset;
+ proxy->lr_length = lr->lr_length;
+ proxy->lr_count = 1;
+ proxy->lr_type = RL_READER;
+ proxy->lr_proxy = B_TRUE;
+ proxy->lr_write_wanted = B_FALSE;
+ proxy->lr_read_wanted = B_FALSE;
+ avl_add(tree, proxy);
+
+ return (proxy);
+}
+
+/*
+ * Split the range lock at the supplied offset
+ * returning the *front* proxy.
+ */
+static locked_range_t *
+rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off)
+{
+ ASSERT3U(lr->lr_length, >, 1);
+ ASSERT3U(off, >, lr->lr_offset);
+ ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
+ ASSERT(lr->lr_write_wanted == B_FALSE);
+ ASSERT(lr->lr_read_wanted == B_FALSE);
+
+ /* create the rear proxy range lock */
+ locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ rear->lr_offset = off;
+ rear->lr_length = lr->lr_offset + lr->lr_length - off;
+ rear->lr_count = lr->lr_count;
+ rear->lr_type = RL_READER;
+ rear->lr_proxy = B_TRUE;
+ rear->lr_write_wanted = B_FALSE;
+ rear->lr_read_wanted = B_FALSE;
+
+ locked_range_t *front = rangelock_proxify(tree, lr);
+ front->lr_length = off - lr->lr_offset;
+
+ avl_insert_here(tree, rear, front, AVL_AFTER);
+ return (front);
+}
+
+/*
+ * Create and add a new proxy range lock for the supplied range.
+ */
+static void
+rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+{
+ ASSERT(len != 0);
+ locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_count = 1;
+ lr->lr_type = RL_READER;
+ lr->lr_proxy = B_TRUE;
+ lr->lr_write_wanted = B_FALSE;
+ lr->lr_read_wanted = B_FALSE;
+ avl_add(tree, lr);
+}
+
+static void
+rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
+ locked_range_t *prev, avl_index_t where)
+{
+ locked_range_t *next;
+ uint64_t off = new->lr_offset;
+ uint64_t len = new->lr_length;
+
+ /*
+ * prev arrives either:
+ * - pointing to an entry at the same offset
+ * - pointing to the entry with the closest previous offset whose
+ * range may overlap with the new range
+ * - null, if there were no ranges starting before the new one
+ */
+ if (prev != NULL) {
+ if (prev->lr_offset + prev->lr_length <= off) {
+ prev = NULL;
+ } else if (prev->lr_offset != off) {
+ /*
+ * convert to proxy if needed then
+ * split this entry and bump ref count
+ */
+ prev = rangelock_split(tree, prev, off);
+ prev = AVL_NEXT(tree, prev); /* move to rear range */
+ }
+ }
+ ASSERT((prev == NULL) || (prev->lr_offset == off));
+
+ if (prev != NULL)
+ next = prev;
+ else
+ next = avl_nearest(tree, where, AVL_AFTER);
+
+ if (next == NULL || off + len <= next->lr_offset) {
+ /* no overlaps, use the original new rl_t in the tree */
+ avl_insert(tree, new, where);
+ return;
+ }
+
+ if (off < next->lr_offset) {
+ /* Add a proxy for initial range before the overlap */
+ rangelock_new_proxy(tree, off, next->lr_offset - off);
+ }
+
+ new->lr_count = 0; /* will use proxies in tree */
+ /*
+ * We now search forward through the ranges, until we go past the end
+ * of the new range. For each entry we make it a proxy if it
+ * isn't already, then bump its reference count. If there's any
+ * gaps between the ranges then we create a new proxy range.
+ */
+ for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->lr_offset)
+ break;
+ if (prev != NULL && prev->lr_offset + prev->lr_length <
+ next->lr_offset) {
+ /* there's a gap */
+ ASSERT3U(next->lr_offset, >,
+ prev->lr_offset + prev->lr_length);
+ rangelock_new_proxy(tree,
+ prev->lr_offset + prev->lr_length,
+ next->lr_offset -
+ (prev->lr_offset + prev->lr_length));
+ }
+ if (off + len == next->lr_offset + next->lr_length) {
+ /* exact overlap with end */
+ next = rangelock_proxify(tree, next);
+ next->lr_count++;
+ return;
+ }
+ if (off + len < next->lr_offset + next->lr_length) {
+ /* new range ends in the middle of this block */
+ next = rangelock_split(tree, next, off + len);
+ next->lr_count++;
+ return;
+ }
+ ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
+ next = rangelock_proxify(tree, next);
+ next->lr_count++;
+ }
+
+ /* Add the remaining end range. */
+ rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
+ (off + len) - (prev->lr_offset + prev->lr_length));
+}
+
+/*
+ * Check if a reader lock can be grabbed. If not, fail immediately or sleep and
+ * recheck until available, depending on the value of the "nonblock" parameter.
+ */
+static boolean_t
+rangelock_enter_reader(rangelock_t *rl, locked_range_t *new, boolean_t nonblock)
+{
+ avl_tree_t *tree = &rl->rl_tree;
+ locked_range_t *prev, *next;
+ avl_index_t where;
+ uint64_t off = new->lr_offset;
+ uint64_t len = new->lr_length;
+
+ /*
+ * Look for any writer locks in the range.
+ */
+retry:
+ prev = avl_find(tree, new, &where);
+ if (prev == NULL)
+ prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
+
+ /*
+ * Check the previous range for a writer lock overlap.
+ */
+ if (prev && (off < prev->lr_offset + prev->lr_length)) {
+ if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
+ if (nonblock)
+ return (B_FALSE);
+ if (!prev->lr_read_wanted) {
+ cv_init(&prev->lr_read_cv,
+ NULL, CV_DEFAULT, NULL);
+ prev->lr_read_wanted = B_TRUE;
+ }
+ cv_wait(&prev->lr_read_cv, &rl->rl_lock);
+ goto retry;
+ }
+ if (off + len < prev->lr_offset + prev->lr_length)
+ goto got_lock;
+ }
+
+ /*
+ * Search through the following ranges to see if there's
+ * write lock any overlap.
+ */
+ if (prev != NULL)
+ next = AVL_NEXT(tree, prev);
+ else
+ next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+ for (; next != NULL; next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->lr_offset)
+ goto got_lock;
+ if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
+ if (nonblock)
+ return (B_FALSE);
+ if (!next->lr_read_wanted) {
+ cv_init(&next->lr_read_cv,
+ NULL, CV_DEFAULT, NULL);
+ next->lr_read_wanted = B_TRUE;
+ }
+ cv_wait(&next->lr_read_cv, &rl->rl_lock);
+ goto retry;
+ }
+ if (off + len <= next->lr_offset + next->lr_length)
+ goto got_lock;
+ }
+
+got_lock:
+ /*
+ * Add the read lock, which may involve splitting existing
+ * locks and bumping ref counts (r_count).
+ */
+ rangelock_add_reader(tree, new, prev, where);
+ return (B_TRUE);
+}
+
+/*
+ * Lock a range (offset, length) as either shared (RL_READER) or exclusive
+ * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert
+ * it to a RL_WRITER lock (with the offset at the end of the file). Returns
+ * the range lock structure for later unlocking (or reduce range if the
+ * entire file is locked as RL_WRITER).
+ */
+static locked_range_t *
+_rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
+ rangelock_type_t type, boolean_t nonblock)
+{
+ ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
+
+ locked_range_t *new = kmem_alloc(sizeof (*new), KM_SLEEP);
+ new->lr_rangelock = rl;
+ new->lr_offset = off;
+ if (len + off < off) /* overflow */
+ len = UINT64_MAX - off;
+ new->lr_length = len;
+ new->lr_count = 1; /* assume it's going to be in the tree */
+ new->lr_type = type;
+ new->lr_proxy = B_FALSE;
+ new->lr_write_wanted = B_FALSE;
+ new->lr_read_wanted = B_FALSE;
+
+ mutex_enter(&rl->rl_lock);
+ if (type == RL_READER) {
+ /*
+ * First check for the usual case of no locks
+ */
+ if (avl_numnodes(&rl->rl_tree) == 0) {
+ avl_add(&rl->rl_tree, new);
+ } else if (!rangelock_enter_reader(rl, new, nonblock)) {
+ kmem_free(new, sizeof (*new));
+ new = NULL;
+ }
+ } else if (!rangelock_enter_writer(rl, new, nonblock)) {
+ kmem_free(new, sizeof (*new));
+ new = NULL;
+ }
+ mutex_exit(&rl->rl_lock);
+ return (new);
+}
+
+locked_range_t *
+rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
+ rangelock_type_t type)
+{
+ return (_rangelock_enter(rl, off, len, type, B_FALSE));
+}
+
+locked_range_t *
+rangelock_tryenter(rangelock_t *rl, uint64_t off, uint64_t len,
+ rangelock_type_t type)
+{
+ return (_rangelock_enter(rl, off, len, type, B_TRUE));
+}
+
+/*
+ * Unlock a reader lock
+ */
+static void
+rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove)
+{
+ avl_tree_t *tree = &rl->rl_tree;
+ uint64_t len;
+
+ /*
+ * The common case is when the remove entry is in the tree
+ * (cnt == 1) meaning there's been no other reader locks overlapping
+ * with this one. Otherwise the remove entry will have been
+ * removed from the tree and replaced by proxies (one or
+ * more ranges mapping to the entire range).
+ */
+ if (remove->lr_count == 1) {
+ avl_remove(tree, remove);
+ if (remove->lr_write_wanted) {
+ cv_broadcast(&remove->lr_write_cv);
+ cv_destroy(&remove->lr_write_cv);
+ }
+ if (remove->lr_read_wanted) {
+ cv_broadcast(&remove->lr_read_cv);
+ cv_destroy(&remove->lr_read_cv);
+ }
+ } else {
+ ASSERT0(remove->lr_count);
+ ASSERT0(remove->lr_write_wanted);
+ ASSERT0(remove->lr_read_wanted);
+ /*
+ * Find start proxy representing this reader lock,
+ * then decrement ref count on all proxies
+ * that make up this range, freeing them as needed.
+ */
+ locked_range_t *lr = avl_find(tree, remove, NULL);
+ ASSERT3P(lr, !=, NULL);
+ ASSERT3U(lr->lr_count, !=, 0);
+ ASSERT3U(lr->lr_type, ==, RL_READER);
+ locked_range_t *next = NULL;
+ for (len = remove->lr_length; len != 0; lr = next) {
+ len -= lr->lr_length;
+ if (len != 0) {
+ next = AVL_NEXT(tree, lr);
+ ASSERT3P(next, !=, NULL);
+ ASSERT3U(lr->lr_offset + lr->lr_length, ==,
+ next->lr_offset);
+ ASSERT3U(next->lr_count, !=, 0);
+ ASSERT3U(next->lr_type, ==, RL_READER);
+ }
+ lr->lr_count--;
+ if (lr->lr_count == 0) {
+ avl_remove(tree, lr);
+ if (lr->lr_write_wanted) {
+ cv_broadcast(&lr->lr_write_cv);
+ cv_destroy(&lr->lr_write_cv);
+ }
+ if (lr->lr_read_wanted) {
+ cv_broadcast(&lr->lr_read_cv);
+ cv_destroy(&lr->lr_read_cv);
+ }
+ kmem_free(lr, sizeof (locked_range_t));
+ }
+ }
+ }
+ kmem_free(remove, sizeof (locked_range_t));
+}
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void
+rangelock_exit(locked_range_t *lr)
+{
+ rangelock_t *rl = lr->lr_rangelock;
+
+ ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
+ ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
+ ASSERT(!lr->lr_proxy);
+
+ mutex_enter(&rl->rl_lock);
+ if (lr->lr_type == RL_WRITER) {
+ /* writer locks can't be shared or split */
+ avl_remove(&rl->rl_tree, lr);
+ mutex_exit(&rl->rl_lock);
+ if (lr->lr_write_wanted) {
+ cv_broadcast(&lr->lr_write_cv);
+ cv_destroy(&lr->lr_write_cv);
+ }
+ if (lr->lr_read_wanted) {
+ cv_broadcast(&lr->lr_read_cv);
+ cv_destroy(&lr->lr_read_cv);
+ }
+ kmem_free(lr, sizeof (locked_range_t));
+ } else {
+ /*
+ * lock may be shared, let rangelock_exit_reader()
+ * release the lock and free the rl_t
+ */
+ rangelock_exit_reader(rl, lr);
+ mutex_exit(&rl->rl_lock);
+ }
+}
+
+/*
+ * Reduce range locked as RL_WRITER from whole file to specified range.
+ * Asserts the whole file is exclusively locked and so there's only one
+ * entry in the tree.
+ */
+void
+rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
+{
+ rangelock_t *rl = lr->lr_rangelock;
+
+ /* Ensure there are no other locks */
+ ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
+ ASSERT3U(lr->lr_offset, ==, 0);
+ ASSERT3U(lr->lr_type, ==, RL_WRITER);
+ ASSERT(!lr->lr_proxy);
+ ASSERT3U(lr->lr_length, ==, UINT64_MAX);
+ ASSERT3U(lr->lr_count, ==, 1);
+
+ mutex_enter(&rl->rl_lock);
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ mutex_exit(&rl->rl_lock);
+ if (lr->lr_write_wanted)
+ cv_broadcast(&lr->lr_write_cv);
+ if (lr->lr_read_wanted)
+ cv_broadcast(&lr->lr_read_cv);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
new file mode 100644
index 000000000000..d12a70d74338
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
@@ -0,0 +1,326 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vnode.h>
+#include <sys/sa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_sa.h>
+
+/*
+ * ZPL attribute registration table.
+ * Order of attributes doesn't matter
+ * a unique value will be assigned for each
+ * attribute that is file system specific
+ *
+ * This is just the set of ZPL attributes that this
+ * version of ZFS deals with natively. The file system
+ * could have other attributes stored in files, but they will be
+ * ignored. The SA framework will preserve them, just that
+ * this version of ZFS won't change or delete them.
+ */
+
+sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+ {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+ {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
+ {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
+ {"ZPL_DACL_ACES", 0, SA_ACL, 0},
+ {NULL, 0, 0, 0}
+};
+
+#ifdef _KERNEL
+
+int
+zfs_sa_readlink(znode_t *zp, uio_t *uio)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ size_t bufsz;
+ int error;
+
+ bufsz = zp->z_size;
+ if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
+ error = uiomove((caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ } else {
+ dmu_buf_t *dbp;
+ if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id,
+ 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
+ error = uiomove(dbp->db_data,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ dmu_buf_rele(dbp, FTAG);
+ }
+ }
+ return (error);
+}
+
+void
+zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+
+ if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
+ VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
+ if (len) {
+ bcopy(link, (caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE, len);
+ }
+ } else {
+ dmu_buf_t *dbp;
+
+ zfs_grow_blocksize(zp, len, tx);
+ VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os,
+ zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
+
+ dmu_buf_will_dirty(dbp, tx);
+
+ ASSERT3U(len, <=, dbp->db_size);
+ bcopy(link, dbp->db_data, len);
+ dmu_buf_rele(dbp, FTAG);
+ }
+}
+
+void
+zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ xoptattr_t *xoap;
+
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa) {
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp)) != 0)
+ return;
+ } else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
+ return;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ if (len <= doi.doi_bonus_size) {
+ (void) memcpy(xoap->xoa_av_scanstamp,
+ (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ sizeof (xoap->xoa_av_scanstamp));
+ }
+ }
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+}
+
+void
+zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ xoptattr_t *xoap;
+
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa)
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp), tx));
+ else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+ if (len > doi.doi_bonus_size)
+ VERIFY(dmu_set_bonus(db, len, tx) == 0);
+ (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
+
+ zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ &zp->z_pflags, sizeof (uint64_t), tx));
+ }
+}
+
+/*
+ * I'm not convinced we should do any of this upgrade.
+ * since the SA code can read both old/new znode formats
+ * with probably little to no performance difference.
+ *
+ * All new files will be created with the new format.
+ */
+
+void
+zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(hdl);
+ znode_t *zp = sa_get_userdata(hdl);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ sa_bulk_attr_t bulk[20];
+ int count = 0;
+ sa_bulk_attr_t sa_attrs[20] = { 0 };
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t uid, gid, mode, rdev, xattr, parent;
+ uint64_t crtime[2], mtime[2], ctime[2];
+ zfs_acl_phys_t znode_acl;
+ char scanstamp[AV_SCANSTAMP_SZ];
+
+ /*
+ * No upgrade if ACL isn't cached
+ * since we won't know which locks are held
+ * and ready the ACL would require special "locked"
+ * interfaces that would be messy
+ */
+ if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK)
+ return;
+
+ /*
+ * If the vnode lock is held and we aren't the owner
+ * then just return since we don't want to deadlock
+ * trying to update the status of z_is_sa. This
+ * file can then be upgraded at a later time.
+ *
+ * Otherwise, we know we are doing the
+ * sa_update() that caused us to enter this function.
+ */
+ if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
+ return;
+
+ /* First do a bulk query of the attributes that aren't cached */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &znode_acl, 88);
+
+ if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
+ goto done;
+
+
+ /*
+ * While the order here doesn't matter its best to try and organize
+ * it is such a way to pick up an already existing layout number
+ */
+ count = 0;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
+ NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ zp->z_atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+ &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &zp->z_acl_cached->z_acl_count, 8);
+
+ if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+ zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+
+ locate.cb_aclp = zp->z_acl_cached;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+
+ if (xattr)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
+ NULL, &xattr, 8);
+
+ /* if scanstamp then add scanstamp */
+
+ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+ bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ scanstamp, AV_SCANSTAMP_SZ);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
+ NULL, scanstamp, AV_SCANSTAMP_SZ);
+ zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+ }
+
+ VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+ VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
+ count, tx) == 0);
+ if (znode_acl.z_acl_extern_obj)
+ VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, tx));
+
+ zp->z_is_sa = B_TRUE;
+done:
+ VOP_UNLOCK(ZTOV(zp));
+}
+
+void
+zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
+{
+ if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
+ return;
+
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ if (zfs_external_acl(zp)) {
+ dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
+ DMU_OBJECT_END);
+ }
+}
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..1ce186d0862e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -0,0 +1,2813 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/varargs.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sunddi.h>
+#include <sys/dnlc.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa_boot.h>
+#include <sys/jail.h>
+#include <ufs/ufs/quota.h>
+
+#include "zfs_comutil.h"
+
+struct mtx zfs_debug_mtx;
+MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+
+SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS file system");
+
+int zfs_super_owner;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
+ "File system owner can perform privileged operation on his file systems");
+
+int zfs_debug_level;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
+ "Debug level");
+
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "ZFS versions");
+static int zfs_version_acl = ZFS_ACL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
+ "ZFS_ACL_VERSION");
+static int zfs_version_spa = SPA_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
+ "SPA_VERSION");
+static int zfs_version_zpl = ZPL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
+ "ZPL_VERSION");
+
+static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
+static int zfs_mount(vfs_t *vfsp);
+static int zfs_umount(vfs_t *vfsp, int fflag);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
+static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
+static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
+static int zfs_sync(vfs_t *vfsp, int waitfor);
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int *secflavors);
+static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
+static void zfs_objset_close(zfsvfs_t *zfsvfs);
+static void zfs_freevfs(vfs_t *vfsp);
+
+struct vfsops zfs_vfsops = {
+ .vfs_mount = zfs_mount,
+ .vfs_unmount = zfs_umount,
+ .vfs_root = vfs_cache_root,
+ .vfs_cachedroot = zfs_root,
+ .vfs_statfs = zfs_statfs,
+ .vfs_vget = zfs_vget,
+ .vfs_sync = zfs_sync,
+ .vfs_checkexp = zfs_checkexp,
+ .vfs_fhtovp = zfs_fhtovp,
+ .vfs_quotactl = zfs_quotactl,
+};
+
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t zfs_active_fs_count = 0;
+
+static int
+zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
+{
+ int error = 0;
+ char buf[32];
+ int err;
+ uint64_t usedobj, quotaobj;
+ uint64_t quota, used = 0;
+ timespec_t now;
+
+ usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ if (quotaobj == 0 || zfsvfs->z_replay) {
+ error = EINVAL;
+ goto done;
+ }
+ (void)sprintf(buf, "%llx", (longlong_t)id);
+ if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
+ buf, sizeof(quota), 1, &quota)) != 0) {
+ dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__);
+ goto done;
+ }
+ /*
+ * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
+ * So we set them to be the same.
+ */
+ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
+ error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof(used), 1, &used);
+ if (error && error != ENOENT) {
+ dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error);
+ goto done;
+ }
+ dqp->dqb_curblocks = btodb(used);
+ dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
+ vfs_timestamp(&now);
+ /*
+ * Setting this to 0 causes FreeBSD quota(8) to print
+ * the number of days since the epoch, which isn't
+ * particularly useful.
+ */
+ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
+done:
+ return (error);
+}
+
+static int
+zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ struct thread *td;
+ int cmd, type, error = 0;
+ int bitsize;
+ uint64_t fuid;
+ zfs_userquota_prop_t quota_type;
+ struct dqblk64 dqblk = { 0 };
+
+ td = curthread;
+ cmd = cmds >> SUBCMDSHIFT;
+ type = cmds & SUBCMDMASK;
+
+ ZFS_ENTER(zfsvfs);
+ if (id == -1) {
+ switch (type) {
+ case USRQUOTA:
+ id = td->td_ucred->cr_ruid;
+ break;
+ case GRPQUOTA:
+ id = td->td_ucred->cr_rgid;
+ break;
+ default:
+ error = EINVAL;
+ if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
+ vfs_unbusy(vfsp);
+ goto done;
+ }
+ }
+ /*
+ * Map BSD type to:
+ * ZFS_PROP_USERUSED,
+ * ZFS_PROP_USERQUOTA,
+ * ZFS_PROP_GROUPUSED,
+ * ZFS_PROP_GROUPQUOTA
+ */
+ switch (cmd) {
+ case Q_SETQUOTA:
+ case Q_SETQUOTA32:
+ if (type == USRQUOTA)
+ quota_type = ZFS_PROP_USERQUOTA;
+ else if (type == GRPQUOTA)
+ quota_type = ZFS_PROP_GROUPQUOTA;
+ else
+ error = EINVAL;
+ break;
+ case Q_GETQUOTA:
+ case Q_GETQUOTA32:
+ if (type == USRQUOTA)
+ quota_type = ZFS_PROP_USERUSED;
+ else if (type == GRPQUOTA)
+ quota_type = ZFS_PROP_GROUPUSED;
+ else
+ error = EINVAL;
+ break;
+ }
+
+ /*
+ * Depending on the cmd, we may need to get
+ * the ruid and domain (see fuidstr_to_sid?),
+ * the fuid (how?), or other information.
+ * Create fuid using zfs_fuid_create(zfsvfs, id,
+ * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
+ * I think I can use just the id?
+ *
+ * Look at zfs_fuid_overquota() to look up a quota.
+ * zap_lookup(something, quotaobj, fuidstring, sizeof(long long), 1, &quota)
+ *
+ * See zfs_set_userquota() to set a quota.
+ */
+ if ((u_int)type >= MAXQUOTAS) {
+ error = EINVAL;
+ goto done;
+ }
+
+ switch (cmd) {
+ case Q_GETQUOTASIZE:
+ bitsize = 64;
+ error = copyout(&bitsize, arg, sizeof(int));
+ break;
+ case Q_QUOTAON:
+ // As far as I can tell, you can't turn quotas on or off on zfs
+ error = 0;
+ vfs_unbusy(vfsp);
+ break;
+ case Q_QUOTAOFF:
+ error = ENOTSUP;
+ vfs_unbusy(vfsp);
+ break;
+ case Q_SETQUOTA:
+ error = copyin(arg, &dqblk, sizeof(dqblk));
+ if (error == 0)
+ error = zfs_set_userquota(zfsvfs, quota_type,
+ "", id, dbtob(dqblk.dqb_bhardlimit));
+ break;
+ case Q_GETQUOTA:
+ error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
+ if (error == 0)
+ error = copyout(&dqblk, arg, sizeof(dqblk));
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+done:
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_sync(vfs_t *vfsp, int waitfor)
+{
+
+ /*
+ * Data integrity is job one. We don't want a compromised kernel
+ * writing to the storage pool, so we never sync during panic.
+ */
+ if (KERNEL_PANICKED())
+ return (0);
+
+ /*
+ * Ignore the system syncher. ZFS already commits async data
+ * at zfs_txg_timeout intervals.
+ */
+ if (waitfor == MNT_LAZY)
+ return (0);
+
+ if (vfsp != NULL) {
+ /*
+ * Sync a specific filesystem.
+ */
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ dsl_pool_t *dp;
+ int error;
+
+ error = vfs_stdsync(vfsp, waitfor);
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+ dp = dmu_objset_pool(zfsvfs->z_os);
+
+ /*
+ * If the system is shutting down, then skip any
+ * filesystems which may exist on a suspended pool.
+ */
+ if (sys_shutdown && spa_suspended(dp->dp_spa)) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, 0);
+
+ ZFS_EXIT(zfsvfs);
+ } else {
+ /*
+ * Sync all ZFS filesystems. This is what happens when you
+ * run sync(1M). Unlike other filesystems, ZFS honors the
+ * request by waiting for all pools to commit all dirty data.
+ */
+ spa_sync_allpools();
+ }
+
+ return (0);
+}
+
+#ifndef __FreeBSD_kernel__
+static int
+zfs_create_unique_device(dev_t *dev)
+{
+ major_t new_major;
+
+ do {
+ ASSERT3U(zfs_minor, <=, MAXMIN32);
+ minor_t start = zfs_minor;
+ do {
+ mutex_enter(&zfs_dev_mtx);
+ if (zfs_minor >= MAXMIN32) {
+ /*
+ * If we're still using the real major
+ * keep out of /dev/zfs and /dev/zvol minor
+ * number space. If we're using a getudev()'ed
+ * major number, we can use all of its minors.
+ */
+ if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
+ zfs_minor = ZFS_MIN_MINOR;
+ else
+ zfs_minor = 0;
+ } else {
+ zfs_minor++;
+ }
+ *dev = makedevice(zfs_major, zfs_minor);
+ mutex_exit(&zfs_dev_mtx);
+ } while (vfs_devismounted(*dev) && zfs_minor != start);
+ if (zfs_minor == start) {
+ /*
+ * We are using all ~262,000 minor numbers for the
+ * current major number. Create a new major number.
+ */
+ if ((new_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "zfs_mount: Can't get unique major "
+ "device number.");
+ return (-1);
+ }
+ mutex_enter(&zfs_dev_mtx);
+ zfs_major = new_major;
+ zfs_minor = 0;
+
+ mutex_exit(&zfs_dev_mtx);
+ } else {
+ break;
+ }
+ /* CONSTANTCONDITION */
+ } while (1);
+
+ return (0);
+}
+#endif /* !__FreeBSD_kernel__ */
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ zfsvfs->z_atime = TRUE;
+ zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+ } else {
+ zfsvfs->z_atime = FALSE;
+ zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+ }
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ /* XXX locking on vfs_flag? */
+#ifdef TODO
+ zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
+#endif
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+#ifdef TODO
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
+#endif
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
+ }
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ zfsvfs->z_max_blksz = newval;
+ zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval) {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+ }
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+ }
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+ }
+}
+
+/*
+ * The nbmand mount option can be changed at mount time.
+ * We can't allow it to be toggled on live file systems or incorrect
+ * behavior may be seen from cifs clients
+ *
+ * This property isn't registered via dsl_prop_register(), but this callback
+ * will be called when a file system is first mounted
+ */
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ if (newval == FALSE) {
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
+ } else {
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
+ }
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_vscan = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_inherit = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+ struct dsl_dataset *ds = NULL;
+ objset_t *os = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+ uint64_t nbmand;
+ boolean_t readonly = B_FALSE;
+ boolean_t do_readonly = B_FALSE;
+ boolean_t setuid = B_FALSE;
+ boolean_t do_setuid = B_FALSE;
+ boolean_t exec = B_FALSE;
+ boolean_t do_exec = B_FALSE;
+#ifdef illumos
+ boolean_t devices = B_FALSE;
+ boolean_t do_devices = B_FALSE;
+#endif
+ boolean_t xattr = B_FALSE;
+ boolean_t do_xattr = B_FALSE;
+ boolean_t atime = B_FALSE;
+ boolean_t do_atime = B_FALSE;
+ int error = 0;
+
+ ASSERT(vfsp);
+ zfsvfs = vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ os = zfsvfs->z_os;
+
+ /*
+ * This function can be called for a snapshot when we update snapshot's
+ * mount point, which isn't really supported.
+ */
+ if (dmu_objset_is_snapshot(os))
+ return (EOPNOTSUPP);
+
+ /*
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and
+ * restore them after we register the callbacks.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+ !spa_writeable(dmu_objset_spa(os))) {
+ readonly = B_TRUE;
+ do_readonly = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ readonly = B_FALSE;
+ do_readonly = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+ setuid = B_TRUE;
+ do_setuid = B_TRUE;
+ }
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+ exec = B_FALSE;
+ do_exec = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+ exec = B_TRUE;
+ do_exec = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
+ xattr = B_FALSE;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
+ xattr = B_TRUE;
+ do_xattr = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
+ atime = B_FALSE;
+ do_atime = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
+ atime = B_TRUE;
+ do_atime = B_TRUE;
+ }
+
+ /*
+ * We need to enter pool configuration here, so that we can use
+ * dsl_prop_get_int_ds() to handle the special nbmand property below.
+ * dsl_prop_get_integer() can not be used, because it has to acquire
+ * spa_namespace_lock and we can not do that because we already hold
+ * z_teardown_lock. The problem is that spa_write_cachefile() is called
+ * with spa_namespace_lock held and the function calls ZFS vnode
+ * operations to write the cache file and thus z_teardown_lock is
+ * acquired after spa_namespace_lock.
+ */
+ ds = dmu_objset_ds(os);
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+
+ /*
+ * nbmand is a special property. It can only be changed at
+ * mount time.
+ *
+ * This is weird, but it is documented to only be changeable
+ * at mount time.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
+ nbmand = B_FALSE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
+ nbmand = B_TRUE;
+ } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ return (error);
+ }
+
+ /*
+ * Register property callbacks.
+ *
+ * It would probably be fine to just check for i/o error from
+ * the first prop_register(), but I guess I like to go
+ * overboard...
+ */
+ error = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+#ifdef illumos
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
+#endif
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+ zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (error)
+ goto unregister;
+
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (do_readonly)
+ readonly_changed_cb(zfsvfs, readonly);
+ if (do_setuid)
+ setuid_changed_cb(zfsvfs, setuid);
+ if (do_exec)
+ exec_changed_cb(zfsvfs, exec);
+ if (do_xattr)
+ xattr_changed_cb(zfsvfs, xattr);
+ if (do_atime)
+ atime_changed_cb(zfsvfs, atime);
+
+ nbmand_changed_cb(zfsvfs, nbmand);
+
+ return (0);
+
+unregister:
+ dsl_prop_unregister_all(ds, zfsvfs);
+ return (error);
+}
+
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
+ uint64_t *userp, uint64_t *groupp)
+{
+ /*
+ * Is it a valid type of object to track?
+ */
+ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * If we have a NULL data pointer
+ * then assume the id's aren't changing and
+ * return EEXIST to the dmu to let it know to
+ * use the same ids
+ */
+ if (data == NULL)
+ return (SET_ERROR(EEXIST));
+
+ if (bonustype == DMU_OT_ZNODE) {
+ znode_phys_t *znp = data;
+ *userp = znp->zp_uid;
+ *groupp = znp->zp_gid;
+ } else {
+ int hdrsize;
+ sa_hdr_phys_t *sap = data;
+ sa_hdr_phys_t sa = *sap;
+ boolean_t swap = B_FALSE;
+
+ ASSERT(bonustype == DMU_OT_SA);
+
+ if (sa.sa_magic == 0) {
+ /*
+ * This should only happen for newly created
+ * files that haven't had the znode data filled
+ * in yet.
+ */
+ *userp = 0;
+ *groupp = 0;
+ return (0);
+ }
+ if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
+ sa.sa_magic = SA_MAGIC;
+ sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
+ swap = B_TRUE;
+ } else {
+ VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
+ }
+
+ hdrsize = sa_hdrsize(&sa);
+ VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
+ *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_UID_OFFSET));
+ *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_GID_OFFSET));
+ if (swap) {
+ *userp = BSWAP_64(*userp);
+ *groupp = BSWAP_64(*groupp);
+ }
+ }
+ return (0);
+}
+
+static void
+fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
+ char *domainbuf, int buflen, uid_t *ridp)
+{
+ uint64_t fuid;
+ const char *domain;
+
+ fuid = zfs_strtonum(fuidstr, NULL);
+
+ domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
+ if (domain)
+ (void) strlcpy(domainbuf, domain, buflen);
+ else
+ domainbuf[0] = '\0';
+ *ridp = FUID_RID(fuid);
+}
+
+static uint64_t
+zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
+{
+ switch (type) {
+ case ZFS_PROP_USERUSED:
+ return (DMU_USERUSED_OBJECT);
+ case ZFS_PROP_GROUPUSED:
+ return (DMU_GROUPUSED_OBJECT);
+ case ZFS_PROP_USERQUOTA:
+ return (zfsvfs->z_userquota_obj);
+ case ZFS_PROP_GROUPQUOTA:
+ return (zfsvfs->z_groupquota_obj);
+ }
+ return (0);
+}
+
+int
+zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
+{
+ int error;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zfs_useracct_t *buf = vbuf;
+ uint64_t obj;
+
+ if (!dmu_objset_userspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+ if (obj == 0) {
+ *bufsizep = 0;
+ return (0);
+ }
+
+ for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
+ *bufsizep)
+ break;
+
+ fuidstr_to_sid(zfsvfs, za.za_name,
+ buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
+
+ buf->zu_space = za.za_first_integer;
+ buf++;
+ }
+ if (error == ENOENT)
+ error = 0;
+
+ ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
+ *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
+ *cookiep = zap_cursor_serialize(&zc);
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+/*
+ * buf must be big enough (eg, 32 bytes)
+ */
+static int
+id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
+ char *buf, boolean_t addok)
+{
+ uint64_t fuid;
+ int domainid = 0;
+
+ if (domain && domain[0]) {
+ domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
+ if (domainid == -1)
+ return (SET_ERROR(ENOENT));
+ }
+ fuid = FUID_ENCODE(domainid, rid);
+ (void) sprintf(buf, "%llx", (longlong_t)fuid);
+ return (0);
+}
+
+int
+zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t *valp)
+{
+ char buf[32];
+ int err;
+ uint64_t obj;
+
+ *valp = 0;
+
+ if (!dmu_objset_userspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+ if (obj == 0)
+ return (0);
+
+ err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
+ if (err)
+ return (err);
+
+ err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+int
+zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t quota)
+{
+ char buf[32];
+ int err;
+ dmu_tx_t *tx;
+ uint64_t *objp;
+ boolean_t fuid_dirtied;
+
+ if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
+ return (SET_ERROR(EINVAL));
+
+ if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
+ return (SET_ERROR(ENOTSUP));
+
+ objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
+ &zfsvfs->z_groupquota_obj;
+
+ err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
+ if (err)
+ return (err);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
+ if (*objp == 0) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ zfs_userquota_prop_prefixes[type]);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ mutex_enter(&zfsvfs->z_lock);
+ if (*objp == 0) {
+ *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
+ DMU_OT_NONE, 0, tx);
+ VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
+ }
+ mutex_exit(&zfsvfs->z_lock);
+
+ if (quota == 0) {
+ err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
+ if (err == ENOENT)
+ err = 0;
+ } else {
+ err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
+ }
+ ASSERT(err == 0);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+ dmu_tx_commit(tx);
+ return (err);
+}
+
+boolean_t
+zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+{
+ char buf[32];
+ uint64_t used, quota, usedobj, quotaobj;
+ int err;
+
+ usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ (void) sprintf(buf, "%llx", (longlong_t)fuid);
+ err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+ if (err != 0)
+ return (B_FALSE);
+
+ err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+ if (err != 0)
+ return (B_FALSE);
+ return (used >= quota);
+}
+
+boolean_t
+zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
+{
+ uint64_t fuid;
+ uint64_t quotaobj;
+
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ fuid = isgroup ? zp->z_gid : zp->z_uid;
+
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+ uint64_t val;
+
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+ zfsvfs->z_os = os;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+ if (error != 0)
+ return (error);
+ if (zfsvfs->z_version >
+ zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+ (void) printf("Can't mount a version %lld file system "
+ "on a version %lld pool\n. Pool must be upgraded to mount "
+ "this file system.", (u_longlong_t)zfsvfs->z_version,
+ (u_longlong_t)spa_version(dmu_objset_spa(os)));
+ return (SET_ERROR(ENOTSUP));
+ }
+ error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_norm = (int)val;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_utf8 = (val != 0);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_case = (uint_t)val;
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ zfsvfs->z_case == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+ uint64_t sa_obj = 0;
+ if (zfsvfs->z_use_sa) {
+ /* should either have both of these objects or none */
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+ &sa_obj);
+ if (error != 0)
+ return (error);
+ }
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+ if (error != 0)
+ return (error);
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(os, zfs_sa_upgrade);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error != 0)
+ return (error);
+ ASSERT(zfsvfs->z_root != 0);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+ 8, 1, &zfsvfs->z_userquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+ 8, 1, &zfsvfs->z_groupquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+ &zfsvfs->z_fuid_obj);
+ if (error == ENOENT)
+ zfsvfs->z_fuid_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+ &zfsvfs->z_shares_dir);
+ if (error == ENOENT)
+ zfsvfs->z_shares_dir = 0;
+ else if (error != 0)
+ return (error);
+
+ /*
+ * Only use the name cache if we are looking for a
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name (which is always the case on
+ * FreeBSD).
+ */
+ zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+ ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+ !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
+ return (0);
+}
+
+#if defined(__FreeBSD__)
+taskq_t *zfsvfs_taskq;
+
+static void
+zfsvfs_task_unlinked_drain(void *context, int pending __unused)
+{
+
+ zfs_unlinked_drain((zfsvfs_t *)context);
+}
+#endif
+
+int
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
+{
+ objset_t *os;
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ /*
+ * XXX: Fix struct statfs so this isn't necessary!
+ *
+ * The 'osname' is used as the filesystem's special node, which means
+ * it must fit in statfs.f_mntfromname, or else it can't be
+ * enumerated, so libzfs_mnttab_find() returns NULL, which causes
+ * 'zfs unmount' to think it's not mounted when it is.
+ */
+ if (strlen(osname) >= MNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ /*
+ * We claim to always be readonly so we can open snapshots;
+ * other ZPL code will prevent us from writing to snapshots.
+ */
+
+ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+ if (error != 0) {
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+ if (error != 0) {
+ dmu_objset_disown(os, zfsvfs);
+ }
+ return (error);
+}
+
+
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+
+ zfsvfs->z_vfs = NULL;
+ zfsvfs->z_parent = zfsvfs;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+#if defined(__FreeBSD__)
+ TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
+ zfsvfs_task_unlinked_drain, zfsvfs);
+#endif
+#ifdef DIAGNOSTIC
+ rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
+ rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
+ rms_init(&zfsvfs->z_teardown_inactive_lock, "zfs teardown inactive");
+ rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+ for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ error = zfsvfs_init(zfsvfs, os);
+ if (error != 0) {
+ *zfvp = NULL;
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ *zfvp = zfsvfs;
+ return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+ int error;
+
+ error = zfs_register_callbacks(zfsvfs->z_vfs);
+ if (error)
+ return (error);
+
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+ /*
+ * If we are not mounting (ie: online recv), then we don't
+ * have to worry about replaying the log as we blocked all
+ * operations out since we closed the ZIL.
+ */
+ if (mounting) {
+ boolean_t readonly;
+
+ /*
+ * During replay we remove the read only flag to
+ * allow replays to succeed.
+ */
+ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
+ if (readonly != 0)
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ else
+ zfs_unlinked_drain(zfsvfs);
+
+ /*
+ * Parse and replay the intent log.
+ *
+ * Because of ziltest, this must be done after
+ * zfs_unlinked_drain(). (Further note: ziltest
+ * doesn't use readonly mounts, where
+ * zfs_unlinked_drain() isn't called.) This is because
+ * ziltest causes spa_sync() to think it's committed,
+ * but actually it is not, so the intent log contains
+ * many txg's worth of changes.
+ *
+ * In particular, if object N is in the unlinked set in
+ * the last txg to actually sync, then it could be
+ * actually freed in a later txg and then reallocated
+ * in a yet later txg. This would write a "create
+ * object N" record to the intent log. Normally, this
+ * would be fine because the spa_sync() would have
+ * written out the fact that object N is free, before
+ * we could write the "create object N" intent log
+ * record.
+ *
+ * But when we are in ziltest mode, we advance the "open
+ * txg" without actually spa_sync()-ing the changes to
+ * disk. So we would see that object N is still
+ * allocated and in the unlinked set, and there is an
+ * intent log record saying to allocate it.
+ */
+ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+ if (zil_replay_disable) {
+ zil_destroy(zfsvfs->z_log, B_FALSE);
+ } else {
+ zfsvfs->z_replay = B_TRUE;
+ zil_replay(zfsvfs->z_os, zfsvfs,
+ zfs_replay_vector);
+ zfsvfs->z_replay = B_FALSE;
+ }
+ }
+ zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
+ }
+
+ /*
+ * Set the objset user_ptr to track its zfsvfs.
+ */
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+ return (0);
+}
+
+extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+ int i;
+
+ /*
+ * This is a barrier to prevent the filesystem from going away in
+ * zfs_znode_move() until we can safely ensure that the filesystem is
+ * not unmounted. We consider the filesystem valid before the barrier
+ * and invalid after the barrier.
+ */
+ rw_enter(&zfsvfs_lock, RW_READER);
+ rw_exit(&zfsvfs_lock);
+
+ zfs_fuid_destroy(zfsvfs);
+
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ mutex_destroy(&zfsvfs->z_lock);
+ list_destroy(&zfsvfs->z_all_znodes);
+ rrm_destroy(&zfsvfs->z_teardown_lock);
+ rms_destroy(&zfsvfs->z_teardown_inactive_lock);
+ rw_destroy(&zfsvfs->z_fuid_lock);
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ if (zfsvfs->z_vfs) {
+ if (zfsvfs->z_use_fuids) {
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ } else {
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+ }
+ }
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname)
+{
+ uint64_t recordsize, fsid_guid;
+ int error = 0;
+ zfsvfs_t *zfsvfs;
+ vnode_t *vp;
+
+ ASSERT(vfsp);
+ ASSERT(osname);
+
+ error = zfsvfs_create(osname, &zfsvfs);
+ if (error)
+ return (error);
+ zfsvfs->z_vfs = vfsp;
+
+#ifdef illumos
+ /* Initialize the generic filesystem structure. */
+ vfsp->vfs_bcount = 0;
+ vfsp->vfs_data = NULL;
+
+ if (zfs_create_unique_device(&mount_dev) == -1) {
+ error = SET_ERROR(ENODEV);
+ goto out;
+ }
+ ASSERT(vfs_devismounted(mount_dev) == 0);
+#endif
+
+ if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
+ NULL))
+ goto out;
+ zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
+ zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
+
+ vfsp->vfs_data = zfsvfs;
+ vfsp->mnt_flag |= MNT_LOCAL;
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+ vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
+#endif
+ vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
+ vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
+ vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
+ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
+ vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
+ vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
+
+ /*
+ * The fsid is 64 bits, composed of an 8-bit fs type, which
+ * separates our fsid from any other filesystem types, and a
+ * 56-bit objset unique ID. The objset unique ID is unique to
+ * all objsets open on this system, provided by unique_create().
+ * The 8-bit fs type must be put in the low bits of fsid[1]
+ * because that's where other Solaris filesystems put it.
+ */
+ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+ ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+ vfsp->vfs_fsid.val[0] = fsid_guid;
+ vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+ vfsp->mnt_vfc->vfc_typenum & 0xFF;
+
+ /*
+ * Set features for file system.
+ */
+ zfs_set_fuid_feature(zfsvfs);
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+ vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+ vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
+ } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+ vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+ vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+ }
+ vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
+
+ if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+ uint64_t pval;
+
+ atime_changed_cb(zfsvfs, B_FALSE);
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
+ goto out;
+ xattr_changed_cb(zfsvfs, pval);
+ zfsvfs->z_issnap = B_TRUE;
+ zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+ } else {
+ error = zfsvfs_setup(zfsvfs, B_TRUE);
+ }
+
+ vfs_mountedfrom(vfsp, osname);
+
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+out:
+ if (error) {
+ dmu_objset_disown(zfsvfs->z_os, zfsvfs);
+ zfsvfs_free(zfsvfs);
+ } else {
+ atomic_inc_32(&zfs_active_fs_count);
+ }
+
+ return (error);
+}
+
+void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+ objset_t *os = zfsvfs->z_os;
+
+ if (!dmu_objset_is_snapshot(os))
+ dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+#ifdef SECLABEL
+/*
+ * Convert a decimal digit string to a uint64_t integer.
+ */
+static int
+str_to_uint64(char *str, uint64_t *objnum)
+{
+ uint64_t num = 0;
+
+ while (*str) {
+ if (*str < '0' || *str > '9')
+ return (SET_ERROR(EINVAL));
+
+ num = num*10 + *str++ - '0';
+ }
+
+ *objnum = num;
+ return (0);
+}
+
+/*
+ * The boot path passed from the boot loader is in the form of
+ * "rootpool-name/root-filesystem-object-number'. Convert this
+ * string to a dataset name: "rootpool-name/root-filesystem-name".
+ */
+static int
+zfs_parse_bootfs(char *bpath, char *outpath)
+{
+ char *slashp;
+ uint64_t objnum;
+ int error;
+
+ if (*bpath == 0 || *bpath == '/')
+ return (SET_ERROR(EINVAL));
+
+ (void) strcpy(outpath, bpath);
+
+ slashp = strchr(bpath, '/');
+
+ /* if no '/', just return the pool name */
+ if (slashp == NULL) {
+ return (0);
+ }
+
+ /* if not a number, just return the root dataset name */
+ if (str_to_uint64(slashp+1, &objnum)) {
+ return (0);
+ }
+
+ *slashp = '\0';
+ error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
+ *slashp = '/';
+
+ return (error);
+}
+
+/*
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high. For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+ if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+ /* must be readonly */
+ uint64_t rdonly;
+
+ if (dsl_prop_get_integer(dsname,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+ return (SET_ERROR(EACCES));
+ return (rdonly ? 0 : EACCES);
+ }
+ return (SET_ERROR(EACCES));
+}
+
+/*
+ * Determine whether the mount is allowed according to MAC check.
+ * by comparing (where appropriate) label of the dataset against
+ * the label of the zone being mounted into. If the dataset has
+ * no label, create one.
+ *
+ * Returns 0 if access allowed, error otherwise (e.g. EACCES)
+ */
+static int
+zfs_mount_label_policy(vfs_t *vfsp, char *osname)
+{
+ int error, retv;
+ zone_t *mntzone = NULL;
+ ts_label_t *mnt_tsl;
+ bslabel_t *mnt_sl;
+ bslabel_t ds_sl;
+ char ds_hexsl[MAXNAMELEN];
+
+ retv = EACCES; /* assume the worst */
+
+ /*
+ * Start by getting the dataset label if it exists.
+ */
+ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+ if (error)
+ return (SET_ERROR(EACCES));
+
+ /*
+ * If labeling is NOT enabled, then disallow the mount of datasets
+ * which have a non-default label already. No other label checks
+ * are needed.
+ */
+ if (!is_system_labeled()) {
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ return (SET_ERROR(EACCES));
+ }
+
+ /*
+ * Get the label of the mountpoint. If mounting into the global
+ * zone (i.e. mountpoint is not within an active zone and the
+ * zoned property is off), the label must be default or
+ * admin_low/admin_high only; no other checks are needed.
+ */
+ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
+ if (mntzone->zone_id == GLOBAL_ZONEID) {
+ uint64_t zoned;
+
+ zone_rele(mntzone);
+
+ if (dsl_prop_get_integer(osname,
+ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+ return (SET_ERROR(EACCES));
+ if (!zoned)
+ return (zfs_check_global_label(osname, ds_hexsl));
+ else
+ /*
+ * This is the case of a zone dataset being mounted
+ * initially, before the zone has been fully created;
+ * allow this mount into global zone.
+ */
+ return (0);
+ }
+
+ mnt_tsl = mntzone->zone_slabel;
+ ASSERT(mnt_tsl != NULL);
+ label_hold(mnt_tsl);
+ mnt_sl = label2bslabel(mnt_tsl);
+
+ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
+ /*
+ * The dataset doesn't have a real label, so fabricate one.
+ */
+ char *str = NULL;
+
+ if (l_to_str_internal(mnt_sl, &str) == 0 &&
+ dsl_prop_set_string(osname,
+ zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+ ZPROP_SRC_LOCAL, str) == 0)
+ retv = 0;
+ if (str != NULL)
+ kmem_free(str, strlen(str) + 1);
+ } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
+ /*
+ * Now compare labels to complete the MAC check. If the
+ * labels are equal then allow access. If the mountpoint
+ * label dominates the dataset label, allow readonly access.
+ * Otherwise, access is denied.
+ */
+ if (blequal(mnt_sl, &ds_sl))
+ retv = 0;
+ else if (bldominates(mnt_sl, &ds_sl)) {
+ vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
+ retv = 0;
+ }
+ }
+
+ label_rele(mnt_tsl);
+ zone_rele(mntzone);
+ return (retv);
+}
+#endif /* SECLABEL */
+
+#ifdef OPENSOLARIS_MOUNTROOT
+static int
+zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
+{
+ int error = 0;
+ static int zfsrootdone = 0;
+ zfsvfs_t *zfsvfs = NULL;
+ znode_t *zp = NULL;
+ vnode_t *vp = NULL;
+ char *zfs_bootfs;
+ char *zfs_devid;
+
+ ASSERT(vfsp);
+
+ /*
+ * The filesystem that we mount as root is defined in the
+ * boot property "zfs-bootfs" with a format of
+ * "poolname/root-dataset-objnum".
+ */
+ if (why == ROOT_INIT) {
+ if (zfsrootdone++)
+ return (SET_ERROR(EBUSY));
+ /*
+ * the process of doing a spa_load will require the
+ * clock to be set before we could (for example) do
+ * something better by looking at the timestamp on
+ * an uberblock, so just set it to -1.
+ */
+ clkset(-1);
+
+ if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
+ cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
+ "bootfs name");
+ return (SET_ERROR(EINVAL));
+ }
+ zfs_devid = spa_get_bootprop("diskdevid");
+ error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
+ if (zfs_devid)
+ spa_free_bootprop(zfs_devid);
+ if (error) {
+ spa_free_bootprop(zfs_bootfs);
+ cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
+ error);
+ return (error);
+ }
+ if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
+ spa_free_bootprop(zfs_bootfs);
+ cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
+ error);
+ return (error);
+ }
+
+ spa_free_bootprop(zfs_bootfs);
+
+ if (error = vfs_lock(vfsp))
+ return (error);
+
+ if (error = zfs_domount(vfsp, rootfs.bo_name)) {
+ cmn_err(CE_NOTE, "zfs_domount: error %d", error);
+ goto out;
+ }
+
+ zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
+ cmn_err(CE_NOTE, "zfs_zget: error %d", error);
+ goto out;
+ }
+
+ vp = ZTOV(zp);
+ mutex_enter(&vp->v_lock);
+ vp->v_flag |= VROOT;
+ mutex_exit(&vp->v_lock);
+ rootvp = vp;
+
+ /*
+ * Leave rootvp held. The root file system is never unmounted.
+ */
+
+ vfs_add((struct vnode *)0, vfsp,
+ (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
+out:
+ vfs_unlock(vfsp);
+ return (error);
+ } else if (why == ROOT_REMOUNT) {
+ readonly_changed_cb(vfsp->vfs_data, B_FALSE);
+ vfsp->vfs_flag |= VFS_REMOUNT;
+
+ /* refresh mount options */
+ zfs_unregister_callbacks(vfsp->vfs_data);
+ return (zfs_register_callbacks(vfsp));
+
+ } else if (why == ROOT_UNMOUNT) {
+ zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
+ (void) zfs_sync(vfsp, 0, 0);
+ return (0);
+ }
+
+ /*
+ * if "why" is equal to anything else other than ROOT_INIT,
+ * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
+ */
+ return (SET_ERROR(ENOTSUP));
+}
+#endif /* OPENSOLARIS_MOUNTROOT */
+
+static int
+getpoolname(const char *osname, char *poolname)
+{
+ char *p;
+
+ p = strchr(osname, '/');
+ if (p == NULL) {
+ if (strlen(osname) >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strcpy(poolname, osname);
+ } else {
+ if (p - osname >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(poolname, osname, p - osname);
+ poolname[p - osname] = '\0';
+ }
+ return (0);
+}
+
+static void
+fetch_osname_options(char *name, bool *checkpointrewind)
+{
+
+ if (name[0] == '!') {
+ *checkpointrewind = true;
+ memmove(name, name + 1, strlen(name));
+ } else {
+ *checkpointrewind = false;
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp)
+{
+ kthread_t *td = curthread;
+ vnode_t *mvp = vfsp->mnt_vnodecovered;
+ cred_t *cr = td->td_ucred;
+ char *osname;
+ int error = 0;
+ int canwrite;
+ bool checkpointrewind;
+
+#ifdef illumos
+ if (mvp->v_type != VDIR)
+ return (SET_ERROR(ENOTDIR));
+
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_REMOUNT) == 0 &&
+ (uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (SET_ERROR(EBUSY));
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * ZFS does not support passing unparsed data in via MS_DATA.
+ * Users should use the MS_OPTIONSTR interface; this means
+ * that all option parsing is already done and the options struct
+ * can be interrogated.
+ */
+ if ((uap->flags & MS_DATA) && uap->datalen > 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Get the objset name (the "special" mount argument).
+ */
+ if (error = pn_get(uap->spec, fromspace, &spn))
+ return (error);
+
+ osname = spn.pn_path;
+#else /* !illumos */
+ if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * If full-owner-access is enabled and delegated administration is
+ * turned on, we must set nosuid.
+ */
+ if (zfs_super_owner &&
+ dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
+ secpolicy_fs_mount_clearopts(cr, vfsp);
+ }
+#endif /* illumos */
+ fetch_osname_options(osname, &checkpointrewind);
+
+ /*
+ * Check for mount privilege?
+ *
+ * If we don't have privilege then see if
+ * we have local permission to allow it
+ */
+ error = secpolicy_fs_mount(cr, mvp, vfsp);
+ if (error) {
+ if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
+ goto out;
+
+ if (!(vfsp->vfs_flag & MS_REMOUNT)) {
+ vattr_t vattr;
+
+ /*
+ * Make sure user is the owner of the mount point
+ * or has sufficient privileges.
+ */
+
+ vattr.va_mask = AT_UID;
+
+ vn_lock(mvp, LK_SHARED | LK_RETRY);
+ if (VOP_GETATTR(mvp, &vattr, cr)) {
+ VOP_UNLOCK(mvp);
+ goto out;
+ }
+
+ if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
+ VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
+ VOP_UNLOCK(mvp);
+ goto out;
+ }
+ VOP_UNLOCK(mvp);
+ }
+
+ secpolicy_fs_mount_clearopts(cr, vfsp);
+ }
+
+ /*
+ * Refuse to mount a filesystem if we are in a local zone and the
+ * dataset is not visible.
+ */
+ if (!INGLOBALZONE(curthread) &&
+ (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+ error = SET_ERROR(EPERM);
+ goto out;
+ }
+
+#ifdef SECLABEL
+ error = zfs_mount_label_policy(vfsp, osname);
+ if (error)
+ goto out;
+#endif
+
+ vfsp->vfs_flag |= MNT_NFS4ACLS;
+
+ /*
+ * When doing a remount, we simply refresh our temporary properties
+ * according to those options set in the current VFS options.
+ */
+ if (vfsp->vfs_flag & MS_REMOUNT) {
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * Refresh mount options with z_teardown_lock blocking I/O while
+ * the filesystem is in an inconsistent state.
+ * The lock also serializes this code with filesystem
+ * manipulations between entry to zfs_suspend_fs() and return
+ * from zfs_resume_fs().
+ */
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ zfs_unregister_callbacks(zfsvfs);
+ error = zfs_register_callbacks(vfsp);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ goto out;
+ }
+
+ /* Initial root mount: try hard to import the requested root pool. */
+ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
+ (vfsp->vfs_flag & MNT_UPDATE) == 0) {
+ char pname[MAXNAMELEN];
+
+ error = getpoolname(osname, pname);
+ if (error == 0)
+ error = spa_import_rootpool(pname, checkpointrewind);
+ if (error)
+ goto out;
+ }
+ DROP_GIANT();
+ error = zfs_domount(vfsp, osname);
+ PICKUP_GIANT();
+
+#ifdef illumos
+ /*
+ * Add an extra VFS_HOLD on our parent vfs so that it can't
+ * disappear due to a forced unmount.
+ */
+ if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
+ VFS_HOLD(mvp->v_vfsp);
+#endif
+
+out:
+ return (error);
+}
+
+static int
+zfs_statfs(vfs_t *vfsp, struct statfs *statp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ uint64_t refdbytes, availbytes, usedobjs, availobjs;
+
+ statp->f_version = STATFS_VERSION;
+
+ ZFS_ENTER(zfsvfs);
+
+ dmu_objset_space(zfsvfs->z_os,
+ &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+ /*
+ * The underlying storage pool actually uses multiple block sizes.
+ * We report the fragsize as the smallest block size we support,
+ * and we report our blocksize as the filesystem's maximum blocksize.
+ */
+ statp->f_bsize = SPA_MINBLOCKSIZE;
+ statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
+
+ /*
+ * The following report "total" blocks of various kinds in the
+ * file system, but reported in terms of f_frsize - the
+ * "fragment" size.
+ */
+
+ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
+ statp->f_bfree = availbytes / statp->f_bsize;
+ statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+ /*
+ * statvfs() should really be called statufs(), because it assumes
+ * static metadata. ZFS doesn't preallocate files, so the best
+ * we can do is report the max that could possibly fit in f_files,
+ * and that minus the number actually used in f_ffree.
+ * For f_ffree, report the smaller of the number of object available
+ * and the number of blocks (each object will take at least a block).
+ */
+ statp->f_ffree = MIN(availobjs, statp->f_bfree);
+ statp->f_files = statp->f_ffree + usedobjs;
+
+ /*
+ * We're a zfs filesystem.
+ */
+ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
+
+ strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
+ sizeof(statp->f_mntfromname));
+ strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
+ sizeof(statp->f_mntonname));
+
+ statp->f_namemax = MAXNAMELEN - 1;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *rootzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0)
+ *vpp = ZTOV(rootzp);
+
+ ZFS_EXIT(zfsvfs);
+
+ if (error == 0) {
+ error = vn_lock(*vpp, flags);
+ if (error != 0) {
+ VN_RELE(*vpp);
+ *vpp = NULL;
+ }
+ }
+ return (error);
+}
+
+/*
+ * Teardown the zfsvfs::z_os.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+ znode_t *zp;
+
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+
+ if (!unmounting) {
+ /*
+ * We purge the parent filesystem's vfsp as the parent
+ * filesystem and all of its snapshots have their vnode's
+ * v_vfsp set to the parent's filesystem's vfsp. Note,
+ * 'z_parent' is self referential for non-snapshots.
+ */
+ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
+#ifdef FREEBSD_NAMECACHE
+ cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
+#endif
+ }
+
+ /*
+ * Close the zil. NB: Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs);
+
+ /*
+ * If we are not unmounting (ie: online recv) and someone already
+ * unmounted this file system while we were doing the switcheroo,
+ * or a reopen of z_os failed then just bail out now.
+ */
+ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+ ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * At this point there are no vops active, and any new vops will
+ * fail with EIO since we have z_teardown_lock for writer (only
+ * relavent for forced unmount).
+ *
+ * Release all holds on dbufs.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+ zp = list_next(&zfsvfs->z_all_znodes, zp))
+ if (zp->z_sa_hdl) {
+ ASSERT(ZTOV(zp)->v_count >= 0);
+ zfs_znode_dmu_fini(zp);
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * If we are unmounting, set the unmounted flag and let new vops
+ * unblock. zfs_inactive will have the unmounted behavior, and all
+ * other vops will fail with EIO.
+ */
+ if (unmounting) {
+ zfsvfs->z_unmounted = B_TRUE;
+ ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ }
+
+ /*
+ * z_os will be NULL if there was an error in attempting to reopen
+ * zfsvfs, so just return as the properties had already been
+ * unregistered and cached data had been evicted before.
+ */
+ if (zfsvfs->z_os == NULL)
+ return (0);
+
+ /*
+ * Unregister properties.
+ */
+ zfs_unregister_callbacks(zfsvfs);
+
+ /*
+ * Evict cached data
+ */
+ if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
+ !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ dmu_objset_evict_dbufs(zfsvfs->z_os);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag)
+{
+ kthread_t *td = curthread;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ objset_t *os;
+ cred_t *cr = td->td_ucred;
+ int ret;
+
+ ret = secpolicy_fs_unmount(cr, vfsp);
+ if (ret) {
+ if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
+ ZFS_DELEG_PERM_MOUNT, cr))
+ return (ret);
+ }
+
+ /*
+ * We purge the parent filesystem's vfsp as the parent filesystem
+ * and all of its snapshots have their vnode's v_vfsp set to the
+ * parent's filesystem's vfsp. Note, 'z_parent' is self
+ * referential for non-snapshots.
+ */
+ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
+
+ /*
+ * Unmount any snapshots mounted under .zfs before unmounting the
+ * dataset itself.
+ */
+ if (zfsvfs->z_ctldir != NULL) {
+ if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+ return (ret);
+ }
+
+ if (fflag & MS_FORCE) {
+ /*
+ * Mark file system as unmounted before calling
+ * vflush(FORCECLOSE). This way we ensure no future vnops
+ * will be called and risk operating on DOOMED vnodes.
+ */
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ zfsvfs->z_unmounted = B_TRUE;
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ }
+
+ /*
+ * Flush all the files.
+ */
+ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+ if (ret != 0)
+ return (ret);
+
+#ifdef illumos
+ if (!(fflag & MS_FORCE)) {
+ /*
+ * Check the number of active vnodes in the file system.
+ * Our count is maintained in the vfs structure, but the
+ * number is off by 1 to indicate a hold on the vfs
+ * structure itself.
+ *
+ * The '.zfs' directory maintains a reference of its
+ * own, and any active references underneath are
+ * reflected in the vnode count.
+ */
+ if (zfsvfs->z_ctldir == NULL) {
+ if (vfsp->vfs_count > 1)
+ return (SET_ERROR(EBUSY));
+ } else {
+ if (vfsp->vfs_count > 2 ||
+ zfsvfs->z_ctldir->v_count > 1)
+ return (SET_ERROR(EBUSY));
+ }
+ }
+#endif
+
+ while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task, NULL) != 0)
+ taskqueue_drain(zfsvfs_taskq->tq_queue,
+ &zfsvfs->z_unlinked_drain_task);
+
+ VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+ os = zfsvfs->z_os;
+
+ /*
+ * z_os will be NULL if there was an error in
+ * attempting to reopen zfsvfs.
+ */
+ if (os != NULL) {
+ /*
+ * Unset the objset user_ptr.
+ */
+ mutex_enter(&os->os_user_ptr_lock);
+ dmu_objset_set_user(os, NULL);
+ mutex_exit(&os->os_user_ptr_lock);
+
+ /*
+ * Finally release the objset
+ */
+ dmu_objset_disown(os, zfsvfs);
+ }
+
+ /*
+ * We can now safely destroy the '.zfs' directory node.
+ */
+ if (zfsvfs->z_ctldir != NULL)
+ zfsctl_destroy(zfsvfs);
+ zfs_freevfs(vfsp);
+
+ return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ int err;
+
+ /*
+ * zfs_zget() can't operate on virtual entries like .zfs/ or
+ * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
+ * This will make NFS to switch to LOOKUP instead of using VGET.
+ */
+ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
+ (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
+ return (EOPNOTSUPP);
+
+ ZFS_ENTER(zfsvfs);
+ err = zfs_zget(zfsvfs, ino, &zp);
+ if (err == 0 && zp->z_unlinked) {
+ vrele(ZTOV(zp));
+ err = EINVAL;
+ }
+ if (err == 0)
+ *vpp = ZTOV(zp);
+ ZFS_EXIT(zfsvfs);
+ if (err == 0) {
+ err = vn_lock(*vpp, flags);
+ if (err != 0)
+ vrele(*vpp);
+ }
+ if (err != 0)
+ *vpp = NULL;
+ return (err);
+}
+
+static int
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int *secflavors)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * If this is regular file system vfsp is the same as
+ * zfsvfs->z_parent->z_vfs, but if it is snapshot,
+ * zfsvfs->z_parent->z_vfs represents parent file system
+ * which we have to use here, because only this file system
+ * has mnt_export configured.
+ */
+ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
+ credanonp, numsecflavors, secflavors));
+}
+
+CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
+CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
+
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
+{
+ struct componentname cn;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ vnode_t *dvp;
+ uint64_t object = 0;
+ uint64_t fid_gen = 0;
+ uint64_t gen_mask;
+ uint64_t zp_gen;
+ int i, err;
+
+ *vpp = NULL;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * On FreeBSD we can get snapshot's mount point or its parent file
+ * system mount point depending if snapshot is already mounted or not.
+ */
+ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint64_t objsetid = 0;
+ uint64_t setgen = 0;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+ ZFS_EXIT(zfsvfs);
+
+ err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+ if (err)
+ return (SET_ERROR(EINVAL));
+ ZFS_ENTER(zfsvfs);
+ }
+
+ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+ } else {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * A zero fid_gen means we are in .zfs or the .zfs/snapshot
+ * directory tree. If the object == zfsvfs->z_shares_dir, then
+ * we are in the .zfs/shares directory tree.
+ */
+ if ((fid_gen == 0 &&
+ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
+ (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
+ ZFS_EXIT(zfsvfs);
+ VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
+ if (object == ZFSCTL_INO_SNAPDIR) {
+ cn.cn_nameptr = "snapshot";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = ISLASTCN | LOCKLEAF;
+ cn.cn_lkflags = flags;
+ VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+ vput(dvp);
+ } else if (object == zfsvfs->z_shares_dir) {
+ /*
+ * XXX This branch must not be taken,
+ * if it is, then the lookup below will
+ * explode.
+ */
+ cn.cn_nameptr = "shares";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = ISLASTCN;
+ cn.cn_lkflags = flags;
+ VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+ vput(dvp);
+ } else {
+ *vpp = dvp;
+ }
+ return (err);
+ }
+
+ gen_mask = -1ULL >> (64 - 8 * i);
+
+ dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+ if (err = zfs_zget(zfsvfs, object, &zp)) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (uint64_t));
+ zp_gen = zp_gen & gen_mask;
+ if (zp_gen == 0)
+ zp_gen = 1;
+ if (zp->z_unlinked || zp_gen != fid_gen) {
+ dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+ vrele(ZTOV(zp));
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ *vpp = ZTOV(zp);
+ ZFS_EXIT(zfsvfs);
+ err = vn_lock(*vpp, flags);
+ if (err == 0)
+ vnode_create_vobject(*vpp, zp->z_size, curthread);
+ else
+ *vpp = NULL;
+ return (err);
+}
+
+/*
+ * Block out VOPs and close zfsvfs_t::z_os
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+ int error;
+
+ if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended. Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ int err;
+ znode_t *zp;
+
+ ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+ ASSERT(ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs));
+
+ /*
+ * We already own this, so just update the objset_t, as the one we
+ * had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+
+ err = zfsvfs_init(zfsvfs, os);
+ if (err != 0)
+ goto bail;
+
+ VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+ zfs_set_fuid_feature(zfsvfs);
+
+ /*
+ * Attempt to re-establish all the active znodes with
+ * their dbufs. If a zfs_rezget() fails, then we'll let
+ * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+ * when they try to use their znode.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ (void) zfs_rezget(zp);
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+bail:
+ /* release the VOPs */
+ ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+ if (err) {
+ /*
+ * Since we couldn't setup the sa framework, try to force
+ * unmount this file system.
+ */
+ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
+ vfs_ref(zfsvfs->z_vfs);
+ (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+ }
+ }
+ return (err);
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+#ifdef illumos
+ /*
+ * If this is a snapshot, we have an extra VFS_HOLD on our parent
+ * from zfs_mount(). Release it here. If we came through
+ * zfs_mountroot() instead, we didn't grab an extra hold, so
+ * skip the VFS_RELE for rootvfs.
+ */
+ if (zfsvfs->z_issnap && (vfsp != rootvfs))
+ VFS_RELE(zfsvfs->z_parent->z_vfs);
+#endif
+
+ zfsvfs_free(zfsvfs);
+
+ atomic_dec_32(&zfs_active_fs_count);
+}
+
+#ifdef __i386__
+static int desiredvnodes_backup;
+#endif
+
+static void
+zfs_vnodes_adjust(void)
+{
+#ifdef __i386__
+ int newdesiredvnodes;
+
+ desiredvnodes_backup = desiredvnodes;
+
+ /*
+ * We calculate newdesiredvnodes the same way it is done in
+ * vntblinit(). If it is equal to desiredvnodes, it means that
+ * it wasn't tuned by the administrator and we can tune it down.
+ */
+ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
+ vm_kmem_size / (5 * (sizeof(struct vm_object) +
+ sizeof(struct vnode))));
+ if (newdesiredvnodes == desiredvnodes)
+ desiredvnodes = (3 * newdesiredvnodes) / 4;
+#endif
+}
+
+static void
+zfs_vnodes_adjust_back(void)
+{
+
+#ifdef __i386__
+ desiredvnodes = desiredvnodes_backup;
+#endif
+}
+
+void
+zfs_init(void)
+{
+
+ printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
+
+ /*
+ * Initialize .zfs directory structures
+ */
+ zfsctl_init();
+
+ /*
+ * Initialize znode cache, vnode ops, etc...
+ */
+ zfs_znode_init();
+
+ /*
+ * Reduce number of vnodes. Originally number of vnodes is calculated
+ * with UFS inode in mind. We reduce it here, because it's too big for
+ * ZFS/i386.
+ */
+ zfs_vnodes_adjust();
+
+ dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
+#if defined(__FreeBSD__)
+ zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
+#endif
+}
+
+void
+zfs_fini(void)
+{
+#if defined(__FreeBSD__)
+ taskq_destroy(zfsvfs_taskq);
+#endif
+ zfsctl_fini();
+ zfs_znode_fini();
+ zfs_vnodes_adjust_back();
+}
+
+int
+zfs_busy(void)
+{
+ return (zfs_active_fs_count != 0);
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+ int error;
+ objset_t *os = zfsvfs->z_os;
+ dmu_tx_t *tx;
+
+ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+ return (SET_ERROR(EINVAL));
+
+ if (newvers < zfsvfs->z_version)
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_spa_version_map(newvers) >
+ spa_version(dmu_objset_spa(zfsvfs->z_os)))
+ return (SET_ERROR(ENOTSUP));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ ZFS_SA_ATTRS);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &newvers, tx);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ return (error);
+ }
+
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ uint64_t sa_obj;
+
+ ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+ SPA_VERSION_SA);
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT0(error);
+
+ VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ sa_register_update_callback(os, zfs_sa_upgrade);
+ }
+
+ spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+ "from %llu to %llu", zfsvfs->z_version, newvers);
+
+ dmu_tx_commit(tx);
+
+ zfsvfs->z_version = newvers;
+ os->os_version = newvers;
+
+ zfs_set_fuid_feature(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+ uint64_t *cached_copy = NULL;
+
+ /*
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
+ */
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION) {
+ pname = ZPL_VERSION_STR;
+ } else {
+ pname = zfs_prop_to_name(prop);
+ }
+
+ if (os != NULL) {
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+ }
+
+ if (error == ENOENT) {
+ /* No value set, use the default value */
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ *value = ZPL_VERSION;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ case ZFS_PROP_UTF8ONLY:
+ *value = 0;
+ break;
+ case ZFS_PROP_CASE:
+ *value = ZFS_CASE_SENSITIVE;
+ break;
+ default:
+ return (error);
+ }
+ error = 0;
+ }
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
+ return (error);
+}
+
+/*
+ * Return true if the coresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+ zfsvfs_t *zfvp;
+ boolean_t unmounted = B_FALSE;
+
+ ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+ mutex_enter(&os->os_user_ptr_lock);
+ zfvp = dmu_objset_get_user(os);
+ if (zfvp != NULL && zfvp->z_vfs != NULL &&
+ (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
+ unmounted = B_TRUE;
+ mutex_exit(&os->os_user_ptr_lock);
+
+ return (unmounted);
+}
+
+#ifdef _KERNEL
+void
+zfsvfs_update_fromname(const char *oldname, const char *newname)
+{
+ char tmpbuf[MAXPATHLEN];
+ struct mount *mp;
+ char *fromname;
+ size_t oldlen;
+
+ oldlen = strlen(oldname);
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ fromname = mp->mnt_stat.f_mntfromname;
+ if (strcmp(fromname, oldname) == 0) {
+ (void)strlcpy(fromname, newname,
+ sizeof(mp->mnt_stat.f_mntfromname));
+ continue;
+ }
+ if (strncmp(fromname, oldname, oldlen) == 0 &&
+ (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
+ (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
+ newname, fromname + oldlen);
+ (void)strlcpy(fromname, tmpbuf,
+ sizeof(mp->mnt_stat.f_mntfromname));
+ continue;
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
new file mode 100644
index 000000000000..9ac9503d2f77
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -0,0 +1,6124 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vm.h>
+#include <sys/vnode.h>
+#include <sys/smr.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/atomic.h>
+#include <sys/namei.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/filio.h>
+#include <sys/sid.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_rlock.h>
+#include <sys/extdirent.h>
+#include <sys/kidmap.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sched.h>
+#include <sys/acl.h>
+#include <sys/vmmeter.h>
+#include <vm/vm_param.h>
+#include <sys/zil.h>
+
+VFS_SMR_DECLARE;
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work. To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory. The example below illustrates the following Big Rules:
+ *
+ * (1) A check must be made in each zfs thread for a mounted file system.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs).
+ * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
+ * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
+ * can return EIO from the calling function.
+ *
+ * (2) VN_RELE() should always be the last thing except for zil_commit()
+ * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ * First, if it's the last reference, the vnode/znode
+ * can be freed, so the zp may point to freed memory. Second, the last
+ * reference will call zfs_zinactive(), which may induce a lot of work --
+ * pushing cached pages (which acquires range locks) and syncing out
+ * cached atime changes. Third, zfs_zinactive() may require a new tx,
+ * which could deadlock the system if you were already holding one.
+ * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
+ *
+ * (3) All range locks must be grabbed before calling dmu_tx_assign(),
+ * as they can span dmu_tx_assign() calls.
+ *
+ * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ * dmu_tx_assign(). This is critical because we don't want to block
+ * while holding locks.
+ *
+ * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
+ * reduces lock contention and CPU usage when we must wait (note that if
+ * throughput is constrained by the storage, nearly every transaction
+ * must wait).
+ *
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing
+ * to use a non-blocking assign can deadlock the system. The scenario:
+ *
+ * Thread A has grabbed a lock before calling dmu_tx_assign().
+ * Thread B is in an already-assigned tx, and blocks for this lock.
+ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ * forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
+ * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ * to indicate that this operation has already called dmu_tx_wait().
+ * This will ensure that we don't retry forever, waiting a short bit
+ * each time.
+ *
+ * (5) If the operation succeeded, generate the intent log entry for it
+ * before dropping locks. This ensures that the ordering of events
+ * in the intent log matches the order in which they actually occurred.
+ * During ZIL replay the zfs_log_* functions will update the sequence
+ * number to indicate the zil transaction has replayed.
+ *
+ * (6) At the end of each vnode op, the DMU tx must always commit,
+ * regardless of whether there were any errors.
+ *
+ * (7) After dropping all locks, invoke zil_commit(zilog, foid)
+ * to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * top:
+ * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
+ * rw_enter(...); // grab any other locks you need
+ * tx = dmu_tx_create(...); // get DMU tx
+ * dmu_tx_hold_*(); // hold each object you might modify
+ * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ * if (error) {
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * if (error == ERESTART) {
+ * waited = B_TRUE;
+ * dmu_tx_wait(tx);
+ * dmu_tx_abort(tx);
+ * goto top;
+ * }
+ * dmu_tx_abort(tx); // abort DMU tx
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // really out of space
+ * }
+ * error = do_real_work(); // do whatever this VOP does
+ * if (error == 0)
+ * zfs_log_*(...); // on success, make ZIL entry
+ * dmu_tx_commit(tx); // commit DMU tx -- error or not
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * zil_commit(zilog, foid); // synchronous when necessary
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // done, report error
+ */
+
+/* ARGSUSED */
+static int
+zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(*vpp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+ ((flag & FAPPEND) == 0)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+ ZTOV(zp)->v_type == VREG &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+ if (fs_vscan(*vpp, cr, 0) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+ }
+
+ /* Keep a count of the synchronous opens in the znode */
+ if (flag & (FSYNC | FDSYNC))
+ atomic_inc_32(&zp->z_sync_cnt);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ /*
+ * Clean up any locks held by this process on the vp.
+ */
+ cleanlocks(vp, ddi_get_pid(), 0);
+ cleanshares(vp, ddi_get_pid());
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* Decrement the synchronous opens in the znode */
+ if ((flag & (FSYNC | FDSYNC)) && (count == 1))
+ atomic_dec_32(&zp->z_sync_cnt);
+
+ if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+ ZTOV(zp)->v_type == VREG &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+ VERIFY(fs_vscan(vp, cr, 1) == 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
+ * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
+{
+ znode_t *zp = VTOZ(vp);
+ uint64_t noff = (uint64_t)*off; /* new offset */
+ uint64_t file_sz;
+ int error;
+ boolean_t hole;
+
+ file_sz = zp->z_size;
+ if (noff >= file_sz) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (cmd == _FIO_SEEK_HOLE)
+ hole = B_TRUE;
+ else
+ hole = B_FALSE;
+
+ error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
+
+ if (error == ESRCH)
+ return (SET_ERROR(ENXIO));
+
+ /*
+ * We could find a hole that begins after the logical end-of-file,
+ * because dmu_offset_next() only works on whole blocks. If the
+ * EOF falls mid-block, then indicate that the "virtual hole"
+ * at the end of the file begins at the logical EOF, rather than
+ * at the end of the last block.
+ */
+ if (noff > file_sz) {
+ ASSERT(hole);
+ noff = file_sz;
+ }
+
+ if (noff < *off)
+ return (error);
+ *off = noff;
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
+ int *rvalp, caller_context_t *ct)
+{
+ offset_t off;
+ offset_t ndata;
+ dmu_object_info_t doi;
+ int error;
+ zfsvfs_t *zfsvfs;
+ znode_t *zp;
+
+ switch (com) {
+ case _FIOFFS:
+ {
+ return (0);
+
+ /*
+ * The following two ioctls are used by bfu. Faking out,
+ * necessary to avoid bfu errors.
+ */
+ }
+ case _FIOGDIO:
+ case _FIOSDIO:
+ {
+ return (0);
+ }
+
+ case _FIO_SEEK_DATA:
+ case _FIO_SEEK_HOLE:
+ {
+#ifdef illumos
+ if (ddi_copyin((void *)data, &off, sizeof (off), flag))
+ return (SET_ERROR(EFAULT));
+#else
+ off = *(offset_t *)data;
+#endif
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* offset parameter is in/out */
+ error = zfs_holey(vp, com, &off);
+ ZFS_EXIT(zfsvfs);
+ if (error)
+ return (error);
+#ifdef illumos
+ if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
+ return (SET_ERROR(EFAULT));
+#else
+ *(offset_t *)data = off;
+#endif
+ return (0);
+ }
+#ifdef illumos
+ case _FIO_COUNT_FILLED:
+ {
+ /*
+ * _FIO_COUNT_FILLED adds a new ioctl command which
+ * exposes the number of filled blocks in a
+ * ZFS object.
+ */
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * Wait for all dirty blocks for this object
+ * to get synced out to disk, and the DMU info
+ * updated.
+ */
+ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Retrieve fill count from DMU object.
+ */
+ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ ndata = doi.doi_fill_count;
+
+ ZFS_EXIT(zfsvfs);
+ if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
+ return (SET_ERROR(EFAULT));
+ return (0);
+ }
+#endif
+ }
+ return (SET_ERROR(ENOTTY));
+}
+
+static vm_page_t
+page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
+{
+ vm_object_t obj;
+ vm_page_t pp;
+ int64_t end;
+
+ /*
+ * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
+ * aligned boundaries, if the range is not aligned. As a result a
+ * DEV_BSIZE subrange with partially dirty data may get marked as clean.
+ * It may happen that all DEV_BSIZE subranges are marked clean and thus
+ * the whole page would be considred clean despite have some dirty data.
+ * For this reason we should shrink the range to DEV_BSIZE aligned
+ * boundaries before calling vm_page_clear_dirty.
+ */
+ end = rounddown2(off + nbytes, DEV_BSIZE);
+ off = roundup2(off, DEV_BSIZE);
+ nbytes = end - off;
+
+ obj = vp->v_object;
+
+ vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
+ VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
+ VM_ALLOC_IGN_SBUSY);
+ if (pp != NULL) {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_object_pip_add(obj, 1);
+ pmap_remove_write(pp);
+ if (nbytes != 0)
+ vm_page_clear_dirty(pp, off, nbytes);
+ }
+ return (pp);
+}
+
+static void
+page_unbusy(vm_page_t pp)
+{
+
+ vm_page_sunbusy(pp);
+ vm_object_pip_wakeup(pp->object);
+}
+
+static vm_page_t
+page_wire(vnode_t *vp, int64_t start)
+{
+ vm_object_t obj;
+ vm_page_t m;
+
+ obj = vp->v_object;
+ vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
+ VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
+ VM_ALLOC_NOBUSY);
+ return (m);
+}
+
+static void
+page_unwire(vm_page_t pp)
+{
+
+ vm_page_unwire(pp, PQ_ACTIVE);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Write: If we find a memory mapped page, we write to *both*
+ * the page and the dmu buffer.
+ */
+static void
+update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
+ int segflg, dmu_tx_t *tx)
+{
+ vm_object_t obj;
+ struct sf_buf *sf;
+ caddr_t va;
+ int off;
+
+ ASSERT(segflg != UIO_NOCOPY);
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ off = start & PAGEOFFSET;
+ vm_object_pip_add(obj, 1);
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ vm_page_t pp;
+ int nbytes = imin(PAGESIZE - off, len);
+
+ if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+ va = zfs_map_page(pp, &sf);
+ (void) dmu_read(os, oid, start+off, nbytes,
+ va+off, DMU_READ_PREFETCH);;
+ zfs_unmap_page(sf);
+ page_unbusy(pp);
+ }
+ len -= nbytes;
+ off = 0;
+ }
+ vm_object_pip_wakeup(obj);
+}
+
+/*
+ * Read with UIO_NOCOPY flag means that sendfile(2) requests
+ * ZFS to populate a range of page cache pages with data.
+ *
+ * NOTE: this function could be optimized to pre-allocate
+ * all pages in advance, drain exclusive busy on all of them,
+ * map them into contiguous KVA region and populate them
+ * in one single dmu_read() call.
+ */
+static int
+mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
+{
+ znode_t *zp = VTOZ(vp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ struct sf_buf *sf;
+ vm_object_t obj;
+ vm_page_t pp;
+ int64_t start;
+ caddr_t va;
+ int len = nbytes;
+ int off;
+ int error = 0;
+
+ ASSERT(uio->uio_segflg == UIO_NOCOPY);
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+ ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
+
+ for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
+ int bytes = MIN(PAGESIZE, len);
+
+ pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
+ VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
+ if (vm_page_none_valid(pp)) {
+ va = zfs_map_page(pp, &sf);
+ error = dmu_read(os, zp->z_id, start, bytes, va,
+ DMU_READ_PREFETCH);
+ if (bytes != PAGESIZE && error == 0)
+ bzero(va + bytes, PAGESIZE - bytes);
+ zfs_unmap_page(sf);
+ if (error == 0) {
+ vm_page_valid(pp);
+ vm_page_activate(pp);
+ vm_page_sunbusy(pp);
+ } else {
+ zfs_vmobject_wlock(obj);
+ if (!vm_page_wired(pp) && pp->valid == 0 &&
+ vm_page_busy_tryupgrade(pp))
+ vm_page_free(pp);
+ else
+ vm_page_sunbusy(pp);
+ zfs_vmobject_wunlock(obj);
+ }
+ } else {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_sunbusy(pp);
+ }
+ if (error)
+ break;
+ uio->uio_resid -= bytes;
+ uio->uio_offset += bytes;
+ len -= bytes;
+ }
+ return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Read: We "read" preferentially from memory mapped pages,
+ * else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedread(vnode_t *vp, int nbytes, uio_t *uio)
+{
+ znode_t *zp = VTOZ(vp);
+ vm_object_t obj;
+ int64_t start;
+ caddr_t va;
+ int len = nbytes;
+ int off;
+ int error = 0;
+
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ start = uio->uio_loffset;
+ off = start & PAGEOFFSET;
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ vm_page_t pp;
+ uint64_t bytes = MIN(PAGESIZE - off, len);
+
+ if (pp = page_wire(vp, start)) {
+ struct sf_buf *sf;
+ caddr_t va;
+
+ va = zfs_map_page(pp, &sf);
+#ifdef illumos
+ error = uiomove(va + off, bytes, UIO_READ, uio);
+#else
+ error = vn_io_fault_uiomove(va + off, bytes, uio);
+#endif
+ zfs_unmap_page(sf);
+ page_unwire(pp);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, bytes);
+ }
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ return (error);
+}
+
+offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ * IN: vp - vnode of file to be read from.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * ioflag - SYNC flags; used to provide FRSYNC semantics.
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Side Effects:
+ * vp - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ssize_t n, nbytes;
+ int error = 0;
+ xuio_t *xuio = NULL;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* We don't copy out anything useful for directories. */
+ if (vp->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EISDIR));
+ }
+
+ if (zp->z_pflags & ZFS_AV_QUARANTINED) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+
+ /*
+ * Validate file offset
+ */
+ if (uio->uio_loffset < (offset_t)0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Fasttrack empty reads
+ */
+ if (uio->uio_resid == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * Check for mandatory locks
+ */
+ if (MANDMODE(zp->z_mode)) {
+ if (error = chklock(vp, FREAD,
+ uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ /*
+ * If we're in FRSYNC mode, sync out this znode before reading it.
+ */
+ if (zfsvfs->z_log &&
+ (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+ zil_commit(zfsvfs->z_log, zp->z_id);
+
+ /*
+ * Lock the range against changes.
+ */
+ locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
+ uio->uio_loffset, uio->uio_resid, RL_READER);
+
+ /*
+ * If we are reading past end-of-file we can skip
+ * to the end; but we might still need to set atime.
+ */
+ if (uio->uio_loffset >= zp->z_size) {
+ error = 0;
+ goto out;
+ }
+
+ ASSERT(uio->uio_loffset < zp->z_size);
+ n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+
+#ifdef illumos
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+ int nblk;
+ int blksz = zp->z_blksz;
+ uint64_t offset = uio->uio_loffset;
+
+ xuio = (xuio_t *)uio;
+ if ((ISP2(blksz))) {
+ nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+ blksz)) / blksz;
+ } else {
+ ASSERT(offset + n <= blksz);
+ nblk = 1;
+ }
+ (void) dmu_xuio_init(xuio, nblk);
+
+ if (vn_has_cached_data(vp)) {
+ /*
+ * For simplicity, we always allocate a full buffer
+ * even if we only expect to read a portion of a block.
+ */
+ while (--nblk >= 0) {
+ (void) dmu_xuio_add(xuio,
+ dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz), 0, blksz);
+ }
+ }
+ }
+#endif /* illumos */
+
+ while (n > 0) {
+ nbytes = MIN(n, zfs_read_chunk_size -
+ P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+
+#ifdef __FreeBSD__
+ if (uio->uio_segflg == UIO_NOCOPY)
+ error = mappedread_sf(vp, nbytes, uio);
+ else
+#endif /* __FreeBSD__ */
+ if (vn_has_cached_data(vp)) {
+ error = mappedread(vp, nbytes, uio);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes);
+ }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+
+ n -= nbytes;
+ }
+out:
+ rangelock_exit(lr);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: vp - vnode of file to be written to.
+ * uio - structure supplying write location, range info,
+ * and data buffer.
+ * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
+ * set if in append mode.
+ * cr - credentials of caller.
+ * ct - caller context (NFS/CIFS fem monitor only)
+ *
+ * OUT: uio - updated offset and range.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated if byte count > 0
+ */
+
+/* ARGSUSED */
+static int
+zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ rlim64_t limit = MAXOFFSET_T;
+ ssize_t start_resid = uio->uio_resid;
+ ssize_t tx_bytes;
+ uint64_t end_size;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog;
+ offset_t woff;
+ ssize_t n, nbytes;
+ int max_blksz = zfsvfs->z_max_blksz;
+ int error = 0;
+ arc_buf_t *abuf;
+ iovec_t *aiov = NULL;
+ xuio_t *xuio = NULL;
+ int i_iov = 0;
+ int iovcnt = uio->uio_iovcnt;
+ iovec_t *iovp = uio->uio_iov;
+ int write_eof;
+ int count = 0;
+ sa_bulk_attr_t bulk[4];
+ uint64_t mtime[2], ctime[2];
+
+ /*
+ * Fasttrack empty write
+ */
+ n = start_resid;
+ if (n == 0)
+ return (0);
+
+ if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+ limit = MAXOFFSET_T;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
+ /*
+ * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
+ * callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * If immutable or not appending then return EPERM.
+ * Intentionally allow ZFS_READONLY through here.
+ * See zfs_zaccess_common()
+ */
+ if ((zp->z_pflags & ZFS_IMMUTABLE) ||
+ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+ (uio->uio_loffset < zp->z_size))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ zilog = zfsvfs->z_log;
+
+ /*
+ * Validate file offset
+ */
+ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Check for mandatory locks before calling rangelock_enter()
+ * in order to prevent a deadlock with locks set via fcntl().
+ */
+ if (MANDMODE((mode_t)zp->z_mode) &&
+ (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+#ifdef illumos
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages
+ * don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
+ */
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+ xuio = (xuio_t *)uio;
+ else
+ uio_prefaultpages(MIN(n, max_blksz), uio);
+#endif
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+ */
+ locked_range_t *lr;
+ if (ioflag & FAPPEND) {
+ /*
+ * Obtain an appending range lock to guarantee file append
+ * semantics. We reset the write offset once we have the lock.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+ woff = lr->lr_offset;
+ if (lr->lr_length == UINT64_MAX) {
+ /*
+ * We overlocked the file because this write will cause
+ * the file block size to increase.
+ * Note that zp_size cannot change with this lock held.
+ */
+ woff = zp->z_size;
+ }
+ uio->uio_loffset = woff;
+ } else {
+ /*
+ * Note that if the file block size will change as a result of
+ * this write, then this range lock will lock the entire file
+ * so that we can re-write the block safely.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
+ }
+
+ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
+ rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (EFBIG);
+ }
+
+ if (woff >= limit) {
+ rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFBIG));
+ }
+
+ if ((woff + n) > limit || woff > (limit - n))
+ n = limit - woff;
+
+ /* Will this write extend the file length? */
+ write_eof = (woff + n > zp->z_size);
+
+ end_size = MAX(zp->z_size, woff + n);
+
+ /*
+ * Write the file in reasonable size chunks. Each chunk is written
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (n > 0) {
+ abuf = NULL;
+ woff = uio->uio_loffset;
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ error = SET_ERROR(EDQUOT);
+ break;
+ }
+
+ if (xuio && abuf == NULL) {
+ ASSERT(i_iov < iovcnt);
+ aiov = &iovp[i_iov];
+ abuf = dmu_xuio_arcbuf(xuio, i_iov);
+ dmu_xuio_clear(xuio, i_iov);
+ DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+ iovec_t *, aiov, arc_buf_t *, abuf);
+ ASSERT((aiov->iov_base == abuf->b_data) ||
+ ((char *)aiov->iov_base - (char *)abuf->b_data +
+ aiov->iov_len == arc_buf_size(abuf)));
+ i_iov++;
+ } else if (abuf == NULL && n >= max_blksz &&
+ woff >= zp->z_size &&
+ P2PHASE(woff, max_blksz) == 0 &&
+ zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
+ size_t cbytes;
+
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ max_blksz);
+ ASSERT(abuf != NULL);
+ ASSERT(arc_buf_size(abuf) == max_blksz);
+ if (error = uiocopy(abuf->b_data, max_blksz,
+ UIO_WRITE, uio, &cbytes)) {
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+ ASSERT(cbytes == max_blksz);
+ }
+
+ /*
+ * Start a transaction.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+
+ /*
+ * If rangelock_enter() over-locked we grow the blocksize
+ * and then reduce the lock range. This will only happen
+ * on the first iteration since rangelock_reduce() will
+ * shrink down lr_length to the appropriate size.
+ */
+ if (lr->lr_length == UINT64_MAX) {
+ uint64_t new_blksz;
+
+ if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ zfs_grow_blocksize(zp, new_blksz, tx);
+ rangelock_reduce(lr, woff, n);
+ }
+
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
+ if (woff + nbytes > zp->z_size)
+ vnode_pager_setsize(vp, woff + nbytes);
+
+ if (abuf == NULL) {
+ tx_bytes = uio->uio_resid;
+ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes, tx);
+ tx_bytes -= uio->uio_resid;
+ } else {
+ tx_bytes = nbytes;
+ ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+ /*
+ * If this is not a full block write, but we are
+ * extending the file past EOF and this data starts
+ * block-aligned, use assign_arcbuf(). Otherwise,
+ * write via dmu_write().
+ */
+ if (tx_bytes < max_blksz && (!write_eof ||
+ aiov->iov_base != abuf->b_data)) {
+ ASSERT(xuio);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff,
+ aiov->iov_len, aiov->iov_base, tx);
+ dmu_return_arcbuf(abuf);
+ xuio_stat_wbuf_copied();
+ } else {
+ ASSERT(xuio || tx_bytes == max_blksz);
+ dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
+ woff, abuf, tx);
+ }
+ ASSERT(tx_bytes <= uio->uio_resid);
+ uioskip(uio, tx_bytes);
+ }
+ if (tx_bytes && vn_has_cached_data(vp)) {
+ update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
+ zp->z_id, uio->uio_segflg, tx);
+ }
+
+ /*
+ * If we made no progress, we're done. If we made even
+ * partial progress, update the znode and ZIL accordingly.
+ */
+ if (tx_bytes == 0) {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+ dmu_tx_commit(tx);
+ ASSERT(error != 0);
+ break;
+ }
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the excute bits is set.
+ *
+ * It would be nice to to this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ *
+ * Note: we don't call zfs_fuid_map_id() here because
+ * user 0 is not an ephemeral uid.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ (S_IXUSR >> 6))) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(vp, cr,
+ (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
+ uint64_t newmode;
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((end_size = zp->z_size) < uio->uio_loffset) {
+ (void) atomic_cas_64(&zp->z_size, end_size,
+ uio->uio_loffset);
+#ifdef illumos
+ ASSERT(error == 0);
+#else
+ ASSERT(error == 0 || error == EFAULT);
+#endif
+ }
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
+ if (error == 0)
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ else
+ (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+ ASSERT(tx_bytes == nbytes);
+ n -= nbytes;
+
+#ifdef illumos
+ if (!xuio && n > 0)
+ uio_prefaultpages(MIN(n, max_blksz), uio);
+#endif
+ }
+
+ rangelock_exit(lr);
+
+ /*
+ * If we're in replay mode, or we made no progress, return error.
+ * Otherwise, it's at least a partial write, so it's successful.
+ */
+ if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+#ifdef __FreeBSD__
+ /*
+ * EFAULT means that at least one page of the source buffer was not
+ * available. VFS will re-try remaining I/O upon this error.
+ */
+ if (error == EFAULT) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+#endif
+
+ if (ioflag & (FSYNC | FDSYNC) ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, zp->z_id);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/* ARGSUSED */
+void
+zfs_get_done(zgd_t *zgd, int error)
+{
+ znode_t *zp = zgd->zgd_private;
+ objset_t *os = zp->z_zfsvfs->z_os;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ rangelock_exit(zgd->zgd_lr);
+
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+#ifdef DEBUG
+static int zil_fault_io = 0;
+#endif
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error = 0;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ /*
+ * Nothing to do if the file has been removed
+ */
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
+ return (SET_ERROR(ENOENT));
+ if (zp->z_unlinked) {
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+ return (SET_ERROR(ENOENT));
+ }
+
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+ zgd->zgd_private = zp;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ /* test for truncation needs to be done while range locked */
+ if (offset >= zp->z_size) {
+ error = SET_ERROR(ENOENT);
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ }
+ ASSERT(error == 0 || error == ENOENT);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's
+ * written out and its checksum is being calculated
+ * that no one can change the data. We need to re-check
+ * blocksize after we get the lock in case it's changed!
+ */
+ for (;;) {
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+ offset -= blkoff;
+ zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ if (zp->z_blksz == size)
+ break;
+ offset += blkoff;
+ rangelock_exit(zgd->zgd_lr);
+ }
+ /* test for truncation needs to be done while range locked */
+ if (lr->lr_offset >= zp->z_size)
+ error = SET_ERROR(ENOENT);
+#ifdef DEBUG
+ if (zil_fault_io) {
+ error = SET_ERROR(EIO);
+ zil_fault_io = 0;
+ }
+#endif
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= size);
+
+ /*
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
+ */
+ if (error == 0)
+ return (0);
+
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ /*
+ * TX_WRITE2 relies on the data previously
+ * written by the TX_WRITE that caused
+ * EALREADY. We zero out the BP because
+ * it is the old, currently-on-disk BP.
+ */
+ zgd->zgd_bp = NULL;
+ BP_ZERO(bp);
+ error = 0;
+ }
+ }
+ }
+
+ zfs_get_done(zgd, error);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (flag & V_ACE_MASK)
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+ else
+ error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static int
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
+{
+ int error;
+
+ *vpp = arg;
+ error = vn_lock(*vpp, lkflags);
+ if (error != 0)
+ vrele(*vpp);
+ return (error);
+}
+
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+ znode_t *zdp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error;
+ int ltype;
+
+ ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+ if ((zdp->z_pflags & ZFS_XATTR) == 0)
+ VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ ASSERT3P(dvp, ==, vp);
+ vref(dvp);
+ ltype = lkflags & LK_TYPE_MASK;
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+ /*
+ * Relock for the "." case could leave us with
+ * reclaimed vnode.
+ */
+ if (VN_IS_DOOMED(dvp)) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
+ }
+ return (0);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ /*
+ * Note that in this case, dvp is the child vnode, and we
+ * are looking up the parent vnode - exactly reverse from
+ * normal operation. Unlocking dvp requires some rather
+ * tricky unlock/relock dance to prevent mp from being freed;
+ * use vn_vget_ino_gen() which takes care of all that.
+ *
+ * XXX Note that there is a time window when both vnodes are
+ * unlocked. It is possible, although highly unlikely, that
+ * during that window the parent-child relationship between
+ * the vnodes may change, for example, get reversed.
+ * In that case we would have a wrong lock order for the vnodes.
+ * All other filesystems seem to ignore this problem, so we
+ * do the same here.
+ * A potential solution could be implemented as follows:
+ * - using LK_NOWAIT when locking the second vnode and retrying
+ * if necessary
+ * - checking that the parent-child relationship still holds
+ * after locking both vnodes and retrying if it doesn't
+ */
+ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+ return (error);
+ } else {
+ error = vn_lock(vp, lkflags);
+ if (error != 0)
+ vrele(vp);
+ return (error);
+ }
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ * IN: dvp - vnode of directory to search.
+ * nm - name of entry to lookup.
+ * pnp - full pathname to lookup [UNUSED].
+ * flags - LOOKUP_XATTR set if looking for an attribute.
+ * rdir - root directory vnode [UNUSED].
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: vpp - vnode of located entry, NULL if not found.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
+ int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached)
+{
+ znode_t *zdp = VTOZ(dvp);
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error = 0;
+
+ /*
+ * Fast path lookup, however we must skip DNLC lookup
+ * for case folding or normalizing lookups because the
+ * DNLC code only stores the passed in name. This means
+ * creating 'a' and removing 'A' on a case insensitive
+ * file system would work, but DNLC still thinks 'a'
+ * exists and won't let you create it again on the next
+ * pass through fast path.
+ */
+ if (!(flags & LOOKUP_XATTR)) {
+ if (dvp->v_type != VDIR) {
+ return (SET_ERROR(ENOTDIR));
+ } else if (zdp->z_sa_hdl == NULL) {
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zdp);
+
+ *vpp = NULL;
+
+ if (flags & LOOKUP_XATTR) {
+#ifdef TODO
+ /*
+ * If the xattr property is off, refuse the lookup request.
+ */
+ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+#endif
+
+ /*
+ * We don't allow recursive attributes..
+ * Maybe someday we will.
+ */
+ if (zdp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Do we have permission to get into attribute directory?
+ */
+ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
+ B_FALSE, cr)) {
+ vrele(*vpp);
+ *vpp = NULL;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Check accessibility of directory.
+ */
+ if (!cached) {
+ if ((cnp->cn_flags & NOEXECCHECK) != 0) {
+ cnp->cn_flags &= ~NOEXECCHECK;
+ } else {
+ error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+
+ /*
+ * First handle the special cases.
+ */
+ if ((cnp->cn_flags & ISDOTDOT) != 0) {
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the vp for the snapshot directory.
+ */
+ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+ struct componentname cn;
+ vnode_t *zfsctl_vp;
+ int ltype;
+
+ ZFS_EXIT(zfsvfs);
+ ltype = VOP_ISLOCKED(dvp);
+ VOP_UNLOCK(dvp);
+ error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
+ &zfsctl_vp);
+ if (error == 0) {
+ cn.cn_nameptr = "snapshot";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = cnp->cn_nameiop;
+ cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
+ cn.cn_lkflags = cnp->cn_lkflags;
+ error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
+ vput(zfsctl_vp);
+ }
+ vn_lock(dvp, ltype | LK_RETRY);
+ return (error);
+ }
+ }
+ if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+ ZFS_EXIT(zfsvfs);
+ if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+ error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
+ return (error);
+ }
+
+ /*
+ * The loop is retry the lookup if the parent-child relationship
+ * changes during the dot-dot locking complexities.
+ */
+ for (;;) {
+ uint64_t parent;
+
+ error = zfs_dirlook(zdp, nm, &zp);
+ if (error == 0)
+ *vpp = ZTOV(zp);
+
+ ZFS_EXIT(zfsvfs);
+ if (error != 0)
+ break;
+
+ error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+ if (error != 0) {
+ /*
+ * If we've got a locking error, then the vnode
+ * got reclaimed because of a force unmount.
+ * We never enter doomed vnodes into the name cache.
+ */
+ *vpp = NULL;
+ return (error);
+ }
+
+ if ((cnp->cn_flags & ISDOTDOT) == 0)
+ break;
+
+ ZFS_ENTER(zfsvfs);
+ if (zdp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ } else {
+ error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ }
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ vput(ZTOV(zp));
+ break;
+ }
+ if (zp->z_id == parent) {
+ ZFS_EXIT(zfsvfs);
+ break;
+ }
+ vput(ZTOV(zp));
+ }
+
+out:
+ if (error != 0)
+ *vpp = NULL;
+
+ /* Translate errors and add SAVENAME when needed. */
+ if (cnp->cn_flags & ISLASTCN) {
+ switch (nameiop) {
+ case CREATE:
+ case RENAME:
+ if (error == ENOENT) {
+ error = EJUSTRETURN;
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DELETE:
+ if (error == 0)
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ }
+
+ /* Insert name into cache (as non-existent) if appropriate. */
+ if (zfsvfs->z_use_namecache &&
+ error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(dvp, NULL, cnp);
+
+ /* Insert name into cache if appropriate. */
+ if (zfsvfs->z_use_namecache &&
+ error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+ if (!(cnp->cn_flags & ISLASTCN) ||
+ (nameiop != DELETE && nameiop != RENAME)) {
+ cache_enter(dvp, *vpp, cnp);
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory. If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error. Return the vp of the created or trunc'd file.
+ *
+ * IN: dvp - vnode of directory to put new file entry in.
+ * name - name of new file entry.
+ * vap - attributes of new file.
+ * excl - flag indicating exclusive or non-exclusive mode.
+ * mode - mode to open file with.
+ * cr - credentials of caller.
+ * flag - large file flag [UNUSED].
+ * ct - caller context
+ * vsecp - ACL to be set
+ *
+ * OUT: vpp - vnode of created or trunc'd entry.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated if new entry created
+ * vp - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+static int
+zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
+ vnode_t **vpp, cred_t *cr, kthread_t *td)
+{
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ objset_t *os;
+ dmu_tx_t *tx;
+ int error;
+ ksid_t *ksid;
+ uid_t uid;
+ gid_t gid = crgetgid(cr);
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ void *vsecp = NULL;
+ int flag = 0;
+ uint64_t txtype;
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ ksid = crgetsid(cr, KSID_OWNER);
+ if (ksid)
+ uid = ksid_getid(ksid);
+ else
+ uid = crgetuid(cr);
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || (vap->va_mask & AT_XVATTR) ||
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ os = zfsvfs->z_os;
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (vap->va_mask & AT_XVATTR) {
+ if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_type)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ *vpp = NULL;
+
+ if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
+ vap->va_mode &= ~S_ISVTX;
+
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ASSERT3P(zp, ==, NULL);
+
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ goto out;
+ }
+
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
+
+ if ((dzp->z_pflags & ZFS_XATTR) &&
+ (vap->va_type != VREG)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
+
+ getnewvnode_reserve();
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+ txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+ vsecp, acl_ids.z_fuidp, vap);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+out:
+ if (error == 0) {
+ *vpp = ZTOV(zp);
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ * IN: dvp - vnode of directory to remove entry from.
+ * name - name of entry to remove.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime
+ * vp - ctime (if nlink > 0)
+ */
+
+/*ARGSUSED*/
+static int
+zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
+{
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp = VTOZ(vp);
+ znode_t *xzp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t acl_obj, xattr_obj;
+ uint64_t obj = 0;
+ dmu_tx_t *tx;
+ boolean_t unlinked, toobig = FALSE;
+ uint64_t txtype;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ ZFS_VERIFY_ZP(zp);
+ zilog = zfsvfs->z_log;
+ zp = VTOZ(vp);
+
+ xattr_obj = 0;
+ xzp = NULL;
+
+ if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+ goto out;
+ }
+
+ /*
+ * Need to use rmdir for removing directories.
+ */
+ if (vp->v_type == VDIR) {
+ error = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ vnevent_remove(vp, dvp, name, ct);
+
+ obj = zp->z_id;
+
+ /* are there any extended attributes? */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT0(error);
+ }
+
+ /*
+ * We may delete the znode now, or we may put it in the unlinked set;
+ * it depends on whether we're the last link, and on whether there are
+ * other holds on the vnode. So we dmu_tx_hold() the right things to
+ * allow for either case.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+
+ if (xzp) {
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ }
+
+ /* charge as an update -- would be nice not to charge at all */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ /*
+ * Mark this transaction as typically resulting in a net free of space
+ */
+ dmu_tx_mark_netfree(tx);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Remove the directory entry.
+ */
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (unlinked) {
+ zfs_unlinked_add(zp, tx);
+ vp->v_vflag |= VV_NOSYNC;
+ }
+
+ txtype = TX_REMOVE;
+ zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
+
+ dmu_tx_commit(tx);
+out:
+
+ if (xzp)
+ vrele(ZTOV(xzp));
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided. Return a pointer to the inserted directory.
+ *
+ * IN: dvp - vnode of directory to add subdir to.
+ * dirname - name of new directory.
+ * vap - attributes of new directory.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ * vsecp - ACL to be set
+ *
+ * OUT: vpp - vnode of created directory.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ * vp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
+{
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t txtype;
+ dmu_tx_t *tx;
+ int error;
+ ksid_t *ksid;
+ uid_t uid;
+ gid_t gid = crgetgid(cr);
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+
+ ASSERT(vap->va_type == VDIR);
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ ksid = crgetsid(cr, KSID_OWNER);
+ if (ksid)
+ uid = ksid_getid(ksid);
+ else
+ uid = crgetuid(cr);
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ ((vap->va_mask & AT_XVATTR) ||
+ IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (dzp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(dirname,
+ strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (vap->va_mask & AT_XVATTR) {
+ if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_type)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+ NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * First make sure the new directory doesn't exist.
+ *
+ * Existence is checked first to make sure we don't return
+ * EACCES instead of EEXIST which can cause some applications
+ * to fail.
+ */
+ *vpp = NULL;
+
+ if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ASSERT3P(zp, ==, NULL);
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ /*
+ * Add a new entry to the directory.
+ */
+ getnewvnode_reserve();
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create new node.
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ /*
+ * Now put new name in parent dir.
+ */
+ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
+
+ *vpp = ZTOV(zp);
+
+ txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
+ acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Remove a directory subdir entry. If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ * IN: dvp - vnode of directory to remove from.
+ * name - name of directory to be removed.
+ * cwd - vnode of current working directory.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
+{
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ ZFS_VERIFY_ZP(zp);
+ zilog = zfsvfs->z_log;
+
+
+ if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+ goto out;
+ }
+
+ if (vp->v_type != VDIR) {
+ error = SET_ERROR(ENOTDIR);
+ goto out;
+ }
+
+ vnevent_rmdir(vp, dvp, name, ct);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ cache_purge(dvp);
+
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
+
+ if (error == 0) {
+ uint64_t txtype = TX_RMDIR;
+ zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
+ }
+
+ dmu_tx_commit(tx);
+
+ cache_purge(vp);
+out:
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure).
+ *
+ * IN: vp - vnode of directory to read.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ * eofp - set to true if end-of-file detected.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
+{
+ znode_t *zp = VTOZ(vp);
+ iovec_t *iovp;
+ edirent_t *eodp;
+ dirent64_t *odp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os;
+ caddr_t outbuf;
+ size_t bufsize;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ uint_t bytes_wanted;
+ uint64_t offset; /* must be unsigned; checks for < 1 */
+ uint64_t parent;
+ int local_eof;
+ int outcount;
+ int error;
+ uint8_t prefetch;
+ boolean_t check_sysattrs;
+ uint8_t type;
+ int ncooks;
+ u_long *cooks = NULL;
+ int flags = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * If we are not given an eof variable,
+ * use a local one.
+ */
+ if (eofp == NULL)
+ eofp = &local_eof;
+
+ /*
+ * Check for valid iov_len.
+ */
+ if (uio->uio_iov->iov_len <= 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Quit if directory has been removed (posix)
+ */
+ if ((*eofp = zp->z_unlinked) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ error = 0;
+ os = zfsvfs->z_os;
+ offset = uio->uio_loffset;
+ prefetch = zp->z_zn_prefetch;
+
+ /*
+ * Initialize the iterator cursor.
+ */
+ if (offset <= 3) {
+ /*
+ * Start iteration from the beginning of the directory.
+ */
+ zap_cursor_init(&zc, os, zp->z_id);
+ } else {
+ /*
+ * The offset is a serialized cursor.
+ */
+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+ }
+
+ /*
+ * Get space to change directory entries into fs independent format.
+ */
+ iovp = uio->uio_iov;
+ bytes_wanted = iovp->iov_len;
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
+ bufsize = bytes_wanted;
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+ odp = (struct dirent64 *)outbuf;
+ } else {
+ bufsize = bytes_wanted;
+ outbuf = NULL;
+ odp = (struct dirent64 *)iovp->iov_base;
+ }
+ eodp = (struct edirent *)odp;
+
+ if (ncookies != NULL) {
+ /*
+ * Minimum entry size is dirent size and 1 byte for a file name.
+ */
+ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
+ cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
+ *cookies = cooks;
+ *ncookies = ncooks;
+ }
+ /*
+ * If this VFS supports the system attribute view interface; and
+ * we're looking at an extended attribute directory; and we care
+ * about normalization conflicts on this vfs; then we must check
+ * for normalization conflicts with the sysattr name space.
+ */
+#ifdef TODO
+ check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+ (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
+ (flags & V_RDDIR_ENTFLAGS);
+#else
+ check_sysattrs = 0;
+#endif
+
+ /*
+ * Transform to file-system independent format
+ */
+ outcount = 0;
+ while (outcount < bytes_wanted) {
+ ino64_t objnum;
+ ushort_t reclen;
+ off64_t *next = NULL;
+
+ /*
+ * Special case `.', `..', and `.zfs'.
+ */
+ if (offset == 0) {
+ (void) strcpy(zap.za_name, ".");
+ zap.za_normalization_conflict = 0;
+ objnum = zp->z_id;
+ type = DT_DIR;
+ } else if (offset == 1) {
+ (void) strcpy(zap.za_name, "..");
+ zap.za_normalization_conflict = 0;
+ objnum = parent;
+ type = DT_DIR;
+ } else if (offset == 2 && zfs_show_ctldir(zp)) {
+ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+ zap.za_normalization_conflict = 0;
+ objnum = ZFSCTL_INO_ROOT;
+ type = DT_DIR;
+ } else {
+ /*
+ * Grab next entry.
+ */
+ if (error = zap_cursor_retrieve(&zc, &zap)) {
+ if ((*eofp = (error == ENOENT)) != 0)
+ break;
+ else
+ goto update;
+ }
+
+ if (zap.za_integer_length != 8 ||
+ zap.za_num_integers != 1) {
+ cmn_err(CE_WARN, "zap_readdir: bad directory "
+ "entry, obj = %lld, offset = %lld\n",
+ (u_longlong_t)zp->z_id,
+ (u_longlong_t)offset);
+ error = SET_ERROR(ENXIO);
+ goto update;
+ }
+
+ objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+ /*
+ * MacOS X can extract the object type here such as:
+ * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ */
+ type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+
+ if (check_sysattrs && !zap.za_normalization_conflict) {
+#ifdef TODO
+ zap.za_normalization_conflict =
+ xattr_sysattr_casechk(zap.za_name);
+#else
+ panic("%s:%u: TODO", __func__, __LINE__);
+#endif
+ }
+ }
+
+ if (flags & V_RDDIR_ACCFILTER) {
+ /*
+ * If we have no access at all, don't include
+ * this entry in the returned information
+ */
+ znode_t *ezp;
+ if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
+ goto skip_entry;
+ if (!zfs_has_access(ezp, cr)) {
+ vrele(ZTOV(ezp));
+ goto skip_entry;
+ }
+ vrele(ZTOV(ezp));
+ }
+
+ if (flags & V_RDDIR_ENTFLAGS)
+ reclen = EDIRENT_RECLEN(strlen(zap.za_name));
+ else
+ reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+
+ /*
+ * Will this entry fit in the buffer?
+ */
+ if (outcount + reclen > bufsize) {
+ /*
+ * Did we manage to fit anything in the buffer?
+ */
+ if (!outcount) {
+ error = SET_ERROR(EINVAL);
+ goto update;
+ }
+ break;
+ }
+ if (flags & V_RDDIR_ENTFLAGS) {
+ /*
+ * Add extended flag entry:
+ */
+ eodp->ed_ino = objnum;
+ eodp->ed_reclen = reclen;
+ /* NOTE: ed_off is the offset for the *next* entry. */
+ next = &eodp->ed_off;
+ eodp->ed_eflags = zap.za_normalization_conflict ?
+ ED_CASE_CONFLICT : 0;
+ (void) strncpy(eodp->ed_name, zap.za_name,
+ EDIRENT_NAMELEN(reclen));
+ eodp = (edirent_t *)((intptr_t)eodp + reclen);
+ } else {
+ /*
+ * Add normal entry:
+ */
+ odp->d_ino = objnum;
+ odp->d_reclen = reclen;
+ odp->d_namlen = strlen(zap.za_name);
+ /* NOTE: d_off is the offset for the *next* entry. */
+ next = &odp->d_off;
+ (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
+ odp->d_type = type;
+ dirent_terminate(odp);
+ odp = (dirent64_t *)((intptr_t)odp + reclen);
+ }
+ outcount += reclen;
+
+ ASSERT(outcount <= bufsize);
+
+ /* Prefetch znode */
+ if (prefetch)
+ dmu_prefetch(os, objnum, 0, 0, 0,
+ ZIO_PRIORITY_SYNC_READ);
+
+ skip_entry:
+ /*
+ * Move to the next entry, fill in the previous offset.
+ */
+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+ zap_cursor_advance(&zc);
+ offset = zap_cursor_serialize(&zc);
+ } else {
+ offset += 1;
+ }
+
+ /* Fill the offset right after advancing the cursor. */
+ if (next != NULL)
+ *next = offset;
+ if (cooks != NULL) {
+ *cooks++ = offset;
+ ncooks--;
+ KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
+ }
+ }
+ zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+ /* Subtract unused cookies */
+ if (ncookies != NULL)
+ *ncookies -= ncooks;
+
+ if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
+ iovp->iov_base += outcount;
+ iovp->iov_len -= outcount;
+ uio->uio_resid -= outcount;
+ } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
+ /*
+ * Reset the pointer.
+ */
+ offset = uio->uio_loffset;
+ }
+
+update:
+ zap_cursor_fini(&zc);
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
+ kmem_free(outbuf, bufsize);
+
+ if (error == ENOENT)
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ uio->uio_loffset = offset;
+ ZFS_EXIT(zfsvfs);
+ if (error != 0 && cookies != NULL) {
+ free(*cookies, M_TEMP);
+ *cookies = NULL;
+ *ncookies = 0;
+ }
+ return (error);
+}
+
+ulong_t zfs_fsync_sync_cnt = 4;
+
+static int
+zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
+ if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ }
+ return (0);
+}
+
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ * IN: vp - vnode of file.
+ * vap - va_mask identifies requested attributes.
+ * If AT_XVATTR set, then optional attrs are requested
+ * flags - ATTR_NOACLCHECK (CIFS server context)
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: vap - attribute values.
+ *
+ * RETURN: 0 (always succeeds).
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error = 0;
+ uint32_t blksize;
+ u_longlong_t nblocks;
+ uint64_t mtime[2], ctime[2], crtime[2], rdev;
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap = NULL;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+
+ if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+ * Also, if we are the owner don't bother, since owner should
+ * always be allowed to read basic attributes of file.
+ */
+ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+ (vap->va_uid != crgetuid(cr))) {
+ if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
+ skipaclchk, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ /*
+ * Return all attributes. It's cheaper to provide the answer
+ * than to determine whether we were asked the question.
+ */
+
+ vap->va_type = IFTOVT(zp->z_mode);
+ vap->va_mode = zp->z_mode & ~S_IFMT;
+#ifdef illumos
+ vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
+#else
+ vn_fsid(vp, vap);
+#endif
+ vap->va_nodeid = zp->z_id;
+ vap->va_nlink = zp->z_links;
+ if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
+ zp->z_links < ZFS_LINK_MAX)
+ vap->va_nlink++;
+ vap->va_size = zp->z_size;
+#ifdef illumos
+ vap->va_rdev = vp->v_rdev;
+#else
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ vap->va_rdev = zfs_cmpldev(rdev);
+#endif
+ vap->va_seq = zp->z_seq;
+ vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_filerev = zp->z_seq;
+
+ /*
+ * Add in any requested optional attributes and the create time.
+ * Also set the corresponding bits in the returned attribute bitmap.
+ */
+ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ xoap->xoa_archive =
+ ((zp->z_pflags & ZFS_ARCHIVE) != 0);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ xoap->xoa_readonly =
+ ((zp->z_pflags & ZFS_READONLY) != 0);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ xoap->xoa_system =
+ ((zp->z_pflags & ZFS_SYSTEM) != 0);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ xoap->xoa_hidden =
+ ((zp->z_pflags & ZFS_HIDDEN) != 0);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ xoap->xoa_nounlink =
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ xoap->xoa_immutable =
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ xoap->xoa_appendonly =
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ xoap->xoa_nodump =
+ ((zp->z_pflags & ZFS_NODUMP) != 0);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ xoap->xoa_opaque =
+ ((zp->z_pflags & ZFS_OPAQUE) != 0);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ xoap->xoa_av_quarantined =
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ xoap->xoa_av_modified =
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
+ vp->v_type == VREG) {
+ zfs_sa_get_scanstamp(zp, xvap);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+ xoap->xoa_generation = zp->z_gen;
+ XVA_SET_RTN(xvap, XAT_GEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ xoap->xoa_offline =
+ ((zp->z_pflags & ZFS_OFFLINE) != 0);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ xoap->xoa_sparse =
+ ((zp->z_pflags & ZFS_SPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+ }
+
+ ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+ ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
+
+
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+ vap->va_blksize = blksize;
+ vap->va_bytes = nblocks << 9; /* nblocks * 512 */
+
+ if (zp->z_blksz == 0) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ vap->va_blksize = zfsvfs->z_max_blksz;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ * IN: vp - vnode of file to be modified.
+ * vap - new attribute values.
+ * If AT_XVATTR set, then optional attrs are being set
+ * flags - ATTR_UTIME set if non-default time values provided.
+ * - ATTR_NOACLCHECK (CIFS context only).
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+static int
+zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ vattr_t oldva;
+ xvattr_t tmpxvattr;
+ uint_t mask = vap->va_mask;
+ uint_t saved_mask = 0;
+ uint64_t saved_mode;
+ int trim_mask = 0;
+ uint64_t new_mode;
+ uint64_t new_uid, new_gid;
+ uint64_t xattr_obj;
+ uint64_t mtime[2], ctime[2];
+ znode_t *attrzp;
+ int need_policy = FALSE;
+ int err, err2;
+ zfs_fuid_info_t *fuidp = NULL;
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap;
+ zfs_acl_t *aclp;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
+ sa_bulk_attr_t bulk[7], xattr_bulk[7];
+ int count = 0, xattr_count = 0;
+
+ if (mask == 0)
+ return (0);
+
+ if (mask & AT_NOSET)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ zilog = zfsvfs->z_log;
+
+ /*
+ * Make sure that if we have ephemeral uid/gid or xvattr specified
+ * that file system is at proper version level
+ */
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+ ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+ (mask & AT_XVATTR))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (mask & AT_SIZE && vp->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EISDIR));
+ }
+
+ if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * If this is an xvattr_t, then get a pointer to the structure of
+ * optional attributes. If this is NULL, then we have a vattr_t.
+ */
+ xoap = xva_getxoptattr(xvap);
+
+ xva_init(&tmpxvattr);
+
+ /*
+ * Immutable files can only alter immutable bit and atime
+ */
+ if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+ ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
+ ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * Note: ZFS_READONLY is handled in zfs_zaccess_common.
+ */
+
+ /*
+ * Verify timestamps doesn't overflow 32 bits.
+ * ZFS can handle large timestamps, but 32bit syscalls can't
+ * handle times greater than 2039. This check should be removed
+ * once large timestamps are fully supported.
+ */
+ if (mask & (AT_ATIME | AT_MTIME)) {
+ if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+ ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOVERFLOW));
+ }
+ }
+ if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
+ TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EOVERFLOW));
+ }
+
+ attrzp = NULL;
+ aclp = NULL;
+
+ /* Can this be moved to before the top label? */
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * First validate permissions
+ */
+
+ if (mask & AT_SIZE) {
+ /*
+ * XXX - Note, we are not providing any open
+ * mode flags here (like FNDELAY), so we may
+ * block if there are locks present... this
+ * should be addressed in openat().
+ */
+ /* XXX - would it be OK to generate a log record here? */
+ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ }
+
+ if (mask & (AT_ATIME|AT_MTIME) ||
+ ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+ XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+ XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+ XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+ XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+ XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+ need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+ skipaclchk, cr);
+ }
+
+ if (mask & (AT_UID|AT_GID)) {
+ int idmask = (mask & (AT_UID|AT_GID));
+ int take_owner;
+ int take_group;
+
+ /*
+ * NOTE: even if a new mode is being set,
+ * we may clear S_ISUID/S_ISGID bits.
+ */
+
+ if (!(mask & AT_MODE))
+ vap->va_mode = zp->z_mode;
+
+ /*
+ * Take ownership or chgrp to group we are a member of
+ */
+
+ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+ take_group = (mask & AT_GID) &&
+ zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+ /*
+ * If both AT_UID and AT_GID are set then take_owner and
+ * take_group must both be set in order to allow taking
+ * ownership.
+ *
+ * Otherwise, send the check through secpolicy_vnode_setattr()
+ *
+ */
+
+ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+ ((idmask == AT_UID) && take_owner) ||
+ ((idmask == AT_GID) && take_group)) {
+ if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+ skipaclchk, cr) == 0) {
+ /*
+ * Remove setuid/setgid for non-privileged users
+ */
+ secpolicy_setid_clear(vap, vp, cr);
+ trim_mask = (mask & (AT_UID|AT_GID));
+ } else {
+ need_policy = TRUE;
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ oldva.va_mode = zp->z_mode;
+ zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+ if (mask & AT_XVATTR) {
+ /*
+ * Update xvattr mask to include only those attributes
+ * that are actually changing.
+ *
+ * the bits will be restored prior to actually setting
+ * the attributes so the caller thinks they were set.
+ */
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ if (xoap->xoa_appendonly !=
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+ XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ if (xoap->xoa_nounlink !=
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+ XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ if (xoap->xoa_immutable !=
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+ XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ if (xoap->xoa_nodump !=
+ ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NODUMP);
+ XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ if (xoap->xoa_av_modified !=
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+ XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ if ((vp->v_type != VREG &&
+ xoap->xoa_av_quarantined) ||
+ xoap->xoa_av_quarantined !=
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+ XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (need_policy == FALSE &&
+ (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+ XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+ need_policy = TRUE;
+ }
+ }
+
+ if (mask & AT_MODE) {
+ if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+ err = secpolicy_setid_setsticky_clear(vp, vap,
+ &oldva, cr);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ trim_mask |= AT_MODE;
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ if (need_policy) {
+ /*
+ * If trim_mask is set then take ownership
+ * has been granted or write_acl is present and user
+ * has the ability to modify mode. In that case remove
+ * UID|GID and or MODE from mask so that
+ * secpolicy_vnode_setattr() doesn't revoke it.
+ */
+
+ if (trim_mask) {
+ saved_mask = vap->va_mask;
+ vap->va_mask &= ~trim_mask;
+ if (trim_mask & AT_MODE) {
+ /*
+ * Save the mode, as secpolicy_vnode_setattr()
+ * will overwrite it with ova.va_mode.
+ */
+ saved_mode = vap->va_mode;
+ }
+ }
+ err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+ (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ if (trim_mask) {
+ vap->va_mask |= saved_mask;
+ if (trim_mask & AT_MODE) {
+ /*
+ * Recover the mode after
+ * secpolicy_vnode_setattr().
+ */
+ vap->va_mode = saved_mode;
+ }
+ }
+ }
+
+ /*
+ * secpolicy_vnode_setattr, or take ownership may have
+ * changed va_mask
+ */
+ mask = vap->va_mask;
+
+ if ((mask & (AT_UID | AT_GID))) {
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+
+ if (err == 0 && xattr_obj) {
+ err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+ if (err == 0) {
+ err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
+ if (err != 0)
+ vrele(ZTOV(attrzp));
+ }
+ if (err)
+ goto out2;
+ }
+ if (mask & AT_UID) {
+ new_uid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+ if (new_uid != zp->z_uid &&
+ zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
+ if (attrzp)
+ vput(ZTOV(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (mask & AT_GID) {
+ new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &fuidp);
+ if (new_gid != zp->z_gid &&
+ zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
+ if (attrzp)
+ vput(ZTOV(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ if (mask & AT_MODE) {
+ uint64_t pmode = zp->z_mode;
+ uint64_t acl_obj;
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+ !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+ err = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
+ goto out;
+
+ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+ /*
+ * Are we upgrading ACL from old V0 format
+ * to V1 format?
+ */
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) ==
+ ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0,
+ aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ }
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ } else {
+ if ((mask & AT_XVATTR) &&
+ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ }
+
+ if (attrzp) {
+ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+ }
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err)
+ goto out;
+
+ count = 0;
+ /*
+ * Set each attribute requested.
+ * We group settings according to the locks they need to acquire.
+ *
+ * Note: you cannot set ctime directly, although it will be
+ * updated as a side-effect of calling this function.
+ */
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&zp->z_acl_lock);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&attrzp->z_acl_lock);
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+ sizeof (attrzp->z_pflags));
+ }
+
+ if (mask & (AT_UID|AT_GID)) {
+
+ if (mask & AT_UID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &new_uid, sizeof (new_uid));
+ zp->z_uid = new_uid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+ sizeof (new_uid));
+ attrzp->z_uid = new_uid;
+ }
+ }
+
+ if (mask & AT_GID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+ NULL, &new_gid, sizeof (new_gid));
+ zp->z_gid = new_gid;
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+ sizeof (new_gid));
+ attrzp->z_gid = new_gid;
+ }
+ }
+ if (!(mask & AT_MODE)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+ NULL, &new_mode, sizeof (new_mode));
+ new_mode = zp->z_mode;
+ }
+ err = zfs_acl_chown_setattr(zp);
+ ASSERT(err == 0);
+ if (attrzp) {
+ err = zfs_acl_chown_setattr(attrzp);
+ ASSERT(err == 0);
+ }
+ }
+
+ if (mask & AT_MODE) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &new_mode, sizeof (new_mode));
+ zp->z_mode = new_mode;
+ ASSERT3U((uintptr_t)aclp, !=, 0);
+ err = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT0(err);
+ if (zp->z_acl_cached)
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = aclp;
+ aclp = NULL;
+ }
+
+
+ if (mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ }
+
+ if (mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ }
+
+ /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
+ if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+ } else if (mask != 0) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+ B_TRUE);
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+ mtime, ctime, B_TRUE);
+ }
+ }
+ /*
+ * Do this after setting timestamps to prevent timestamp
+ * update from toggling bit
+ */
+
+ if (xoap && (mask & AT_XVATTR)) {
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+ xoap->xoa_createtime = vap->va_birthtime;
+ /*
+ * restore trimmed off masks
+ * so that return masks can be set for caller.
+ */
+
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
+ XVA_SET_REQ(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
+ XVA_SET_REQ(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
+ XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
+ XVA_SET_REQ(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
+ XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
+ XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ ASSERT(vp->v_type == VREG);
+
+ zfs_xvattr_set(zp, xvap, tx);
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (mask != 0)
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&zp->z_acl_lock);
+
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&attrzp->z_acl_lock);
+ }
+out:
+ if (err == 0 && attrzp) {
+ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+ xattr_count, tx);
+ ASSERT(err2 == 0);
+ }
+
+ if (attrzp)
+ vput(ZTOV(attrzp));
+
+ if (aclp)
+ zfs_acl_free(aclp);
+
+ if (fuidp) {
+ zfs_fuid_info_free(fuidp);
+ fuidp = NULL;
+ }
+
+ if (err) {
+ dmu_tx_abort(tx);
+ } else {
+ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ dmu_tx_commit(tx);
+ }
+
+out2:
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * We acquire all but fdvp locks using non-blocking acquisitions. If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename. This acquire/release step ensures that we do not
+ * spin on a lock waiting for release. On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
+ */
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+ struct vnode *tdvp, struct vnode **tvpp,
+ const struct componentname *scnp, const struct componentname *tcnp)
+{
+ zfsvfs_t *zfsvfs;
+ struct vnode *nvp, *svp, *tvp;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ const char *snm = scnp->cn_nameptr;
+ const char *tnm = tcnp->cn_nameptr;
+ int error;
+
+ VOP_UNLOCK(tdvp);
+ if (*tvpp != NULL && *tvpp != tdvp)
+ VOP_UNLOCK(*tvpp);
+
+relock:
+ error = vn_lock(sdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ sdzp = VTOZ(sdvp);
+
+ error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp);
+ if (error != EBUSY)
+ goto out;
+ error = vn_lock(tdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ VOP_UNLOCK(tdvp);
+ goto relock;
+ }
+ tdzp = VTOZ(tdvp);
+
+ /*
+ * Before using sdzp and tdzp we must ensure that they are live.
+ * As a porting legacy from illumos we have two things to worry
+ * about. One is typical for FreeBSD and it is that the vnode is
+ * not reclaimed (doomed). The other is that the znode is live.
+ * The current code can invalidate the znode without acquiring the
+ * corresponding vnode lock if the object represented by the znode
+ * and vnode is no longer valid after a rollback or receive operation.
+ * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+ * that protects the znodes from the invalidation.
+ */
+ zfsvfs = sdzp->z_zfsvfs;
+ ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
+ */
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp);
+ VOP_UNLOCK(tdvp);
+ error = SET_ERROR(EIO);
+ goto out;
+ }
+
+ /*
+ * Re-resolve svp to be certain it still exists and fetch the
+ * correct vnode.
+ */
+ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+ if (error != 0) {
+ /* Source entry invalid or not there. */
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp);
+ VOP_UNLOCK(tdvp);
+ if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+ (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ svp = ZTOV(szp);
+
+ /*
+ * Re-resolve tvp, if it disappeared we just carry on.
+ */
+ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp);
+ VOP_UNLOCK(tdvp);
+ vrele(svp);
+ if ((tcnp->cn_flags & ISDOTDOT) != 0)
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ if (tzp != NULL)
+ tvp = ZTOV(tzp);
+ else
+ tvp = NULL;
+
+ /*
+ * At present the vnode locks must be acquired before z_teardown_lock,
+ * although it would be more logical to use the opposite order.
+ */
+ ZFS_EXIT(zfsvfs);
+
+ /*
+ * Now try acquire locks on svp and tvp.
+ */
+ nvp = svp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp);
+ VOP_UNLOCK(tdvp);
+ if (tvp != NULL)
+ vrele(tvp);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ VOP_UNLOCK(nvp);
+ /*
+ * Concurrent rename race.
+ * XXX ?
+ */
+ if (nvp == tdvp) {
+ vrele(nvp);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+ goto relock;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+
+ if (*tvpp != NULL)
+ vrele(*tvpp);
+ *tvpp = NULL;
+ if (tvp != NULL) {
+ nvp = tvp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp);
+ VOP_UNLOCK(tdvp);
+ VOP_UNLOCK(*svpp);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ vput(nvp);
+ goto relock;
+ }
+ *tvpp = nvp;
+ }
+
+ return (0);
+
+out:
+ return (error);
+}
+
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *zp, *zp1;
+ uint64_t parent;
+ int error;
+
+ zfsvfs = tdzp->z_zfsvfs;
+ if (tdzp == szp)
+ return (SET_ERROR(EINVAL));
+ if (tdzp == sdzp)
+ return (0);
+ if (tdzp->z_id == zfsvfs->z_root)
+ return (0);
+ zp = tdzp;
+ for (;;) {
+ ASSERT(!zp->z_unlinked);
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ break;
+
+ if (parent == szp->z_id) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ if (parent == zfsvfs->z_root)
+ break;
+ if (parent == sdzp->z_id)
+ break;
+
+ error = zfs_zget(zfsvfs, parent, &zp1);
+ if (error != 0)
+ break;
+
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+ zp = zp1;
+ }
+
+ if (error == ENOTDIR)
+ panic("checkpath: .. not a directory\n");
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+ return (error);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory. Change the entry name as indicated.
+ *
+ * IN: sdvp - Source directory containing the "old entry".
+ * snm - Old entry name.
+ * tdvp - Target directory to contain the "new entry".
+ * tnm - New entry name.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * sdvp,tdvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+ vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+ cred_t *cr)
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ zilog_t *zilog = NULL;
+ dmu_tx_t *tx;
+ char *snm = scnp->cn_nameptr;
+ char *tnm = tcnp->cn_nameptr;
+ int error = 0;
+ bool want_seqc_end = false;
+
+ /* Reject renames across filesystems. */
+ if ((*svpp)->v_mount != tdvp->v_mount ||
+ ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ if (zfsctl_is_node(tdvp)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ /*
+ * Lock all four vnodes to ensure safety and semantics of renaming.
+ */
+ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+ if (error != 0) {
+ /* no vnodes are locked in the case of error here */
+ return (error);
+ }
+
+ tdzp = VTOZ(tdvp);
+ sdzp = VTOZ(sdvp);
+ zfsvfs = tdzp->z_zfsvfs;
+ zilog = zfsvfs->z_log;
+
+ /*
+ * After we re-enter ZFS_ENTER() we will have to revalidate all
+ * znodes involved.
+ */
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsvfs->z_utf8 && u8_validate(tnm,
+ strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ error = SET_ERROR(EILSEQ);
+ goto unlockout;
+ }
+
+ /* If source and target are the same file, there is nothing to do. */
+ if ((*svpp) == (*tvpp)) {
+ error = 0;
+ goto unlockout;
+ }
+
+ if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+ ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+ (*tvpp)->v_mountedhere != NULL)) {
+ error = SET_ERROR(EXDEV);
+ goto unlockout;
+ }
+
+ /*
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
+ */
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
+ }
+
+ szp = VTOZ(*svpp);
+ tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+ if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
+ }
+
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+ error = SET_ERROR(EINVAL);
+ goto unlockout;
+ }
+
+ /*
+ * Must have write access at the source to remove the old entry
+ * and write access at the target to create the new entry.
+ * Note that if target and source are the same, this can be
+ * done in a single check.
+ */
+ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
+ goto unlockout;
+
+ if ((*svpp)->v_type == VDIR) {
+ /*
+ * Avoid ".", "..", and aliases of "." for obvious reasons.
+ */
+ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+ sdzp == szp ||
+ (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+ error = EINVAL;
+ goto unlockout;
+ }
+
+ /*
+ * Check to make sure rename is valid.
+ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+ */
+ if (error = zfs_rename_check(szp, sdzp, tdzp))
+ goto unlockout;
+ }
+
+ /*
+ * Does target exist?
+ */
+ if (tzp) {
+ /*
+ * Source and target must be the same type.
+ */
+ if ((*svpp)->v_type == VDIR) {
+ if ((*tvpp)->v_type != VDIR) {
+ error = SET_ERROR(ENOTDIR);
+ goto unlockout;
+ } else {
+ cache_purge(tdvp);
+ if (sdvp != tdvp)
+ cache_purge(sdvp);
+ }
+ } else {
+ if ((*tvpp)->v_type == VDIR) {
+ error = SET_ERROR(EISDIR);
+ goto unlockout;
+ }
+ }
+ }
+
+ vn_seqc_write_begin(*svpp);
+ vn_seqc_write_begin(sdvp);
+ if (*tvpp != NULL)
+ vn_seqc_write_begin(*tvpp);
+ if (tdvp != *tvpp)
+ vn_seqc_write_begin(tdvp);
+ want_seqc_end = true;
+
+ vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
+ if (tzp)
+ vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
+
+ /*
+ * notify the target directory if it is not the same
+ * as source directory.
+ */
+ if (tdvp != sdvp) {
+ vnevent_rename_dest_dir(tdvp, ct);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+ if (sdzp != tdzp) {
+ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ }
+ if (tzp) {
+ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tzp);
+ }
+
+ zfs_sa_upgrade_txholds(tx, szp);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto unlockout;
+ }
+
+
+ if (tzp) /* Attempt to remove the existing target */
+ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
+
+ if (error == 0) {
+ error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
+ if (error == 0) {
+ szp->z_pflags |= ZFS_AV_MODIFIED;
+
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ ASSERT0(error);
+
+ error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+ NULL);
+ if (error == 0) {
+ zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+ snm, tdzp, tnm, szp);
+
+ /*
+ * Update path information for the target vnode
+ */
+ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
+ } else {
+ /*
+ * At this point, we have successfully created
+ * the target name, but have failed to remove
+ * the source name. Since the create was done
+ * with the ZRENAMING flag, there are
+ * complications; for one, the link count is
+ * wrong. The easiest way to deal with this
+ * is to remove the newly created target, and
+ * return the original error. This must
+ * succeed; fortunately, it is very unlikely to
+ * fail, since we just created it.
+ */
+ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
+ ZRENAMING, NULL), ==, 0);
+ }
+ }
+ if (error == 0) {
+ cache_purge(*svpp);
+ if (*tvpp != NULL)
+ cache_purge(*tvpp);
+ cache_purge_negative(tdvp);
+ }
+ }
+
+ dmu_tx_commit(tx);
+
+unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
+ ZFS_EXIT(zfsvfs);
+ if (want_seqc_end) {
+ vn_seqc_write_end(*svpp);
+ vn_seqc_write_end(sdvp);
+ if (*tvpp != NULL)
+ vn_seqc_write_end(*tvpp);
+ if (tdvp != *tvpp)
+ vn_seqc_write_end(tdvp);
+ want_seqc_end = false;
+ }
+ VOP_UNLOCK(*svpp);
+ VOP_UNLOCK(sdvp);
+
+out: /* original two vnodes are locked */
+ MPASS(!want_seqc_end);
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ if (*tvpp != NULL)
+ VOP_UNLOCK(*tvpp);
+ if (tdvp != *tvpp)
+ VOP_UNLOCK(tdvp);
+ return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ * IN: dvp - Directory to contain new symbolic link.
+ * link - Name for new symlink entry.
+ * vap - Attributes of new entry.
+ * cr - credentials of caller.
+ * ct - caller context
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
+ cred_t *cr, kthread_t *td)
+{
+ znode_t *zp, *dzp = VTOZ(dvp);
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ uint64_t len = strlen(link);
+ int error;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t txtype = TX_SYMLINK;
+ int flags = 0;
+
+ ASSERT(vap->va_type == VLNK);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (len > MAXPATHLEN) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0,
+ vap, cr, NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ getnewvnode_reserve();
+ tx = dmu_tx_create(zfsvfs->z_os);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE + len);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create a new object for the symlink.
+ * for version 4 ZPL datsets the symlink will be an SA attribute
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (zp->z_is_sa)
+ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+ link, len, tx);
+ else
+ zfs_sa_symlink(zp, link, len, tx);
+
+ zp->z_size = len;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx);
+ /*
+ * Insert the new object into the directory.
+ */
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+ *vpp = ZTOV(zp);
+
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ getnewvnode_drop_reserve();
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ * IN: vp - vnode of symbolic link.
+ * uio - structure to contain the link path.
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * OUT: uio - structure containing the link path.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (zp->z_is_sa)
+ error = sa_lookup_uio(zp->z_sa_hdl,
+ SA_ZPL_SYMLINK(zfsvfs), uio);
+ else
+ error = zfs_sa_readlink(zp, uio);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ * IN: tdvp - Directory to contain new entry.
+ * svp - vnode of new entry.
+ * name - name of new entry.
+ * cr - credentials of caller.
+ * ct - caller context
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * tdvp - ctime|mtime updated
+ * svp - ctime updated
+ */
+/* ARGSUSED */
+static int
+zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ znode_t *dzp = VTOZ(tdvp);
+ znode_t *tzp, *szp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ int error;
+ uint64_t parent;
+ uid_t owner;
+
+ ASSERT(tdvp->v_type == VDIR);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ /*
+ * POSIX dictates that we return EPERM here.
+ * Better choices include ENOTSUP or EISDIR.
+ */
+ if (svp->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ szp = VTOZ(svp);
+ ZFS_VERIFY_ZP(szp);
+
+ if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /* Prevent links to .zfs/shares files */
+
+ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (parent == zfsvfs->z_shares_dir) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(name,
+ strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ /*
+ * We do not support links between attributes and non-attributes
+ * because of the potential security risk of creating links
+ * into "normal" file space in order to circumvent restrictions
+ * imposed in attribute space.
+ */
+ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+
+ owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+ if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ zfs_sa_upgrade_txholds(tx, szp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_create(dzp, name, szp, tx, 0);
+
+ if (error == 0) {
+ uint64_t txtype = TX_LINK;
+ zfs_log_link(zilog, tx, txtype, dzp, szp, name);
+ }
+
+ dmu_tx_commit(tx);
+
+ if (error == 0) {
+ vnevent_link(svp, ct);
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+
+/*ARGSUSED*/
+void
+zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs);
+ if (zp->z_sa_hdl == NULL) {
+ /*
+ * The fs has been unmounted, or we did a
+ * suspend/resume and this file no longer exists.
+ */
+ ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
+ vrecycle(vp);
+ return;
+ }
+
+ if (zp->z_unlinked) {
+ /*
+ * Fast path to recycle a vnode of a removed file.
+ */
+ ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
+ vrecycle(vp);
+ return;
+ }
+
+ if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+ (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
+ zp->z_atime_dirty = 0;
+ dmu_tx_commit(tx);
+ }
+ }
+ ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
+}
+
+
+CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
+CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
+
+/*ARGSUSED*/
+static int
+zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint32_t gen;
+ uint64_t gen64;
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int size, i, error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+ &gen64, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ gen = (uint32_t)gen64;
+
+ size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+
+#ifdef illumos
+ if (fidp->fid_len < size) {
+ fidp->fid_len = size;
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOSPC));
+ }
+#else
+ fidp->fid_len = size;
+#endif
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = size;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* Must have a non-zero generation number to distinguish from .zfs */
+ if (gen == 0)
+ gen = 1;
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ if (size == LONG_FID_LEN) {
+ uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+ zfid_long_t *zlfid;
+
+ zlfid = (zfid_long_t *)fidp;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+ /* XXX - this should be the generation number for the objset */
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ zlfid->zf_setgen[i] = 0;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp, *xzp;
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ switch (cmd) {
+ case _PC_LINK_MAX:
+ *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *valp = 64;
+ return (0);
+#ifdef illumos
+ case _PC_XATTR_EXISTS:
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ *valp = 0;
+ error = zfs_dirent_lookup(zp, "", &xzp,
+ ZXATTR | ZEXISTS | ZSHARED);
+ if (error == 0) {
+ if (!zfs_dirempty(xzp))
+ *valp = 1;
+ vrele(ZTOV(xzp));
+ } else if (error == ENOENT) {
+ /*
+ * If there aren't extended attributes, it's the
+ * same as having zero of them.
+ */
+ error = 0;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+
+ case _PC_SATTR_ENABLED:
+ case _PC_SATTR_EXISTS:
+ *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+ (vp->v_type == VREG || vp->v_type == VDIR);
+ return (0);
+
+ case _PC_ACCESS_FILTERING:
+ *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
+ vp->v_type == VDIR;
+ return (0);
+
+ case _PC_ACL_ENABLED:
+ *valp = _ACL_ACE_ENABLED;
+ return (0);
+#endif /* illumos */
+ case _PC_MIN_HOLE_SIZE:
+ *valp = (int)SPA_MINBLOCKSIZE;
+ return (0);
+#ifdef illumos
+ case _PC_TIMESTAMP_RESOLUTION:
+ /* nanosecond timestamp resolution */
+ *valp = 1L;
+ return (0);
+#endif
+ case _PC_ACL_EXTENDED:
+ *valp = 0;
+ return (0);
+
+ case _PC_ACL_NFS4:
+ *valp = 1;
+ return (0);
+
+ case _PC_ACL_PATH_MAX:
+ *valp = ACL_MAX_ENTRIES;
+ return (0);
+
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ zilog_t *zilog = zfsvfs->z_log;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static int
+zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
+ int *rahead)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zp->z_zfsvfs->z_os;
+ locked_range_t *lr;
+ vm_object_t object;
+ off_t start, end, obj_size;
+ uint_t blksz;
+ int pgsin_b, pgsin_a;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ start = IDX_TO_OFF(ma[0]->pindex);
+ end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
+
+ /*
+ * Try to lock a range covering all required and optional pages, to
+ * handle the case of the block size growing. It is not safe to block
+ * on the range lock since the owner may be waiting for the fault page
+ * to be unbusied.
+ */
+ for (;;) {
+ blksz = zp->z_blksz;
+ lr = rangelock_tryenter(&zp->z_rangelock,
+ rounddown(start, blksz),
+ roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+ if (lr == NULL) {
+ if (rahead != NULL) {
+ *rahead = 0;
+ rahead = NULL;
+ }
+ if (rbehind != NULL) {
+ *rbehind = 0;
+ rbehind = NULL;
+ }
+ break;
+ }
+ if (blksz == zp->z_blksz)
+ break;
+ rangelock_exit(lr);
+ }
+
+ object = ma[0]->object;
+ zfs_vmobject_wlock(object);
+ obj_size = object->un_pager.vnp.vnp_size;
+ zfs_vmobject_wunlock(object);
+ if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
+ if (lr != NULL)
+ rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (zfs_vm_pagerret_bad);
+ }
+
+ pgsin_b = 0;
+ if (rbehind != NULL) {
+ pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
+ pgsin_b = MIN(*rbehind, pgsin_b);
+ }
+
+ pgsin_a = 0;
+ if (rahead != NULL) {
+ pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
+ if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
+ pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
+ pgsin_a = MIN(*rahead, pgsin_a);
+ }
+
+ /*
+ * NB: we need to pass the exact byte size of the data that we expect
+ * to read after accounting for the file size. This is required because
+ * ZFS will panic if we request DMU to read beyond the end of the last
+ * allocated block.
+ */
+ error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
+ MIN(end, obj_size) - (end - PAGE_SIZE));
+
+ if (lr != NULL)
+ rangelock_exit(lr);
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+
+ if (error != 0)
+ return (zfs_vm_pagerret_error);
+
+ VM_CNT_INC(v_vnodein);
+ VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
+ if (rbehind != NULL)
+ *rbehind = pgsin_b;
+ if (rahead != NULL)
+ *rahead = pgsin_a;
+ return (zfs_vm_pagerret_ok);
+}
+
+static int
+zfs_freebsd_getpages(ap)
+ struct vop_getpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int *a_rbehind;
+ int *a_rahead;
+ } */ *ap;
+{
+
+ return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
+ ap->a_rahead));
+}
+
+static int
+zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
+ int *rtvals)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ locked_range_t *lr;
+ dmu_tx_t *tx;
+ struct sf_buf *sf;
+ vm_object_t object;
+ vm_page_t m;
+ caddr_t va;
+ size_t tocopy;
+ size_t lo_len;
+ vm_ooffset_t lo_off;
+ vm_ooffset_t off;
+ uint_t blksz;
+ int ncount;
+ int pcount;
+ int err;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ object = vp->v_object;
+ pcount = btoc(len);
+ ncount = pcount;
+
+ KASSERT(ma[0]->object == object, ("mismatching object"));
+ KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
+
+ for (i = 0; i < pcount; i++)
+ rtvals[i] = zfs_vm_pagerret_error;
+
+ off = IDX_TO_OFF(ma[0]->pindex);
+ blksz = zp->z_blksz;
+ lo_off = rounddown(off, blksz);
+ lo_len = roundup(len + (off - lo_off), blksz);
+ lr = rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
+
+ zfs_vmobject_wlock(object);
+ if (len + off > object->un_pager.vnp.vnp_size) {
+ if (object->un_pager.vnp.vnp_size > off) {
+ int pgoff;
+
+ len = object->un_pager.vnp.vnp_size - off;
+ ncount = btoc(len);
+ if ((pgoff = (int)len & PAGE_MASK) != 0) {
+ /*
+ * If the object is locked and the following
+ * conditions hold, then the page's dirty
+ * field cannot be concurrently changed by a
+ * pmap operation.
+ */
+ m = ma[ncount - 1];
+ vm_page_assert_sbusied(m);
+ KASSERT(!pmap_page_is_write_mapped(m),
+ ("zfs_putpages: page %p is not read-only", m));
+ vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
+ pgoff);
+ }
+ } else {
+ len = 0;
+ ncount = 0;
+ }
+ if (ncount < pcount) {
+ for (i = ncount; i < pcount; i++) {
+ rtvals[i] = zfs_vm_pagerret_bad;
+ }
+ }
+ }
+ zfs_vmobject_wunlock(object);
+
+ if (ncount == 0)
+ goto out;
+
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
+ goto out;
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ if (zp->z_blksz < PAGE_SIZE) {
+ for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
+ tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+ va = zfs_map_page(ma[i], &sf);
+ dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+ zfs_unmap_page(sf);
+ }
+ } else {
+ err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
+ }
+
+ if (err == 0) {
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT0(err);
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
+
+ zfs_vmobject_wlock(object);
+ for (i = 0; i < ncount; i++) {
+ rtvals[i] = zfs_vm_pagerret_ok;
+ vm_page_undirty(ma[i]);
+ }
+ zfs_vmobject_wunlock(object);
+ VM_CNT_INC(v_vnodeout);
+ VM_CNT_ADD(v_vnodepgsout, ncount);
+ }
+ dmu_tx_commit(tx);
+
+out:
+ rangelock_exit(lr);
+ if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ return (rtvals[0]);
+}
+
+int
+zfs_freebsd_putpages(ap)
+ struct vop_putpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_sync;
+ int *a_rtvals;
+ } */ *ap;
+{
+
+ return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
+ ap->a_rtvals));
+}
+
+static int
+zfs_freebsd_bmap(ap)
+ struct vop_bmap_args /* {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct bufobj **a_bop;
+ daddr_t *a_bnp;
+ int *a_runp;
+ int *a_runb;
+ } */ *ap;
+{
+
+ if (ap->a_bop != NULL)
+ *ap->a_bop = &ap->a_vp->v_bufobj;
+ if (ap->a_bnp != NULL)
+ *ap->a_bnp = ap->a_bn;
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ if (ap->a_runb != NULL)
+ *ap->a_runb = 0;
+
+ return (0);
+}
+
+static int
+zfs_freebsd_open(ap)
+ struct vop_open_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ int error;
+
+ error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
+ if (error == 0)
+ vnode_create_vobject(vp, zp->z_size, ap->a_td);
+ return (error);
+}
+
+static int
+zfs_freebsd_close(ap)
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
+}
+
+static int
+zfs_freebsd_ioctl(ap)
+ struct vop_ioctl_args /* {
+ struct vnode *a_vp;
+ u_long a_command;
+ caddr_t a_data;
+ int a_fflag;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
+{
+
+ return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
+ ap->a_fflag, ap->a_cred, NULL, NULL));
+}
+
+static int
+ioflags(int ioflags)
+{
+ int flags = 0;
+
+ if (ioflags & IO_APPEND)
+ flags |= FAPPEND;
+ if (ioflags & IO_NDELAY)
+ flags |= FNONBLOCK;
+ if (ioflags & IO_SYNC)
+ flags |= (FSYNC | FDSYNC | FRSYNC);
+
+ return (flags);
+}
+
+static int
+zfs_freebsd_read(ap)
+ struct vop_read_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+
+ return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
+ ap->a_cred, NULL));
+}
+
+static int
+zfs_freebsd_write(ap)
+ struct vop_write_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+
+ return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
+ ap->a_cred, NULL));
+}
+
+/*
+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
+ * the comment above cache_fplookup for details.
+ */
+static int
+zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
+{
+ vnode_t *vp;
+ znode_t *zp;
+ uint64_t pflags;
+
+ vp = v->a_vp;
+ zp = VTOZ_SMR(vp);
+ if (__predict_false(zp == NULL))
+ return (EAGAIN);
+ pflags = atomic_load_64(&zp->z_pflags);
+ if (pflags & ZFS_AV_QUARANTINED)
+ return (EAGAIN);
+ if (pflags & ZFS_XATTR)
+ return (EAGAIN);
+ if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
+ return (EAGAIN);
+ return (0);
+}
+
+static int
+zfs_freebsd_access(ap)
+ struct vop_access_args /* {
+ struct vnode *a_vp;
+ accmode_t a_accmode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ accmode_t accmode;
+ int error = 0;
+
+ if (ap->a_accmode == VEXEC) {
+ if (zfs_freebsd_fastaccesschk_execute(ap->a_vp, ap->a_cred) == 0)
+ return (0);
+ }
+
+ /*
+ * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
+ */
+ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
+ if (accmode != 0)
+ error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
+
+ /*
+ * VADMIN has to be handled by vaccess().
+ */
+ if (error == 0) {
+ accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
+ if (accmode != 0) {
+ error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+ zp->z_gid, accmode, ap->a_cred);
+ }
+ }
+
+ /*
+ * For VEXEC, ensure that at least one execute bit is set for
+ * non-directories.
+ */
+ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
+ (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
+ error = EACCES;
+ }
+
+ return (error);
+}
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+ struct componentname *cnp = ap->a_cnp;
+ char nm[NAME_MAX + 1];
+
+ ASSERT(cnp->cn_namelen < sizeof(nm));
+ strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
+
+ return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
+ cnp->cn_cred, cnp->cn_thread, 0, cached));
+}
+
+static int
+zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
+{
+
+ return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
+}
+
+static int
+zfs_cache_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ zfsvfs_t *zfsvfs;
+
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
+ if (zfsvfs->z_use_namecache)
+ return (vfs_cache_lookup(ap));
+ else
+ return (zfs_freebsd_lookup(ap, B_FALSE));
+}
+
+static int
+zfs_freebsd_create(ap)
+ struct vop_create_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap;
+{
+ zfsvfs_t *zfsvfs;
+ struct componentname *cnp = ap->a_cnp;
+ vattr_t *vap = ap->a_vap;
+ int error, mode;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ vattr_init_mask(vap);
+ mode = vap->va_mode & ALLPERMS;
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
+
+ error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
+ ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
+ if (zfsvfs->z_use_namecache &&
+ error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
+ return (error);
+}
+
+static int
+zfs_freebsd_remove(ap)
+ struct vop_remove_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+ ap->a_cnp->cn_cred));
+}
+
+static int
+zfs_freebsd_mkdir(ap)
+ struct vop_mkdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap;
+{
+ vattr_t *vap = ap->a_vap;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ vattr_init_mask(vap);
+
+ return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
+ ap->a_cnp->cn_cred));
+}
+
+static int
+zfs_freebsd_rmdir(ap)
+ struct vop_rmdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ struct componentname *cnp = ap->a_cnp;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
+}
+
+static int
+zfs_freebsd_readdir(ap)
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *a_ncookies;
+ u_long **a_cookies;
+ } */ *ap;
+{
+
+ return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
+ ap->a_ncookies, ap->a_cookies));
+}
+
+static int
+zfs_freebsd_fsync(ap)
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ int a_waitfor;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ vop_stdfsync(ap);
+ return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
+}
+
+static int
+zfs_freebsd_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vattr_t *vap = ap->a_vap;
+ xvattr_t xvap;
+ u_long fflags = 0;
+ int error;
+
+ xva_init(&xvap);
+ xvap.xva_vattr = *vap;
+ xvap.xva_vattr.va_mask |= AT_XVATTR;
+
+ /* Convert chflags into ZFS-type flags. */
+ /* XXX: what about SF_SETTABLE?. */
+ XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
+ XVA_SET_REQ(&xvap, XAT_APPENDONLY);
+ XVA_SET_REQ(&xvap, XAT_NOUNLINK);
+ XVA_SET_REQ(&xvap, XAT_NODUMP);
+ XVA_SET_REQ(&xvap, XAT_READONLY);
+ XVA_SET_REQ(&xvap, XAT_ARCHIVE);
+ XVA_SET_REQ(&xvap, XAT_SYSTEM);
+ XVA_SET_REQ(&xvap, XAT_HIDDEN);
+ XVA_SET_REQ(&xvap, XAT_REPARSE);
+ XVA_SET_REQ(&xvap, XAT_OFFLINE);
+ XVA_SET_REQ(&xvap, XAT_SPARSE);
+
+ error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
+ if (error != 0)
+ return (error);
+
+ /* Convert ZFS xattr into chflags. */
+#define FLAG_CHECK(fflag, xflag, xfield) do { \
+ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
+ fflags |= (fflag); \
+} while (0)
+ FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
+ xvap.xva_xoptattrs.xoa_immutable);
+ FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
+ xvap.xva_xoptattrs.xoa_appendonly);
+ FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
+ xvap.xva_xoptattrs.xoa_nounlink);
+ FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
+ xvap.xva_xoptattrs.xoa_archive);
+ FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
+ xvap.xva_xoptattrs.xoa_nodump);
+ FLAG_CHECK(UF_READONLY, XAT_READONLY,
+ xvap.xva_xoptattrs.xoa_readonly);
+ FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
+ xvap.xva_xoptattrs.xoa_system);
+ FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
+ xvap.xva_xoptattrs.xoa_reparse);
+ FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
+ xvap.xva_xoptattrs.xoa_offline);
+ FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
+ xvap.xva_xoptattrs.xoa_sparse);
+
+#undef FLAG_CHECK
+ *vap = xvap.xva_vattr;
+ vap->va_flags = fflags;
+ return (0);
+}
+
+static int
+zfs_freebsd_setattr(ap)
+ struct vop_setattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cred = ap->a_cred;
+ xvattr_t xvap;
+ u_long fflags;
+ uint64_t zflags;
+
+ vattr_init_mask(vap);
+ vap->va_mask &= ~AT_NOSET;
+
+ xva_init(&xvap);
+ xvap.xva_vattr = *vap;
+
+ zflags = VTOZ(vp)->z_pflags;
+
+ if (vap->va_flags != VNOVAL) {
+ zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
+ int error;
+
+ if (zfsvfs->z_use_fuids == B_FALSE)
+ return (EOPNOTSUPP);
+
+ fflags = vap->va_flags;
+ /*
+ * XXX KDM
+ * We need to figure out whether it makes sense to allow
+ * UF_REPARSE through, since we don't really have other
+ * facilities to handle reparse points and zfs_setattr()
+ * doesn't currently allow setting that attribute anyway.
+ */
+ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
+ UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
+ UF_OFFLINE|UF_SPARSE)) != 0)
+ return (EOPNOTSUPP);
+ /*
+ * Unprivileged processes are not permitted to unset system
+ * flags, or modify flags if any system flags are set.
+ * Privileged non-jail processes may not modify system flags
+ * if securelevel > 0 and any existing system flags are set.
+ * Privileged jail processes behave like privileged non-jail
+ * processes if the PR_ALLOW_CHFLAGS permission bit is set;
+ * otherwise, they behave like unprivileged processes.
+ */
+ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
+ priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
+ if (zflags &
+ (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
+ error = securelevel_gt(cred, 0);
+ if (error != 0)
+ return (error);
+ }
+ } else {
+ /*
+ * Callers may only modify the file flags on objects they
+ * have VADMIN rights for.
+ */
+ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
+ return (error);
+ if (zflags &
+ (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
+ return (EPERM);
+ }
+ if (fflags &
+ (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
+ return (EPERM);
+ }
+ }
+
+#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
+ if (((fflags & (fflag)) && !(zflags & (zflag))) || \
+ ((zflags & (zflag)) && !(fflags & (fflag)))) { \
+ XVA_SET_REQ(&xvap, (xflag)); \
+ (xfield) = ((fflags & (fflag)) != 0); \
+ } \
+} while (0)
+ /* Convert chflags into ZFS-type flags. */
+ /* XXX: what about SF_SETTABLE?. */
+ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+ xvap.xva_xoptattrs.xoa_immutable);
+ FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+ xvap.xva_xoptattrs.xoa_appendonly);
+ FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
+ xvap.xva_xoptattrs.xoa_nounlink);
+ FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
+ xvap.xva_xoptattrs.xoa_archive);
+ FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+ xvap.xva_xoptattrs.xoa_nodump);
+ FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
+ xvap.xva_xoptattrs.xoa_readonly);
+ FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
+ xvap.xva_xoptattrs.xoa_system);
+ FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
+ xvap.xva_xoptattrs.xoa_reparse);
+ FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
+ xvap.xva_xoptattrs.xoa_offline);
+ FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
+ xvap.xva_xoptattrs.xoa_sparse);
+#undef FLAG_CHANGE
+ }
+ if (vap->va_birthtime.tv_sec != VNOVAL) {
+ xvap.xva_vattr.va_mask |= AT_XVATTR;
+ XVA_SET_REQ(&xvap, XAT_CREATETIME);
+ }
+ return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
+}
+
+static int
+zfs_freebsd_rename(ap)
+ struct vop_rename_args /* {
+ struct vnode *a_fdvp;
+ struct vnode *a_fvp;
+ struct componentname *a_fcnp;
+ struct vnode *a_tdvp;
+ struct vnode *a_tvp;
+ struct componentname *a_tcnp;
+ } */ *ap;
+{
+ vnode_t *fdvp = ap->a_fdvp;
+ vnode_t *fvp = ap->a_fvp;
+ vnode_t *tdvp = ap->a_tdvp;
+ vnode_t *tvp = ap->a_tvp;
+ int error;
+
+ ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
+ ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
+
+ error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+ ap->a_tcnp, ap->a_fcnp->cn_cred);
+
+ vrele(fdvp);
+ vrele(fvp);
+ vrele(tdvp);
+ if (tvp != NULL)
+ vrele(tvp);
+
+ return (error);
+}
+
+static int
+zfs_freebsd_symlink(ap)
+ struct vop_symlink_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ char *a_target;
+ } */ *ap;
+{
+ struct componentname *cnp = ap->a_cnp;
+ vattr_t *vap = ap->a_vap;
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
+ vattr_init_mask(vap);
+
+ return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
+ __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread));
+}
+
+static int
+zfs_freebsd_readlink(ap)
+ struct vop_readlink_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+
+ return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
+}
+
+static int
+zfs_freebsd_link(ap)
+ struct vop_link_args /* {
+ struct vnode *a_tdvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ struct componentname *cnp = ap->a_cnp;
+ vnode_t *vp = ap->a_vp;
+ vnode_t *tdvp = ap->a_tdvp;
+
+ if (tdvp->v_mount != vp->v_mount)
+ return (EXDEV);
+
+ ASSERT(cnp->cn_flags & SAVENAME);
+
+ return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
+}
+
+static int
+zfs_freebsd_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+
+ zfs_inactive(vp, ap->a_td->td_ucred, NULL);
+ return (0);
+}
+
+static int
+zfs_freebsd_need_inactive(ap)
+ struct vop_need_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int need;
+
+ if (vn_need_pageq_flush(vp))
+ return (1);
+
+ if (!ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs))
+ return (1);
+ need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
+ ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
+
+ return (need);
+}
+
+static int
+zfs_freebsd_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp != NULL);
+
+ /*
+ * z_teardown_inactive_lock protects from a race with
+ * zfs_znode_dmu_fini in zfsvfs_teardown during
+ * force unmount.
+ */
+ ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs);
+ if (zp->z_sa_hdl == NULL)
+ zfs_znode_free(zp);
+ else
+ zfs_zinactive(zp);
+ ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
+
+ vp->v_data = NULL;
+ return (0);
+}
+
+static int
+zfs_freebsd_fid(ap)
+ struct vop_fid_args /* {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+ } */ *ap;
+{
+
+ return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
+}
+
+static int
+zfs_freebsd_pathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ register_t *a_retval;
+ } */ *ap;
+{
+ ulong_t val;
+ int error;
+
+ error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
+ if (error == 0) {
+ *ap->a_retval = val;
+ return (error);
+ }
+ if (error != EOPNOTSUPP)
+ return (error);
+
+ switch (ap->a_name) {
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
+ case _PC_PIPE_BUF:
+ if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
+ *ap->a_retval = PIPE_BUF;
+ return (0);
+ }
+ return (EINVAL);
+ default:
+ return (vop_stdpathconf(ap));
+ }
+}
+
+/*
+ * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
+ * extended attribute name:
+ *
+ * NAMESPACE PREFIX
+ * system freebsd:system:
+ * user (none, can be used to access ZFS fsattr(5) attributes
+ * created on Solaris)
+ */
+static int
+zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
+ size_t size)
+{
+ const char *namespace, *prefix, *suffix;
+
+ /* We don't allow '/' character in attribute name. */
+ if (strchr(name, '/') != NULL)
+ return (EINVAL);
+ /* We don't allow attribute names that start with "freebsd:" string. */
+ if (strncmp(name, "freebsd:", 8) == 0)
+ return (EINVAL);
+
+ bzero(attrname, size);
+
+ switch (attrnamespace) {
+ case EXTATTR_NAMESPACE_USER:
+#if 0
+ prefix = "freebsd:";
+ namespace = EXTATTR_NAMESPACE_USER_STRING;
+ suffix = ":";
+#else
+ /*
+ * This is the default namespace by which we can access all
+ * attributes created on Solaris.
+ */
+ prefix = namespace = suffix = "";
+#endif
+ break;
+ case EXTATTR_NAMESPACE_SYSTEM:
+ prefix = "freebsd:";
+ namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
+ suffix = ":";
+ break;
+ case EXTATTR_NAMESPACE_EMPTY:
+ default:
+ return (EINVAL);
+ }
+ if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
+ name) >= size) {
+ return (ENAMETOOLONG);
+ }
+ return (0);
+}
+
+/*
+ * Vnode operating to retrieve a named extended attribute.
+ */
+static int
+zfs_getextattr(struct vop_getextattr_args *ap)
+/*
+vop_getextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ INOUT struct uio *a_uio;
+ OUT size_t *a_size;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+*/
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ struct vattr va;
+ vnode_t *xvp = NULL, *vp;
+ int error, flags;
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VREAD);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof(attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ flags = FREAD;
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+ xvp, td);
+ error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ if (error == ENOENT)
+ error = ENOATTR;
+ return (error);
+ }
+
+ if (ap->a_size != NULL) {
+ error = VOP_GETATTR(vp, &va, ap->a_cred);
+ if (error == 0)
+ *ap->a_size = (size_t)va.va_size;
+ } else if (ap->a_uio != NULL)
+ error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+ VOP_UNLOCK(vp);
+ vn_close(vp, flags, ap->a_cred, td);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+int
+zfs_deleteextattr(struct vop_deleteextattr_args *ap)
+/*
+vop_deleteextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+*/
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ struct vattr va;
+ vnode_t *xvp = NULL, *vp;
+ int error, flags;
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VWRITE);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof(attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
+ UIO_SYSSPACE, attrname, xvp, td);
+ error = namei(&nd);
+ vp = nd.ni_vp;
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error == ENOENT)
+ error = ENOATTR;
+ return (error);
+ }
+
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ vput(nd.ni_dvp);
+ if (vp == nd.ni_dvp)
+ vrele(vp);
+ else
+ vput(vp);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+static int
+zfs_setextattr(struct vop_setextattr_args *ap)
+/*
+vop_setextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ IN const char *a_name;
+ INOUT struct uio *a_uio;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+*/
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrname[255];
+ struct vattr va;
+ vnode_t *xvp = NULL, *vp;
+ int error, flags;
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VWRITE);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+ sizeof(attrname));
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ flags = FFLAGS(O_WRONLY | O_CREAT);
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+ xvp, td);
+ error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
+ NULL);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ VATTR_NULL(&va);
+ va.va_size = 0;
+ error = VOP_SETATTR(vp, &va, ap->a_cred);
+ if (error == 0)
+ VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+ VOP_UNLOCK(vp);
+ vn_close(vp, flags, ap->a_cred, td);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Vnode operation to retrieve extended attributes on a vnode.
+ */
+static int
+zfs_listextattr(struct vop_listextattr_args *ap)
+/*
+vop_listextattr {
+ IN struct vnode *a_vp;
+ IN int a_attrnamespace;
+ INOUT struct uio *a_uio;
+ OUT size_t *a_size;
+ IN struct ucred *a_cred;
+ IN struct thread *a_td;
+};
+*/
+{
+ zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+ struct thread *td = ap->a_td;
+ struct nameidata nd;
+ char attrprefix[16];
+ u_char dirbuf[sizeof(struct dirent)];
+ struct dirent *dp;
+ struct iovec aiov;
+ struct uio auio, *uio = ap->a_uio;
+ size_t *sizep = ap->a_size;
+ size_t plen;
+ vnode_t *xvp = NULL, *vp;
+ int done, error, eof, pos;
+
+ error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+ ap->a_cred, ap->a_td, VREAD);
+ if (error != 0)
+ return (error);
+
+ error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
+ sizeof(attrprefix));
+ if (error != 0)
+ return (error);
+ plen = strlen(attrprefix);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (sizep != NULL)
+ *sizep = 0;
+
+ error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+ LOOKUP_XATTR, B_FALSE);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ /*
+ * ENOATTR means that the EA directory does not yet exist,
+ * i.e. there are no extended attributes there.
+ */
+ if (error == ENOATTR)
+ error = 0;
+ return (error);
+ }
+
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
+ UIO_SYSSPACE, ".", xvp, td);
+ error = namei(&nd);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_rw = UIO_READ;
+ auio.uio_offset = 0;
+
+ do {
+ u_char nlen;
+
+ aiov.iov_base = (void *)dirbuf;
+ aiov.iov_len = sizeof(dirbuf);
+ auio.uio_resid = sizeof(dirbuf);
+ error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
+ done = sizeof(dirbuf) - auio.uio_resid;
+ if (error != 0)
+ break;
+ for (pos = 0; pos < done;) {
+ dp = (struct dirent *)(dirbuf + pos);
+ pos += dp->d_reclen;
+ /*
+ * XXX: Temporarily we also accept DT_UNKNOWN, as this
+ * is what we get when attribute was created on Solaris.
+ */
+ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
+ continue;
+ if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
+ continue;
+ else if (strncmp(dp->d_name, attrprefix, plen) != 0)
+ continue;
+ nlen = dp->d_namlen - plen;
+ if (sizep != NULL)
+ *sizep += 1 + nlen;
+ else if (uio != NULL) {
+ /*
+ * Format of extattr name entry is one byte for
+ * length and the rest for name.
+ */
+ error = uiomove(&nlen, 1, uio->uio_rw, uio);
+ if (error == 0) {
+ error = uiomove(dp->d_name + plen, nlen,
+ uio->uio_rw, uio);
+ }
+ if (error != 0)
+ break;
+ }
+ }
+ } while (!eof && error == 0);
+
+ vput(vp);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+int
+zfs_freebsd_getacl(ap)
+ struct vop_getacl_args /* {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
+{
+ int error;
+ vsecattr_t vsecattr;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
+ if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
+ return (error);
+
+ error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
+ if (vsecattr.vsa_aclentp != NULL)
+ kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
+
+ return (error);
+}
+
+int
+zfs_freebsd_setacl(ap)
+ struct vop_setacl_args /* {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
+{
+ int error;
+ vsecattr_t vsecattr;
+ int aclbsize; /* size of acl list in bytes */
+ aclent_t *aaclp;
+
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
+
+ if (ap->a_aclp == NULL)
+ return (EINVAL);
+
+ if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
+ return (EINVAL);
+
+ /*
+ * With NFSv4 ACLs, chmod(2) may need to add additional entries,
+ * splitting every entry into two and appending "canonical six"
+ * entries at the end. Don't allow for setting an ACL that would
+ * cause chmod(2) to run out of ACL entries.
+ */
+ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
+ return (ENOSPC);
+
+ error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
+ if (error != 0)
+ return (error);
+
+ vsecattr.vsa_mask = VSA_ACE;
+ aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
+ vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
+ aaclp = vsecattr.vsa_aclentp;
+ vsecattr.vsa_aclentsz = aclbsize;
+
+ aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
+ error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
+ kmem_free(aaclp, aclbsize);
+
+ return (error);
+}
+
+int
+zfs_freebsd_aclcheck(ap)
+ struct vop_aclcheck_args /* {
+ struct vnode *vp;
+ acl_type_t type;
+ struct acl *aclp;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+zfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+ vnode_t *covered_vp;
+ vnode_t *vp = ap->a_vp;;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ znode_t *zp = VTOZ(vp);
+ enum vgetstate vs;
+ int ltype;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * If we are a snapshot mounted under .zfs, run the operation
+ * on the covered vnode.
+ */
+ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
+ char name[MAXNAMLEN + 1];
+ znode_t *dzp;
+ size_t len;
+
+ error = zfs_znode_parent_and_name(zp, &dzp, name);
+ if (error == 0) {
+ len = strlen(name);
+ if (*ap->a_buflen < len)
+ error = SET_ERROR(ENOMEM);
+ }
+ if (error == 0) {
+ *ap->a_buflen -= len;
+ bcopy(name, ap->a_buf + *ap->a_buflen, len);
+ *ap->a_vpp = ZTOV(dzp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ZFS_EXIT(zfsvfs);
+
+ covered_vp = vp->v_mount->mnt_vnodecovered;
+ vs = vget_prep(covered_vp);
+ ltype = VOP_ISLOCKED(vp);
+ VOP_UNLOCK(vp);
+ error = vget_finish(covered_vp, LK_SHARED, vs);
+ if (error == 0) {
+ error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
+ ap->a_buf, ap->a_buflen);
+ vput(covered_vp);
+ }
+ vn_lock(vp, ltype | LK_RETRY);
+ if (VN_IS_DOOMED(vp))
+ error = SET_ERROR(ENOENT);
+ return (error);
+}
+
+#ifdef DIAGNOSTIC
+static int
+zfs_lock(ap)
+ struct vop_lock1_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ char *file;
+ int line;
+ } */ *ap;
+{
+ vnode_t *vp;
+ znode_t *zp;
+ int err;
+
+ err = vop_lock(ap);
+ if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
+ vp = ap->a_vp;
+ zp = vp->v_data;
+ if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) &&
+ zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
+ VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
+ }
+ return (err);
+}
+#endif
+
+struct vop_vector zfs_vnodeops;
+struct vop_vector zfs_fifoops;
+struct vop_vector zfs_shareops;
+
+struct vop_vector zfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+ .vop_inactive = zfs_freebsd_inactive,
+ .vop_need_inactive = zfs_freebsd_need_inactive,
+ .vop_reclaim = zfs_freebsd_reclaim,
+ .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
+ .vop_access = zfs_freebsd_access,
+ .vop_allocate = VOP_EINVAL,
+ .vop_lookup = zfs_cache_lookup,
+ .vop_cachedlookup = zfs_freebsd_cachedlookup,
+ .vop_getattr = zfs_freebsd_getattr,
+ .vop_setattr = zfs_freebsd_setattr,
+ .vop_create = zfs_freebsd_create,
+ .vop_mknod = zfs_freebsd_create,
+ .vop_mkdir = zfs_freebsd_mkdir,
+ .vop_readdir = zfs_freebsd_readdir,
+ .vop_fsync = zfs_freebsd_fsync,
+ .vop_open = zfs_freebsd_open,
+ .vop_close = zfs_freebsd_close,
+ .vop_rmdir = zfs_freebsd_rmdir,
+ .vop_ioctl = zfs_freebsd_ioctl,
+ .vop_link = zfs_freebsd_link,
+ .vop_symlink = zfs_freebsd_symlink,
+ .vop_readlink = zfs_freebsd_readlink,
+ .vop_read = zfs_freebsd_read,
+ .vop_write = zfs_freebsd_write,
+ .vop_remove = zfs_freebsd_remove,
+ .vop_rename = zfs_freebsd_rename,
+ .vop_pathconf = zfs_freebsd_pathconf,
+ .vop_bmap = zfs_freebsd_bmap,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_getextattr = zfs_getextattr,
+ .vop_deleteextattr = zfs_deleteextattr,
+ .vop_setextattr = zfs_setextattr,
+ .vop_listextattr = zfs_listextattr,
+ .vop_getacl = zfs_freebsd_getacl,
+ .vop_setacl = zfs_freebsd_setacl,
+ .vop_aclcheck = zfs_freebsd_aclcheck,
+ .vop_getpages = zfs_freebsd_getpages,
+ .vop_putpages = zfs_freebsd_putpages,
+ .vop_vptocnp = zfs_vptocnp,
+#ifdef DIAGNOSTIC
+ .vop_lock1 = zfs_lock,
+#else
+ .vop_lock1 = vop_lock,
+#endif
+ .vop_unlock = vop_unlock,
+ .vop_islocked = vop_islocked,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
+
+struct vop_vector zfs_fifoops = {
+ .vop_default = &fifo_specops,
+ .vop_fsync = zfs_freebsd_fsync,
+ .vop_access = zfs_freebsd_access,
+ .vop_getattr = zfs_freebsd_getattr,
+ .vop_inactive = zfs_freebsd_inactive,
+ .vop_read = VOP_PANIC,
+ .vop_reclaim = zfs_freebsd_reclaim,
+ .vop_setattr = zfs_freebsd_setattr,
+ .vop_write = VOP_PANIC,
+ .vop_pathconf = zfs_freebsd_pathconf,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_getacl = zfs_freebsd_getacl,
+ .vop_setacl = zfs_freebsd_setacl,
+ .vop_aclcheck = zfs_freebsd_aclcheck,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
+
+/*
+ * special share hidden files vnode operations template
+ */
+struct vop_vector zfs_shareops = {
+ .vop_default = &default_vnodeops,
+ .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
+ .vop_access = zfs_freebsd_access,
+ .vop_inactive = zfs_freebsd_inactive,
+ .vop_reclaim = zfs_freebsd_reclaim,
+ .vop_fid = zfs_freebsd_fid,
+ .vop_pathconf = zfs_freebsd_pathconf,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_shareops);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
new file mode 100644
index 000000000000..ecc11d16f42a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -0,0 +1,2388 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/kidmap.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+#include <sys/refcount.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/* Used by fstat(1). */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
+ SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)");
+
+/*
+ * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef DEBUG
+#define ZNODE_STATS
+#endif /* DEBUG */
+
+#ifdef ZNODE_STATS
+#define ZNODE_STAT_ADD(stat) ((stat)++)
+#else
+#define ZNODE_STAT_ADD(stat) /* nothing */
+#endif /* ZNODE_STATS */
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+/*
+ * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
+ * be freed before it can be safely accessed.
+ */
+krwlock_t zfsvfs_lock;
+
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+#define _ZFS_USE_SMR
+static uma_zone_t znode_uma_zone;
+#else
+static kmem_cache_t *znode_cache = NULL;
+#endif
+
+/*ARGSUSED*/
+static void
+znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
+{
+ /*
+ * We should never drop all dbuf refs without first clearing
+ * the eviction callback.
+ */
+ panic("evicting znode %p\n", user_ptr);
+}
+
+extern struct vop_vector zfs_vnodeops;
+extern struct vop_vector zfs_fifoops;
+extern struct vop_vector zfs_shareops;
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(locked_range_t *new, void *arg)
+{
+ znode_t *zp = arg;
+
+ /*
+ * If in append mode, convert to writer and lock starting at the
+ * current end of file.
+ */
+ if (new->lr_type == RL_APPEND) {
+ new->lr_offset = zp->z_size;
+ new->lr_type = RL_WRITER;
+ }
+
+ /*
+ * If we need to grow the block size then lock the whole file range.
+ */
+ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+ new->lr_offset = 0;
+ new->lr_length = UINT64_MAX;
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ znode_t *zp = buf;
+
+ POINTER_INVALIDATE(&zp->z_zfsvfs);
+
+ list_link_init(&zp->z_link_node);
+
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+ zp->z_acl_cached = NULL;
+ zp->z_vnode = NULL;
+ zp->z_moved = 0;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ ASSERT3P(zp->z_vnode, ==, NULL);
+ ASSERT(!list_link_active(&zp->z_link_node));
+ mutex_destroy(&zp->z_acl_lock);
+ rangelock_fini(&zp->z_rangelock);
+
+ ASSERT(zp->z_acl_cached == NULL);
+}
+
+#ifdef ZNODE_STATS
+static struct {
+ uint64_t zms_zfsvfs_invalid;
+ uint64_t zms_zfsvfs_recheck1;
+ uint64_t zms_zfsvfs_unmounted;
+ uint64_t zms_zfsvfs_recheck2;
+ uint64_t zms_obj_held;
+ uint64_t zms_vnode_locked;
+ uint64_t zms_not_only_dnlc;
+} znode_move_stats;
+#endif /* ZNODE_STATS */
+
+#ifdef illumos
+static void
+zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
+{
+ vnode_t *vp;
+
+ /* Copy fields. */
+ nzp->z_zfsvfs = ozp->z_zfsvfs;
+
+ /* Swap vnodes. */
+ vp = nzp->z_vnode;
+ nzp->z_vnode = ozp->z_vnode;
+ ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
+ ZTOV(ozp)->v_data = ozp;
+ ZTOV(nzp)->v_data = nzp;
+
+ nzp->z_id = ozp->z_id;
+ ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
+ nzp->z_unlinked = ozp->z_unlinked;
+ nzp->z_atime_dirty = ozp->z_atime_dirty;
+ nzp->z_zn_prefetch = ozp->z_zn_prefetch;
+ nzp->z_blksz = ozp->z_blksz;
+ nzp->z_seq = ozp->z_seq;
+ nzp->z_mapcnt = ozp->z_mapcnt;
+ nzp->z_gen = ozp->z_gen;
+ nzp->z_sync_cnt = ozp->z_sync_cnt;
+ nzp->z_is_sa = ozp->z_is_sa;
+ nzp->z_sa_hdl = ozp->z_sa_hdl;
+ bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
+ nzp->z_links = ozp->z_links;
+ nzp->z_size = ozp->z_size;
+ nzp->z_pflags = ozp->z_pflags;
+ nzp->z_uid = ozp->z_uid;
+ nzp->z_gid = ozp->z_gid;
+ nzp->z_mode = ozp->z_mode;
+
+ /*
+ * Since this is just an idle znode and kmem is already dealing with
+ * memory pressure, release any cached ACL.
+ */
+ if (ozp->z_acl_cached) {
+ zfs_acl_free(ozp->z_acl_cached);
+ ozp->z_acl_cached = NULL;
+ }
+
+ sa_set_userp(nzp->z_sa_hdl, nzp);
+
+ /*
+ * Invalidate the original znode by clearing fields that provide a
+ * pointer back to the znode. Set the low bit of the vfs pointer to
+ * ensure that zfs_znode_move() recognizes the znode as invalid in any
+ * subsequent callback.
+ */
+ ozp->z_sa_hdl = NULL;
+ POINTER_INVALIDATE(&ozp->z_zfsvfs);
+
+ /*
+ * Mark the znode.
+ */
+ nzp->z_moved = 1;
+ ozp->z_moved = (uint8_t)-1;
+}
+
+/*ARGSUSED*/
+static kmem_cbrc_t
+zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+ znode_t *ozp = buf, *nzp = newbuf;
+ zfsvfs_t *zfsvfs;
+ vnode_t *vp;
+
+ /*
+ * The znode is on the file system's list of known znodes if the vfs
+ * pointer is valid. We set the low bit of the vfs pointer when freeing
+ * the znode to invalidate it, and the memory patterns written by kmem
+ * (baddcafe and deadbeef) set at least one of the two low bits. A newly
+ * created znode sets the vfs pointer last of all to indicate that the
+ * znode is known and in a valid state to be moved by this function.
+ */
+ zfsvfs = ozp->z_zfsvfs;
+ if (!POINTER_IS_VALID(zfsvfs)) {
+ ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * Close a small window in which it's possible that the filesystem could
+ * be unmounted and freed, and zfsvfs, though valid in the previous
+ * statement, could point to unrelated memory by the time we try to
+ * prevent the filesystem from being unmounted.
+ */
+ rw_enter(&zfsvfs_lock, RW_WRITER);
+ if (zfsvfs != ozp->z_zfsvfs) {
+ rw_exit(&zfsvfs_lock);
+ ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * If the znode is still valid, then so is the file system. We know that
+ * no valid file system can be freed while we hold zfsvfs_lock, so we
+ * can safely ensure that the filesystem is not and will not be
+ * unmounted. The next statement is equivalent to ZFS_ENTER().
+ */
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+ if (zfsvfs->z_unmounted) {
+ ZFS_EXIT(zfsvfs);
+ rw_exit(&zfsvfs_lock);
+ ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+ rw_exit(&zfsvfs_lock);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ /*
+ * Recheck the vfs pointer in case the znode was removed just before
+ * acquiring the lock.
+ */
+ if (zfsvfs != ozp->z_zfsvfs) {
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ ZFS_EXIT(zfsvfs);
+ ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * At this point we know that as long as we hold z_znodes_lock, the
+ * znode cannot be freed and fields within the znode can be safely
+ * accessed. Now, prevent a race with zfs_zget().
+ */
+ if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ ZFS_EXIT(zfsvfs);
+ ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
+ return (KMEM_CBRC_LATER);
+ }
+
+ vp = ZTOV(ozp);
+ if (mutex_tryenter(&vp->v_lock) == 0) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ ZFS_EXIT(zfsvfs);
+ ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /* Only move znodes that are referenced _only_ by the DNLC. */
+ if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
+ mutex_exit(&vp->v_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ ZFS_EXIT(zfsvfs);
+ ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * The znode is known and in a valid state to move. We're holding the
+ * locks needed to execute the critical section.
+ */
+ zfs_znode_move_impl(ozp, nzp);
+ mutex_exit(&vp->v_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
+
+ list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ ZFS_EXIT(zfsvfs);
+
+ return (KMEM_CBRC_YES);
+}
+#endif /* illumos */
+
+#ifdef _ZFS_USE_SMR
+VFS_SMR_DECLARE;
+
+static int
+zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, int flags)
+{
+
+ return (zfs_znode_cache_constructor(mem, private, flags));
+}
+
+static void
+zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private)
+{
+
+ zfs_znode_cache_destructor(mem, private);
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
+ ASSERT(znode_uma_zone == NULL);
+ znode_uma_zone = uma_zcreate("zfs_znode_cache",
+ sizeof (znode_t), zfs_znode_cache_constructor_smr,
+ zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
+ VFS_SMR_ZONE_SET(znode_uma_zone);
+}
+
+static znode_t *
+zfs_znode_alloc_kmem(int flags)
+{
+
+ return (uma_zalloc_smr(znode_uma_zone, flags));
+}
+
+static void
+zfs_znode_free_kmem(znode_t *zp)
+{
+
+ uma_zfree_smr(znode_uma_zone, zp);
+}
+#else
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, zfs_znode_cache_constructor,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+ kmem_cache_set_move(znode_cache, zfs_znode_move);
+}
+
+static znode_t *
+zfs_znode_alloc_kmem(int flags)
+{
+
+ return (kmem_cache_alloc(znode_cache, flags));
+}
+
+static void
+zfs_znode_free_kmem(znode_t *zp)
+{
+
+ kmem_cache_free(znode_cache, zp);
+}
+#endif
+
+void
+zfs_znode_fini(void)
+{
+#ifdef illumos
+ /*
+ * Cleanup vfs & vnode ops
+ */
+ zfs_remove_op_tables();
+#endif
+
+ /*
+ * Cleanup zcache
+ */
+#ifdef _ZFS_USE_SMR
+ if (znode_uma_zone) {
+ uma_zdestroy(znode_uma_zone);
+ znode_uma_zone = NULL;
+ }
+#else
+ if (znode_cache) {
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+ }
+#endif
+ rw_destroy(&zfsvfs_lock);
+}
+
+#ifdef illumos
+struct vnodeops *zfs_dvnodeops;
+struct vnodeops *zfs_fvnodeops;
+struct vnodeops *zfs_symvnodeops;
+struct vnodeops *zfs_xdvnodeops;
+struct vnodeops *zfs_evnodeops;
+struct vnodeops *zfs_sharevnodeops;
+
+void
+zfs_remove_op_tables()
+{
+ /*
+ * Remove vfs ops
+ */
+ ASSERT(zfsfstype);
+ (void) vfs_freevfsops_by_type(zfsfstype);
+ zfsfstype = 0;
+
+ /*
+ * Remove vnode ops
+ */
+ if (zfs_dvnodeops)
+ vn_freevnodeops(zfs_dvnodeops);
+ if (zfs_fvnodeops)
+ vn_freevnodeops(zfs_fvnodeops);
+ if (zfs_symvnodeops)
+ vn_freevnodeops(zfs_symvnodeops);
+ if (zfs_xdvnodeops)
+ vn_freevnodeops(zfs_xdvnodeops);
+ if (zfs_evnodeops)
+ vn_freevnodeops(zfs_evnodeops);
+ if (zfs_sharevnodeops)
+ vn_freevnodeops(zfs_sharevnodeops);
+
+ zfs_dvnodeops = NULL;
+ zfs_fvnodeops = NULL;
+ zfs_symvnodeops = NULL;
+ zfs_xdvnodeops = NULL;
+ zfs_evnodeops = NULL;
+ zfs_sharevnodeops = NULL;
+}
+
+extern const fs_operation_def_t zfs_dvnodeops_template[];
+extern const fs_operation_def_t zfs_fvnodeops_template[];
+extern const fs_operation_def_t zfs_xdvnodeops_template[];
+extern const fs_operation_def_t zfs_symvnodeops_template[];
+extern const fs_operation_def_t zfs_evnodeops_template[];
+extern const fs_operation_def_t zfs_sharevnodeops_template[];
+
+int
+zfs_create_op_tables()
+{
+ int error;
+
+ /*
+ * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
+ * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
+ * In this case we just return as the ops vectors are already set up.
+ */
+ if (zfs_dvnodeops)
+ return (0);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
+ &zfs_dvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
+ &zfs_fvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
+ &zfs_symvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
+ &zfs_xdvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
+ &zfs_evnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
+ &zfs_sharevnodeops);
+
+ return (error);
+}
+#endif /* illumos */
+
+int
+zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ zfs_acl_ids_t acl_ids;
+ vattr_t vattr;
+ znode_t *sharezp;
+ znode_t *zp;
+ int error;
+
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0555;
+ vattr.va_uid = crgetuid(kcred);
+ vattr.va_gid = crgetgid(kcred);
+
+ sharezp = zfs_znode_alloc_kmem(KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+ sharezp->z_moved = 0;
+ sharezp->z_unlinked = 0;
+ sharezp->z_atime_dirty = 0;
+ sharezp->z_zfsvfs = zfsvfs;
+ sharezp->z_is_sa = zfsvfs->z_use_sa;
+
+ VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
+ kcred, NULL, &acl_ids));
+ zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
+ ASSERT3P(zp, ==, sharezp);
+ POINTER_INVALIDATE(&sharezp->z_zfsvfs);
+ error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
+ zfsvfs->z_shares_dir = sharezp->z_id;
+
+ zfs_acl_ids_free(&acl_ids);
+ sa_handle_destroy(sharezp->z_sa_hdl);
+ zfs_znode_free_kmem(sharezp);
+
+ return (error);
+}
+
+/*
+ * define a couple of values we need available
+ * for both 64 and 32 bit environments.
+ */
+#ifndef NBITSMINOR64
+#define NBITSMINOR64 32
+#endif
+#ifndef MAXMAJ64
+#define MAXMAJ64 0xffffffffUL
+#endif
+#ifndef MAXMIN64
+#define MAXMIN64 0xffffffffUL
+#endif
+
+/*
+ * Create special expldev for ZFS private use.
+ * Can't use standard expldev since it doesn't do
+ * what we want. The standard expldev() takes a
+ * dev32_t in LP64 and expands it to a long dev_t.
+ * We need an interface that takes a dev32_t in ILP32
+ * and expands it to a long dev_t.
+ */
+static uint64_t
+zfs_expldev(dev_t dev)
+{
+ return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
+}
+/*
+ * Special cmpldev for ZFS private use.
+ * Can't use standard cmpldev since it takes
+ * a long dev_t and compresses it to dev32_t in
+ * LP64. We need to do a compaction of a long dev_t
+ * to a dev32_t in ILP32.
+ */
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+ return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
+ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ ASSERT(zp->z_acl_cached == NULL);
+ if (sa_hdl == NULL) {
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ SA_HDL_SHARED, &zp->z_sa_hdl));
+ } else {
+ zp->z_sa_hdl = sa_hdl;
+ sa_set_userp(sa_hdl, zp);
+ }
+
+ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+ /*
+ * Slap on VROOT if we are the root znode unless we are the root
+ * node of a snapshot mounted under .zfs.
+ */
+ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
+ ZTOV(zp)->v_flag |= VROOT;
+
+ vn_exists(ZTOV(zp));
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
+ zp->z_unlinked ||
+ ZFS_TEARDOWN_INACTIVE_WLOCKED(zp->z_zfsvfs));
+
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
+}
+
+static void
+zfs_vnode_forget(vnode_t *vp)
+{
+
+ /* copied from insmntque_stddtr */
+ vp->v_data = NULL;
+ vp->v_op = &dead_vnodeops;
+ vgone(vp);
+ vput(vp);
+}
+
+/*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+ dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+ znode_t *zp;
+ vnode_t *vp;
+ uint64_t mode;
+ uint64_t parent;
+ sa_bulk_attr_t bulk[9];
+ int count = 0;
+ int error;
+
+ zp = zfs_znode_alloc_kmem(KM_SLEEP);
+
+#ifndef _ZFS_USE_SMR
+ KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0,
+ ("%s: fast path lookup enabled without smr", __func__));
+#endif
+
+ KASSERT(curthread->td_vp_reserved != NULL,
+ ("zfs_znode_alloc: getnewvnode without preallocated vnode"));
+ error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
+ if (error != 0) {
+ zfs_znode_free_kmem(zp);
+ return (NULL);
+ }
+ zp->z_vnode = vp;
+ vp->v_data = zp;
+
+ ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ zp->z_moved = 0;
+
+ /*
+ * Defer setting z_zfsvfs until the znode is ready to be a candidate for
+ * the zfs_znode_move() callback.
+ */
+ zp->z_sa_hdl = NULL;
+ zp->z_unlinked = 0;
+ zp->z_atime_dirty = 0;
+ zp->z_mapcnt = 0;
+ zp->z_id = db->db_object;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
+
+ vp = ZTOV(zp);
+
+ zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, 8);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
+ if (hdl == NULL)
+ sa_handle_destroy(zp->z_sa_hdl);
+ zfs_vnode_forget(vp);
+ zp->z_vnode = NULL;
+ zfs_znode_free_kmem(zp);
+ return (NULL);
+ }
+
+ zp->z_mode = mode;
+
+ vp->v_type = IFTOVT((mode_t)mode);
+
+ switch (vp->v_type) {
+ case VDIR:
+ zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+ break;
+#ifdef illumos
+ case VBLK:
+ case VCHR:
+ {
+ uint64_t rdev;
+ VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
+ &rdev, sizeof (rdev)) == 0);
+
+ vp->v_rdev = zfs_cmpldev(rdev);
+ }
+ break;
+#endif
+ case VFIFO:
+#ifdef illumos
+ case VSOCK:
+ case VDOOR:
+#endif
+ vp->v_op = &zfs_fifoops;
+ break;
+ case VREG:
+ if (parent == zfsvfs->z_shares_dir) {
+ ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
+ vp->v_op = &zfs_shareops;
+ }
+ break;
+#ifdef illumos
+ case VLNK:
+ vn_setops(vp, zfs_symvnodeops);
+ break;
+ default:
+ vn_setops(vp, zfs_evnodeops);
+ break;
+#endif
+ }
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ membar_producer();
+ /*
+ * Everything else must be valid before assigning z_zfsvfs makes the
+ * znode eligible for zfs_znode_move().
+ */
+ zp->z_zfsvfs = zfsvfs;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * Acquire vnode lock before making it available to the world.
+ */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VN_LOCK_AREC(vp);
+ if (vp->v_type != VFIFO)
+ VN_LOCK_ASHARE(vp);
+
+#ifdef illumos
+ VFS_HOLD(zfsvfs->z_vfs);
+#endif
+ return (zp);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_XATTR - new object is an attribute
+ * bonuslen - length of bonus buffer
+ * setaclp - File/Dir initial ACL
+ * fuidp - Tracks fuid allocation.
+ *
+ * OUT: zpp - allocated znode
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+ uint64_t crtime[2], atime[2], mtime[2], ctime[2];
+ uint64_t mode, size, links, parent, pflags;
+ uint64_t dzp_pflags = 0;
+ uint64_t rdev = 0;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ dmu_buf_t *db;
+ timestruc_t now;
+ uint64_t gen, obj;
+ int err;
+ int bonuslen;
+ int dnodesize;
+ sa_handle_t *sa_hdl;
+ dmu_object_type_t obj_type;
+ sa_bulk_attr_t *sa_attrs;
+ int cnt = 0;
+ zfs_acl_locator_cb_t locate = { 0 };
+
+ ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
+
+ if (zfsvfs->z_replay) {
+ obj = vap->va_nodeid;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ dnodesize = vap->va_fsid; /* ditto */
+ } else {
+ obj = 0;
+ vfs_timestamp(&now);
+ gen = dmu_tx_get_txg(tx);
+ dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+ }
+
+ if (dnodesize == 0)
+ dnodesize = DNODE_MIN_SIZE;
+
+ obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+ bonuslen = (obj_type == DMU_OT_SA) ?
+ DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ /*
+ * Create a new DMU object.
+ */
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be needed to allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
+ if (vap->va_type == VDIR) {
+ if (zfsvfs->z_replay) {
+ VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = zap_create_norm_dnsize(zfsvfs->z_os,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ } else {
+ if (zfsvfs->z_replay) {
+ VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ }
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+ VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_id = obj;
+ } else {
+ dzp_pflags = dzp->z_pflags;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp_pflags & ZFS_XATTR) {
+ flag |= IS_XATTR;
+ }
+
+ if (zfsvfs->z_use_fuids)
+ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ else
+ pflags = 0;
+
+ if (vap->va_type == VDIR) {
+ size = 2; /* contents ("." and "..") */
+ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ } else {
+ size = links = 0;
+ }
+
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ rdev = zfs_expldev(vap->va_rdev);
+ }
+
+ parent = dzp->z_id;
+ mode = acl_ids->z_mode;
+ if (flag & IS_XATTR)
+ pflags |= ZFS_XATTR;
+
+ /*
+ * No execs denied will be deterimed when zfs_mode_compute() is called.
+ */
+ pflags |= acl_ids->z_aclp->z_hints &
+ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+ ZFS_TIME_ENCODE(&now, crtime);
+ ZFS_TIME_ENCODE(&now, ctime);
+
+ if (vap->va_mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, atime);
+ }
+
+ if (vap->va_mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, mtime);
+ }
+
+ /* Now add in all of the "SA" attributes */
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ &sa_hdl));
+
+ /*
+ * Setup the array of attributes to be replaced/set on the new file
+ *
+ * order for DMU_OT_ZNODE is critical since it needs to be constructed
+ * in the old znode_phys_t format. Don't change this ordering
+ */
+ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ } else {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+ NULL, &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+ NULL, &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ }
+
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+ &empty_xattr, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE ||
+ (vap->va_type == VBLK || vap->va_type == VCHR)) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+
+ }
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+ sizeof (uint64_t) * 4);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (zfs_acl_phys_t));
+ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &acl_ids->z_aclp->z_acl_count, 8);
+ locate.cb_aclp = acl_ids->z_aclp;
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ acl_ids->z_aclp->z_acl_bytes);
+ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ }
+
+ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+ if (!(flag & IS_ROOT_NODE)) {
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+ ASSERT(*zpp != NULL);
+ } else {
+ /*
+ * If we are creating the root node, the "parent" we
+ * passed in is the znode for the root.
+ */
+ *zpp = dzp;
+
+ (*zpp)->z_sa_hdl = sa_hdl;
+ }
+
+ (*zpp)->z_pflags = pflags;
+ (*zpp)->z_mode = mode;
+ (*zpp)->z_dnodesize = dnodesize;
+
+ if (vap->va_mask & AT_XVATTR)
+ zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+ if (obj_type == DMU_OT_ZNODE ||
+ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+ VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+ }
+ if (!(flag & IS_ROOT_NODE)) {
+ vnode_t *vp;
+
+ vp = ZTOV(*zpp);
+ vp->v_vflag |= VV_FORCEINSMQ;
+ err = insmntque(vp, zfsvfs->z_vfs);
+ vp->v_vflag &= ~VV_FORCEINSMQ;
+ KASSERT(err == 0, ("insmntque() failed: error %d", err));
+ }
+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+}
+
+/*
+ * Update in-core attributes. It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ xoptattr_t *xoap;
+
+ xoap = xva_getxoptattr(xvap);
+ ASSERT(xoap);
+
+ ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+ uint64_t times[2];
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ &times, sizeof (times), tx);
+ XVA_SET_RTN(xvap, XAT_CREATETIME);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+ xoap->xoa_av_quarantined, zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+ zfs_sa_set_scanstamp(zp, xvap, tx);
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+ vnode_t *vp;
+ sa_handle_t *hdl;
+ struct thread *td;
+ int locked;
+ int err;
+
+ td = curthread;
+ getnewvnode_reserve();
+again:
+ *zpp = NULL;
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+#ifdef __FreeBSD__
+ getnewvnode_drop_reserve();
+#endif
+ return (SET_ERROR(EINVAL));
+ }
+
+ hdl = dmu_buf_get_user(db);
+ if (hdl != NULL) {
+ zp = sa_get_userdata(hdl);
+
+ /*
+ * Since "SA" does immediate eviction we
+ * should never find a sa handle that doesn't
+ * know about the znode.
+ */
+ ASSERT3P(zp, !=, NULL);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ if (zp->z_unlinked) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ vp = ZTOV(zp);
+ /*
+ * Don't let the vnode disappear after
+ * ZFS_OBJ_HOLD_EXIT.
+ */
+ VN_HOLD(vp);
+ *zpp = zp;
+ err = 0;
+ }
+
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+ if (err) {
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ locked = VOP_ISLOCKED(vp);
+ VI_LOCK(vp);
+ if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) {
+ /*
+ * The vnode is doomed and this thread doesn't
+ * hold the exclusive lock on it, so the vnode
+ * must be being reclaimed by another thread.
+ * Otherwise the doomed vnode is being reclaimed
+ * by this thread and zfs_zget is called from
+ * ZIL internals.
+ */
+ VI_UNLOCK(vp);
+
+ /*
+ * XXX vrele() locks the vnode when the last reference
+ * is dropped. Although in this case the vnode is
+ * doomed / dead and so no inactivation is required,
+ * the vnode lock is still acquired. That could result
+ * in a LOR with z_teardown_lock if another thread holds
+ * the vnode's lock and tries to take z_teardown_lock.
+ * But that is only possible if the other thread peforms
+ * a ZFS vnode operation on the vnode. That either
+ * should not happen if the vnode is dead or the thread
+ * should also have a refrence to the vnode and thus
+ * our reference is not last.
+ */
+ VN_RELE(vp);
+ goto again;
+ }
+ VI_UNLOCK(vp);
+ getnewvnode_drop_reserve();
+ return (err);
+ }
+
+ /*
+ * Not found create new znode/vnode
+ * but only if file exists.
+ *
+ * There is a small window where zfs_vget() could
+ * find this object while a file create is still in
+ * progress. This is checked for in zfs_znode_alloc()
+ *
+ * if zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+ doi.doi_bonus_type, NULL);
+ if (zp == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ *zpp = zp;
+ }
+ if (err == 0) {
+ vnode_t *vp = ZTOV(zp);
+
+ err = insmntque(vp, zfsvfs->z_vfs);
+ if (err == 0) {
+ vp->v_hash = obj_num;
+ VOP_UNLOCK(vp);
+ } else {
+ zp->z_vnode = NULL;
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ *zpp = NULL;
+ }
+ }
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ getnewvnode_drop_reserve();
+ return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ vnode_t *vp;
+ uint64_t obj_num = zp->z_id;
+ uint64_t mode, size;
+ sa_bulk_attr_t bulk[8];
+ int err;
+ int count = 0;
+ uint64_t gen;
+
+ /*
+ * Remove cached pages before reloading the znode, so that they are not
+ * lingering after we run into any error. Ideally, we should vgone()
+ * the vnode in case of error, but currently we cannot do that
+ * because of the LOR between the vnode lock and z_teardown_lock.
+ * So, instead, we have to "doom" the znode in the illumos style.
+ */
+ vp = ZTOV(zp);
+ vn_pages_remove(vp, 0, 0);
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(zp->z_sa_hdl == NULL);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EINVAL));
+ }
+
+ zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+ size = zp->z_size;
+
+ /* reload cached values */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, sizeof (gen));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, sizeof (zp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &zp->z_uid, sizeof (zp->z_uid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &zp->z_gid, sizeof (zp->z_gid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ zp->z_mode = mode;
+
+ if (gen != zp->z_gen) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * It is highly improbable but still quite possible that two
+ * objects in different datasets are created with the same
+ * object numbers and in transaction groups with the same
+ * numbers. znodes corresponding to those objects would
+ * have the same z_id and z_gen, but their other attributes
+ * may be different.
+ * zfs recv -F may replace one of such objects with the other.
+ * As a result file properties recorded in the replaced
+ * object's vnode may no longer match the received object's
+ * properties. At present the only cached property is the
+ * files type recorded in v_type.
+ * So, handle this case by leaving the old vnode and znode
+ * disassociated from the actual object. A new vnode and a
+ * znode will be created if the object is accessed
+ * (e.g. via a look-up). The old vnode and znode will be
+ * recycled when the last vnode reference is dropped.
+ */
+ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * If the file has zero links, then it has been unlinked on the send
+ * side and it must be in the received unlinked set.
+ * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+ * stale data and to prevent automatical removal of the file in
+ * zfs_zinactive(). The file will be removed either when it is removed
+ * on the send side and the next incremental stream is received or
+ * when the unlinked set gets processed.
+ */
+ zp->z_unlinked = (zp->z_links == 0);
+ if (zp->z_unlinked) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (0);
+ }
+
+ zp->z_blksz = doi.doi_data_block_size;
+ if (zp->z_size != size)
+ vnode_pager_setsize(vp, zp->z_size);
+
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ uint64_t obj = zp->z_id;
+ uint64_t acl_obj = zfs_external_acl(zp);
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+ if (acl_obj) {
+ VERIFY(!zp->z_is_sa);
+ VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ }
+ VERIFY(0 == dmu_object_free(os, obj, tx));
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+ zfs_znode_free(zp);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t z_id = zp->z_id;
+
+ ASSERT(zp->z_sa_hdl);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode
+ */
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+ /*
+ * If this was the last reference to a file with no links, remove
+ * the file from the file system unless the file system is mounted
+ * read-only. That can happen, for example, if the file system was
+ * originally read-write, the file was opened, then unlinked and
+ * the file system was made read-only before the file was finally
+ * closed. The file will remain in the unlinked set.
+ */
+ if (zp->z_unlinked) {
+ ASSERT(!zfsvfs->z_issnap);
+ if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ zfs_rmnode(zp);
+ return;
+ }
+ }
+
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ zfs_znode_free(zp);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ zp->z_vnode = NULL;
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ POINTER_INVALIDATE(&zp->z_zfsvfs);
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ zfs_znode_free_kmem(zp);
+
+#ifdef illumos
+ VFS_RELE(zfsvfs->z_vfs);
+#endif
+}
+
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2], boolean_t have_tx)
+{
+ timestruc_t now;
+
+ vfs_timestamp(&now);
+
+ if (have_tx) { /* will sa_bulk_update happen really soon? */
+ zp->z_atime_dirty = 0;
+ zp->z_seq++;
+ } else {
+ zp->z_atime_dirty = 1;
+ }
+
+ if (flag & AT_ATIME) {
+ ZFS_TIME_ENCODE(&now, zp->z_atime);
+ }
+
+ if (flag & AT_MTIME) {
+ ZFS_TIME_ENCODE(&now, mtime);
+ if (zp->z_zfsvfs->z_use_fuids) {
+ zp->z_pflags |= (ZFS_ARCHIVE |
+ ZFS_AV_MODIFIED);
+ }
+ }
+
+ if (flag & AT_CTIME) {
+ ZFS_TIME_ENCODE(&now, ctime);
+ if (zp->z_zfsvfs->z_use_fuids)
+ zp->z_pflags |= ZFS_ARCHIVE;
+ }
+}
+
+/*
+ * Grow the block size for a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ if (size <= zp->z_blksz)
+ return;
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_size > zp->z_blksz)
+ return;
+
+ error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+ size, 0, tx);
+
+ if (error == ENOTSUP)
+ return;
+ ASSERT0(error);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+#ifdef illumos
+/*
+ * This is a dummy interface used when pvn_vplist_dirty() should *not*
+ * be calling back into the fs for a putpage(). E.g.: when truncating
+ * a file, the pages being "thrown away* don't need to be written out.
+ */
+/* ARGSUSED */
+static int
+zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
+ int flags, cred_t *cr)
+{
+ ASSERT(0);
+ return (0);
+}
+#endif
+
+/*
+ * Increase the file length
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_tx_t *tx;
+ locked_range_t *lr;
+ uint64_t newblksz;
+ int error;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end <= zp->z_size) {
+ rangelock_exit(lr);
+ return (0);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ if (end > zp->z_blksz &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+ } else {
+ newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+ }
+ dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+ } else {
+ newblksz = 0;
+ }
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ rangelock_exit(lr);
+ return (error);
+ }
+
+ if (newblksz)
+ zfs_grow_blocksize(zp, newblksz, tx);
+
+ zp->z_size = end;
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx));
+
+ vnode_pager_setsize(ZTOV(zp), end);
+
+ rangelock_exit(lr);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ locked_range_t *lr;
+ int error;
+
+ /*
+ * Lock the range being freed.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (off >= zp->z_size) {
+ rangelock_exit(lr);
+ return (0);
+ }
+
+ if (off + len > zp->z_size)
+ len = zp->z_size - off;
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+ if (error == 0) {
+ /*
+ * In FreeBSD we cannot free block in the middle of a file,
+ * but only at the end of a file, so this code path should
+ * never happen.
+ */
+ vnode_pager_setsize(ZTOV(zp), off);
+ }
+
+ rangelock_exit(lr);
+
+ return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ locked_range_t *lr;
+ int error;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end >= zp->z_size) {
+ rangelock_exit(lr);
+ return (0);
+ }
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+ DMU_OBJECT_END);
+ if (error) {
+ rangelock_exit(lr);
+ return (error);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ rangelock_exit(lr);
+ return (error);
+ }
+
+ zp->z_size = end;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &zp->z_size, sizeof (zp->z_size));
+
+ if (end == 0) {
+ zp->z_pflags &= ~ZFS_SPARSE;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ }
+ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear any mapped pages in the truncated region. This has to
+ * happen outside of the transaction to avoid the possibility of
+ * a deadlock with someone trying to push a page that we are
+ * about to invalidate.
+ */
+ vnode_pager_setsize(vp, end);
+
+ rangelock_exit(lr);
+
+ return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of range
+ * len - end of range (0 => EOF)
+ * flag - current file open mode flags.
+ * log - TRUE if this action should be logged
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t mode;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+ int error;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+ sizeof (mode))) != 0)
+ return (error);
+
+ if (off > zp->z_size) {
+ error = zfs_extend(zp, off+len);
+ if (error == 0 && log)
+ goto log;
+ else
+ return (error);
+ }
+
+ /*
+ * Check for any locks in the region to be freed.
+ */
+
+ if (MANDLOCK(vp, (mode_t)mode)) {
+ uint64_t length = (len ? len : zp->z_size - off);
+ if (error = chklock(vp, FWRITE, off, length, flag, NULL))
+ return (error);
+ }
+
+ if (len == 0) {
+ error = zfs_trunc(zp, off);
+ } else {
+ if ((error = zfs_free_range(zp, off, len)) == 0 &&
+ off + len > zp->z_size)
+ error = zfs_extend(zp, off+len);
+ }
+ if (error || !log)
+ return (error);
+log:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+ uint64_t moid, obj, sa_obj, version;
+ uint64_t sense = ZFS_CASE_SENSITIVE;
+ uint64_t norm = 0;
+ nvpair_t *elem;
+ int error;
+ int i;
+ znode_t *rootzp = NULL;
+ zfsvfs_t *zfsvfs;
+ vattr_t vattr;
+ znode_t *zp;
+ zfs_acl_ids_t acl_ids;
+
+ /*
+ * First attempt to create master node.
+ */
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+ /* For the moment we expect all zpl props to be uint64_ts */
+ uint64_t val;
+ char *name;
+
+ ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+ VERIFY(nvpair_value_uint64(elem, &val) == 0);
+ name = nvpair_name(elem);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+ if (val < version)
+ version = val;
+ } else {
+ error = zap_update(os, moid, name, 8, 1, &val, tx);
+ }
+ ASSERT(error == 0);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+ norm = val;
+ else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+ sense = val;
+ }
+ ASSERT(version != 0);
+ error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+ /*
+ * Create zap object used for SA attribute registration
+ */
+
+ if (version >= ZPL_VERSION_SA) {
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT(error == 0);
+ } else {
+ sa_obj = 0;
+ }
+ /*
+ * Create a delete queue.
+ */
+ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/vnode/zfsvfs
+ * to allow zfs_mknode to work.
+ */
+ VATTR_NULL(&vattr);
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = crgetuid(cr);
+ vattr.va_gid = crgetgid(cr);
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ rootzp = zfs_znode_alloc_kmem(KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+ rootzp->z_moved = 0;
+ rootzp->z_unlinked = 0;
+ rootzp->z_atime_dirty = 0;
+ rootzp->z_is_sa = USE_SA(version, os);
+
+ zfsvfs->z_os = os;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_version = version;
+ zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs->z_use_sa = USE_SA(version, os);
+ zfsvfs->z_norm = norm;
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+
+ ASSERT(error == 0);
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ rootzp->z_zfsvfs = zfsvfs;
+ VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+ cr, NULL, &acl_ids));
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+ ASSERT3P(zp, ==, rootzp);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+ ASSERT(error == 0);
+ zfs_acl_ids_free(&acl_ids);
+ POINTER_INVALIDATE(&rootzp->z_zfsvfs);
+
+ sa_handle_destroy(rootzp->z_sa_hdl);
+ zfs_znode_free_kmem(rootzp);
+
+ /*
+ * Create shares directory
+ */
+
+ error = zfs_create_share_dir(zfsvfs, tx);
+
+ ASSERT(error == 0);
+
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+ uint64_t sa_obj = 0;
+ int error;
+
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+ return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+ dmu_buf_t **db, void *tag)
+{
+ dmu_object_info_t doi;
+ int error;
+
+ if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+ return (error);
+
+ dmu_object_info_from_db(*db, &doi);
+ if ((doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) ||
+ doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ sa_buf_rele(*db, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+ if (error != 0) {
+ sa_buf_rele(*db, tag);
+ return (error);
+ }
+
+ return (0);
+}
+
+void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ uint64_t *pobjp, int *is_xattrdir)
+{
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ uint64_t parent_mode;
+ sa_bulk_attr_t bulk[3];
+ sa_handle_t *sa_hdl;
+ dmu_buf_t *sa_db;
+ int count = 0;
+ int error;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+ &parent, sizeof (parent));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+ &pflags, sizeof (pflags));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &mode, sizeof (mode));
+
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+ return (error);
+
+ /*
+ * When a link is removed its parent pointer is not changed and will
+ * be invalid. There are two cases where a link is removed but the
+ * file stays around, when it goes to the delete queue and when there
+ * are additional links.
+ */
+ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+ /*
+ * Extended attributes can be applied to files, directories, etc.
+ * Otherwise the parent must be a directory.
+ */
+ if (!*is_xattrdir && !S_ISDIR(parent_mode))
+ return (SET_ERROR(EINVAL));
+
+ *pobjp = parent;
+
+ return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ zfs_stat_t *sb)
+{
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &sb->zs_mode, sizeof (sb->zs_mode));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+ &sb->zs_gen, sizeof (sb->zs_gen));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+ &sb->zs_links, sizeof (sb->zs_links));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+ &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+ return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+ sa_attr_type_t *sa_table, char *buf, int len)
+{
+ sa_handle_t *sa_hdl;
+ sa_handle_t *prevhdl = NULL;
+ dmu_buf_t *prevdb = NULL;
+ dmu_buf_t *sa_db = NULL;
+ char *path = buf + len - 1;
+ int error;
+
+ *path = '\0';
+ sa_hdl = hdl;
+
+ uint64_t deleteq_obj;
+ VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+ ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+ error = zap_lookup_int(osp, deleteq_obj, obj);
+ if (error == 0) {
+ return (ESTALE);
+ } else if (error != ENOENT) {
+ return (error);
+ }
+ error = 0;
+
+ for (;;) {
+ uint64_t pobj;
+ char component[MAXNAMELEN + 2];
+ size_t complen;
+ int is_xattrdir;
+
+ if (prevdb)
+ zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+
+ if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+ &is_xattrdir)) != 0)
+ break;
+
+ if (pobj == obj) {
+ if (path[0] != '/')
+ *--path = '/';
+ break;
+ }
+
+ component[0] = '/';
+ if (is_xattrdir) {
+ (void) sprintf(component + 1, "<xattrdir>");
+ } else {
+ error = zap_value_search(osp, pobj, obj,
+ ZFS_DIRENT_OBJ(-1ULL), component + 1);
+ if (error != 0)
+ break;
+ }
+
+ complen = strlen(component);
+ path -= complen;
+ ASSERT(path >= buf);
+ bcopy(component, path, complen);
+ obj = pobj;
+
+ if (sa_hdl != hdl) {
+ prevhdl = sa_hdl;
+ prevdb = sa_db;
+ }
+ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+ if (error != 0) {
+ sa_hdl = prevhdl;
+ sa_db = prevdb;
+ break;
+ }
+ }
+
+ if (sa_hdl != NULL && sa_hdl != hdl) {
+ ASSERT(sa_db != NULL);
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ }
+
+ if (error == 0)
+ (void) memmove(buf, path, buf + len - path);
+
+ return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len)
+{
+ char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ *path = '\0';
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+ if (error != 0) {
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+ }
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+#ifdef _KERNEL
+int
+zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t parent;
+ int is_xattrdir;
+ int err;
+
+ /* Extended attributes should not be visible as regular files. */
+ if ((zp->z_pflags & ZFS_XATTR) != 0)
+ return (SET_ERROR(EINVAL));
+
+ err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
+ &parent, &is_xattrdir);
+ if (err != 0)
+ return (err);
+ ASSERT0(is_xattrdir);
+
+ /* No name as this is a root object. */
+ if (parent == zp->z_id)
+ return (SET_ERROR(EINVAL));
+
+ err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
+ ZFS_DIRENT_OBJ(-1ULL), buf);
+ if (err != 0)
+ return (err);
+ err = zfs_zget(zfsvfs, parent, dzpp);
+ return (err);
+}
+#endif /* _KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
new file mode 100644
index 000000000000..a2b9f9bbeaa0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -0,0 +1,3499 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/abd.h>
+
+/*
+ * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
+ * calls that change the file system. Each itx has enough information to
+ * be able to replay them after a system crash, power loss, or
+ * equivalent failure mode. These are stored in memory until either:
+ *
+ * 1. they are committed to the pool by the DMU transaction group
+ * (txg), at which point they can be discarded; or
+ * 2. they are committed to the on-disk ZIL for the dataset being
+ * modified (e.g. due to an fsync, O_DSYNC, or other synchronous
+ * requirement).
+ *
+ * In the event of a crash or power loss, the itxs contained by each
+ * dataset's on-disk ZIL will be replayed when that dataset is first
+ * instantianted (e.g. if the dataset is a normal fileystem, when it is
+ * first mounted).
+ *
+ * As hinted at above, there is one ZIL per dataset (both the in-memory
+ * representation, and the on-disk representation). The on-disk format
+ * consists of 3 parts:
+ *
+ * - a single, per-dataset, ZIL header; which points to a chain of
+ * - zero or more ZIL blocks; each of which contains
+ * - zero or more ZIL records
+ *
+ * A ZIL record holds the information necessary to replay a single
+ * system call transaction. A ZIL block can hold many ZIL records, and
+ * the blocks are chained together, similarly to a singly linked list.
+ *
+ * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
+ * block in the chain, and the ZIL header points to the first block in
+ * the chain.
+ *
+ * Note, there is not a fixed place in the pool to hold these ZIL
+ * blocks; they are dynamically allocated and freed as needed from the
+ * blocks available on the pool, though they can be preferentially
+ * allocated from a dedicated "log" vdev.
+ */
+
+/*
+ * This controls the amount of time that a ZIL block (lwb) will remain
+ * "open" when it isn't "full", and it has a thread waiting for it to be
+ * committed to stable storage. Please refer to the zil_commit_waiter()
+ * function (and the comments within it) for more details.
+ */
+int zfs_commit_timeout_pct = 5;
+
+/*
+ * Disable intent logging replay. This global ZIL switch affects all pools.
+ */
+int zil_replay_disable = 0;
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN,
+ &zil_replay_disable, 0, "Disable intent logging replay");
+
+/*
+ * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
+ * the disk(s) by the ZIL after an LWB write has completed. Setting this
+ * will cause ZIL corruption on power loss if a volatile out-of-order
+ * write cache is enabled.
+ */
+boolean_t zil_nocacheflush = B_FALSE;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_nocacheflush, CTLFLAG_RWTUN,
+ &zil_nocacheflush, 0, "Disable ZIL cache flush");
+
+boolean_t zfs_trim_enabled = B_TRUE;
+SYSCTL_DECL(_vfs_zfs_trim);
+SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
+ "Enable ZFS TRIM");
+
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that will be executed with lower (asynchronous) priority
+ * to limit potential SLOG device abuse by single active ZIL writer.
+ */
+uint64_t zil_slog_bulk = 768 * 1024;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN,
+ &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority");
+
+static kmem_cache_t *zil_lwb_cache;
+static kmem_cache_t *zil_zcw_cache;
+
+#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+static int
+zil_bp_compare(const void *x1, const void *x2)
+{
+ const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+ const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
+
+ int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
+}
+
+static void
+zil_bp_tree_init(zilog_t *zilog)
+{
+ avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+ sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
+}
+
+static void
+zil_bp_tree_fini(zilog_t *zilog)
+{
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ zil_bp_node_t *zn;
+ void *cookie = NULL;
+
+ while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(zn, sizeof (zil_bp_node_t));
+
+ avl_destroy(t);
+}
+
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
+{
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ const dva_t *dva;
+ zil_bp_node_t *zn;
+ avl_index_t where;
+
+ if (BP_IS_EMBEDDED(bp))
+ return (0);
+
+ dva = BP_IDENTITY(bp);
+
+ if (avl_find(t, dva, &where) != NULL)
+ return (SET_ERROR(EEXIST));
+
+ zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
+ zn->zn_dva = *dva;
+ avl_insert(t, zn, where);
+
+ return (0);
+}
+
+static zil_header_t *
+zil_header_in_syncing_context(zilog_t *zilog)
+{
+ return ((zil_header_t *)zilog->zl_header);
+}
+
+static void
+zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
+{
+ zio_cksum_t *zc = &bp->blk_cksum;
+
+ zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
+ zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
+}
+
+/*
+ * Read a log block and make sure it's valid.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
+ char **end)
+{
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_phys_t zb;
+ int error;
+
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+ if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+ if (error == 0) {
+ zio_cksum_t cksum = bp->blk_cksum;
+
+ /*
+ * Validate the checksummed log block.
+ *
+ * Sequence numbers should be... sequential. The checksum
+ * verifier for the next block should be bp's checksum plus 1.
+ *
+ * Also check the log chain linkage and size used.
+ */
+ cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = abuf->b_data;
+ char *lr = (char *)(zilc + 1);
+ uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
+
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+ error = SET_ERROR(ECKSUM);
+ } else {
+ ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
+ bcopy(lr, dst, len);
+ *end = (char *)dst + len;
+ *nbp = zilc->zc_next_blk;
+ }
+ } else {
+ char *lr = abuf->b_data;
+ uint64_t size = BP_GET_LSIZE(bp);
+ zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+ (zilc->zc_nused > (size - sizeof (*zilc)))) {
+ error = SET_ERROR(ECKSUM);
+ } else {
+ ASSERT3U(zilc->zc_nused, <=,
+ SPA_OLD_MAXBLOCKSIZE);
+ bcopy(lr, dst, zilc->zc_nused);
+ *end = (char *)dst + zilc->zc_nused;
+ *nbp = zilc->zc_next_blk;
+ }
+ }
+
+ arc_buf_destroy(abuf, &abuf);
+ }
+
+ return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ const blkptr_t *bp = &lr->lr_blkptr;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_phys_t zb;
+ int error;
+
+ if (BP_IS_HOLE(bp)) {
+ if (wbuf != NULL)
+ bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+ return (0);
+ }
+
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+ error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+ if (error == 0) {
+ if (wbuf != NULL)
+ bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+ arc_buf_destroy(abuf, &abuf);
+ }
+
+ return (error);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ */
+int
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ boolean_t claimed = !!zh->zh_claim_txg;
+ uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+ uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+ uint64_t max_blk_seq = 0;
+ uint64_t max_lr_seq = 0;
+ uint64_t blk_count = 0;
+ uint64_t lr_count = 0;
+ blkptr_t blk, next_blk;
+ char *lrbuf, *lrp;
+ int error = 0;
+
+ /*
+ * Old logs didn't record the maximum zh_claim_lr_seq.
+ */
+ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ claim_lr_seq = UINT64_MAX;
+
+ /*
+ * Starting at the block pointed to by zh_log we read the log chain.
+ * For each block in the chain we strongly check that block to
+ * ensure its validity. We stop when an invalid block is found.
+ * For each block pointer in the chain we call parse_blk_func().
+ * For each record in each valid block we call parse_lr_func().
+ * If the log has been claimed, stop if we encounter a sequence
+ * number greater than the highest claimed sequence number.
+ */
+ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
+ zil_bp_tree_init(zilog);
+
+ for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+ uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ int reclen;
+ char *end;
+
+ if (blk_seq > claim_blk_seq)
+ break;
+ if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
+ break;
+ ASSERT3U(max_blk_seq, <, blk_seq);
+ max_blk_seq = blk_seq;
+ blk_count++;
+
+ if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
+ break;
+
+ error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
+ if (error != 0)
+ break;
+
+ for (lrp = lrbuf; lrp < end; lrp += reclen) {
+ lr_t *lr = (lr_t *)lrp;
+ reclen = lr->lrc_reclen;
+ ASSERT3U(reclen, >=, sizeof (lr_t));
+ if (lr->lrc_seq > claim_lr_seq)
+ goto done;
+ if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
+ goto done;
+ ASSERT3U(max_lr_seq, <, lr->lrc_seq);
+ max_lr_seq = lr->lrc_seq;
+ lr_count++;
+ }
+ }
+done:
+ zilog->zl_parse_error = error;
+ zilog->zl_parse_blk_seq = max_blk_seq;
+ zilog->zl_parse_lr_seq = max_lr_seq;
+ zilog->zl_parse_blk_count = blk_count;
+ zilog->zl_parse_lr_count = lr_count;
+
+ ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+ (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
+
+ zil_bp_tree_fini(zilog);
+ zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ ASSERT(!BP_IS_HOLE(bp));
+
+ /*
+ * As we call this function from the context of a rewind to a
+ * checkpoint, each ZIL block whose txg is later than the txg
+ * that we rewind to is invalid. Thus, we return -1 so
+ * zil_parse() doesn't attempt to read it.
+ */
+ if (bp->blk_birth >= first_txg)
+ return (-1);
+
+ if (zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
+
+ zio_free(zilog->zl_spa, first_txg, bp);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ return (0);
+}
+
+static int
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ /*
+ * Claim log block if not already committed and not already claimed.
+ * If tx == NULL, just verify that the block is claimable.
+ */
+ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+ zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
+
+ return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+ tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ lr_write_t *lr = (lr_write_t *)lrc;
+ int error;
+
+ if (lrc->lrc_txtype != TX_WRITE)
+ return (0);
+
+ /*
+ * If the block is not readable, don't claim it. This can happen
+ * in normal operation when a log block is written to disk before
+ * some of the dmu_sync() blocks it points to. In this case, the
+ * transaction cannot have been committed to anyone (we would have
+ * waited for all writes to be stable first), so it is semantically
+ * correct to declare this the end of the log.
+ */
+ if (lr->lr_blkptr.blk_birth >= first_txg &&
+ (error = zil_read_log_data(zilog, lr, NULL)) != 0)
+ return (error);
+ return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
+}
+
+/* ARGSUSED */
+static int
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
+{
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
+}
+
+static int
+zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
+{
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ /*
+ * If we previously claimed it, we need to free it.
+ */
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+ bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+ !BP_IS_HOLE(bp))
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
+}
+
+static int
+zil_lwb_vdev_compare(const void *x1, const void *x2)
+{
+ const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+ const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+
+ return (AVL_CMP(v1, v2));
+}
+
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
+{
+ lwb_t *lwb;
+
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = *bp;
+ lwb->lwb_slog = slog;
+ lwb->lwb_state = LWB_STATE_CLOSED;
+ lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_write_zio = NULL;
+ lwb->lwb_root_zio = NULL;
+ lwb->lwb_tx = NULL;
+ lwb->lwb_issued_timestamp = 0;
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ lwb->lwb_nused = sizeof (zil_chain_t);
+ lwb->lwb_sz = BP_GET_LSIZE(bp);
+ } else {
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+
+ ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+ ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ VERIFY(list_is_empty(&lwb->lwb_waiters));
+
+ return (lwb);
+}
+
+static void
+zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
+{
+ ASSERT(MUTEX_HELD(&zilog->zl_lock));
+ ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+ VERIFY(list_is_empty(&lwb->lwb_waiters));
+ ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ ASSERT3P(lwb->lwb_write_zio, ==, NULL);
+ ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+ ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
+ ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+
+ /*
+ * Clear the zilog's field to indicate this lwb is no longer
+ * valid, and prevent use-after-free errors.
+ */
+ if (zilog->zl_last_lwb_opened == lwb)
+ zilog->zl_last_lwb_opened = NULL;
+
+ kmem_cache_free(zil_lwb_cache, lwb);
+}
+
+/*
+ * Called when we create in-memory log transactions so that we know
+ * to cleanup the itxs at the end of spa_sync().
+ */
+void
+zilog_dirty(zilog_t *zilog, uint64_t txg)
+{
+ dsl_pool_t *dp = zilog->zl_dmu_pool;
+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+
+ ASSERT(spa_writeable(zilog->zl_spa));
+
+ if (ds->ds_is_snapshot)
+ panic("dirtying snapshot!");
+
+ if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(ds->ds_dbuf, zilog);
+
+ zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
+ }
+}
+
+/*
+ * Determine if the zil is dirty in the specified txg. Callers wanting to
+ * ensure that the dirty state does not change must hold the itxg_lock for
+ * the specified txg. Holding the lock will ensure that the zil cannot be
+ * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
+ * state.
+ */
+boolean_t
+zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
+{
+ dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+ if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the zil is dirty. The zil is considered dirty if it has
+ * any pending itx records that have not been cleaned by zil_clean().
+ */
+boolean_t
+zilog_is_dirty(zilog_t *zilog)
+{
+ dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static lwb_t *
+zil_create(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb = NULL;
+ uint64_t txg = 0;
+ dmu_tx_t *tx = NULL;
+ blkptr_t blk;
+ int error = 0;
+ boolean_t slog = FALSE;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ ASSERT(zh->zh_claim_txg == 0);
+ ASSERT(zh->zh_replay_seq == 0);
+
+ blk = zh->zh_log;
+
+ /*
+ * Allocate an initial log block if:
+ * - there isn't one already
+ * - the existing block is the wrong endianess
+ */
+ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
+ tx = dmu_tx_create(zilog->zl_os);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ if (!BP_IS_HOLE(&blk)) {
+ zio_free(zilog->zl_spa, txg, &blk);
+ BP_ZERO(&blk);
+ }
+
+ error = zio_alloc_zil(zilog->zl_spa,
+ zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL,
+ ZIL_MIN_BLKSZ, &slog);
+
+ if (error == 0)
+ zil_init_log_chain(zilog, &blk);
+ }
+
+ /*
+ * Allocate a log write block (lwb) for the first log block.
+ */
+ if (error == 0)
+ lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
+
+ /*
+ * If we just allocated the first log block, commit our transaction
+ * and wait for zil_sync() to stuff the block poiner into zh_log.
+ * (zh is part of the MOS, so we cannot modify it in open context.)
+ */
+ if (tx != NULL) {
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+
+ ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+
+ return (lwb);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header. If keep_first
+ * is set, then we're replaying a log with no content. We want to keep the
+ * first block, however, so that the first synchronous transaction doesn't
+ * require a txg_wait_synced() in zil_create(). We don't need to
+ * txg_wait_synced() here either when keep_first is set, because both
+ * zil_create() and zil_destroy() will wait for any in-progress destroys
+ * to complete.
+ */
+void
+zil_destroy(zilog_t *zilog, boolean_t keep_first)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb;
+ dmu_tx_t *tx;
+ uint64_t txg;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ zilog->zl_old_header = *zh; /* debugging aid */
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return;
+
+ tx = dmu_tx_create(zilog->zl_os);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT3U(zilog->zl_destroy_txg, <, txg);
+ zilog->zl_destroy_txg = txg;
+ zilog->zl_keep_first = keep_first;
+
+ if (!list_is_empty(&zilog->zl_lwb_list)) {
+ ASSERT(zh->zh_claim_txg == 0);
+ VERIFY(!keep_first);
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
+ zil_free_lwb(zilog, lwb);
+ }
+ } else if (!keep_first) {
+ zil_destroy_sync(zilog, tx);
+ }
+ mutex_exit(&zilog->zl_lock);
+
+ dmu_tx_commit(tx);
+}
+
+void
+zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+ (void) zil_parse(zilog, zil_free_log_block,
+ zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
+}
+
+int
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
+{
+ dmu_tx_t *tx = txarg;
+ zilog_t *zilog;
+ uint64_t first_txg;
+ zil_header_t *zh;
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_own_obj(dp, ds->ds_object,
+ DMU_OST_ANY, B_FALSE, FTAG, &os);
+ if (error != 0) {
+ /*
+ * EBUSY indicates that the objset is inconsistent, in which
+ * case it can not have a ZIL.
+ */
+ if (error != EBUSY) {
+ cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+ (unsigned long long)ds->ds_object, error);
+ }
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ zh = zil_header_in_syncing_context(zilog);
+ ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
+ first_txg = spa_min_claim_txg(zilog->zl_spa);
+
+ /*
+ * If the spa_log_state is not set to be cleared, check whether
+ * the current uberblock is a checkpoint one and if the current
+ * header has been claimed before moving on.
+ *
+ * If the current uberblock is a checkpointed uberblock then
+ * one of the following scenarios took place:
+ *
+ * 1] We are currently rewinding to the checkpoint of the pool.
+ * 2] We crashed in the middle of a checkpoint rewind but we
+ * did manage to write the checkpointed uberblock to the
+ * vdev labels, so when we tried to import the pool again
+ * the checkpointed uberblock was selected from the import
+ * procedure.
+ *
+ * In both cases we want to zero out all the ZIL blocks, except
+ * the ones that have been claimed at the time of the checkpoint
+ * (their zh_claim_txg != 0). The reason is that these blocks
+ * may be corrupted since we may have reused their locations on
+ * disk after we took the checkpoint.
+ *
+ * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
+ * when we first figure out whether the current uberblock is
+ * checkpointed or not. Unfortunately, that would discard all
+ * the logs, including the ones that are claimed, and we would
+ * leak space.
+ */
+ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
+ (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)) {
+ if (!BP_IS_HOLE(&zh->zh_log)) {
+ (void) zil_parse(zilog, zil_clear_log_block,
+ zil_noop_log_record, tx, first_txg);
+ }
+ BP_ZERO(&zh->zh_log);
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ dmu_objset_disown(os, FTAG);
+ return (0);
+ }
+
+ /*
+ * If we are not rewinding and opening the pool normally, then
+ * the min_claim_txg should be equal to the first txg of the pool.
+ */
+ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
+
+ /*
+ * Claim all log blocks if we haven't already done so, and remember
+ * the highest claimed sequence number. This ensures that if we can
+ * read only part of the log now (e.g. due to a missing device),
+ * but we can read the entire log later, we will not try to replay
+ * or destroy beyond the last block we successfully claimed.
+ */
+ ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+ if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+ (void) zil_parse(zilog, zil_claim_log_block,
+ zil_claim_log_record, tx, first_txg);
+ zh->zh_claim_txg = first_txg;
+ zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+ zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+ if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+ zh->zh_flags |= ZIL_REPLAY_NEEDED;
+ zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ }
+
+ ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+ dmu_objset_disown(os, FTAG);
+ return (0);
+}
+
+/*
+ * Check the log by walking the log chain.
+ * Checksum errors are ok as they indicate the end of the chain.
+ * Any other error (no device or read failure) returns an error.
+ */
+/* ARGSUSED */
+int
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
+{
+ zilog_t *zilog;
+ objset_t *os;
+ blkptr_t *bp;
+ int error;
+
+ ASSERT(tx == NULL);
+
+ error = dmu_objset_from_ds(ds, &os);
+ if (error != 0) {
+ cmn_err(CE_WARN, "can't open objset %llu, error %d",
+ (unsigned long long)ds->ds_object, error);
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ bp = (blkptr_t *)&zilog->zl_header->zh_log;
+
+ if (!BP_IS_HOLE(bp)) {
+ vdev_t *vd;
+ boolean_t valid = B_TRUE;
+
+ /*
+ * Check the first block and determine if it's on a log device
+ * which may have been removed or faulted prior to loading this
+ * pool. If so, there's no point in checking the rest of the
+ * log as its content should have already been synced to the
+ * pool.
+ */
+ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
+ vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+ if (vd->vdev_islog && vdev_is_dead(vd))
+ valid = vdev_log_state_valid(vd);
+ spa_config_exit(os->os_spa, SCL_STATE, FTAG);
+
+ if (!valid)
+ return (0);
+
+ /*
+ * Check whether the current uberblock is checkpointed (e.g.
+ * we are rewinding) and whether the current header has been
+ * claimed or not. If it hasn't then skip verifying it. We
+ * do this because its ZIL blocks may be part of the pool's
+ * state before the rewind, which is no longer valid.
+ */
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)
+ return (0);
+ }
+
+ /*
+ * Because tx == NULL, zil_claim_log_block() will not actually claim
+ * any blocks, but just determine whether it is possible to do so.
+ * In addition to checking the log chain, zil_claim_log_block()
+ * will invoke zio_claim() with a done func of spa_claim_notify(),
+ * which will update spa_max_claim_txg. See spa_load() for details.
+ */
+ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+ zilog->zl_header->zh_claim_txg ? -1ULL :
+ spa_min_claim_txg(os->os_spa));
+
+ return ((error == ECKSUM || error == ENOENT) ? 0 : error);
+}
+
+/*
+ * When an itx is "skipped", this function is used to properly mark the
+ * waiter as "done, and signal any thread(s) waiting on it. An itx can
+ * be skipped (and not committed to an lwb) for a variety of reasons,
+ * one of them being that the itx was committed via spa_sync(), prior to
+ * it being committed to an lwb; this can happen if a thread calling
+ * zil_commit() is racing with spa_sync().
+ */
+static void
+zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
+{
+ mutex_enter(&zcw->zcw_lock);
+ ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+ zcw->zcw_done = B_TRUE;
+ cv_broadcast(&zcw->zcw_cv);
+ mutex_exit(&zcw->zcw_lock);
+}
+
+/*
+ * This function is used when the given waiter is to be linked into an
+ * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
+ * At this point, the waiter will no longer be referenced by the itx,
+ * and instead, will be referenced by the lwb.
+ */
+static void
+zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
+{
+ /*
+ * The lwb_waiters field of the lwb is protected by the zilog's
+ * zl_lock, thus it must be held when calling this function.
+ */
+ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+
+ mutex_enter(&zcw->zcw_lock);
+ ASSERT(!list_link_active(&zcw->zcw_node));
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
+ lwb->lwb_state == LWB_STATE_ISSUED ||
+ lwb->lwb_state == LWB_STATE_WRITE_DONE);
+
+ list_insert_tail(&lwb->lwb_waiters, zcw);
+ zcw->zcw_lwb = lwb;
+ mutex_exit(&zcw->zcw_lock);
+}
+
+/*
+ * This function is used when zio_alloc_zil() fails to allocate a ZIL
+ * block, and the given waiter must be linked to the "nolwb waiters"
+ * list inside of zil_process_commit_list().
+ */
+static void
+zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
+{
+ mutex_enter(&zcw->zcw_lock);
+ ASSERT(!list_link_active(&zcw->zcw_node));
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ list_insert_tail(nolwb, zcw);
+ mutex_exit(&zcw->zcw_lock);
+}
+
+void
+zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
+{
+ avl_tree_t *t = &lwb->lwb_vdev_tree;
+ avl_index_t where;
+ zil_vdev_node_t *zv, zvsearch;
+ int ndvas = BP_GET_NDVAS(bp);
+ int i;
+
+ if (zil_nocacheflush)
+ return;
+
+ mutex_enter(&lwb->lwb_vdev_lock);
+ for (i = 0; i < ndvas; i++) {
+ zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+ if (avl_find(t, &zvsearch, &where) == NULL) {
+ zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
+ zv->zv_vdev = zvsearch.zv_vdev;
+ avl_insert(t, zv, where);
+ }
+ }
+ mutex_exit(&lwb->lwb_vdev_lock);
+}
+
+static void
+zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
+{
+ avl_tree_t *src = &lwb->lwb_vdev_tree;
+ avl_tree_t *dst = &nlwb->lwb_vdev_tree;
+ void *cookie = NULL;
+ zil_vdev_node_t *zv;
+
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+ ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+ ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
+ /*
+ * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
+ * not need the protection of lwb_vdev_lock (it will only be modified
+ * while holding zilog->zl_lock) as its writes and those of its
+ * children have all completed. The younger 'nlwb' may be waiting on
+ * future writes to additional vdevs.
+ */
+ mutex_enter(&nlwb->lwb_vdev_lock);
+ /*
+ * Tear down the 'lwb' vdev tree, ensuring that entries which do not
+ * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
+ */
+ while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
+ avl_index_t where;
+
+ if (avl_find(dst, zv, &where) == NULL) {
+ avl_insert(dst, zv, where);
+ } else {
+ kmem_free(zv, sizeof (*zv));
+ }
+ }
+ mutex_exit(&nlwb->lwb_vdev_lock);
+}
+
+void
+zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
+{
+ lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+}
+
+/*
+ * This function is a called after all vdevs associated with a given lwb
+ * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
+ * as the lwb write completes, if "zil_nocacheflush" is set. Further,
+ * all "previous" lwb's will have completed before this function is
+ * called; i.e. this function is called for all previous lwbs before
+ * it's called for "this" lwb (enforced via zio the dependencies
+ * configured in zil_lwb_set_zio_dependency()).
+ *
+ * The intention is for this function to be called as soon as the
+ * contents of an lwb are considered "stable" on disk, and will survive
+ * any sudden loss of power. At this point, any threads waiting for the
+ * lwb to reach this state are signalled, and the "waiter" structures
+ * are marked "done".
+ */
+static void
+zil_lwb_flush_vdevs_done(zio_t *zio)
+{
+ lwb_t *lwb = zio->io_private;
+ zilog_t *zilog = lwb->lwb_zilog;
+ dmu_tx_t *tx = lwb->lwb_tx;
+ zil_commit_waiter_t *zcw;
+
+ spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
+
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+
+ mutex_enter(&zilog->zl_lock);
+
+ /*
+ * Ensure the lwb buffer pointer is cleared before releasing the
+ * txg. If we have had an allocation failure and the txg is
+ * waiting to sync then we want zil_sync() to remove the lwb so
+ * that it's not picked up as the next new one in
+ * zil_process_commit_list(). zil_sync() will only remove the
+ * lwb if lwb_buf is null.
+ */
+ lwb->lwb_buf = NULL;
+ lwb->lwb_tx = NULL;
+
+ ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
+ zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
+
+ lwb->lwb_root_zio = NULL;
+
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+ lwb->lwb_state = LWB_STATE_FLUSH_DONE;
+
+ if (zilog->zl_last_lwb_opened == lwb) {
+ /*
+ * Remember the highest committed log sequence number
+ * for ztest. We only update this value when all the log
+ * writes succeeded, because ztest wants to ASSERT that
+ * it got the whole log chain.
+ */
+ zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
+ }
+
+ while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
+ mutex_enter(&zcw->zcw_lock);
+
+ ASSERT(list_link_active(&zcw->zcw_node));
+ list_remove(&lwb->lwb_waiters, zcw);
+
+ ASSERT3P(zcw->zcw_lwb, ==, lwb);
+ zcw->zcw_lwb = NULL;
+
+ zcw->zcw_zio_error = zio->io_error;
+
+ ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+ zcw->zcw_done = B_TRUE;
+ cv_broadcast(&zcw->zcw_cv);
+
+ mutex_exit(&zcw->zcw_lock);
+ }
+
+ mutex_exit(&zilog->zl_lock);
+
+ /*
+ * Now that we've written this log block, we have a stable pointer
+ * to the next block in the chain, so it's OK to let the txg in
+ * which we allocated the next block sync.
+ */
+ dmu_tx_commit(tx);
+}
+
+/*
+ * This is called when an lwb's write zio completes. The callback's
+ * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
+ * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
+ * in writing out this specific lwb's data, and in the case that cache
+ * flushes have been deferred, vdevs involved in writing the data for
+ * previous lwbs. The writes corresponding to all the vdevs in the
+ * lwb_vdev_tree will have completed by the time this is called, due to
+ * the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done"
+ * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
+ * completion callback for the lwb's root zio.
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+ lwb_t *lwb = zio->io_private;
+ spa_t *spa = zio->io_spa;
+ zilog_t *zilog = lwb->lwb_zilog;
+ avl_tree_t *t = &lwb->lwb_vdev_tree;
+ void *cookie = NULL;
+ zil_vdev_node_t *zv;
+ lwb_t *nlwb;
+
+ ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
+
+ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+ ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+ ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
+ ASSERT(!BP_IS_GANG(zio->io_bp));
+ ASSERT(!BP_IS_HOLE(zio->io_bp));
+ ASSERT(BP_GET_FILL(zio->io_bp) == 0);
+
+ abd_put(zio->io_abd);
+
+ mutex_enter(&zilog->zl_lock);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
+ lwb->lwb_state = LWB_STATE_WRITE_DONE;
+ lwb->lwb_write_zio = NULL;
+ nlwb = list_next(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+
+ if (avl_numnodes(t) == 0)
+ return;
+
+ /*
+ * If there was an IO error, we're not going to call zio_flush()
+ * on these vdevs, so we simply empty the tree and free the
+ * nodes. We avoid calling zio_flush() since there isn't any
+ * good reason for doing so, after the lwb block failed to be
+ * written out.
+ */
+ if (zio->io_error != 0) {
+ while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(zv, sizeof (*zv));
+ return;
+ }
+
+ /*
+ * If this lwb does not have any threads waiting for it to
+ * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
+ * command to the vdevs written to by "this" lwb, and instead
+ * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
+ * command for those vdevs. Thus, we merge the vdev tree of
+ * "this" lwb with the vdev tree of the "next" lwb in the list,
+ * and assume the "next" lwb will handle flushing the vdevs (or
+ * deferring the flush(s) again).
+ *
+ * This is a useful performance optimization, especially for
+ * workloads with lots of async write activity and few sync
+ * write and/or fsync activity, as it has the potential to
+ * coalesce multiple flush commands to a vdev into one.
+ */
+ if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
+ zil_lwb_flush_defer(lwb, nlwb);
+ ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ return;
+ }
+
+ while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
+ vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
+ if (vd != NULL)
+ zio_flush(lwb->lwb_root_zio, vd);
+ kmem_free(zv, sizeof (*zv));
+ }
+}
+
+static void
+zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
+{
+ lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT(MUTEX_HELD(&zilog->zl_lock));
+
+ /*
+ * The zilog's "zl_last_lwb_opened" field is used to build the
+ * lwb/zio dependency chain, which is used to preserve the
+ * ordering of lwb completions that is required by the semantics
+ * of the ZIL. Each new lwb zio becomes a parent of the
+ * "previous" lwb zio, such that the new lwb's zio cannot
+ * complete until the "previous" lwb's zio completes.
+ *
+ * This is required by the semantics of zil_commit(); the commit
+ * waiters attached to the lwbs will be woken in the lwb zio's
+ * completion callback, so this zio dependency graph ensures the
+ * waiters are woken in the correct order (the same order the
+ * lwbs were created).
+ */
+ if (last_lwb_opened != NULL &&
+ last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
+ ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+ last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
+ last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
+
+ ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
+ zio_add_child(lwb->lwb_root_zio,
+ last_lwb_opened->lwb_root_zio);
+
+ /*
+ * If the previous lwb's write hasn't already completed,
+ * we also want to order the completion of the lwb write
+ * zios (above, we only order the completion of the lwb
+ * root zios). This is required because of how we can
+ * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+ *
+ * When the DKIOCFLUSHWRITECACHE commands are defered,
+ * the previous lwb will rely on this lwb to flush the
+ * vdevs written to by that previous lwb. Thus, we need
+ * to ensure this lwb doesn't issue the flush until
+ * after the previous lwb's write completes. We ensure
+ * this ordering by setting the zio parent/child
+ * relationship here.
+ *
+ * Without this relationship on the lwb's write zio,
+ * it's possible for this lwb's write to complete prior
+ * to the previous lwb's write completing; and thus, the
+ * vdevs for the previous lwb would be flushed prior to
+ * that lwb's data being written to those vdevs (the
+ * vdevs are flushed in the lwb write zio's completion
+ * handler, zil_lwb_write_done()).
+ */
+ if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
+ ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+ last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
+
+ ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
+ zio_add_child(lwb->lwb_write_zio,
+ last_lwb_opened->lwb_write_zio);
+ }
+ }
+}
+
+
+/*
+ * This function's purpose is to "open" an lwb such that it is ready to
+ * accept new itxs being committed to it. To do this, the lwb's zio
+ * structures are created, and linked to the lwb. This function is
+ * idempotent; if the passed in lwb has already been opened, this
+ * function is essentially a no-op.
+ */
+static void
+zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
+{
+ zbookmark_phys_t zb;
+ zio_priority_t prio;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT3P(lwb, !=, NULL);
+ EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
+ EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
+
+ SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ if (lwb->lwb_root_zio == NULL) {
+ abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
+ BP_GET_LSIZE(&lwb->lwb_blk));
+
+ if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+ prio = ZIO_PRIORITY_SYNC_WRITE;
+ else
+ prio = ZIO_PRIORITY_ASYNC_WRITE;
+
+ lwb->lwb_root_zio = zio_root(zilog->zl_spa,
+ zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
+ ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+
+ lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
+ zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
+ BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
+ prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
+ ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+
+ lwb->lwb_state = LWB_STATE_OPENED;
+
+ mutex_enter(&zilog->zl_lock);
+ zil_lwb_set_zio_dependency(zilog, lwb);
+ zilog->zl_last_lwb_opened = lwb;
+ mutex_exit(&zilog->zl_lock);
+ }
+
+ ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+ ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+}
+
+/*
+ * Define a limited set of intent log block sizes.
+ *
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+struct {
+ uint64_t limit;
+ uint64_t blksz;
+} zil_block_buckets[] = {
+ { 4096, 4096 }, /* non TX_WRITE */
+ { 8192 + 4096, 8192 + 4096 }, /* database */
+ { 32768 + 4096, 32768 + 4096 }, /* NFS writes */
+ { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */
+ { 131072, 131072 }, /* < 128KB writes */
+ { 131072 + 4096, 65536 + 4096 }, /* 128KB writes */
+ { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */
+};
+
+/*
+ * Maximum block size used by the ZIL. This is picked up when the ZIL is
+ * initialized. Otherwise this should not be used directly; see
+ * zl_max_block_size instead.
+ */
+int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_maxblocksize, CTLFLAG_RWTUN,
+ &zil_maxblocksize, 0, "Limit in bytes of ZIL log block size");
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
+{
+ lwb_t *nlwb = NULL;
+ zil_chain_t *zilc;
+ spa_t *spa = zilog->zl_spa;
+ blkptr_t *bp;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ uint64_t zil_blksz, wsz;
+ int i, error;
+ boolean_t slog;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+ ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ zilc = (zil_chain_t *)lwb->lwb_buf;
+ bp = &zilc->zc_next_blk;
+ } else {
+ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+ bp = &zilc->zc_next_blk;
+ }
+
+ ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
+
+ /*
+ * Allocate the next block and save its address in this block
+ * before writing it in order to establish the log chain.
+ * Note that if the allocation of nlwb synced before we wrote
+ * the block that points at it (lwb), we'd leak it if we crashed.
+ * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+ * We dirty the dataset to ensure that zil_sync() will be called
+ * to clean up in the event of allocation failure or I/O failure.
+ */
+
+ tx = dmu_tx_create(zilog->zl_os);
+
+ /*
+ * Since we are not going to create any new dirty data, and we
+ * can even help with clearing the existing dirty data, we
+ * should not be subject to the dirty data based delays. We
+ * use TXG_NOTHROTTLE to bypass the delay mechanism.
+ */
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ lwb->lwb_tx = tx;
+
+ /*
+ * Log blocks are pre-allocated. Here we select the size of the next
+ * block, based on size used in the last block.
+ * - first find the smallest bucket that will fit the block from a
+ * limited set of block sizes. This is because it's faster to write
+ * blocks allocated from the same metaslab as they are adjacent or
+ * close.
+ * - next find the maximum from the new suggested size and an array of
+ * previous sizes. This lessens a picket fence effect of wrongly
+ * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
+ * requests.
+ *
+ * Note we only write what is used, but we can't just allocate
+ * the maximum block size because we can exhaust the available
+ * pool log space.
+ */
+ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+ for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
+ continue;
+ zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
+ zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+ for (i = 0; i < ZIL_PREV_BLKS; i++)
+ zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+ zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
+
+ BP_ZERO(bp);
+
+ /* pass the old blkptr in order to spread log blocks across devs */
+ error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object,
+ txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
+ if (error == 0) {
+ ASSERT3U(bp->blk_birth, ==, txg);
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ /*
+ * Allocate a new log write block (lwb).
+ */
+ nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
+ }
+
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ /* For Slim ZIL only write what is used. */
+ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+ ASSERT3U(wsz, <=, lwb->lwb_sz);
+ zio_shrink(lwb->lwb_write_zio, wsz);
+
+ } else {
+ wsz = lwb->lwb_sz;
+ }
+
+ zilc->zc_pad = 0;
+ zilc->zc_nused = lwb->lwb_nused;
+ zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+
+ /*
+ * clear unused data for security
+ */
+ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
+
+ spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
+
+ zil_lwb_add_block(lwb, &lwb->lwb_blk);
+ lwb->lwb_issued_timestamp = gethrtime();
+ lwb->lwb_state = LWB_STATE_ISSUED;
+
+ zio_nowait(lwb->lwb_root_zio);
+ zio_nowait(lwb->lwb_write_zio);
+
+ /*
+ * If there was an allocation failure then nlwb will be null which
+ * forces a txg_wait_synced().
+ */
+ return (nlwb);
+}
+
+/*
+ * Maximum amount of write data that can be put into single log block.
+ */
+uint64_t
+zil_max_log_data(zilog_t *zilog)
+{
+ return (zilog->zl_max_block_size -
+ sizeof (zil_chain_t) - sizeof (lr_write_t));
+}
+
+/*
+ * Maximum amount of log space we agree to waste to reduce number of
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ */
+static inline uint64_t
+zil_max_waste_space(zilog_t *zilog)
+{
+ return (zil_max_log_data(zilog) / 8);
+}
+
+/*
+ * Maximum amount of write data for WR_COPIED. For correctness, consumers
+ * must fall back to WR_NEED_COPY if we can't fit the entire record into one
+ * maximum sized log block, because each WR_COPIED record must fit in a
+ * single log block. For space efficiency, we want to fit two records into a
+ * max-sized log block.
+ */
+uint64_t
+zil_max_copied_data(zilog_t *zilog)
+{
+ return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
+ sizeof (lr_write_t));
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+ lr_t *lrcb, *lrc;
+ lr_write_t *lrwb, *lrw;
+ char *lr_buf;
+ uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(lwb->lwb_buf, !=, NULL);
+
+ zil_lwb_write_open(zilog, lwb);
+
+ lrc = &itx->itx_lr;
+ lrw = (lr_write_t *)lrc;
+
+ /*
+ * A commit itx doesn't represent any on-disk state; instead
+ * it's simply used as a place holder on the commit list, and
+ * provides a mechanism for attaching a "commit waiter" onto the
+ * correct lwb (such that the waiter can be signalled upon
+ * completion of that lwb). Thus, we don't process this itx's
+ * log record if it's a commit itx (these itx's don't have log
+ * records), and instead link the itx's waiter onto the lwb's
+ * list of waiters.
+ *
+ * For more details, see the comment above zil_commit().
+ */
+ if (lrc->lrc_txtype == TX_COMMIT) {
+ mutex_enter(&zilog->zl_lock);
+ zil_commit_waiter_link_lwb(itx->itx_private, lwb);
+ itx->itx_private = NULL;
+ mutex_exit(&zilog->zl_lock);
+ return (lwb);
+ }
+
+ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+ dlen = P2ROUNDUP_TYPED(
+ lrw->lr_length, sizeof (uint64_t), uint64_t);
+ } else {
+ dlen = 0;
+ }
+ reclen = lrc->lrc_reclen;
+ zilog->zl_cur_used += (reclen + dlen);
+ txg = lrc->lrc_txg;
+
+ ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
+
+cont:
+ /*
+ * If this record won't fit in the current log block, start a new one.
+ * For WR_NEED_COPY optimize layout for minimal number of chunks.
+ */
+ lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ max_log_data = zil_max_log_data(zilog);
+ if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+ lwb_sp < zil_max_waste_space(zilog) &&
+ (dlen % max_log_data == 0 ||
+ lwb_sp < reclen + dlen % max_log_data))) {
+ lwb = zil_lwb_write_issue(zilog, lwb);
+ if (lwb == NULL)
+ return (NULL);
+ zil_lwb_write_open(zilog, lwb);
+ ASSERT(LWB_EMPTY(lwb));
+ lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+
+ /*
+ * There must be enough space in the new, empty log block to
+ * hold reclen. For WR_COPIED, we need to fit the whole
+ * record in one block, and reclen is the header size + the
+ * data size. For WR_NEED_COPY, we can create multiple
+ * records, splitting the data into multiple blocks, so we
+ * only need to fit one word of data per block; in this case
+ * reclen is just the header size (no data).
+ */
+ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+ }
+
+ dnow = MIN(dlen, lwb_sp - reclen);
+ lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+ bcopy(lrc, lr_buf, reclen);
+ lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */
+ lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */
+
+ /*
+ * If it's a write, fetch the data or get its blkptr as appropriate.
+ */
+ if (lrc->lrc_txtype == TX_WRITE) {
+ if (txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ if (itx->itx_wr_state != WR_COPIED) {
+ char *dbuf;
+ int error;
+
+ if (itx->itx_wr_state == WR_NEED_COPY) {
+ dbuf = lr_buf + reclen;
+ lrcb->lrc_reclen += dnow;
+ if (lrwb->lr_length > dnow)
+ lrwb->lr_length = dnow;
+ lrw->lr_offset += dnow;
+ lrw->lr_length -= dnow;
+ } else {
+ ASSERT(itx->itx_wr_state == WR_INDIRECT);
+ dbuf = NULL;
+ }
+
+ /*
+ * We pass in the "lwb_write_zio" rather than
+ * "lwb_root_zio" so that the "lwb_write_zio"
+ * becomes the parent of any zio's created by
+ * the "zl_get_data" callback. The vdevs are
+ * flushed after the "lwb_write_zio" completes,
+ * so we want to make sure that completion
+ * callback waits for these additional zio's,
+ * such that the vdevs used by those zio's will
+ * be included in the lwb's vdev tree, and those
+ * vdevs will be properly flushed. If we passed
+ * in "lwb_root_zio" here, then these additional
+ * vdevs may not be flushed; e.g. if these zio's
+ * completed after "lwb_write_zio" completed.
+ */
+ error = zilog->zl_get_data(itx->itx_private,
+ lrwb, dbuf, lwb, lwb->lwb_write_zio);
+
+ if (error == EIO) {
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ return (lwb);
+ }
+ if (error != 0) {
+ ASSERT(error == ENOENT || error == EEXIST ||
+ error == EALREADY);
+ return (lwb);
+ }
+ }
+ }
+
+ /*
+ * We're actually making an entry, so update lrc_seq to be the
+ * log record sequence number. Note that this is generally not
+ * equal to the itx sequence number because not all transactions
+ * are synchronous, and sometimes spa_sync() gets there first.
+ */
+ lrcb->lrc_seq = ++zilog->zl_lr_seq;
+ lwb->lwb_nused += reclen + dnow;
+
+ zil_lwb_add_txg(lwb, txg);
+
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
+ ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+
+ dlen -= dnow;
+ if (dlen > 0) {
+ zilog->zl_cur_used += reclen;
+ goto cont;
+ }
+
+ return (lwb);
+}
+
+itx_t *
+zil_itx_create(uint64_t txtype, size_t lrsize)
+{
+ itx_t *itx;
+
+ lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
+
+ itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
+ itx->itx_lr.lrc_txtype = txtype;
+ itx->itx_lr.lrc_reclen = lrsize;
+ itx->itx_lr.lrc_seq = 0; /* defensive */
+ itx->itx_sync = B_TRUE; /* default is synchronous */
+
+ return (itx);
+}
+
+void
+zil_itx_destroy(itx_t *itx)
+{
+ kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
+}
+
+/*
+ * Free up the sync and async itxs. The itxs_t has already been detached
+ * so no locks are needed.
+ */
+static void
+zil_itxg_clean(itxs_t *itxs)
+{
+ itx_t *itx;
+ list_t *list;
+ avl_tree_t *t;
+ void *cookie;
+ itx_async_node_t *ian;
+
+ list = &itxs->i_sync_list;
+ while ((itx = list_head(list)) != NULL) {
+ /*
+ * In the general case, commit itxs will not be found
+ * here, as they'll be committed to an lwb via
+ * zil_lwb_commit(), and free'd in that function. Having
+ * said that, it is still possible for commit itxs to be
+ * found here, due to the following race:
+ *
+ * - a thread calls zil_commit() which assigns the
+ * commit itx to a per-txg i_sync_list
+ * - zil_itxg_clean() is called (e.g. via spa_sync())
+ * while the waiter is still on the i_sync_list
+ *
+ * There's nothing to prevent syncing the txg while the
+ * waiter is on the i_sync_list. This normally doesn't
+ * happen because spa_sync() is slower than zil_commit(),
+ * but if zil_commit() calls txg_wait_synced() (e.g.
+ * because zil_create() or zil_commit_writer_stall() is
+ * called) we will hit this case.
+ */
+ if (itx->itx_lr.lrc_txtype == TX_COMMIT)
+ zil_commit_waiter_skip(itx->itx_private);
+
+ list_remove(list, itx);
+ zil_itx_destroy(itx);
+ }
+
+ cookie = NULL;
+ t = &itxs->i_async_tree;
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list = &ian->ia_list;
+ while ((itx = list_head(list)) != NULL) {
+ list_remove(list, itx);
+ /* commit itxs should never be on the async lists. */
+ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
+ zil_itx_destroy(itx);
+ }
+ list_destroy(list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ avl_destroy(t);
+
+ kmem_free(itxs, sizeof (itxs_t));
+}
+
+static int
+zil_aitx_compare(const void *x1, const void *x2)
+{
+ const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
+ const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
+
+ return (AVL_CMP(o1, o2));
+}
+
+/*
+ * Remove all async itx with the given oid.
+ */
+static void
+zil_remove_async(zilog_t *zilog, uint64_t oid)
+{
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
+ list_t clean_list;
+ itx_t *itx;
+
+ ASSERT(oid != 0);
+ list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * Locate the object node and append its list.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ ian = avl_find(t, &oid, &where);
+ if (ian != NULL)
+ list_move_tail(&clean_list, &ian->ia_list);
+ mutex_exit(&itxg->itxg_lock);
+ }
+ while ((itx = list_head(&clean_list)) != NULL) {
+ list_remove(&clean_list, itx);
+ /* commit itxs should never be on the async lists. */
+ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
+ zil_itx_destroy(itx);
+ }
+ list_destroy(&clean_list);
+}
+
+void
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t txg;
+ itxg_t *itxg;
+ itxs_t *itxs, *clean = NULL;
+
+ /*
+ * Object ids can be re-instantiated in the next txg so
+ * remove any async transactions to avoid future leaks.
+ * This can happen if a fsync occurs on the re-instantiated
+ * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+ * the new file data and flushes a write record for the old object.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
+ zil_remove_async(zilog, itx->itx_oid);
+
+ /*
+ * Ensure the data of a renamed file is committed before the rename.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
+ zil_async_to_sync(zilog, itx->itx_oid);
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
+ txg = ZILTEST_TXG;
+ else
+ txg = dmu_tx_get_txg(tx);
+
+ itxg = &zilog->zl_itxg[txg & TXG_MASK];
+ mutex_enter(&itxg->itxg_lock);
+ itxs = itxg->itxg_itxs;
+ if (itxg->itxg_txg != txg) {
+ if (itxs != NULL) {
+ /*
+ * The zil_clean callback hasn't got around to cleaning
+ * this itxg. Save the itxs for release below.
+ * This should be rare.
+ */
+ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
+ "txg %llu", itxg->itxg_txg);
+ clean = itxg->itxg_itxs;
+ }
+ itxg->itxg_txg = txg;
+ itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
+
+ list_create(&itxs->i_sync_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ avl_create(&itxs->i_async_tree, zil_aitx_compare,
+ sizeof (itx_async_node_t),
+ offsetof(itx_async_node_t, ia_node));
+ }
+ if (itx->itx_sync) {
+ list_insert_tail(&itxs->i_sync_list, itx);
+ } else {
+ avl_tree_t *t = &itxs->i_async_tree;
+ uint64_t foid =
+ LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
+ itx_async_node_t *ian;
+ avl_index_t where;
+
+ ian = avl_find(t, &foid, &where);
+ if (ian == NULL) {
+ ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
+ list_create(&ian->ia_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ ian->ia_foid = foid;
+ avl_insert(t, ian, where);
+ }
+ list_insert_tail(&ian->ia_list, itx);
+ }
+
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+
+ /*
+ * We don't want to dirty the ZIL using ZILTEST_TXG, because
+ * zil_clean() will never be called using ZILTEST_TXG. Thus, we
+ * need to be careful to always dirty the ZIL using the "real"
+ * TXG (not itxg_txg) even when the SPA is frozen.
+ */
+ zilog_dirty(zilog, dmu_tx_get_txg(tx));
+ mutex_exit(&itxg->itxg_lock);
+
+ /* Release the old itxs now we've dropped the lock */
+ if (clean != NULL)
+ zil_itxg_clean(clean);
+}
+
+/*
+ * If there are any in-memory intent log transactions which have now been
+ * synced then start up a taskq to free them. We should only do this after we
+ * have written out the uberblocks (i.e. txg has been comitted) so that
+ * don't inadvertently clean out in-memory log records that would be required
+ * by zil_commit().
+ */
+void
+zil_clean(zilog_t *zilog, uint64_t synced_txg)
+{
+ itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
+ itxs_t *clean_me;
+
+ ASSERT3U(synced_txg, <, ZILTEST_TXG);
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
+ mutex_exit(&itxg->itxg_lock);
+ return;
+ }
+ ASSERT3U(itxg->itxg_txg, <=, synced_txg);
+ ASSERT3U(itxg->itxg_txg, !=, 0);
+ clean_me = itxg->itxg_itxs;
+ itxg->itxg_itxs = NULL;
+ itxg->itxg_txg = 0;
+ mutex_exit(&itxg->itxg_lock);
+ /*
+ * Preferably start a task queue to free up the old itxs but
+ * if taskq_dispatch can't allocate resources to do that then
+ * free it in-line. This should be rare. Note, using TQ_SLEEP
+ * created a bad performance problem.
+ */
+ ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
+ ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
+ if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
+ (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
+ zil_itxg_clean(clean_me);
+}
+
+/*
+ * This function will traverse the queue of itxs that need to be
+ * committed, and move them onto the ZIL's zl_itx_commit_list.
+ */
+static void
+zil_get_commit_list(zilog_t *zilog)
+{
+ uint64_t otxg, txg;
+ list_t *commit_list = &zilog->zl_itx_commit_list;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ /*
+ * This is inherently racy, since there is nothing to prevent
+ * the last synced txg from changing. That's okay since we'll
+ * only commit things in the future.
+ */
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * If we're adding itx records to the zl_itx_commit_list,
+ * then the zil better be dirty in this "txg". We can assert
+ * that here since we're holding the itxg_lock which will
+ * prevent spa_sync from cleaning it. Once we add the itxs
+ * to the zl_itx_commit_list we must commit it to disk even
+ * if it's unnecessary (i.e. the txg was synced).
+ */
+ ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
+ spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
+ list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+
+ mutex_exit(&itxg->itxg_lock);
+ }
+}
+
+/*
+ * Move the async itxs for a specified object to commit into sync lists.
+ */
+void
+zil_async_to_sync(zilog_t *zilog, uint64_t foid)
+{
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ /*
+ * This is inherently racy, since there is nothing to prevent
+ * the last synced txg from changing.
+ */
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * If a foid is specified then find that node and append its
+ * list. Otherwise walk the tree appending all the lists
+ * to the sync list. We add to the end rather than the
+ * beginning to ensure the create has happened.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ if (foid != 0) {
+ ian = avl_find(t, &foid, &where);
+ if (ian != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ }
+ } else {
+ void *cookie = NULL;
+
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ list_destroy(&ian->ia_list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ }
+ mutex_exit(&itxg->itxg_lock);
+ }
+}
+
+/*
+ * This function will prune commit itxs that are at the head of the
+ * commit list (it won't prune past the first non-commit itx), and
+ * either: a) attach them to the last lwb that's still pending
+ * completion, or b) skip them altogether.
+ *
+ * This is used as a performance optimization to prevent commit itxs
+ * from generating new lwbs when it's unnecessary to do so.
+ */
+static void
+zil_prune_commit_list(zilog_t *zilog)
+{
+ itx_t *itx;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+ while (itx = list_head(&zilog->zl_itx_commit_list)) {
+ lr_t *lrc = &itx->itx_lr;
+ if (lrc->lrc_txtype != TX_COMMIT)
+ break;
+
+ mutex_enter(&zilog->zl_lock);
+
+ lwb_t *last_lwb = zilog->zl_last_lwb_opened;
+ if (last_lwb == NULL ||
+ last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
+ /*
+ * All of the itxs this waiter was waiting on
+ * must have already completed (or there were
+ * never any itx's for it to wait on), so it's
+ * safe to skip this waiter and mark it done.
+ */
+ zil_commit_waiter_skip(itx->itx_private);
+ } else {
+ zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
+ itx->itx_private = NULL;
+ }
+
+ mutex_exit(&zilog->zl_lock);
+
+ list_remove(&zilog->zl_itx_commit_list, itx);
+ zil_itx_destroy(itx);
+ }
+
+ IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
+}
+
+static void
+zil_commit_writer_stall(zilog_t *zilog)
+{
+ /*
+ * When zio_alloc_zil() fails to allocate the next lwb block on
+ * disk, we must call txg_wait_synced() to ensure all of the
+ * lwbs in the zilog's zl_lwb_list are synced and then freed (in
+ * zil_sync()), such that any subsequent ZIL writer (i.e. a call
+ * to zil_process_commit_list()) will have to call zil_create(),
+ * and start a new ZIL chain.
+ *
+ * Since zil_alloc_zil() failed, the lwb that was previously
+ * issued does not have a pointer to the "next" lwb on disk.
+ * Thus, if another ZIL writer thread was to allocate the "next"
+ * on-disk lwb, that block could be leaked in the event of a
+ * crash (because the previous lwb on-disk would not point to
+ * it).
+ *
+ * We must hold the zilog's zl_issuer_lock while we do this, to
+ * ensure no new threads enter zil_process_commit_list() until
+ * all lwb's in the zl_lwb_list have been synced and freed
+ * (which is achieved via the txg_wait_synced() call).
+ */
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+}
+
+/*
+ * This function will traverse the commit list, creating new lwbs as
+ * needed, and committing the itxs from the commit list to these newly
+ * created lwbs. Additionally, as a new lwb is created, the previous
+ * lwb will be issued to the zio layer to be written to disk.
+ */
+static void
+zil_process_commit_list(zilog_t *zilog)
+{
+ spa_t *spa = zilog->zl_spa;
+ list_t nolwb_waiters;
+ lwb_t *lwb;
+ itx_t *itx;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+ /*
+ * Return if there's nothing to commit before we dirty the fs by
+ * calling zil_create().
+ */
+ if (list_head(&zilog->zl_itx_commit_list) == NULL)
+ return;
+
+ list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
+ offsetof(zil_commit_waiter_t, zcw_node));
+
+ lwb = list_tail(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ lwb = zil_create(zilog);
+ } else {
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+ }
+
+ while (itx = list_head(&zilog->zl_itx_commit_list)) {
+ lr_t *lrc = &itx->itx_lr;
+ uint64_t txg = lrc->lrc_txg;
+
+ ASSERT3U(txg, !=, 0);
+
+ if (lrc->lrc_txtype == TX_COMMIT) {
+ DTRACE_PROBE2(zil__process__commit__itx,
+ zilog_t *, zilog, itx_t *, itx);
+ } else {
+ DTRACE_PROBE2(zil__process__normal__itx,
+ zilog_t *, zilog, itx_t *, itx);
+ }
+
+ boolean_t synced = txg <= spa_last_synced_txg(spa);
+ boolean_t frozen = txg > spa_freeze_txg(spa);
+
+ /*
+ * If the txg of this itx has already been synced out, then
+ * we don't need to commit this itx to an lwb. This is
+ * because the data of this itx will have already been
+ * written to the main pool. This is inherently racy, and
+ * it's still ok to commit an itx whose txg has already
+ * been synced; this will result in a write that's
+ * unnecessary, but will do no harm.
+ *
+ * With that said, we always want to commit TX_COMMIT itxs
+ * to an lwb, regardless of whether or not that itx's txg
+ * has been synced out. We do this to ensure any OPENED lwb
+ * will always have at least one zil_commit_waiter_t linked
+ * to the lwb.
+ *
+ * As a counter-example, if we skipped TX_COMMIT itx's
+ * whose txg had already been synced, the following
+ * situation could occur if we happened to be racing with
+ * spa_sync:
+ *
+ * 1. we commit a non-TX_COMMIT itx to an lwb, where the
+ * itx's txg is 10 and the last synced txg is 9.
+ * 2. spa_sync finishes syncing out txg 10.
+ * 3. we move to the next itx in the list, it's a TX_COMMIT
+ * whose txg is 10, so we skip it rather than committing
+ * it to the lwb used in (1).
+ *
+ * If the itx that is skipped in (3) is the last TX_COMMIT
+ * itx in the commit list, than it's possible for the lwb
+ * used in (1) to remain in the OPENED state indefinitely.
+ *
+ * To prevent the above scenario from occuring, ensuring
+ * that once an lwb is OPENED it will transition to ISSUED
+ * and eventually DONE, we always commit TX_COMMIT itx's to
+ * an lwb here, even if that itx's txg has already been
+ * synced.
+ *
+ * Finally, if the pool is frozen, we _always_ commit the
+ * itx. The point of freezing the pool is to prevent data
+ * from being written to the main pool via spa_sync, and
+ * instead rely solely on the ZIL to persistently store the
+ * data; i.e. when the pool is frozen, the last synced txg
+ * value can't be trusted.
+ */
+ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
+ if (lwb != NULL) {
+ lwb = zil_lwb_commit(zilog, itx, lwb);
+ } else if (lrc->lrc_txtype == TX_COMMIT) {
+ ASSERT3P(lwb, ==, NULL);
+ zil_commit_waiter_link_nolwb(
+ itx->itx_private, &nolwb_waiters);
+ }
+ }
+
+ list_remove(&zilog->zl_itx_commit_list, itx);
+ zil_itx_destroy(itx);
+ }
+
+ if (lwb == NULL) {
+ /*
+ * This indicates zio_alloc_zil() failed to allocate the
+ * "next" lwb on-disk. When this happens, we must stall
+ * the ZIL write pipeline; see the comment within
+ * zil_commit_writer_stall() for more details.
+ */
+ zil_commit_writer_stall(zilog);
+
+ /*
+ * Additionally, we have to signal and mark the "nolwb"
+ * waiters as "done" here, since without an lwb, we
+ * can't do this via zil_lwb_flush_vdevs_done() like
+ * normal.
+ */
+ zil_commit_waiter_t *zcw;
+ while (zcw = list_head(&nolwb_waiters)) {
+ zil_commit_waiter_skip(zcw);
+ list_remove(&nolwb_waiters, zcw);
+ }
+ } else {
+ ASSERT(list_is_empty(&nolwb_waiters));
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
+ /*
+ * At this point, the ZIL block pointed at by the "lwb"
+ * variable is in one of the following states: "closed"
+ * or "open".
+ *
+ * If its "closed", then no itxs have been committed to
+ * it, so there's no point in issuing its zio (i.e.
+ * it's "empty").
+ *
+ * If its "open" state, then it contains one or more
+ * itxs that eventually need to be committed to stable
+ * storage. In this case we intentionally do not issue
+ * the lwb's zio to disk yet, and instead rely on one of
+ * the following two mechanisms for issuing the zio:
+ *
+ * 1. Ideally, there will be more ZIL activity occuring
+ * on the system, such that this function will be
+ * immediately called again (not necessarily by the same
+ * thread) and this lwb's zio will be issued via
+ * zil_lwb_commit(). This way, the lwb is guaranteed to
+ * be "full" when it is issued to disk, and we'll make
+ * use of the lwb's size the best we can.
+ *
+ * 2. If there isn't sufficient ZIL activity occuring on
+ * the system, such that this lwb's zio isn't issued via
+ * zil_lwb_commit(), zil_commit_waiter() will issue the
+ * lwb's zio. If this occurs, the lwb is not guaranteed
+ * to be "full" by the time its zio is issued, and means
+ * the size of the lwb was "too large" given the amount
+ * of ZIL activity occuring on the system at that time.
+ *
+ * We do this for a couple of reasons:
+ *
+ * 1. To try and reduce the number of IOPs needed to
+ * write the same number of itxs. If an lwb has space
+ * available in it's buffer for more itxs, and more itxs
+ * will be committed relatively soon (relative to the
+ * latency of performing a write), then it's beneficial
+ * to wait for these "next" itxs. This way, more itxs
+ * can be committed to stable storage with fewer writes.
+ *
+ * 2. To try and use the largest lwb block size that the
+ * incoming rate of itxs can support. Again, this is to
+ * try and pack as many itxs into as few lwbs as
+ * possible, without significantly impacting the latency
+ * of each individual itx.
+ */
+ }
+}
+
+/*
+ * This function is responsible for ensuring the passed in commit waiter
+ * (and associated commit itx) is committed to an lwb. If the waiter is
+ * not already committed to an lwb, all itxs in the zilog's queue of
+ * itxs will be processed. The assumption is the passed in waiter's
+ * commit itx will found in the queue just like the other non-commit
+ * itxs, such that when the entire queue is processed, the waiter will
+ * have been commited to an lwb.
+ *
+ * The lwb associated with the passed in waiter is not guaranteed to
+ * have been issued by the time this function completes. If the lwb is
+ * not issued, we rely on future calls to zil_commit_writer() to issue
+ * the lwb, or the timeout mechanism found in zil_commit_waiter().
+ */
+static void
+zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+ ASSERT(!MUTEX_HELD(&zilog->zl_lock));
+ ASSERT(spa_writeable(zilog->zl_spa));
+
+ mutex_enter(&zilog->zl_issuer_lock);
+
+ if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
+ /*
+ * It's possible that, while we were waiting to acquire
+ * the "zl_issuer_lock", another thread committed this
+ * waiter to an lwb. If that occurs, we bail out early,
+ * without processing any of the zilog's queue of itxs.
+ *
+ * On certain workloads and system configurations, the
+ * "zl_issuer_lock" can become highly contended. In an
+ * attempt to reduce this contention, we immediately drop
+ * the lock if the waiter has already been processed.
+ *
+ * We've measured this optimization to reduce CPU spent
+ * contending on this lock by up to 5%, using a system
+ * with 32 CPUs, low latency storage (~50 usec writes),
+ * and 1024 threads performing sync writes.
+ */
+ goto out;
+ }
+
+ zil_get_commit_list(zilog);
+ zil_prune_commit_list(zilog);
+ zil_process_commit_list(zilog);
+
+out:
+ mutex_exit(&zilog->zl_issuer_lock);
+}
+
+static void
+zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+ ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+ ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+
+ lwb_t *lwb = zcw->zcw_lwb;
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
+
+ /*
+ * If the lwb has already been issued by another thread, we can
+ * immediately return since there's no work to be done (the
+ * point of this function is to issue the lwb). Additionally, we
+ * do this prior to acquiring the zl_issuer_lock, to avoid
+ * acquiring it when it's not necessary to do so.
+ */
+ if (lwb->lwb_state == LWB_STATE_ISSUED ||
+ lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+ return;
+
+ /*
+ * In order to call zil_lwb_write_issue() we must hold the
+ * zilog's "zl_issuer_lock". We can't simply acquire that lock,
+ * since we're already holding the commit waiter's "zcw_lock",
+ * and those two locks are aquired in the opposite order
+ * elsewhere.
+ */
+ mutex_exit(&zcw->zcw_lock);
+ mutex_enter(&zilog->zl_issuer_lock);
+ mutex_enter(&zcw->zcw_lock);
+
+ /*
+ * Since we just dropped and re-acquired the commit waiter's
+ * lock, we have to re-check to see if the waiter was marked
+ * "done" during that process. If the waiter was marked "done",
+ * the "lwb" pointer is no longer valid (it can be free'd after
+ * the waiter is marked "done"), so without this check we could
+ * wind up with a use-after-free error below.
+ */
+ if (zcw->zcw_done)
+ goto out;
+
+ ASSERT3P(lwb, ==, zcw->zcw_lwb);
+
+ /*
+ * We've already checked this above, but since we hadn't acquired
+ * the zilog's zl_issuer_lock, we have to perform this check a
+ * second time while holding the lock.
+ *
+ * We don't need to hold the zl_lock since the lwb cannot transition
+ * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
+ * _can_ transition from ISSUED to DONE, but it's OK to race with
+ * that transition since we treat the lwb the same, whether it's in
+ * the ISSUED or DONE states.
+ *
+ * The important thing, is we treat the lwb differently depending on
+ * if it's ISSUED or OPENED, and block any other threads that might
+ * attempt to issue this lwb. For that reason we hold the
+ * zl_issuer_lock when checking the lwb_state; we must not call
+ * zil_lwb_write_issue() if the lwb had already been issued.
+ *
+ * See the comment above the lwb_state_t structure definition for
+ * more details on the lwb states, and locking requirements.
+ */
+ if (lwb->lwb_state == LWB_STATE_ISSUED ||
+ lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+ goto out;
+
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+
+ /*
+ * As described in the comments above zil_commit_waiter() and
+ * zil_process_commit_list(), we need to issue this lwb's zio
+ * since we've reached the commit waiter's timeout and it still
+ * hasn't been issued.
+ */
+ lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
+
+ IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
+
+ /*
+ * Since the lwb's zio hadn't been issued by the time this thread
+ * reached its timeout, we reset the zilog's "zl_cur_used" field
+ * to influence the zil block size selection algorithm.
+ *
+ * By having to issue the lwb's zio here, it means the size of the
+ * lwb was too large, given the incoming throughput of itxs. By
+ * setting "zl_cur_used" to zero, we communicate this fact to the
+ * block size selection algorithm, so it can take this informaiton
+ * into account, and potentially select a smaller size for the
+ * next lwb block that is allocated.
+ */
+ zilog->zl_cur_used = 0;
+
+ if (nlwb == NULL) {
+ /*
+ * When zil_lwb_write_issue() returns NULL, this
+ * indicates zio_alloc_zil() failed to allocate the
+ * "next" lwb on-disk. When this occurs, the ZIL write
+ * pipeline must be stalled; see the comment within the
+ * zil_commit_writer_stall() function for more details.
+ *
+ * We must drop the commit waiter's lock prior to
+ * calling zil_commit_writer_stall() or else we can wind
+ * up with the following deadlock:
+ *
+ * - This thread is waiting for the txg to sync while
+ * holding the waiter's lock; txg_wait_synced() is
+ * used within txg_commit_writer_stall().
+ *
+ * - The txg can't sync because it is waiting for this
+ * lwb's zio callback to call dmu_tx_commit().
+ *
+ * - The lwb's zio callback can't call dmu_tx_commit()
+ * because it's blocked trying to acquire the waiter's
+ * lock, which occurs prior to calling dmu_tx_commit()
+ */
+ mutex_exit(&zcw->zcw_lock);
+ zil_commit_writer_stall(zilog);
+ mutex_enter(&zcw->zcw_lock);
+ }
+
+out:
+ mutex_exit(&zilog->zl_issuer_lock);
+ ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+}
+
+/*
+ * This function is responsible for performing the following two tasks:
+ *
+ * 1. its primary responsibility is to block until the given "commit
+ * waiter" is considered "done".
+ *
+ * 2. its secondary responsibility is to issue the zio for the lwb that
+ * the given "commit waiter" is waiting on, if this function has
+ * waited "long enough" and the lwb is still in the "open" state.
+ *
+ * Given a sufficient amount of itxs being generated and written using
+ * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
+ * function. If this does not occur, this secondary responsibility will
+ * ensure the lwb is issued even if there is not other synchronous
+ * activity on the system.
+ *
+ * For more details, see zil_process_commit_list(); more specifically,
+ * the comment at the bottom of that function.
+ */
+static void
+zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+ ASSERT(!MUTEX_HELD(&zilog->zl_lock));
+ ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT(spa_writeable(zilog->zl_spa));
+
+ mutex_enter(&zcw->zcw_lock);
+
+ /*
+ * The timeout is scaled based on the lwb latency to avoid
+ * significantly impacting the latency of each individual itx.
+ * For more details, see the comment at the bottom of the
+ * zil_process_commit_list() function.
+ */
+ int pct = MAX(zfs_commit_timeout_pct, 1);
+#if defined(illumos) || !defined(_KERNEL)
+ hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
+ hrtime_t wakeup = gethrtime() + sleep;
+#else
+ sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100);
+ sbintime_t wakeup = getsbinuptime() + sleep;
+#endif
+ boolean_t timedout = B_FALSE;
+
+ while (!zcw->zcw_done) {
+ ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+
+ lwb_t *lwb = zcw->zcw_lwb;
+
+ /*
+ * Usually, the waiter will have a non-NULL lwb field here,
+ * but it's possible for it to be NULL as a result of
+ * zil_commit() racing with spa_sync().
+ *
+ * When zil_clean() is called, it's possible for the itxg
+ * list (which may be cleaned via a taskq) to contain
+ * commit itxs. When this occurs, the commit waiters linked
+ * off of these commit itxs will not be committed to an
+ * lwb. Additionally, these commit waiters will not be
+ * marked done until zil_commit_waiter_skip() is called via
+ * zil_itxg_clean().
+ *
+ * Thus, it's possible for this commit waiter (i.e. the
+ * "zcw" variable) to be found in this "in between" state;
+ * where it's "zcw_lwb" field is NULL, and it hasn't yet
+ * been skipped, so it's "zcw_done" field is still B_FALSE.
+ */
+ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
+
+ if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
+ ASSERT3B(timedout, ==, B_FALSE);
+
+ /*
+ * If the lwb hasn't been issued yet, then we
+ * need to wait with a timeout, in case this
+ * function needs to issue the lwb after the
+ * timeout is reached; responsibility (2) from
+ * the comment above this function.
+ */
+#if defined(illumos) || !defined(_KERNEL)
+ clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv,
+ &zcw->zcw_lock, wakeup, USEC2NSEC(1),
+ CALLOUT_FLAG_ABSOLUTE);
+
+ if (timeleft >= 0 || zcw->zcw_done)
+ continue;
+#else
+ int wait_err = cv_timedwait_sbt(&zcw->zcw_cv,
+ &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE);
+ if (wait_err != EWOULDBLOCK || zcw->zcw_done)
+ continue;
+#endif
+
+ timedout = B_TRUE;
+ zil_commit_waiter_timeout(zilog, zcw);
+
+ if (!zcw->zcw_done) {
+ /*
+ * If the commit waiter has already been
+ * marked "done", it's possible for the
+ * waiter's lwb structure to have already
+ * been freed. Thus, we can only reliably
+ * make these assertions if the waiter
+ * isn't done.
+ */
+ ASSERT3P(lwb, ==, zcw->zcw_lwb);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+ }
+ } else {
+ /*
+ * If the lwb isn't open, then it must have already
+ * been issued. In that case, there's no need to
+ * use a timeout when waiting for the lwb to
+ * complete.
+ *
+ * Additionally, if the lwb is NULL, the waiter
+ * will soon be signalled and marked done via
+ * zil_clean() and zil_itxg_clean(), so no timeout
+ * is required.
+ */
+
+ IMPLY(lwb != NULL,
+ lwb->lwb_state == LWB_STATE_ISSUED ||
+ lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+ cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
+ }
+ }
+
+ mutex_exit(&zcw->zcw_lock);
+}
+
+static zil_commit_waiter_t *
+zil_alloc_commit_waiter()
+{
+ zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
+
+ cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_link_init(&zcw->zcw_node);
+ zcw->zcw_lwb = NULL;
+ zcw->zcw_done = B_FALSE;
+ zcw->zcw_zio_error = 0;
+
+ return (zcw);
+}
+
+static void
+zil_free_commit_waiter(zil_commit_waiter_t *zcw)
+{
+ ASSERT(!list_link_active(&zcw->zcw_node));
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
+ ASSERT3B(zcw->zcw_done, ==, B_TRUE);
+ mutex_destroy(&zcw->zcw_lock);
+ cv_destroy(&zcw->zcw_cv);
+ kmem_cache_free(zil_zcw_cache, zcw);
+}
+
+/*
+ * This function is used to create a TX_COMMIT itx and assign it. This
+ * way, it will be linked into the ZIL's list of synchronous itxs, and
+ * then later committed to an lwb (or skipped) when
+ * zil_process_commit_list() is called.
+ */
+static void
+zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
+ itx->itx_sync = B_TRUE;
+ itx->itx_private = zcw;
+
+ zil_itx_assign(zilog, itx, tx);
+
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Commit ZFS Intent Log transactions (itxs) to stable storage.
+ *
+ * When writing ZIL transactions to the on-disk representation of the
+ * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
+ * itxs can be committed to a single lwb. Once a lwb is written and
+ * committed to stable storage (i.e. the lwb is written, and vdevs have
+ * been flushed), each itx that was committed to that lwb is also
+ * considered to be committed to stable storage.
+ *
+ * When an itx is committed to an lwb, the log record (lr_t) contained
+ * by the itx is copied into the lwb's zio buffer, and once this buffer
+ * is written to disk, it becomes an on-disk ZIL block.
+ *
+ * As itxs are generated, they're inserted into the ZIL's queue of
+ * uncommitted itxs. The semantics of zil_commit() are such that it will
+ * block until all itxs that were in the queue when it was called, are
+ * committed to stable storage.
+ *
+ * If "foid" is zero, this means all "synchronous" and "asynchronous"
+ * itxs, for all objects in the dataset, will be committed to stable
+ * storage prior to zil_commit() returning. If "foid" is non-zero, all
+ * "synchronous" itxs for all objects, but only "asynchronous" itxs
+ * that correspond to the foid passed in, will be committed to stable
+ * storage prior to zil_commit() returning.
+ *
+ * Generally speaking, when zil_commit() is called, the consumer doesn't
+ * actually care about _all_ of the uncommitted itxs. Instead, they're
+ * simply trying to waiting for a specific itx to be committed to disk,
+ * but the interface(s) for interacting with the ZIL don't allow such
+ * fine-grained communication. A better interface would allow a consumer
+ * to create and assign an itx, and then pass a reference to this itx to
+ * zil_commit(); such that zil_commit() would return as soon as that
+ * specific itx was committed to disk (instead of waiting for _all_
+ * itxs to be committed).
+ *
+ * When a thread calls zil_commit() a special "commit itx" will be
+ * generated, along with a corresponding "waiter" for this commit itx.
+ * zil_commit() will wait on this waiter's CV, such that when the waiter
+ * is marked done, and signalled, zil_commit() will return.
+ *
+ * This commit itx is inserted into the queue of uncommitted itxs. This
+ * provides an easy mechanism for determining which itxs were in the
+ * queue prior to zil_commit() having been called, and which itxs were
+ * added after zil_commit() was called.
+ *
+ * The commit it is special; it doesn't have any on-disk representation.
+ * When a commit itx is "committed" to an lwb, the waiter associated
+ * with it is linked onto the lwb's list of waiters. Then, when that lwb
+ * completes, each waiter on the lwb's list is marked done and signalled
+ * -- allowing the thread waiting on the waiter to return from zil_commit().
+ *
+ * It's important to point out a few critical factors that allow us
+ * to make use of the commit itxs, commit waiters, per-lwb lists of
+ * commit waiters, and zio completion callbacks like we're doing:
+ *
+ * 1. The list of waiters for each lwb is traversed, and each commit
+ * waiter is marked "done" and signalled, in the zio completion
+ * callback of the lwb's zio[*].
+ *
+ * * Actually, the waiters are signalled in the zio completion
+ * callback of the root zio for the DKIOCFLUSHWRITECACHE commands
+ * that are sent to the vdevs upon completion of the lwb zio.
+ *
+ * 2. When the itxs are inserted into the ZIL's queue of uncommitted
+ * itxs, the order in which they are inserted is preserved[*]; as
+ * itxs are added to the queue, they are added to the tail of
+ * in-memory linked lists.
+ *
+ * When committing the itxs to lwbs (to be written to disk), they
+ * are committed in the same order in which the itxs were added to
+ * the uncommitted queue's linked list(s); i.e. the linked list of
+ * itxs to commit is traversed from head to tail, and each itx is
+ * committed to an lwb in that order.
+ *
+ * * To clarify:
+ *
+ * - the order of "sync" itxs is preserved w.r.t. other
+ * "sync" itxs, regardless of the corresponding objects.
+ * - the order of "async" itxs is preserved w.r.t. other
+ * "async" itxs corresponding to the same object.
+ * - the order of "async" itxs is *not* preserved w.r.t. other
+ * "async" itxs corresponding to different objects.
+ * - the order of "sync" itxs w.r.t. "async" itxs (or vice
+ * versa) is *not* preserved, even for itxs that correspond
+ * to the same object.
+ *
+ * For more details, see: zil_itx_assign(), zil_async_to_sync(),
+ * zil_get_commit_list(), and zil_process_commit_list().
+ *
+ * 3. The lwbs represent a linked list of blocks on disk. Thus, any
+ * lwb cannot be considered committed to stable storage, until its
+ * "previous" lwb is also committed to stable storage. This fact,
+ * coupled with the fact described above, means that itxs are
+ * committed in (roughly) the order in which they were generated.
+ * This is essential because itxs are dependent on prior itxs.
+ * Thus, we *must not* deem an itx as being committed to stable
+ * storage, until *all* prior itxs have also been committed to
+ * stable storage.
+ *
+ * To enforce this ordering of lwb zio's, while still leveraging as
+ * much of the underlying storage performance as possible, we rely
+ * on two fundamental concepts:
+ *
+ * 1. The creation and issuance of lwb zio's is protected by
+ * the zilog's "zl_issuer_lock", which ensures only a single
+ * thread is creating and/or issuing lwb's at a time
+ * 2. The "previous" lwb is a child of the "current" lwb
+ * (leveraging the zio parent-child depenency graph)
+ *
+ * By relying on this parent-child zio relationship, we can have
+ * many lwb zio's concurrently issued to the underlying storage,
+ * but the order in which they complete will be the same order in
+ * which they were created.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t foid)
+{
+ /*
+ * We should never attempt to call zil_commit on a snapshot for
+ * a couple of reasons:
+ *
+ * 1. A snapshot may never be modified, thus it cannot have any
+ * in-flight itxs that would have modified the dataset.
+ *
+ * 2. By design, when zil_commit() is called, a commit itx will
+ * be assigned to this zilog; as a result, the zilog will be
+ * dirtied. We must not dirty the zilog of a snapshot; there's
+ * checks in the code that enforce this invariant, and will
+ * cause a panic if it's not upheld.
+ */
+ ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
+
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return;
+
+ if (!spa_writeable(zilog->zl_spa)) {
+ /*
+ * If the SPA is not writable, there should never be any
+ * pending itxs waiting to be committed to disk. If that
+ * weren't true, we'd skip writing those itxs out, and
+ * would break the sematics of zil_commit(); thus, we're
+ * verifying that truth before we return to the caller.
+ */
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+ ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+ for (int i = 0; i < TXG_SIZE; i++)
+ ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
+ return;
+ }
+
+ /*
+ * If the ZIL is suspended, we don't want to dirty it by calling
+ * zil_commit_itx_assign() below, nor can we write out
+ * lwbs like would be done in zil_commit_write(). Thus, we
+ * simply rely on txg_wait_synced() to maintain the necessary
+ * semantics, and avoid calling those functions altogether.
+ */
+ if (zilog->zl_suspend > 0) {
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ return;
+ }
+
+ zil_commit_impl(zilog, foid);
+}
+
+void
+zil_commit_impl(zilog_t *zilog, uint64_t foid)
+{
+ /*
+ * Move the "async" itxs for the specified foid to the "sync"
+ * queues, such that they will be later committed (or skipped)
+ * to an lwb when zil_process_commit_list() is called.
+ *
+ * Since these "async" itxs must be committed prior to this
+ * call to zil_commit returning, we must perform this operation
+ * before we call zil_commit_itx_assign().
+ */
+ zil_async_to_sync(zilog, foid);
+
+ /*
+ * We allocate a new "waiter" structure which will initially be
+ * linked to the commit itx using the itx's "itx_private" field.
+ * Since the commit itx doesn't represent any on-disk state,
+ * when it's committed to an lwb, rather than copying the its
+ * lr_t into the lwb's buffer, the commit itx's "waiter" will be
+ * added to the lwb's list of waiters. Then, when the lwb is
+ * committed to stable storage, each waiter in the lwb's list of
+ * waiters will be marked "done", and signalled.
+ *
+ * We must create the waiter and assign the commit itx prior to
+ * calling zil_commit_writer(), or else our specific commit itx
+ * is not guaranteed to be committed to an lwb prior to calling
+ * zil_commit_waiter().
+ */
+ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
+ zil_commit_itx_assign(zilog, zcw);
+
+ zil_commit_writer(zilog, zcw);
+ zil_commit_waiter(zilog, zcw);
+
+ if (zcw->zcw_zio_error != 0) {
+ /*
+ * If there was an error writing out the ZIL blocks that
+ * this thread is waiting on, then we fallback to
+ * relying on spa_sync() to write out the data this
+ * thread is waiting on. Obviously this has performance
+ * implications, but the expectation is for this to be
+ * an exceptional case, and shouldn't occur often.
+ */
+ DTRACE_PROBE2(zil__commit__io__error,
+ zilog_t *, zilog, zil_commit_waiter_t *, zcw);
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ }
+
+ zil_free_commit_waiter(zcw);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = zilog->zl_spa;
+ uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
+ lwb_t *lwb;
+
+ /*
+ * We don't zero out zl_destroy_txg, so make sure we don't try
+ * to destroy it twice.
+ */
+ if (spa_sync_pass(spa) != 1)
+ return;
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT(zilog->zl_stop_sync == 0);
+
+ if (*replayed_seq != 0) {
+ ASSERT(zh->zh_replay_seq < *replayed_seq);
+ zh->zh_replay_seq = *replayed_seq;
+ *replayed_seq = 0;
+ }
+
+ if (zilog->zl_destroy_txg == txg) {
+ blkptr_t blk = zh->zh_log;
+
+ ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+
+ bzero(zh, sizeof (zil_header_t));
+ bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
+
+ if (zilog->zl_keep_first) {
+ /*
+ * If this block was part of log chain that couldn't
+ * be claimed because a device was missing during
+ * zil_claim(), but that device later returns,
+ * then this block could erroneously appear valid.
+ * To guard against this, assign a new GUID to the new
+ * log chain so it doesn't matter what blk points to.
+ */
+ zil_init_log_chain(zilog, &blk);
+ zh->zh_log = blk;
+ }
+ }
+
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ zh->zh_log = lwb->lwb_blk;
+ if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+ break;
+ list_remove(&zilog->zl_lwb_list, lwb);
+ zio_free(spa, txg, &lwb->lwb_blk);
+ zil_free_lwb(zilog, lwb);
+
+ /*
+ * If we don't have anything left in the lwb list then
+ * we've had an allocation failure and we need to zero
+ * out the zil_header blkptr so that we don't end
+ * up freeing the same block twice.
+ */
+ if (list_head(&zilog->zl_lwb_list) == NULL)
+ BP_ZERO(&zh->zh_log);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+/* ARGSUSED */
+static int
+zil_lwb_cons(void *vbuf, void *unused, int kmflag)
+{
+ lwb_t *lwb = vbuf;
+ list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
+ offsetof(zil_commit_waiter_t, zcw_node));
+ avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
+ sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
+ mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+zil_lwb_dest(void *vbuf, void *unused)
+{
+ lwb_t *lwb = vbuf;
+ mutex_destroy(&lwb->lwb_vdev_lock);
+ avl_destroy(&lwb->lwb_vdev_tree);
+ list_destroy(&lwb->lwb_waiters);
+}
+
+void
+zil_init(void)
+{
+ zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+ sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
+
+ zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
+ sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zil_fini(void)
+{
+ kmem_cache_destroy(zil_zcw_cache);
+ kmem_cache_destroy(zil_lwb_cache);
+}
+
+void
+zil_set_sync(zilog_t *zilog, uint64_t sync)
+{
+ zilog->zl_sync = sync;
+}
+
+void
+zil_set_logbias(zilog_t *zilog, uint64_t logbias)
+{
+ zilog->zl_logbias = logbias;
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+ zilog_t *zilog;
+
+ zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+ zilog->zl_header = zh_phys;
+ zilog->zl_os = os;
+ zilog->zl_spa = dmu_objset_spa(os);
+ zilog->zl_dmu_pool = dmu_objset_pool(os);
+ zilog->zl_destroy_txg = TXG_INITIAL - 1;
+ zilog->zl_logbias = dmu_objset_logbias(os);
+ zilog->zl_sync = dmu_objset_syncprop(os);
+ zilog->zl_dirty_max_txg = 0;
+ zilog->zl_last_lwb_opened = NULL;
+ zilog->zl_last_lwb_latency = 0;
+ zilog->zl_max_block_size = zil_maxblocksize;
+
+ mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
+
+ list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+ offsetof(lwb_t, lwb_node));
+
+ list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
+ cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+
+ return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+ zilog->zl_stop_sync = 1;
+
+ ASSERT0(zilog->zl_suspend);
+ ASSERT0(zilog->zl_suspending);
+
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+ list_destroy(&zilog->zl_lwb_list);
+
+ ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
+ list_destroy(&zilog->zl_itx_commit_list);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ /*
+ * It's possible for an itx to be generated that doesn't dirty
+ * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
+ * callback to remove the entry. We remove those here.
+ *
+ * Also free up the ziltest itxs.
+ */
+ if (zilog->zl_itxg[i].itxg_itxs)
+ zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
+ mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
+ }
+
+ mutex_destroy(&zilog->zl_issuer_lock);
+ mutex_destroy(&zilog->zl_lock);
+
+ cv_destroy(&zilog->zl_cv_suspend);
+
+ kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ ASSERT3P(zilog->zl_get_data, ==, NULL);
+ ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
+ zilog->zl_get_data = get_data;
+
+ return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ uint64_t txg;
+
+ if (!dmu_objset_is_snapshot(zilog->zl_os)) {
+ zil_commit(zilog, 0);
+ } else {
+ ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+ ASSERT0(zilog->zl_dirty_max_txg);
+ ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ lwb = list_tail(&zilog->zl_lwb_list);
+ if (lwb == NULL)
+ txg = zilog->zl_dirty_max_txg;
+ else
+ txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
+ mutex_exit(&zilog->zl_lock);
+
+ /*
+ * We need to use txg_wait_synced() to wait long enough for the
+ * ZIL to be clean, and to wait for all pending lwbs to be
+ * written out.
+ */
+ if (txg)
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+ if (zilog_is_dirty(zilog))
+ zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
+ if (txg < spa_freeze_txg(zilog->zl_spa))
+ VERIFY(!zilog_is_dirty(zilog));
+
+ zilog->zl_get_data = NULL;
+
+ /*
+ * We should have only one lwb left on the list; remove it now.
+ */
+ mutex_enter(&zilog->zl_lock);
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb != NULL) {
+ ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+ list_remove(&zilog->zl_lwb_list, lwb);
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ zil_free_lwb(zilog, lwb);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+static char *suspend_tag = "zil suspending";
+
+/*
+ * Suspend an intent log. While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * On old version pools, we suspend the log briefly when taking a
+ * snapshot so that it will have an empty intent log.
+ *
+ * Long holds are not really intended to be used the way we do here --
+ * held for such a short time. A concurrent caller of dsl_dataset_long_held()
+ * could fail. Therefore we take pains to only put a long hold if it is
+ * actually necessary. Fortunately, it will only be necessary if the
+ * objset is currently mounted (or the ZVOL equivalent). In that case it
+ * will already have a long hold, so we are not really making things any worse.
+ *
+ * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
+ * zvol_state_t), and use their mechanism to prevent their hold from being
+ * dropped (e.g. VFS_HOLD()). However, that would be even more pain for
+ * very little gain.
+ *
+ * if cookiep == NULL, this does both the suspend & resume.
+ * Otherwise, it returns with the dataset "long held", and the cookie
+ * should be passed into zil_resume().
+ */
+int
+zil_suspend(const char *osname, void **cookiep)
+{
+ objset_t *os;
+ zilog_t *zilog;
+ const zil_header_t *zh;
+ int error;
+
+ error = dmu_objset_hold(osname, suspend_tag, &os);
+ if (error != 0)
+ return (error);
+ zilog = dmu_objset_zil(os);
+
+ mutex_enter(&zilog->zl_lock);
+ zh = zilog->zl_header;
+
+ if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
+ mutex_exit(&zilog->zl_lock);
+ dmu_objset_rele(os, suspend_tag);
+ return (SET_ERROR(EBUSY));
+ }
+
+ /*
+ * Don't put a long hold in the cases where we can avoid it. This
+ * is when there is no cookie so we are doing a suspend & resume
+ * (i.e. called from zil_vdev_offline()), and there's nothing to do
+ * for the suspend because it's already suspended, or there's no ZIL.
+ */
+ if (cookiep == NULL && !zilog->zl_suspending &&
+ (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
+ mutex_exit(&zilog->zl_lock);
+ dmu_objset_rele(os, suspend_tag);
+ return (0);
+ }
+
+ dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
+ dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
+
+ zilog->zl_suspend++;
+
+ if (zilog->zl_suspend > 1) {
+ /*
+ * Someone else is already suspending it.
+ * Just wait for them to finish.
+ */
+
+ while (zilog->zl_suspending)
+ cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
+ mutex_exit(&zilog->zl_lock);
+
+ if (cookiep == NULL)
+ zil_resume(os);
+ else
+ *cookiep = os;
+ return (0);
+ }
+
+ /*
+ * If there is no pointer to an on-disk block, this ZIL must not
+ * be active (e.g. filesystem not mounted), so there's nothing
+ * to clean up.
+ */
+ if (BP_IS_HOLE(&zh->zh_log)) {
+ ASSERT(cookiep != NULL); /* fast path already handled */
+
+ *cookiep = os;
+ mutex_exit(&zilog->zl_lock);
+ return (0);
+ }
+
+ zilog->zl_suspending = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+
+ /*
+ * We need to use zil_commit_impl to ensure we wait for all
+ * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed
+ * to disk before proceeding. If we used zil_commit instead, it
+ * would just call txg_wait_synced(), because zl_suspend is set.
+ * txg_wait_synced() doesn't wait for these lwb's to be
+ * LWB_STATE_FLUSH_DONE before returning.
+ */
+ zil_commit_impl(zilog, 0);
+
+ /*
+ * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
+ * use txg_wait_synced() to ensure the data from the zilog has
+ * migrated to the main pool before calling zil_destroy().
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ zil_destroy(zilog, B_FALSE);
+
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_suspending = B_FALSE;
+ cv_broadcast(&zilog->zl_cv_suspend);
+ mutex_exit(&zilog->zl_lock);
+
+ if (cookiep == NULL)
+ zil_resume(os);
+ else
+ *cookiep = os;
+ return (0);
+}
+
+void
+zil_resume(void *cookie)
+{
+ objset_t *os = cookie;
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zilog->zl_suspend != 0);
+ zilog->zl_suspend--;
+ mutex_exit(&zilog->zl_lock);
+ dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
+ dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
+}
+
+typedef struct zil_replay_arg {
+ zil_replay_func_t **zr_replay;
+ void *zr_arg;
+ boolean_t zr_byteswap;
+ char *zr_lr;
+} zil_replay_arg_t;
+
+static int
+zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
+{
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+
+ zilog->zl_replaying_seq--; /* didn't actually replay this one */
+
+ dmu_objset_name(zilog->zl_os, name);
+
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+ (u_longlong_t)lr->lrc_seq,
+ (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+ (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+ return (error);
+}
+
+static int
+zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
+{
+ zil_replay_arg_t *zr = zra;
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t reclen = lr->lrc_reclen;
+ uint64_t txtype = lr->lrc_txtype;
+ int error = 0;
+
+ zilog->zl_replaying_seq = lr->lrc_seq;
+
+ if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
+ return (0);
+
+ if (lr->lrc_txg < claim_txg) /* already committed */
+ return (0);
+
+ /* Strip case-insensitive bit, still present in log record */
+ txtype &= ~TX_CI;
+
+ if (txtype == 0 || txtype >= TX_MAX_TYPE)
+ return (zil_replay_error(zilog, lr, EINVAL));
+
+ /*
+ * If this record type can be logged out of order, the object
+ * (lr_foid) may no longer exist. That's legitimate, not an error.
+ */
+ if (TX_OOO(txtype)) {
+ error = dmu_object_info(zilog->zl_os,
+ LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
+ if (error == ENOENT || error == EEXIST)
+ return (0);
+ }
+
+ /*
+ * Make a copy of the data so we can revise and extend it.
+ */
+ bcopy(lr, zr->zr_lr, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ error = zil_read_log_data(zilog, (lr_write_t *)lr,
+ zr->zr_lr + reclen);
+ if (error != 0)
+ return (zil_replay_error(zilog, lr, error));
+ }
+
+ /*
+ * The log block containing this lr may have been byteswapped
+ * so that we can easily examine common fields like lrc_txtype.
+ * However, the log is a mix of different record types, and only the
+ * replay vectors know how to byteswap their records. Therefore, if
+ * the lr was byteswapped, undo it before invoking the replay vector.
+ */
+ if (zr->zr_byteswap)
+ byteswap_uint64_array(zr->zr_lr, reclen);
+
+ /*
+ * We must now do two things atomically: replay this log record,
+ * and update the log header sequence number to reflect the fact that
+ * we did so. At the end of each replay function the sequence number
+ * is updated if we are in replay mode.
+ */
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+ if (error != 0) {
+ /*
+ * The DMU's dnode layer doesn't see removes until the txg
+ * commits, so a subsequent claim can spuriously fail with
+ * EEXIST. So if we receive any error we try syncing out
+ * any removes then retry the transaction. Note that we
+ * specify B_FALSE for byteswap now, so we don't do it twice.
+ */
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+ if (error != 0)
+ return (zil_replay_error(zilog, lr, error));
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zilog->zl_replay_blks++;
+
+ return (0);
+}
+
+/*
+ * If this dataset has a non-empty intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+ const zil_header_t *zh = zilog->zl_header;
+ zil_replay_arg_t zr;
+
+ if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
+ zil_destroy(zilog, B_TRUE);
+ return;
+ }
+
+ zr.zr_replay = replay_func;
+ zr.zr_arg = arg;
+ zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
+ zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+ /*
+ * Wait for in-progress removes to sync before starting replay.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ zilog->zl_replay = B_TRUE;
+ zilog->zl_replay_time = ddi_get_lbolt();
+ ASSERT(zilog->zl_replay_blks == 0);
+ (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
+ zh->zh_claim_txg);
+ kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
+
+ zil_destroy(zilog, B_FALSE);
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+ zilog->zl_replay = B_FALSE;
+}
+
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
+{
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return (B_TRUE);
+
+ if (zilog->zl_replay) {
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+ zilog->zl_replaying_seq;
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/* ARGSUSED */
+int
+zil_reset(const char *osname, void *arg)
+{
+ int error;
+
+ error = zil_suspend(osname, NULL);
+ if (error != 0)
+ return (SET_ERROR(EEXIST));
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
new file mode 100644
index 000000000000..a026b3bfe02d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -0,0 +1,4386 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include <sys/trim_map.h>
+#include <sys/blkptr.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_scan.h>
+#include <sys/metaslab_impl.h>
+#include <sys/abd.h>
+#include <sys/cityhash.h>
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS ZIO");
+#if defined(__amd64__)
+static int zio_use_uma = 1;
+#else
+static int zio_use_uma = 0;
+#endif
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
+ "Use uma(9) for ZIO allocations");
+static int zio_exclude_metadata = 0;
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
+ "Exclude metadata buffers from dumps as well");
+
+zio_trim_stats_t zio_trim_stats = {
+ { "bytes", KSTAT_DATA_UINT64,
+ "Number of bytes successfully TRIMmed" },
+ { "success", KSTAT_DATA_UINT64,
+ "Number of successful TRIM requests" },
+ { "unsupported", KSTAT_DATA_UINT64,
+ "Number of TRIM requests that failed because TRIM is not supported" },
+ { "failed", KSTAT_DATA_UINT64,
+ "Number of TRIM requests that failed for reasons other than not supported" },
+};
+
+static kstat_t *zio_trim_ksp;
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+const char *zio_type_name[ZIO_TYPES] = {
+ "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
+ "zio_ioctl"
+};
+
+boolean_t zio_dva_throttle_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RWTUN,
+ &zio_dva_throttle_enabled, 0, "Enable allocation throttling");
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_cache;
+kmem_cache_t *zio_link_cache;
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
+
+#define BP_SPANB(indblkshift, level) \
+ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define COMPARE_META_LEVEL 0x80000000ul
+/*
+ * The following actions directly effect the spa's sync-to-convergence logic.
+ * The values below define the sync pass when we start performing the action.
+ * Care should be taken when changing these values as they directly impact
+ * spa_sync() performance. Tuning these values may introduce subtle performance
+ * pathologies and should only be done in the context of performance analysis.
+ * These tunables will eventually be removed and replaced with #defines once
+ * enough analysis has been done to determine optimal values.
+ *
+ * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
+ * regular blocks are not deferred.
+ */
+int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
+ &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
+int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
+ &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
+int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
+ &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
+
+/*
+ * An allocating zio is one that either currently has the DVA allocate
+ * stage set or will have it later in its lifetime.
+ */
+#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
+
+#ifdef illumos
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
+#endif
+
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
+void
+zio_init(void)
+{
+ size_t c;
+ zio_cache = kmem_cache_create("zio_cache",
+ sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ zio_link_cache = kmem_cache_create("zio_link_cache",
+ sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (!zio_use_uma)
+ goto out;
+
+ /*
+ * For small buffers, we want a cache for each multiple of
+ * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
+ * for each quarter-power of 2.
+ */
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+ size_t p2 = size;
+ size_t align = 0;
+ int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
+
+ while (!ISP2(p2))
+ p2 &= p2 - 1;
+
+#ifdef illumos
+#ifndef _KERNEL
+ /*
+ * If we are using watchpoints, put each buffer on its own page,
+ * to eliminate the performance overhead of trapping to the
+ * kernel when modifying a non-watched buffer that shares the
+ * page with a watched buffer.
+ */
+ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
+ continue;
+#endif
+#endif /* illumos */
+ if (size <= 4 * SPA_MINBLOCKSIZE) {
+ align = SPA_MINBLOCKSIZE;
+ } else if (IS_P2ALIGNED(size, p2 >> 2)) {
+ align = MIN(p2 >> 2, PAGESIZE);
+ }
+
+ if (align != 0) {
+ char name[36];
+ (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, cflags);
+
+ /*
+ * Since zio_data bufs do not appear in crash dumps, we
+ * pass KMC_NOTOUCH so that no allocator metadata is
+ * stored with the buffers.
+ */
+ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
+ zio_data_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL,
+ cflags | KMC_NOTOUCH | KMC_NODEBUG);
+ }
+ }
+
+ while (--c != 0) {
+ ASSERT(zio_buf_cache[c] != NULL);
+ if (zio_buf_cache[c - 1] == NULL)
+ zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+ ASSERT(zio_data_buf_cache[c] != NULL);
+ if (zio_data_buf_cache[c - 1] == NULL)
+ zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
+ }
+out:
+
+ zio_inject_init();
+
+ zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
+ KSTAT_TYPE_NAMED,
+ sizeof(zio_trim_stats) / sizeof(kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (zio_trim_ksp != NULL) {
+ zio_trim_ksp->ks_data = &zio_trim_stats;
+ kstat_install(zio_trim_ksp);
+ }
+}
+
+void
+zio_fini(void)
+{
+ size_t c;
+ kmem_cache_t *last_cache = NULL;
+ kmem_cache_t *last_data_cache = NULL;
+
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ if (zio_buf_cache[c] != last_cache) {
+ last_cache = zio_buf_cache[c];
+ kmem_cache_destroy(zio_buf_cache[c]);
+ }
+ zio_buf_cache[c] = NULL;
+
+ if (zio_data_buf_cache[c] != last_data_cache) {
+ last_data_cache = zio_data_buf_cache[c];
+ kmem_cache_destroy(zio_data_buf_cache[c]);
+ }
+ zio_data_buf_cache[c] = NULL;
+ }
+
+ kmem_cache_destroy(zio_link_cache);
+ kmem_cache_destroy(zio_cache);
+
+ zio_inject_fini();
+
+ if (zio_trim_ksp != NULL) {
+ kstat_delete(zio_trim_ksp);
+ zio_trim_ksp = NULL;
+ }
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously. Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+ int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ if (zio_use_uma)
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+ else
+ return (kmem_alloc(size, KM_SLEEP|flags));
+}
+
+/*
+ * Use zio_data_buf_alloc to allocate data. The data will not appear in a
+ * crashdump if the kernel panics. This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ if (zio_use_uma)
+ return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+ else
+ return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ if (zio_use_uma)
+ kmem_cache_free(zio_buf_cache[c], buf);
+ else
+ kmem_free(buf, size);
+}
+
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ if (zio_use_uma)
+ kmem_cache_free(zio_data_buf_cache[c], buf);
+ else
+ kmem_free(buf, size);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+void
+zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
+ zio_transform_func_t *transform)
+{
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+ /*
+ * Ensure that anyone expecting this zio to contain a linear ABD isn't
+ * going to get a nasty surprise when they try to access the data.
+ */
+#ifdef illumos
+ IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
+#else
+ IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd),
+ abd_is_linear(data));
+#endif
+
+ zt->zt_orig_abd = zio->io_abd;
+ zt->zt_orig_size = zio->io_size;
+ zt->zt_bufsize = bufsize;
+ zt->zt_transform = transform;
+
+ zt->zt_next = zio->io_transform_stack;
+ zio->io_transform_stack = zt;
+
+ zio->io_abd = data;
+ zio->io_size = size;
+}
+
+void
+zio_pop_transforms(zio_t *zio)
+{
+ zio_transform_t *zt;
+
+ while ((zt = zio->io_transform_stack) != NULL) {
+ if (zt->zt_transform != NULL)
+ zt->zt_transform(zio,
+ zt->zt_orig_abd, zt->zt_orig_size);
+
+ if (zt->zt_bufsize != 0)
+ abd_free(zio->io_abd);
+
+ zio->io_abd = zt->zt_orig_abd;
+ zio->io_size = zt->zt_orig_size;
+ zio->io_transform_stack = zt->zt_next;
+
+ kmem_free(zt, sizeof (zio_transform_t));
+ }
+}
+
+/*
+ * ==========================================================================
+ * I/O transform callbacks for subblocks and decompression
+ * ==========================================================================
+ */
+static void
+zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
+{
+ ASSERT(zio->io_size > size);
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_copy(data, zio->io_abd, size);
+}
+
+static void
+zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
+{
+ if (zio->io_error == 0) {
+ void *tmp = abd_borrow_buf(data, size);
+ int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+ zio->io_abd, tmp, zio->io_size, size);
+ abd_return_buf_copy(data, tmp, size);
+
+ if (ret != 0)
+ zio->io_error = SET_ERROR(EIO);
+ }
+}
+
+/*
+ * ==========================================================================
+ * I/O parent/child relationships and pipeline interlocks
+ * ==========================================================================
+ */
+zio_t *
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
+{
+ list_t *pl = &cio->io_parent_list;
+
+ *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+ if (*zl == NULL)
+ return (NULL);
+
+ ASSERT((*zl)->zl_child == cio);
+ return ((*zl)->zl_parent);
+}
+
+zio_t *
+zio_walk_children(zio_t *pio, zio_link_t **zl)
+{
+ list_t *cl = &pio->io_child_list;
+
+ ASSERT(MUTEX_HELD(&pio->io_lock));
+
+ *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+ if (*zl == NULL)
+ return (NULL);
+
+ ASSERT((*zl)->zl_parent == pio);
+ return ((*zl)->zl_child);
+}
+
+zio_t *
+zio_unique_parent(zio_t *cio)
+{
+ zio_link_t *zl = NULL;
+ zio_t *pio = zio_walk_parents(cio, &zl);
+
+ VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
+ return (pio);
+}
+
+void
+zio_add_child(zio_t *pio, zio_t *cio)
+{
+ zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+
+ /*
+ * Logical I/Os can have logical, gang, or vdev children.
+ * Gang I/Os can have gang or vdev children.
+ * Vdev I/Os can only have vdev children.
+ * The following ASSERT captures all of these constraints.
+ */
+ ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+
+ zl->zl_parent = pio;
+ zl->zl_child = cio;
+
+ mutex_enter(&pio->io_lock);
+ mutex_enter(&cio->io_lock);
+
+ ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+
+ list_insert_head(&pio->io_child_list, zl);
+ list_insert_head(&cio->io_parent_list, zl);
+
+ pio->io_child_count++;
+ cio->io_parent_count++;
+
+ mutex_exit(&cio->io_lock);
+ mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
+{
+ ASSERT(zl->zl_parent == pio);
+ ASSERT(zl->zl_child == cio);
+
+ mutex_enter(&pio->io_lock);
+ mutex_enter(&cio->io_lock);
+
+ list_remove(&pio->io_child_list, zl);
+ list_remove(&cio->io_parent_list, zl);
+
+ pio->io_child_count--;
+ cio->io_parent_count--;
+
+ mutex_exit(&cio->io_lock);
+ mutex_exit(&pio->io_lock);
+ kmem_cache_free(zio_link_cache, zl);
+}
+
+static boolean_t
+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
+{
+ boolean_t waiting = B_FALSE;
+
+ mutex_enter(&zio->io_lock);
+ ASSERT(zio->io_stall == NULL);
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
+ if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
+ continue;
+
+ uint64_t *countp = &zio->io_children[c][wait];
+ if (*countp != 0) {
+ zio->io_stage >>= 1;
+ ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
+ zio->io_stall = countp;
+ waiting = B_TRUE;
+ break;
+ }
+ }
+ mutex_exit(&zio->io_lock);
+ return (waiting);
+}
+
+static void
+zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
+ zio_t **next_to_executep)
+{
+ uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
+ int *errorp = &pio->io_child_error[zio->io_child_type];
+
+ mutex_enter(&pio->io_lock);
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ *errorp = zio_worst_error(*errorp, zio->io_error);
+ pio->io_reexecute |= zio->io_reexecute;
+ ASSERT3U(*countp, >, 0);
+
+ (*countp)--;
+
+ if (*countp == 0 && pio->io_stall == countp) {
+ zio_taskq_type_t type =
+ pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+ ZIO_TASKQ_INTERRUPT;
+ pio->io_stall = NULL;
+ mutex_exit(&pio->io_lock);
+
+ /*
+ * If we can tell the caller to execute this parent next, do
+ * so. Otherwise dispatch the parent zio as its own task.
+ *
+ * Having the caller execute the parent when possible reduces
+ * locking on the zio taskq's, reduces context switch
+ * overhead, and has no recursion penalty. Note that one
+ * read from disk typically causes at least 3 zio's: a
+ * zio_null(), the logical zio_read(), and then a physical
+ * zio. When the physical ZIO completes, we are able to call
+ * zio_done() on all 3 of these zio's from one invocation of
+ * zio_execute() by returning the parent back to
+ * zio_execute(). Since the parent isn't executed until this
+ * thread returns back to zio_execute(), the caller should do
+ * so promptly.
+ *
+ * In other cases, dispatching the parent prevents
+ * overflowing the stack when we have deeply nested
+ * parent-child relationships, as we do with the "mega zio"
+ * of writes for spa_sync(), and the chain of ZIL blocks.
+ */
+ if (next_to_executep != NULL && *next_to_executep == NULL) {
+ *next_to_executep = pio;
+ } else {
+ zio_taskq_dispatch(pio, type, B_FALSE);
+ }
+ } else {
+ mutex_exit(&pio->io_lock);
+ }
+}
+
+static void
+zio_inherit_child_errors(zio_t *zio, enum zio_child c)
+{
+ if (zio->io_child_error[c] != 0 && zio->io_error == 0)
+ zio->io_error = zio->io_child_error[c];
+}
+
+int
+zio_bookmark_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
+ return (-1);
+ if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
+ return (1);
+
+ if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
+ return (-1);
+ if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
+ return (1);
+
+ if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
+ return (-1);
+ if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
+ return (1);
+
+ if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
+ return (-1);
+ if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free, etc)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
+ void *private, zio_type_t type, zio_priority_t priority,
+ enum zio_flag flags, vdev_t *vd, uint64_t offset,
+ const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
+{
+ zio_t *zio;
+
+ IMPLY(type != ZIO_TYPE_FREE, psize <= SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
+ ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+ ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
+ ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
+ ASSERT(vd || stage == ZIO_STAGE_OPEN);
+
+ IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
+
+ zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
+ bzero(zio, sizeof (zio_t));
+
+ mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ callout_init(&zio->io_timer, 1);
+#endif
+
+ list_create(&zio->io_parent_list, sizeof (zio_link_t),
+ offsetof(zio_link_t, zl_parent_node));
+ list_create(&zio->io_child_list, sizeof (zio_link_t),
+ offsetof(zio_link_t, zl_child_node));
+ metaslab_trace_init(&zio->io_alloc_list);
+
+ if (vd != NULL)
+ zio->io_child_type = ZIO_CHILD_VDEV;
+ else if (flags & ZIO_FLAG_GANG_CHILD)
+ zio->io_child_type = ZIO_CHILD_GANG;
+ else if (flags & ZIO_FLAG_DDT_CHILD)
+ zio->io_child_type = ZIO_CHILD_DDT;
+ else
+ zio->io_child_type = ZIO_CHILD_LOGICAL;
+
+ if (bp != NULL) {
+ zio->io_bp = (blkptr_t *)bp;
+ zio->io_bp_copy = *bp;
+ zio->io_bp_orig = *bp;
+ if (type != ZIO_TYPE_WRITE ||
+ zio->io_child_type == ZIO_CHILD_DDT)
+ zio->io_bp = &zio->io_bp_copy; /* so caller can free */
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL)
+ zio->io_logical = zio;
+ if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
+ pipeline |= ZIO_GANG_STAGES;
+ }
+
+ zio->io_spa = spa;
+ zio->io_txg = txg;
+ zio->io_done = done;
+ zio->io_private = private;
+ zio->io_type = type;
+ zio->io_priority = priority;
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+ zio->io_orig_abd = zio->io_abd = data;
+ zio->io_orig_size = zio->io_size = psize;
+ zio->io_lsize = lsize;
+ zio->io_orig_flags = zio->io_flags = flags;
+ zio->io_orig_stage = zio->io_stage = stage;
+ zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+ zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+
+ zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+ zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
+
+ if (zb != NULL)
+ zio->io_bookmark = *zb;
+
+ if (pio != NULL) {
+ if (zio->io_metaslab_class == NULL)
+ zio->io_metaslab_class = pio->io_metaslab_class;
+ if (zio->io_logical == NULL)
+ zio->io_logical = pio->io_logical;
+ if (zio->io_child_type == ZIO_CHILD_GANG)
+ zio->io_gang_leader = pio->io_gang_leader;
+ zio_add_child(pio, zio);
+ }
+
+ return (zio);
+}
+
+static void
+zio_destroy(zio_t *zio)
+{
+#ifdef __FreeBSD__
+ KASSERT(!(callout_active(&zio->io_timer) ||
+ callout_pending(&zio->io_timer)), ("zio_destroy: timer active"));
+#endif
+ metaslab_trace_fini(&zio->io_alloc_list);
+ list_destroy(&zio->io_parent_list);
+ list_destroy(&zio->io_child_list);
+ mutex_destroy(&zio->io_lock);
+ cv_destroy(&zio->io_cv);
+ kmem_cache_free(zio_cache, zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
+ void *private, enum zio_flag flags)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+ ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
+
+ return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+ return (zio_null(NULL, spa, NULL, done, private, flags));
+}
+
+void
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
+{
+ if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
+ zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
+ bp, (longlong_t)BP_GET_TYPE(bp));
+ }
+ if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
+ BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+ zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
+ bp, (longlong_t)BP_GET_CHECKSUM(bp));
+ }
+ if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
+ BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+ zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
+ bp, (longlong_t)BP_GET_COMPRESS(bp));
+ }
+ if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
+ zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
+ bp, (longlong_t)BP_GET_LSIZE(bp));
+ }
+ if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
+ zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
+ bp, (longlong_t)BP_GET_PSIZE(bp));
+ }
+
+ if (BP_IS_EMBEDDED(bp)) {
+ if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
+ zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
+ bp, (longlong_t)BPE_GET_ETYPE(bp));
+ }
+ }
+
+ /*
+ * Do not verify individual DVAs if the config is not trusted. This
+ * will be done once the zio is executed in vdev_mirror_map_alloc.
+ */
+ if (!spa->spa_trust_config)
+ return;
+
+ /*
+ * Pool-specific checks.
+ *
+ * Note: it would be nice to verify that the blk_birth and
+ * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
+ * allows the birth time of log blocks (and dmu_sync()-ed blocks
+ * that are in the log) to be arbitrarily large.
+ */
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+ uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
+ if (vdevid >= spa->spa_root_vdev->vdev_children) {
+ zfs_panic_recover("blkptr at %p DVA %u has invalid "
+ "VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+ if (vd == NULL) {
+ zfs_panic_recover("blkptr at %p DVA %u has invalid "
+ "VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ if (vd->vdev_ops == &vdev_hole_ops) {
+ zfs_panic_recover("blkptr at %p DVA %u has hole "
+ "VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ if (vd->vdev_ops == &vdev_missing_ops) {
+ /*
+ * "missing" vdevs are valid during import, but we
+ * don't have their detailed info (e.g. asize), so
+ * we can't perform any more checks on them.
+ */
+ continue;
+ }
+ uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+ uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
+ if (BP_IS_GANG(bp))
+ asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ if (offset + asize > vd->vdev_asize) {
+ zfs_panic_recover("blkptr at %p DVA %u has invalid "
+ "OFFSET %llu",
+ bp, i, (longlong_t)offset);
+ }
+ }
+}
+
+boolean_t
+zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
+{
+ uint64_t vdevid = DVA_GET_VDEV(dva);
+
+ if (vdevid >= spa->spa_root_vdev->vdev_children)
+ return (B_FALSE);
+
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+ if (vd == NULL)
+ return (B_FALSE);
+
+ if (vd->vdev_ops == &vdev_hole_ops)
+ return (B_FALSE);
+
+ if (vd->vdev_ops == &vdev_missing_ops) {
+ return (B_FALSE);
+ }
+
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t asize = DVA_GET_ASIZE(dva);
+
+ if (BP_IS_GANG(bp))
+ asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ if (offset + asize > vd->vdev_asize)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
+{
+ zio_t *zio;
+
+ zfs_blkptr_verify(spa, bp);
+
+ zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+ data, size, size, done, private,
+ ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
+
+ return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
+ zio_done_func_t *ready, zio_done_func_t *children_ready,
+ zio_done_func_t *physdone, zio_done_func_t *done,
+ void *private, zio_priority_t priority, enum zio_flag flags,
+ const zbookmark_phys_t *zb)
+{
+ zio_t *zio;
+
+ ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
+ zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
+ zp->zp_compress >= ZIO_COMPRESS_OFF &&
+ zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
+ DMU_OT_IS_VALID(zp->zp_type) &&
+ zp->zp_level < 32 &&
+ zp->zp_copies > 0 &&
+ zp->zp_copies <= spa_max_replication(spa));
+
+ zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
+ ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
+
+ zio->io_ready = ready;
+ zio->io_children_ready = children_ready;
+ zio->io_physdone = physdone;
+ zio->io_prop = *zp;
+
+ /*
+ * Data can be NULL if we are going to call zio_write_override() to
+ * provide the already-allocated BP. But we may need the data to
+ * verify a dedup hit (if requested). In this case, don't try to
+ * dedup (just take the already-allocated BP verbatim).
+ */
+ if (data == NULL && zio->io_prop.zp_dedup_verify) {
+ zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
+ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+ return (zio);
+}
+
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+{
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+ ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+ /*
+ * We must reset the io_prop to match the values that existed
+ * when the bp was first written by dmu_sync() keeping in mind
+ * that nopwrite and dedup are mutually exclusive.
+ */
+ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+ zio->io_prop.zp_nopwrite = nopwrite;
+ zio->io_prop.zp_copies = copies;
+ zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+
+ zfs_blkptr_verify(spa, bp);
+
+ /*
+ * The check for EMBEDDED is a performance optimization. We
+ * process the free here (by ignoring it) rather than
+ * putting it on the list and then processing it in zio_free_sync().
+ */
+ if (BP_IS_EMBEDDED(bp))
+ return;
+ metaslab_check_free(spa, bp);
+
+ /*
+ * Frees that are for the currently-syncing txg, are not going to be
+ * deferred, and which will not need to do a read (i.e. not GANG or
+ * DEDUP), can be processed immediately. Otherwise, put them on the
+ * in-memory list for later processing.
+ */
+ if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
+ txg != spa->spa_syncing_txg ||
+ spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
+ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+ } else {
+ VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
+ BP_GET_PSIZE(bp), 0)));
+ }
+}
+
+zio_t *
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ uint64_t size, enum zio_flag flags)
+{
+ zio_t *zio;
+ enum zio_stage stage = ZIO_FREE_PIPELINE;
+
+ ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(spa_syncing_txg(spa) == txg);
+ ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
+
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
+ metaslab_check_free(spa, bp);
+ arc_freed(spa, bp);
+ dsl_scan_freed(spa, bp);
+
+ if (zfs_trim_enabled)
+ stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
+ ZIO_STAGE_VDEV_IO_ASSESS;
+ /*
+ * GANG and DEDUP blocks can induce a read (for the gang block header,
+ * or the DDT), so issue them asynchronously so that this thread is
+ * not tied up.
+ */
+ else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
+ stage |= ZIO_STAGE_ISSUE_ASYNC;
+
+ flags |= ZIO_FLAG_DONT_QUEUE;
+
+ zio = zio_create(pio, spa, txg, bp, NULL, size,
+ size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
+ flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
+
+ return (zio);
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+ zio_t *zio;
+
+ zfs_blkptr_verify(spa, bp);
+
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
+ /*
+ * A claim is an allocation of a specific block. Claims are needed
+ * to support immediate writes in the intent log. The issue is that
+ * immediate writes contain committed data, but in a txg that was
+ * *not* committed. Upon opening the pool after an unclean shutdown,
+ * the intent log claims all blocks that contain immediate write data
+ * so that the SPA knows they're in use.
+ *
+ * All claims *must* be resolved in the first txg -- before the SPA
+ * starts allocating blocks -- so that nothing is allocated twice.
+ * If txg == 0 we just verify that the block is claimable.
+ */
+ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+ spa_min_claim_txg(spa));
+ ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
+ ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
+
+ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+ BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
+ flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ ASSERT0(zio->io_queued_timestamp);
+
+ return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags)
+{
+ zio_t *zio;
+ int c;
+
+ if (vd->vdev_children == 0) {
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+ ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+ ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+ zio->io_cmd = cmd;
+ } else {
+ zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+ offset, size, done, private, priority, flags));
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ abd_t *data, int checksum, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+{
+ zio_t *zio;
+
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+ private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+ offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+ zio->io_prop.zp_checksum = checksum;
+
+ return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ abd_t *data, int checksum, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+{
+ zio_t *zio;
+
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+ private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+ offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+ zio->io_prop.zp_checksum = checksum;
+
+ if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+ /*
+ * zec checksums are necessarily destructive -- they modify
+ * the end of the write buffer to hold the verifier/checksum.
+ * Therefore, we must make a local copy in case the data is
+ * being written to multiple places in parallel.
+ */
+ abd_t *wbuf = abd_alloc_sametype(data, size);
+ abd_copy(wbuf, data, size);
+
+ zio_push_transform(zio, wbuf, size, size, NULL);
+ }
+
+ return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+ abd_t *data, uint64_t size, int type, zio_priority_t priority,
+ enum zio_flag flags, zio_done_func_t *done, void *private)
+{
+ enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ zio_t *zio;
+
+ /*
+ * vdev child I/Os do not propagate their error to the parent.
+ * Therefore, for correct operation the caller *must* check for
+ * and handle the error in the child i/o's done callback.
+ * The only exceptions are i/os that we don't care about
+ * (OPTIONAL or REPAIR).
+ */
+ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
+ done != NULL);
+
+ if (type == ZIO_TYPE_READ && bp != NULL) {
+ /*
+ * If we have the bp, then the child should perform the
+ * checksum and the parent need not. This pushes error
+ * detection as close to the leaves as possible and
+ * eliminates redundant checksums in the interior nodes.
+ */
+ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+ pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+ }
+
+ /* Not all IO types require vdev io done stage e.g. free */
+ if (type == ZIO_TYPE_FREE &&
+ !(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
+ pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ ASSERT0(vd->vdev_children);
+ offset += VDEV_LABEL_START_SIZE;
+ }
+
+ flags |= ZIO_VDEV_CHILD_FLAGS(pio);
+
+ /*
+ * If we've decided to do a repair, the write is not speculative --
+ * even if the original read was.
+ */
+ if (flags & ZIO_FLAG_IO_REPAIR)
+ flags &= ~ZIO_FLAG_SPECULATIVE;
+
+ /*
+ * If we're creating a child I/O that is not associated with a
+ * top-level vdev, then the child zio is not an allocating I/O.
+ * If this is a retried I/O then we ignore it since we will
+ * have already processed the original allocating I/O.
+ */
+ if (flags & ZIO_FLAG_IO_ALLOCATING &&
+ (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+ ASSERT(pio->io_metaslab_class != NULL);
+ ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
+ ASSERT(type == ZIO_TYPE_WRITE);
+ ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+ pio->io_child_type == ZIO_CHILD_GANG);
+
+ flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ }
+
+ zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
+ done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+ ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+
+ zio->io_physdone = pio->io_physdone;
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+ zio->io_logical->io_phys_children++;
+
+ return (zio);
+}
+
+zio_t *
+zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
+ zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
+ data, size, size, done, private, type, priority,
+ flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
+ vd, offset, NULL,
+ ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
+
+ return (zio);
+}
+
+void
+zio_flush(zio_t *zio, vdev_t *vd)
+{
+ zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+}
+
+zio_t *
+zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
+{
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL,
+ ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+ vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
+}
+
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+ ASSERT3P(zio->io_executor, ==, NULL);
+ ASSERT3P(zio->io_orig_size, ==, zio->io_size);
+ ASSERT3U(size, <=, zio->io_size);
+
+ /*
+ * We don't shrink for raidz because of problems with the
+ * reconstruction when reading back less than the block size.
+ * Note, BP_IS_RAIDZ() assumes no compression.
+ */
+ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+ if (!BP_IS_RAIDZ(zio->io_bp)) {
+ /* we are not doing a raw write */
+ ASSERT3U(zio->io_size, ==, zio->io_lsize);
+ zio->io_orig_size = zio->io_size = zio->io_lsize = size;
+ }
+}
+
+/*
+ * ==========================================================================
+ * Prepare to read and write logical blocks
+ * ==========================================================================
+ */
+
+static zio_t *
+zio_read_bp_init(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+ zio->io_child_type == ZIO_CHILD_LOGICAL &&
+ !(zio->io_flags & ZIO_FLAG_RAW)) {
+ uint64_t psize =
+ BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
+ zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+ psize, psize, zio_decompress);
+ }
+
+ if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ int psize = BPE_GET_PSIZE(bp);
+ void *data = abd_borrow_buf(zio->io_abd, psize);
+ decode_embedded_bp_compressed(bp, data);
+ abd_return_buf_copy(zio->io_abd, data, psize);
+ } else {
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+ }
+
+ if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+ zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
+ return (zio);
+}
+
+static zio_t *
+zio_write_bp_init(zio_t *zio)
+{
+ if (!IO_IS_ALLOCATING(zio))
+ return (zio);
+
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+ if (zio->io_bp_override) {
+ blkptr_t *bp = zio->io_bp;
+ zio_prop_t *zp = &zio->io_prop;
+
+ ASSERT(bp->blk_birth != zio->io_txg);
+ ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+ *bp = *zio->io_bp_override;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (BP_IS_EMBEDDED(bp))
+ return (zio);
+
+ /*
+ * If we've been overridden and nopwrite is set then
+ * set the flag accordingly to indicate that a nopwrite
+ * has already occurred.
+ */
+ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+ ASSERT(!zp->zp_dedup);
+ ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ return (zio);
+ }
+
+ ASSERT(!zp->zp_nopwrite);
+
+ if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+ return (zio);
+
+ ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
+
+ if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+ BP_SET_DEDUP(bp, 1);
+ zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+ return (zio);
+ }
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
+ zio->io_bp_override = NULL;
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
+ }
+
+ return (zio);
+}
+
+static zio_t *
+zio_write_compress(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_prop_t *zp = &zio->io_prop;
+ enum zio_compress compress = zp->zp_compress;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t lsize = zio->io_lsize;
+ uint64_t psize = zio->io_size;
+ int pass = 1;
+
+ EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
+
+ /*
+ * If our children haven't all reached the ready stage,
+ * wait for them and then repeat this pipeline stage.
+ */
+ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+ ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
+ return (NULL);
+ }
+
+ if (!IO_IS_ALLOCATING(zio))
+ return (zio);
+
+ if (zio->io_children_ready != NULL) {
+ /*
+ * Now that all our children are ready, run the callback
+ * associated with this zio in case it wants to modify the
+ * data to be written.
+ */
+ ASSERT3U(zp->zp_level, >, 0);
+ zio->io_children_ready(zio);
+ }
+
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+ ASSERT(zio->io_bp_override == NULL);
+
+ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+ /*
+ * We're rewriting an existing block, which means we're
+ * working on behalf of spa_sync(). For spa_sync() to
+ * converge, it must eventually be the case that we don't
+ * have to allocate new blocks. But compression changes
+ * the blocksize, which forces a reallocate, and makes
+ * convergence take longer. Therefore, after the first
+ * few passes, stop compressing to ensure convergence.
+ */
+ pass = spa_sync_pass(spa);
+
+ ASSERT(zio->io_txg == spa_syncing_txg(spa));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!BP_GET_DEDUP(bp));
+
+ if (pass >= zfs_sync_pass_dont_compress)
+ compress = ZIO_COMPRESS_OFF;
+
+ /* Make sure someone doesn't change their mind on overwrites */
+ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+ }
+
+ /* If it's a compressed write that is not raw, compress the buffer. */
+ if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
+ void *cbuf = zio_buf_alloc(lsize);
+ psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
+ if (psize == 0 || psize == lsize) {
+ compress = ZIO_COMPRESS_OFF;
+ zio_buf_free(cbuf, lsize);
+ } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+ zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ encode_embedded_bp_compressed(bp,
+ cbuf, compress, lsize, psize);
+ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+ BP_SET_TYPE(bp, zio->io_prop.zp_type);
+ BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+ zio_buf_free(cbuf, lsize);
+ bp->blk_birth = zio->io_txg;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_EMBEDDED_DATA));
+ return (zio);
+ } else {
+ /*
+ * Round up compressed size up to the ashift
+ * of the smallest-ashift device, and zero the tail.
+ * This ensures that the compressed size of the BP
+ * (and thus compressratio property) are correct,
+ * in that we charge for the padding used to fill out
+ * the last sector.
+ */
+ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+ size_t rounded = (size_t)P2ROUNDUP(psize,
+ 1ULL << spa->spa_min_ashift);
+ if (rounded >= lsize) {
+ compress = ZIO_COMPRESS_OFF;
+ zio_buf_free(cbuf, lsize);
+ psize = lsize;
+ } else {
+ abd_t *cdata = abd_get_from_buf(cbuf, lsize);
+ abd_take_ownership_of_buf(cdata, B_TRUE);
+ abd_zero_off(cdata, psize, rounded - psize);
+ psize = rounded;
+ zio_push_transform(zio, cdata,
+ psize, lsize, NULL);
+ }
+ }
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
+ zio->io_bp_override = NULL;
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
+ } else {
+ ASSERT3U(psize, !=, 0);
+ }
+
+ /*
+ * The final pass of spa_sync() must be all rewrites, but the first
+ * few passes offer a trade-off: allocating blocks defers convergence,
+ * but newly allocated blocks are sequential, so they can be written
+ * to disk faster. Therefore, we allow the first few passes of
+ * spa_sync() to allocate new blocks, but force rewrites after that.
+ * There should only be a handful of blocks after pass 1 in any case.
+ */
+ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+ BP_GET_PSIZE(bp) == psize &&
+ pass >= zfs_sync_pass_rewrite) {
+ VERIFY3U(psize, !=, 0);
+ enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+
+ zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
+ zio->io_flags |= ZIO_FLAG_IO_REWRITE;
+ } else {
+ BP_ZERO(bp);
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ }
+
+ if (psize == 0) {
+ if (zio->io_bp_orig.blk_birth != 0 &&
+ spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, zp->zp_type);
+ BP_SET_LEVEL(bp, zp->zp_level);
+ BP_SET_BIRTH(bp, zio->io_txg, 0);
+ }
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ } else {
+ ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, zp->zp_type);
+ BP_SET_LEVEL(bp, zp->zp_level);
+ BP_SET_PSIZE(bp, psize);
+ BP_SET_COMPRESS(bp, compress);
+ BP_SET_CHECKSUM(bp, zp->zp_checksum);
+ BP_SET_DEDUP(bp, zp->zp_dedup);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ if (zp->zp_dedup) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+ }
+ if (zp->zp_nopwrite) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+ }
+ }
+ return (zio);
+}
+
+static zio_t *
+zio_free_bp_init(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+ if (BP_GET_DEDUP(bp))
+ zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
+ }
+
+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Execute the I/O pipeline
+ * ==========================================================================
+ */
+
+static void
+zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
+{
+ spa_t *spa = zio->io_spa;
+ zio_type_t t = zio->io_type;
+ int flags = (cutinline ? TQ_FRONT : 0);
+
+ ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
+
+ /*
+ * If we're a config writer or a probe, the normal issue and
+ * interrupt threads may all be blocked waiting for the config lock.
+ * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
+ */
+ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
+ t = ZIO_TYPE_NULL;
+
+ /*
+ * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
+ */
+ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
+ t = ZIO_TYPE_NULL;
+
+ /*
+ * If this is a high priority I/O, then use the high priority taskq if
+ * available.
+ */
+ if ((zio->io_priority == ZIO_PRIORITY_NOW ||
+ zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
+ spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
+ q++;
+
+ ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+
+ /*
+ * NB: We are assuming that the zio can only be dispatched
+ * to a single taskq at a time. It would be a grievous error
+ * to dispatch the zio to another taskq at the same time.
+ */
+#if defined(illumos) || !defined(_KERNEL)
+ ASSERT(zio->io_tqent.tqent_next == NULL);
+#else
+ ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
+#endif
+ spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
+ flags, &zio->io_tqent);
+}
+
+static boolean_t
+zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
+{
+ kthread_t *executor = zio->io_executor;
+ spa_t *spa = zio->io_spa;
+
+ for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ uint_t i;
+ for (i = 0; i < tqs->stqs_count; i++) {
+ if (taskq_member(tqs->stqs_taskq[i], executor))
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static zio_t *
+zio_issue_async(zio_t *zio)
+{
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+
+ return (NULL);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+ zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
+}
+
+void
+zio_delay_interrupt(zio_t *zio)
+{
+ /*
+ * The timeout_generic() function isn't defined in userspace, so
+ * rather than trying to implement the function, the zio delay
+ * functionality has been disabled for userspace builds.
+ */
+
+#ifdef _KERNEL
+ /*
+ * If io_target_timestamp is zero, then no delay has been registered
+ * for this IO, thus jump to the end of this function and "skip" the
+ * delay; issuing it directly to the zio layer.
+ */
+ if (zio->io_target_timestamp != 0) {
+ hrtime_t now = gethrtime();
+
+ if (now >= zio->io_target_timestamp) {
+ /*
+ * This IO has already taken longer than the target
+ * delay to complete, so we don't want to delay it
+ * any longer; we "miss" the delay and issue it
+ * directly to the zio layer. This is likely due to
+ * the target latency being set to a value less than
+ * the underlying hardware can satisfy (e.g. delay
+ * set to 1ms, but the disks take 10ms to complete an
+ * IO request).
+ */
+
+ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
+ hrtime_t, now);
+
+ zio_interrupt(zio);
+ } else {
+ hrtime_t diff = zio->io_target_timestamp - now;
+
+ DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
+ hrtime_t, now, hrtime_t, diff);
+
+#ifdef __FreeBSD__
+ callout_reset_sbt(&zio->io_timer, nstosbt(diff), 0,
+ (void (*)(void *))zio_interrupt, zio, C_HARDCLOCK);
+#else
+ (void) timeout_generic(CALLOUT_NORMAL,
+ (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
+#endif
+ }
+
+ return;
+ }
+#endif
+
+ DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
+ zio_interrupt(zio);
+}
+
+/*
+ * Execute the I/O pipeline until one of the following occurs:
+ *
+ * (1) the I/O completes
+ * (2) the pipeline stalls waiting for dependent child I/Os
+ * (3) the I/O issues, so we're waiting for an I/O completion interrupt
+ * (4) the I/O is delegated by vdev-level caching or aggregation
+ * (5) the I/O is deferred due to vdev-level queueing
+ * (6) the I/O is handed off to another thread.
+ *
+ * In all cases, the pipeline stops whenever there's no CPU work; it never
+ * burns a thread in cv_wait().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
+ */
+static zio_pipe_stage_t *zio_pipeline[];
+
+void
+zio_execute(zio_t *zio)
+{
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+
+ while (zio->io_stage < ZIO_STAGE_DONE) {
+ enum zio_stage pipeline = zio->io_pipeline;
+ enum zio_stage stage = zio->io_stage;
+
+ zio->io_executor = curthread;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+ ASSERT(ISP2(stage));
+ ASSERT(zio->io_stall == NULL);
+
+ do {
+ stage <<= 1;
+ } while ((stage & pipeline) == 0);
+
+ ASSERT(stage <= ZIO_STAGE_DONE);
+
+ /*
+ * If we are in interrupt context and this pipeline stage
+ * will grab a config lock that is held across I/O,
+ * or may wait for an I/O that needs an interrupt thread
+ * to complete, issue async to avoid deadlock.
+ *
+ * For VDEV_IO_START, we cut in line so that the io will
+ * be sent to disk promptly.
+ */
+ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+ zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
+ boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+ zio_requeue_io_start_cut_in_line : B_FALSE;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+ return;
+ }
+
+ zio->io_stage = stage;
+ zio->io_pipeline_trace |= zio->io_stage;
+
+ /*
+ * The zio pipeline stage returns the next zio to execute
+ * (typically the same as this one), or NULL if we should
+ * stop.
+ */
+ zio = zio_pipeline[highbit64(stage) - 1](zio);
+
+ if (zio == NULL)
+ return;
+ }
+}
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+ int error;
+
+ ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN);
+ ASSERT3P(zio->io_executor, ==, NULL);
+
+ zio->io_waiter = curthread;
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
+
+ zio_execute(zio);
+
+ mutex_enter(&zio->io_lock);
+ while (zio->io_executor != NULL)
+ cv_wait(&zio->io_cv, &zio->io_lock);
+ mutex_exit(&zio->io_lock);
+
+ error = zio->io_error;
+ zio_destroy(zio);
+
+ return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+ ASSERT3P(zio->io_executor, ==, NULL);
+
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
+ zio_unique_parent(zio) == NULL) {
+ /*
+ * This is a logical async I/O with no parent to wait for it.
+ * We add it to the spa_async_root_zio "Godfather" I/O which
+ * will ensure they complete prior to unloading the pool.
+ */
+ spa_t *spa = zio->io_spa;
+
+ zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
+ }
+
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
+ zio_execute(zio);
+}
+
+/*
+ * ==========================================================================
+ * Reexecute, cancel, or suspend/resume failed I/O
+ * ==========================================================================
+ */
+
+static void
+zio_reexecute(zio_t *pio)
+{
+ zio_t *cio, *cio_next;
+
+ ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
+ ASSERT(pio->io_gang_leader == NULL);
+ ASSERT(pio->io_gang_tree == NULL);
+
+ pio->io_flags = pio->io_orig_flags;
+ pio->io_stage = pio->io_orig_stage;
+ pio->io_pipeline = pio->io_orig_pipeline;
+ pio->io_reexecute = 0;
+ pio->io_flags |= ZIO_FLAG_REEXECUTED;
+ pio->io_pipeline_trace = 0;
+ pio->io_error = 0;
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ pio->io_state[w] = 0;
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+ pio->io_child_error[c] = 0;
+
+ if (IO_IS_ALLOCATING(pio))
+ BP_ZERO(pio->io_bp);
+
+ /*
+ * As we reexecute pio's children, new children could be created.
+ * New children go to the head of pio's io_child_list, however,
+ * so we will (correctly) not reexecute them. The key is that
+ * the remainder of pio's io_child_list, from 'cio_next' onward,
+ * cannot be affected by any side effects of reexecuting 'cio'.
+ */
+ zio_link_t *zl = NULL;
+ mutex_enter(&pio->io_lock);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ pio->io_children[cio->io_child_type][w]++;
+ mutex_exit(&pio->io_lock);
+ zio_reexecute(cio);
+ mutex_enter(&pio->io_lock);
+ }
+ mutex_exit(&pio->io_lock);
+
+ /*
+ * Now that all children have been reexecuted, execute the parent.
+ * We don't reexecute "The Godfather" I/O here as it's the
+ * responsibility of the caller to wait on it.
+ */
+ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+ pio->io_queued_timestamp = gethrtime();
+ zio_execute(pio);
+ }
+}
+
+void
+zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
+{
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
+ fm_panic("Pool '%s' has encountered an uncorrectable I/O "
+ "failure and the failure mode property for this pool "
+ "is set to panic.", spa_name(spa));
+
+ zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
+
+ mutex_enter(&spa->spa_suspend_lock);
+
+ if (spa->spa_suspend_zio_root == NULL)
+ spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+
+ spa->spa_suspended = reason;
+
+ if (zio != NULL) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
+ ASSERT(zio != spa->spa_suspend_zio_root);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(zio_unique_parent(zio) == NULL);
+ ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+ zio_add_child(spa->spa_suspend_zio_root, zio);
+ }
+
+ mutex_exit(&spa->spa_suspend_lock);
+}
+
+int
+zio_resume(spa_t *spa)
+{
+ zio_t *pio;
+
+ /*
+ * Reexecute all previously suspended i/o.
+ */
+ mutex_enter(&spa->spa_suspend_lock);
+ spa->spa_suspended = ZIO_SUSPEND_NONE;
+ cv_broadcast(&spa->spa_suspend_cv);
+ pio = spa->spa_suspend_zio_root;
+ spa->spa_suspend_zio_root = NULL;
+ mutex_exit(&spa->spa_suspend_lock);
+
+ if (pio == NULL)
+ return (0);
+
+ zio_reexecute(pio);
+ return (zio_wait(pio));
+}
+
+void
+zio_resume_wait(spa_t *spa)
+{
+ mutex_enter(&spa->spa_suspend_lock);
+ while (spa_suspended(spa))
+ cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
+ mutex_exit(&spa->spa_suspend_lock);
+}
+
+/*
+ * ==========================================================================
+ * Gang blocks.
+ *
+ * A gang block is a collection of small blocks that looks to the DMU
+ * like one large block. When zio_dva_allocate() cannot find a block
+ * of the requested size, due to either severe fragmentation or the pool
+ * being nearly full, it calls zio_write_gang_block() to construct the
+ * block from smaller fragments.
+ *
+ * A gang block consists of a gang header (zio_gbh_phys_t) and up to
+ * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
+ * an indirect block: it's an array of block pointers. It consumes
+ * only one sector and hence is allocatable regardless of fragmentation.
+ * The gang header's bps point to its gang members, which hold the data.
+ *
+ * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
+ * as the verifier to ensure uniqueness of the SHA256 checksum.
+ * Critically, the gang block bp's blk_cksum is the checksum of the data,
+ * not the gang header. This ensures that data block signatures (needed for
+ * deduplication) are independent of how the block is physically stored.
+ *
+ * Gang blocks can be nested: a gang member may itself be a gang block.
+ * Thus every gang block is a tree in which root and all interior nodes are
+ * gang headers, and the leaves are normal blocks that contain user data.
+ * The root of the gang tree is called the gang leader.
+ *
+ * To perform any operation (read, rewrite, free, claim) on a gang block,
+ * zio_gang_assemble() first assembles the gang tree (minus data leaves)
+ * in the io_gang_tree field of the original logical i/o by recursively
+ * reading the gang leader and all gang headers below it. This yields
+ * an in-core tree containing the contents of every gang header and the
+ * bps for every constituent of the gang block.
+ *
+ * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
+ * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
+ * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
+ * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
+ * zio_read_gang() is a wrapper around zio_read() that omits reading gang
+ * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
+ * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
+ * of the gang header plus zio_checksum_compute() of the data to update the
+ * gang header's blk_cksum as described above.
+ *
+ * The two-phase assemble/issue model solves the problem of partial failure --
+ * what if you'd freed part of a gang block but then couldn't read the
+ * gang header for another part? Assembling the entire gang tree first
+ * ensures that all the necessary gang header I/O has succeeded before
+ * starting the actual work of free, claim, or write. Once the gang tree
+ * is assembled, free and claim are in-memory operations that cannot fail.
+ *
+ * In the event that a gang write fails, zio_dva_unallocate() walks the
+ * gang tree to immediately free (i.e. insert back into the space map)
+ * everything we've allocated. This ensures that we don't get ENOSPC
+ * errors during repeated suspend/resume cycles due to a flaky device.
+ *
+ * Gang rewrites only happen during sync-to-convergence. If we can't assemble
+ * the gang tree, we won't modify the block, so we can safely defer the free
+ * (knowing that the block is still intact). If we *can* assemble the gang
+ * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
+ * each constituent bp and we can allocate a new block on the next sync pass.
+ *
+ * In all cases, the gang tree allows complete recovery from partial failure.
+ * ==========================================================================
+ */
+
+static void
+zio_gang_issue_func_done(zio_t *zio)
+{
+ abd_put(zio->io_abd);
+}
+
+static zio_t *
+zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
+{
+ if (gn != NULL)
+ return (pio);
+
+ return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
+ BP_GET_PSIZE(bp), zio_gang_issue_func_done,
+ NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ &pio->io_bookmark));
+}
+
+static zio_t *
+zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
+{
+ zio_t *zio;
+
+ if (gn != NULL) {
+ abd_t *gbh_abd =
+ abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+ gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
+ pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ &pio->io_bookmark);
+ /*
+ * As we rewrite each gang header, the pipeline will compute
+ * a new gang block header checksum for it; but no one will
+ * compute a new data checksum, so we do that here. The one
+ * exception is the gang leader: the pipeline already computed
+ * its data checksum because that stage precedes gang assembly.
+ * (Presently, nothing actually uses interior data checksums;
+ * this is just good hygiene.)
+ */
+ if (gn != pio->io_gang_leader->io_gang_tree) {
+ abd_t *buf = abd_get_offset(data, offset);
+
+ zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
+ buf, BP_GET_PSIZE(bp));
+
+ abd_put(buf);
+ }
+ /*
+ * If we are here to damage data for testing purposes,
+ * leave the GBH alone so that we can detect the damage.
+ */
+ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ } else {
+ zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+ abd_get_offset(data, offset), BP_GET_PSIZE(bp),
+ zio_gang_issue_func_done, NULL, pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ }
+
+ return (zio);
+}
+
+/* ARGSUSED */
+static zio_t *
+zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
+{
+ return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+ BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
+ ZIO_GANG_CHILD_FLAGS(pio)));
+}
+
+/* ARGSUSED */
+static zio_t *
+zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
+{
+ return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
+ NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+}
+
+static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
+ NULL,
+ zio_read_gang,
+ zio_rewrite_gang,
+ zio_free_gang,
+ zio_claim_gang,
+ NULL
+};
+
+static void zio_gang_tree_assemble_done(zio_t *zio);
+
+static zio_gang_node_t *
+zio_gang_node_alloc(zio_gang_node_t **gnpp)
+{
+ zio_gang_node_t *gn;
+
+ ASSERT(*gnpp == NULL);
+
+ gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
+ gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+ *gnpp = gn;
+
+ return (gn);
+}
+
+static void
+zio_gang_node_free(zio_gang_node_t **gnpp)
+{
+ zio_gang_node_t *gn = *gnpp;
+
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ ASSERT(gn->gn_child[g] == NULL);
+
+ zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ kmem_free(gn, sizeof (*gn));
+ *gnpp = NULL;
+}
+
+static void
+zio_gang_tree_free(zio_gang_node_t **gnpp)
+{
+ zio_gang_node_t *gn = *gnpp;
+
+ if (gn == NULL)
+ return;
+
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ zio_gang_tree_free(&gn->gn_child[g]);
+
+ zio_gang_node_free(gnpp);
+}
+
+static void
+zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
+{
+ zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
+ abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+
+ ASSERT(gio->io_gang_leader == gio);
+ ASSERT(BP_IS_GANG(bp));
+
+ zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_gang_tree_assemble_done, gn, gio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
+}
+
+static void
+zio_gang_tree_assemble_done(zio_t *zio)
+{
+ zio_t *gio = zio->io_gang_leader;
+ zio_gang_node_t *gn = zio->io_private;
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(gio == zio_unique_parent(zio));
+ ASSERT(zio->io_child_count == 0);
+
+ if (zio->io_error)
+ return;
+
+ /* this ABD was created from a linear buf in zio_gang_tree_assemble */
+ if (BP_SHOULD_BYTESWAP(bp))
+ byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
+
+ ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
+ ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+
+ abd_put(zio->io_abd);
+
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ if (!BP_IS_GANG(gbp))
+ continue;
+ zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
+ }
+}
+
+static void
+zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
+ uint64_t offset)
+{
+ zio_t *gio = pio->io_gang_leader;
+ zio_t *zio;
+
+ ASSERT(BP_IS_GANG(bp) == !!gn);
+ ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
+ ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
+
+ /*
+ * If you're a gang header, your data is in gn->gn_gbh.
+ * If you're a gang member, your data is in 'data' and gn == NULL.
+ */
+ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
+
+ if (gn != NULL) {
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
+ offset);
+ offset += BP_GET_PSIZE(gbp);
+ }
+ }
+
+ if (gn == gio->io_gang_tree && gio->io_abd != NULL)
+ ASSERT3U(gio->io_size, ==, offset);
+
+ if (zio != pio)
+ zio_nowait(zio);
+}
+
+static zio_t *
+zio_gang_assemble(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ zio->io_gang_leader = zio;
+
+ zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
+
+ return (zio);
+}
+
+static zio_t *
+zio_gang_issue(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
+ zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
+ 0);
+ else
+ zio_gang_tree_free(&zio->io_gang_tree);
+
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ return (zio);
+}
+
+static void
+zio_write_gang_member_ready(zio_t *zio)
+{
+ zio_t *pio = zio_unique_parent(zio);
+ zio_t *gio = zio->io_gang_leader;
+ dva_t *cdva = zio->io_bp->blk_dva;
+ dva_t *pdva = pio->io_bp->blk_dva;
+ uint64_t asize;
+
+ if (BP_IS_HOLE(zio->io_bp))
+ return;
+
+ ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
+
+ ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
+ ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+
+ mutex_enter(&pio->io_lock);
+ for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
+ ASSERT(DVA_GET_GANG(&pdva[d]));
+ asize = DVA_GET_ASIZE(&pdva[d]);
+ asize += DVA_GET_ASIZE(&cdva[d]);
+ DVA_SET_ASIZE(&pdva[d], asize);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_write_gang_done(zio_t *zio)
+{
+ /*
+ * The io_abd field will be NULL for a zio with no data. The io_flags
+ * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
+ * check for it here as it is cleared in zio_ready.
+ */
+ if (zio->io_abd != NULL)
+ abd_put(zio->io_abd);
+}
+
+static zio_t *
+zio_write_gang_block(zio_t *pio)
+{
+ spa_t *spa = pio->io_spa;
+ metaslab_class_t *mc = spa_normal_class(spa);
+ blkptr_t *bp = pio->io_bp;
+ zio_t *gio = pio->io_gang_leader;
+ zio_t *zio;
+ zio_gang_node_t *gn, **gnpp;
+ zio_gbh_phys_t *gbh;
+ abd_t *gbh_abd;
+ uint64_t txg = pio->io_txg;
+ uint64_t resid = pio->io_size;
+ uint64_t lsize;
+ int copies = gio->io_prop.zp_copies;
+ int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
+ zio_prop_t zp;
+ int error;
+ boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
+
+ int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(has_data);
+
+ flags |= METASLAB_ASYNC_ALLOC;
+ VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
+ pio));
+
+ /*
+ * The logical zio has already placed a reservation for
+ * 'copies' allocation slots but gang blocks may require
+ * additional copies. These additional copies
+ * (i.e. gbh_copies - copies) are guaranteed to succeed
+ * since metaslab_class_throttle_reserve() always allows
+ * additional reservations for gang blocks.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+ pio->io_allocator, pio, flags));
+ }
+
+ error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
+ &pio->io_alloc_list, pio, pio->io_allocator);
+ if (error) {
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(has_data);
+
+ /*
+ * If we failed to allocate the gang block header then
+ * we remove any additional allocation reservations that
+ * we placed here. The original reservation will
+ * be removed when the logical I/O goes to the ready
+ * stage.
+ */
+ metaslab_class_throttle_unreserve(mc,
+ gbh_copies - copies, pio->io_allocator, pio);
+ }
+ pio->io_error = error;
+ return (pio);
+ }
+
+ if (pio == gio) {
+ gnpp = &gio->io_gang_tree;
+ } else {
+ gnpp = pio->io_private;
+ ASSERT(pio->io_ready == zio_write_gang_member_ready);
+ }
+
+ gn = zio_gang_node_alloc(gnpp);
+ gbh = gn->gn_gbh;
+ bzero(gbh, SPA_GANGBLOCKSIZE);
+ gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
+
+ /*
+ * Create the gang header.
+ */
+ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_write_gang_done, NULL, pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+ /*
+ * Create and nowait the gang children.
+ */
+ for (int g = 0; resid != 0; resid -= lsize, g++) {
+ lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
+ SPA_MINBLOCKSIZE);
+ ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
+
+ zp.zp_checksum = gio->io_prop.zp_checksum;
+ zp.zp_compress = ZIO_COMPRESS_OFF;
+ zp.zp_type = DMU_OT_NONE;
+ zp.zp_level = 0;
+ zp.zp_copies = gio->io_prop.zp_copies;
+ zp.zp_dedup = B_FALSE;
+ zp.zp_dedup_verify = B_FALSE;
+ zp.zp_nopwrite = B_FALSE;
+
+ zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+ has_data ? abd_get_offset(pio->io_abd, pio->io_size -
+ resid) : NULL, lsize, lsize, &zp,
+ zio_write_gang_member_ready, NULL, NULL,
+ zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(has_data);
+
+ /*
+ * Gang children won't throttle but we should
+ * account for their work, so reserve an allocation
+ * slot for them here.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zp.zp_copies, cio->io_allocator, cio, flags));
+ }
+ zio_nowait(cio);
+ }
+
+ /*
+ * Set pio's pipeline to just wait for zio to finish.
+ */
+ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ zio_nowait(zio);
+
+ return (pio);
+}
+
+/*
+ * The zio_nop_write stage in the pipeline determines if allocating a
+ * new bp is necessary. The nopwrite feature can handle writes in
+ * either syncing or open context (i.e. zil writes) and as a result is
+ * mutually exclusive with dedup.
+ *
+ * By leveraging a cryptographically secure checksum, such as SHA256, we
+ * can compare the checksums of the new data and the old to determine if
+ * allocating a new block is required. Note that our requirements for
+ * cryptographic strength are fairly weak: there can't be any accidental
+ * hash collisions, but we don't need to be secure against intentional
+ * (malicious) collisions. To trigger a nopwrite, you have to be able
+ * to write the file to begin with, and triggering an incorrect (hash
+ * collision) nopwrite is no worse than simply writing to the file.
+ * That said, there are no known attacks against the checksum algorithms
+ * used for nopwrite, assuming that the salt and the checksums
+ * themselves remain secret.
+ */
+static zio_t *
+zio_nop_write(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ zio_prop_t *zp = &zio->io_prop;
+
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(zp->zp_nopwrite);
+ ASSERT(!zp->zp_dedup);
+ ASSERT(zio->io_bp_override == NULL);
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Check to see if the original bp and the new bp have matching
+ * characteristics (i.e. same checksum, compression algorithms, etc).
+ * If they don't then just continue with the pipeline which will
+ * allocate a new bp.
+ */
+ if (BP_IS_HOLE(bp_orig) ||
+ !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE) ||
+ BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+ BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+ BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+ zp->zp_copies != BP_GET_NDVAS(bp_orig))
+ return (zio);
+
+ /*
+ * If the checksums match then reset the pipeline so that we
+ * avoid allocating a new bp and issuing any I/O.
+ */
+ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE);
+ ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+ ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+ ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+ ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+ sizeof (uint64_t)) == 0);
+
+ *bp = *bp_orig;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ }
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Dedup
+ * ==========================================================================
+ */
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp;
+ zio_t *pio = zio_unique_parent(zio);
+
+ mutex_enter(&pio->io_lock);
+ ddp = ddt_phys_select(dde, bp);
+ if (zio->io_error == 0)
+ ddt_phys_clear(ddp); /* this ddp doesn't need repair */
+
+ if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
+ dde->dde_repair_abd = zio->io_abd;
+ else
+ abd_free(zio->io_abd);
+ mutex_exit(&pio->io_lock);
+}
+
+static zio_t *
+zio_ddt_read_start(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+ blkptr_t blk;
+
+ ASSERT(zio->io_vsd == NULL);
+ zio->io_vsd = dde;
+
+ if (ddp_self == NULL)
+ return (zio);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+ &blk);
+ zio_nowait(zio_read(zio, zio->io_spa, &blk,
+ abd_alloc_for_io(zio->io_size, B_TRUE),
+ zio->io_size, zio_ddt_child_read_done, dde,
+ zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
+ ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
+ }
+ return (zio);
+ }
+
+ zio_nowait(zio_read(zio, zio->io_spa, bp,
+ zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+ return (zio);
+}
+
+static zio_t *
+zio_ddt_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_vsd;
+ if (ddt == NULL) {
+ ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+ return (zio);
+ }
+ if (dde == NULL) {
+ zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+ return (NULL);
+ }
+ if (dde->dde_repair_abd != NULL) {
+ abd_copy(zio->io_abd, dde->dde_repair_abd,
+ zio->io_size);
+ zio->io_child_error[ZIO_CHILD_DDT] = 0;
+ }
+ ddt_repair_done(ddt, dde);
+ zio->io_vsd = NULL;
+ }
+
+ ASSERT(zio->io_vsd == NULL);
+
+ return (zio);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+ spa_t *spa = zio->io_spa;
+ boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW);
+
+ /* We should never get a raw, override zio */
+ ASSERT(!(zio->io_bp_override && do_raw));
+
+ /*
+ * Note: we compare the original data, not the transformed data,
+ * because when zio->io_bp is an override bp, we will not have
+ * pushed the I/O transforms. That's an important optimization
+ * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+ */
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ zio_t *lio = dde->dde_lead_zio[p];
+
+ if (lio != NULL) {
+ return (lio->io_orig_size != zio->io_orig_size ||
+ abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
+ zio->io_orig_size) != 0);
+ }
+ }
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ if (ddp->ddp_phys_birth != 0) {
+ arc_buf_t *abuf = NULL;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ blkptr_t blk = *zio->io_bp;
+ int error;
+
+ ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+ ddt_exit(ddt);
+
+ /*
+ * Intuitively, it would make more sense to compare
+ * io_abd than io_orig_abd in the raw case since you
+ * don't want to look at any transformations that have
+ * happened to the data. However, for raw I/Os the
+ * data will actually be the same in io_abd and
+ * io_orig_abd, so all we have to do is issue this as
+ * a raw ARC read.
+ */
+ if (do_raw) {
+ zio_flags |= ZIO_FLAG_RAW;
+ ASSERT3U(zio->io_size, ==, zio->io_orig_size);
+ ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
+ zio->io_size));
+ ASSERT3P(zio->io_transform_stack, ==, NULL);
+ }
+
+ error = arc_read(NULL, spa, &blk,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+ zio_flags, &aflags, &zio->io_bookmark);
+
+ if (error == 0) {
+ if (arc_buf_size(abuf) != zio->io_orig_size ||
+ abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
+ zio->io_orig_size) != 0)
+ error = SET_ERROR(EEXIST);
+ arc_buf_destroy(abuf, &abuf);
+ }
+
+ ddt_enter(ddt);
+ return (error != 0);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *pio;
+
+ if (zio->io_error)
+ return;
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_lead_zio[p] == zio);
+
+ ddt_phys_fill(ddp, zio->io_bp);
+
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
+ ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ zio_link_t *zl = NULL;
+ while (zio_walk_parents(zio, &zl) != NULL)
+ ddt_phys_addref(ddp);
+ } else {
+ ddt_phys_clear(ddp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_ditto_write_done(zio_t *zio)
+{
+ int p = DDT_PHYS_DITTO;
+ zio_prop_t *zp = &zio->io_prop;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ ddt_key_t *ddk = &dde->dde_key;
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+ ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+ ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+ if (ddp->ddp_phys_birth != 0)
+ ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+ ddt_phys_fill(ddp, bp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static zio_t *
+zio_ddt_write(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t txg = zio->io_txg;
+ zio_prop_t *zp = &zio->io_prop;
+ int p = zp->zp_copies;
+ int ditto_copies;
+ zio_t *cio = NULL;
+ zio_t *dio = NULL;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+ ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+ ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
+
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = &dde->dde_phys[p];
+
+ if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+ /*
+ * If we're using a weak checksum, upgrade to a strong checksum
+ * and try again. If we're already using a strong checksum,
+ * we can't resolve it, so just convert to an ordinary write.
+ * (And automatically e-mail a paper to Nature?)
+ */
+ if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP)) {
+ zp->zp_checksum = spa_dedup_checksum(spa);
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ BP_ZERO(bp);
+ } else {
+ zp->zp_dedup = B_FALSE;
+ BP_SET_DEDUP(bp, B_FALSE);
+ }
+ ASSERT(!BP_GET_DEDUP(bp));
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ ddt_exit(ddt);
+ return (zio);
+ }
+
+ ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+ ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+ if (ditto_copies > ddt_ditto_copies_present(dde) &&
+ dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+ zio_prop_t czp = *zp;
+
+ czp.zp_copies = ditto_copies;
+
+ /*
+ * If we arrived here with an override bp, we won't have run
+ * the transform stack, so we won't have the data we need to
+ * generate a child i/o. So, toss the override bp and restart.
+ * This is safe, because using the override bp is just an
+ * optimization; and it's rare, so the cost doesn't matter.
+ */
+ if (zio->io_bp_override) {
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ zio->io_bp_override = NULL;
+ BP_ZERO(bp);
+ ddt_exit(ddt);
+ return (zio);
+ }
+
+ dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
+ zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
+ NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+ }
+
+ if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+ if (ddp->ddp_phys_birth != 0)
+ ddt_bp_fill(ddp, bp, txg);
+ if (dde->dde_lead_zio[p] != NULL)
+ zio_add_child(zio, dde->dde_lead_zio[p]);
+ else
+ ddt_phys_addref(ddp);
+ } else if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth == txg);
+ ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+ ddt_phys_fill(ddp, bp);
+ ddt_phys_addref(ddp);
+ } else {
+ cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
+ zio->io_orig_size, zio->io_orig_size, zp,
+ zio_ddt_child_write_ready, NULL, NULL,
+ zio_ddt_child_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[p] = cio;
+ }
+
+ ddt_exit(ddt);
+
+ if (cio)
+ zio_nowait(cio);
+ if (dio)
+ zio_nowait(dio);
+
+ return (zio);
+}
+
+ddt_entry_t *freedde; /* for debugging */
+
+static zio_t *
+zio_ddt_free(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ ddt_enter(ddt);
+ freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+ if (dde) {
+ ddp = ddt_phys_select(dde, bp);
+ if (ddp)
+ ddt_phys_decref(ddp);
+ }
+ ddt_exit(ddt);
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa, int allocator)
+{
+ zio_t *zio;
+
+ ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
+
+ zio = avl_first(&spa->spa_alloc_trees[allocator]);
+ if (zio == NULL)
+ return (NULL);
+
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Try to place a reservation for this zio. If we're unable to
+ * reserve then we throttle.
+ */
+ ASSERT3U(zio->io_allocator, ==, allocator);
+ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
+ zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
+ return (NULL);
+ }
+
+ avl_remove(&spa->spa_alloc_trees[allocator], zio);
+ ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+ return (zio);
+}
+
+static zio_t *
+zio_dva_throttle(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_t *nio;
+ metaslab_class_t *mc;
+
+ /* locate an appropriate allocation class */
+ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
+ zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
+
+ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+ !mc->mc_alloc_throttle_enabled ||
+ zio->io_child_type == ZIO_CHILD_GANG ||
+ zio->io_flags & ZIO_FLAG_NODATA) {
+ return (zio);
+ }
+
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+ ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+ zbookmark_phys_t *bm = &zio->io_bookmark;
+ /*
+ * We want to try to use as many allocators as possible to help improve
+ * performance, but we also want logically adjacent IOs to be physically
+ * adjacent to improve sequential read performance. We chunk each object
+ * into 2^20 block regions, and then hash based on the objset, object,
+ * level, and region to accomplish both of these goals.
+ */
+ zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+ bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
+ mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_metaslab_class = mc;
+ avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
+ nio = zio_io_to_allocate(spa, zio->io_allocator);
+ mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
+
+ return (nio);
+}
+
+static void
+zio_allocate_dispatch(spa_t *spa, int allocator)
+{
+ zio_t *zio;
+
+ mutex_enter(&spa->spa_alloc_locks[allocator]);
+ zio = zio_io_to_allocate(spa, allocator);
+ mutex_exit(&spa->spa_alloc_locks[allocator]);
+ if (zio == NULL)
+ return;
+
+ ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+ ASSERT0(zio->io_error);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
+static zio_t *
+zio_dva_allocate(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ metaslab_class_t *mc;
+ blkptr_t *bp = zio->io_bp;
+ int error;
+ int flags = 0;
+
+ if (zio->io_gang_leader == NULL) {
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+ zio->io_gang_leader = zio;
+ }
+
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT0(BP_GET_NDVAS(bp));
+ ASSERT3U(zio->io_prop.zp_copies, >, 0);
+ ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ if (zio->io_flags & ZIO_FLAG_NODATA)
+ flags |= METASLAB_DONT_THROTTLE;
+ if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
+ flags |= METASLAB_GANG_CHILD;
+ if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
+ flags |= METASLAB_ASYNC_ALLOC;
+
+ /*
+ * if not already chosen, locate an appropriate allocation class
+ */
+ mc = zio->io_metaslab_class;
+ if (mc == NULL) {
+ mc = spa_preferred_class(spa, zio->io_size,
+ zio->io_prop.zp_type, zio->io_prop.zp_level,
+ zio->io_prop.zp_zpl_smallblk);
+ zio->io_metaslab_class = mc;
+ }
+
+ error = metaslab_alloc(spa, mc, zio->io_size, bp,
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+ &zio->io_alloc_list, zio, zio->io_allocator);
+
+ /*
+ * Fallback to normal class when an alloc class is full
+ */
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ /*
+ * If throttling, transfer reservation over to normal class.
+ * The io_allocator slot can remain the same even though we
+ * are switching classes.
+ */
+ if (mc->mc_alloc_throttle_enabled &&
+ (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
+ metaslab_class_throttle_unreserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+
+ mc = spa_normal_class(spa);
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio,
+ flags | METASLAB_MUST_RESERVE));
+ } else {
+ mc = spa_normal_class(spa);
+ }
+ zio->io_metaslab_class = mc;
+
+ error = metaslab_alloc(spa, mc, zio->io_size, bp,
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+ &zio->io_alloc_list, zio, zio->io_allocator);
+ }
+
+ if (error != 0) {
+ zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
+ "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+ error);
+ if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
+ return (zio_write_gang_block(zio));
+ zio->io_error = error;
+ }
+
+ return (zio);
+}
+
+static zio_t *
+zio_dva_free(zio_t *zio)
+{
+ metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
+
+ return (zio);
+}
+
+static zio_t *
+zio_dva_claim(zio_t *zio)
+{
+ int error;
+
+ error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+ if (error)
+ zio->io_error = error;
+
+ return (zio);
+}
+
+/*
+ * Undo an allocation. This is used by zio_done() when an I/O fails
+ * and we want to give back the block we just allocated.
+ * This handles both normal blocks and gang blocks.
+ */
+static void
+zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
+{
+ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+ ASSERT(zio->io_bp_override == NULL);
+
+ if (!BP_IS_HOLE(bp))
+ metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+
+ if (gn != NULL) {
+ for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ zio_dva_unallocate(zio, gn->gn_child[g],
+ &gn->gn_gbh->zg_blkptr[g]);
+ }
+ }
+}
+
+/*
+ * Try to allocate an intent log block. Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t size, boolean_t *slog)
+{
+ int error = 1;
+ zio_alloc_list_t io_alloc_list;
+
+ ASSERT(txg > spa_syncing_txg(spa));
+
+ metaslab_trace_init(&io_alloc_list);
+
+ /*
+ * Block pointer fields are useful to metaslabs for stats and debugging.
+ * Fill in the obvious ones before calling into metaslab_alloc().
+ */
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_LEVEL(new_bp, 0);
+
+ /*
+ * When allocating a zil block, we don't have information about
+ * the final destination of the block except the objset it's part
+ * of, so we just hash the objset ID to pick the allocator to get
+ * some parallelism.
+ */
+ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
+ txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL,
+ cityhash4(0, 0, 0, objset) % spa->spa_alloc_count);
+ if (error == 0) {
+ *slog = TRUE;
+ } else {
+ error = metaslab_alloc(spa, spa_normal_class(spa), size,
+ new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
+ &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) %
+ spa->spa_alloc_count);
+ if (error == 0)
+ *slog = FALSE;
+ }
+ metaslab_trace_fini(&io_alloc_list);
+
+ if (error == 0) {
+ BP_SET_LSIZE(new_bp, size);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(new_bp,
+ spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+ ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_LEVEL(new_bp, 0);
+ BP_SET_DEDUP(new_bp, 0);
+ BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+ } else {
+ zfs_dbgmsg("%s: zil block allocation failure: "
+ "size %llu, error %d", spa_name(spa), size, error);
+ }
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * Read, write and delete to physical devices
+ * ==========================================================================
+ */
+
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
+static zio_t *
+zio_vdev_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ uint64_t align;
+ spa_t *spa = zio->io_spa;
+ int ret;
+
+ ASSERT(zio->io_error == 0);
+ ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
+
+ if (vd == NULL) {
+ if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+ spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
+
+ /*
+ * The mirror_ops handle multiple DVAs in a single BP.
+ */
+ vdev_mirror_ops.vdev_op_io_start(zio);
+ return (NULL);
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
+ zio->io_priority == ZIO_PRIORITY_NOW) {
+ trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
+ return (zio);
+ }
+
+ ASSERT3P(zio->io_logical, !=, zio);
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ ASSERT(spa->spa_trust_config);
+
+ if (zio->io_vd->vdev_removing) {
+ /*
+ * Note: the code can handle other kinds of writes,
+ * but we don't expect them.
+ */
+ ASSERT(zio->io_flags &
+ (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
+ }
+ }
+
+ /*
+ * We keep track of time-sensitive I/Os so that the scan thread
+ * can quickly react to certain workloads. In particular, we care
+ * about non-scrubbing, top-level reads and writes with the following
+ * characteristics:
+ * - synchronous writes of user data to non-slog devices
+ * - any reads of user data
+ * When these conditions are met, adjust the timestamp of spa_last_io
+ * which allows the scan thread to adjust its workload accordingly.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+ vd == vd->vdev_top && !vd->vdev_islog &&
+ zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+ zio->io_txg != spa_syncing_txg(spa)) {
+ uint64_t old = spa->spa_last_io;
+ uint64_t new = ddi_get_lbolt64();
+ if (old != new)
+ (void) atomic_cas_64(&spa->spa_last_io, old, new);
+ }
+ align = 1ULL << vd->vdev_top->vdev_ashift;
+
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+ P2PHASE(zio->io_size, align) != 0) {
+ /* Transform logical writes to be a full physical block size. */
+ uint64_t asize = P2ROUNDUP(zio->io_size, align);
+ abd_t *abuf = NULL;
+ if (zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE)
+ abuf = abd_alloc_sametype(zio->io_abd, asize);
+ ASSERT(vd == vd->vdev_top);
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ abd_copy(abuf, zio->io_abd, zio->io_size);
+ abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
+ }
+ zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
+ zio_subblock);
+ }
+
+ /*
+ * If this is not a physical io, make sure that it is properly aligned
+ * before proceeding.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+ ASSERT0(P2PHASE(zio->io_offset, align));
+ ASSERT0(P2PHASE(zio->io_size, align));
+ } else {
+ /*
+ * For the physical io we allow alignment
+ * to a logical block size.
+ */
+ uint64_t log_align =
+ 1ULL << vd->vdev_top->vdev_logical_ashift;
+ ASSERT0(P2PHASE(zio->io_offset, log_align));
+ ASSERT0(P2PHASE(zio->io_size, log_align));
+ }
+
+ VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
+
+ /*
+ * If this is a repair I/O, and there's no self-healing involved --
+ * that is, we're just resilvering what we expect to resilver --
+ * then don't do the I/O unless zio's txg is actually in vd's DTL.
+ * This prevents spurious resilvering.
+ *
+ * There are a few ways that we can end up creating these spurious
+ * resilver i/os:
+ *
+ * 1. A resilver i/o will be issued if any DVA in the BP has a
+ * dirty DTL. The mirror code will issue resilver writes to
+ * each DVA, including the one(s) that are not on vdevs with dirty
+ * DTLs.
+ *
+ * 2. With nested replication, which happens when we have a
+ * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+ * For example, given mirror(replacing(A+B), C), it's likely that
+ * only A is out of date (it's the new device). In this case, we'll
+ * read from C, then use the data to resilver A+B -- but we don't
+ * actually want to resilver B, just A. The top-level mirror has no
+ * way to know this, so instead we just discard unnecessary repairs
+ * as we work our way down the vdev tree.
+ *
+ * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+ * The same logic applies to any form of nested replication: ditto
+ * + mirror, RAID-Z + replacing, etc.
+ *
+ * However, indirect vdevs point off to other vdevs which may have
+ * DTL's, so we never bypass them. The child i/os on concrete vdevs
+ * will be properly bypassed instead.
+ */
+ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+ !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
+ zio->io_txg != 0 && /* not a delegated i/o */
+ vd->vdev_ops != &vdev_indirect_ops &&
+ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ zio_vdev_io_bypass(zio);
+ return (zio);
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ switch (zio->io_type) {
+ case ZIO_TYPE_READ:
+ if (vdev_cache_read(zio))
+ return (zio);
+ /* FALLTHROUGH */
+ case ZIO_TYPE_WRITE:
+ case ZIO_TYPE_FREE:
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return (NULL);
+
+ if (!vdev_accessible(vd, zio)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return (NULL);
+ }
+ break;
+ }
+ /*
+ * Note that we ignore repair writes for TRIM because they can
+ * conflict with normal writes. This isn't an issue because, by
+ * definition, we only repair blocks that aren't freed.
+ */
+ if (zio->io_type == ZIO_TYPE_WRITE &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+ !trim_map_write_start(zio))
+ return (NULL);
+ }
+
+ vd->vdev_ops->vdev_op_io_start(zio);
+ return (NULL);
+}
+
+static zio_t *
+zio_vdev_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
+ boolean_t unexpected_error = B_FALSE;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
+
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_FREE)) {
+
+ if (zio->io_type == ZIO_TYPE_WRITE &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
+ trim_map_write_done(zio);
+
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(vd,
+ zio, EIO);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_label_injection(zio, EIO);
+
+ if (zio->io_error) {
+ if (zio->io_error == ENOTSUP &&
+ zio->io_type == ZIO_TYPE_FREE) {
+ /* Not all devices support TRIM. */
+ } else if (!vdev_accessible(vd, zio)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ } else {
+ unexpected_error = B_TRUE;
+ }
+ }
+ }
+
+ ops->vdev_op_io_done(zio);
+
+ if (unexpected_error)
+ VERIFY(vdev_probe(vd, zio) == NULL);
+
+ return (zio);
+}
+
+/*
+ * This function is used to change the priority of an existing zio that is
+ * currently in-flight. This is used by the arc to upgrade priority in the
+ * event that a demand read is made for a block that is currently queued
+ * as a scrub or async read IO. Otherwise, the high priority read request
+ * would end up having to wait for the lower priority IO.
+ */
+void
+zio_change_priority(zio_t *pio, zio_priority_t priority)
+{
+ zio_t *cio, *cio_next;
+ zio_link_t *zl = NULL;
+
+ ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
+ vdev_queue_change_io_priority(pio, priority);
+ } else {
+ pio->io_priority = priority;
+ }
+
+ mutex_enter(&pio->io_lock);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
+ zio_change_priority(cio, priority);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+/*
+ * For non-raidz ZIOs, we can just copy aside the bad data read from the
+ * disk, and use that to finish the checksum ereport later.
+ */
+static void
+zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
+ const void *good_buf)
+{
+ /* no processing needed */
+ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
+}
+
+/*ARGSUSED*/
+void
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+{
+ void *buf = zio_buf_alloc(zio->io_size);
+
+ abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
+
+ zcr->zcr_cbinfo = zio->io_size;
+ zcr->zcr_cbdata = buf;
+ zcr->zcr_finish = zio_vsd_default_cksum_finish;
+ zcr->zcr_free = zio_buf_free;
+}
+
+static zio_t *
+zio_vdev_io_assess(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+ spa_config_exit(zio->io_spa, SCL_ZIO, zio);
+
+ if (zio->io_vsd != NULL) {
+ zio->io_vsd_ops->vsd_free(zio);
+ zio->io_vsd = NULL;
+ }
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_fault_injection(zio, EIO);
+
+ if (zio->io_type == ZIO_TYPE_FREE &&
+ zio->io_priority != ZIO_PRIORITY_NOW) {
+ switch (zio->io_error) {
+ case 0:
+ ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
+ ZIO_TRIM_STAT_BUMP(success);
+ break;
+ case EOPNOTSUPP:
+ ZIO_TRIM_STAT_BUMP(unsupported);
+ break;
+ default:
+ ZIO_TRIM_STAT_BUMP(failed);
+ break;
+ }
+ }
+
+ /*
+ * If the I/O failed, determine whether we should attempt to retry it.
+ *
+ * On retry, we cut in line in the issue queue, since we don't want
+ * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
+ */
+ if (zio->io_error && vd == NULL &&
+ !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
+ zio->io_error = 0;
+ zio->io_flags |= ZIO_FLAG_IO_RETRY |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+ zio_requeue_io_start_cut_in_line);
+ return (NULL);
+ }
+
+ /*
+ * If we got an error on a leaf device, convert it to ENXIO
+ * if the device is not accessible at all.
+ */
+ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ !vdev_accessible(vd, zio))
+ zio->io_error = SET_ERROR(ENXIO);
+
+ /*
+ * If we can't write to an interior vdev (mirror or RAID-Z),
+ * set vdev_cant_write so that we stop trying to allocate from it.
+ */
+ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
+ vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
+ vd->vdev_cant_write = B_TRUE;
+ }
+
+ /*
+ * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
+ * attempts will ever succeed. In this case we set a persistent bit so
+ * that we don't bother with it in the future.
+ */
+ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
+ zio->io_type == ZIO_TYPE_IOCTL &&
+ zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
+ vd->vdev_nowritecache = B_TRUE;
+
+ if (zio->io_error)
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ zio->io_physdone != NULL) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+ ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+ zio->io_physdone(zio->io_logical);
+ }
+
+ return (zio);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_stage >>= 1;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+ zio->io_stage >>= 1;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static zio_t *
+zio_checksum_generate(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ enum zio_checksum checksum;
+
+ if (bp == NULL) {
+ /*
+ * This is zio_write_phys().
+ * We're either generating a label checksum, or none at all.
+ */
+ checksum = zio->io_prop.zp_checksum;
+
+ if (checksum == ZIO_CHECKSUM_OFF)
+ return (zio);
+
+ ASSERT(checksum == ZIO_CHECKSUM_LABEL);
+ } else {
+ if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
+ ASSERT(!IO_IS_ALLOCATING(zio));
+ checksum = ZIO_CHECKSUM_GANG_HEADER;
+ } else {
+ checksum = BP_GET_CHECKSUM(bp);
+ }
+ }
+
+ zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
+
+ return (zio);
+}
+
+static zio_t *
+zio_checksum_verify(zio_t *zio)
+{
+ zio_bad_cksum_t info;
+ blkptr_t *bp = zio->io_bp;
+ int error;
+
+ ASSERT(zio->io_vd != NULL);
+
+ if (bp == NULL) {
+ /*
+ * This is zio_read_phys().
+ * We're either verifying a label checksum, or nothing at all.
+ */
+ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
+ return (zio);
+
+ ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
+ }
+
+ if ((error = zio_checksum_error(zio, &info)) != 0) {
+ zio->io_error = error;
+ if (error == ECKSUM &&
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ zfs_ereport_start_checksum(zio->io_spa,
+ zio->io_vd, zio, zio->io_offset,
+ zio->io_size, NULL, &info);
+ }
+ }
+
+ return (zio);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+ zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+}
+
+/*
+ * ==========================================================================
+ * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
+ * An error of 0 indicates success. ENXIO indicates whole-device failure,
+ * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
+ * indicate errors that are specific to one I/O, and most likely permanent.
+ * Any other error is presumed to be worse because we weren't expecting it.
+ * ==========================================================================
+ */
+int
+zio_worst_error(int e1, int e2)
+{
+ static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
+ int r1, r2;
+
+ for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
+ if (e1 == zio_error_rank[r1])
+ break;
+
+ for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
+ if (e2 == zio_error_rank[r2])
+ break;
+
+ return (r1 > r2 ? e1 : e2);
+}
+
+/*
+ * ==========================================================================
+ * I/O completion
+ * ==========================================================================
+ */
+static zio_t *
+zio_ready(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ zio_t *pio, *pio_next;
+ zio_link_t *zl = NULL;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
+ ZIO_WAIT_READY)) {
+ return (NULL);
+ }
+
+ if (zio->io_ready) {
+ ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+ (zio->io_flags & ZIO_FLAG_NOPWRITE));
+ ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
+
+ zio->io_ready(zio);
+ }
+
+ if (bp != NULL && bp != &zio->io_bp_copy)
+ zio->io_bp_copy = *bp;
+
+ if (zio->io_error != 0) {
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
+
+ /*
+ * We were unable to allocate anything, unreserve and
+ * issue the next I/O to allocate.
+ */
+ metaslab_class_throttle_unreserve(
+ zio->io_metaslab_class, zio->io_prop.zp_copies,
+ zio->io_allocator, zio);
+ zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
+ }
+ }
+
+ mutex_enter(&zio->io_lock);
+ zio->io_state[ZIO_WAIT_READY] = 1;
+ pio = zio_walk_parents(zio, &zl);
+ mutex_exit(&zio->io_lock);
+
+ /*
+ * As we notify zio's parents, new parents could be added.
+ * New parents go to the head of zio's io_parent_list, however,
+ * so we will (correctly) not notify them. The remainder of zio's
+ * io_parent_list, from 'pio_next' onward, cannot change because
+ * all parents must wait for us to be done before they can be done.
+ */
+ for (; pio != NULL; pio = pio_next) {
+ pio_next = zio_walk_parents(zio, &zl);
+ zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
+ }
+
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
+ if (BP_IS_GANG(bp)) {
+ zio->io_flags &= ~ZIO_FLAG_NODATA;
+ } else {
+ ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ }
+ }
+
+ if (zio_injection_enabled &&
+ zio->io_spa->spa_syncing_txg == zio->io_txg)
+ zio_handle_ignored_writes(zio);
+
+ return (zio);
+}
+
+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+ zio_t *lio = zio->io_logical;
+ zio_t *pio = zio_unique_parent(zio);
+ vdev_t *vd = zio->io_vd;
+ int flags = METASLAB_ASYNC_ALLOC;
+
+ ASSERT3P(zio->io_bp, !=, NULL);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ ASSERT(vd != NULL);
+ ASSERT3P(vd, ==, vd->vdev_top);
+ ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
+ ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+ ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * Parents of gang children can have two flavors -- ones that
+ * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+ * and ones that allocated the constituent blocks. The allocation
+ * throttle needs to know the allocating parent zio so we must find
+ * it here.
+ */
+ if (pio->io_child_type == ZIO_CHILD_GANG) {
+ /*
+ * If our parent is a rewrite gang child then our grandparent
+ * would have been the one that performed the allocation.
+ */
+ if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+ pio = zio_unique_parent(pio);
+ flags |= METASLAB_GANG_CHILD;
+ }
+
+ ASSERT(IO_IS_ALLOCATING(pio));
+ ASSERT3P(zio, !=, zio->io_logical);
+ ASSERT(zio->io_logical != NULL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
+
+ mutex_enter(&pio->io_lock);
+ metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
+ pio->io_allocator, B_TRUE);
+ mutex_exit(&pio->io_lock);
+
+ metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
+ pio->io_allocator, pio);
+
+ /*
+ * Call into the pipeline to see if there is more work that
+ * needs to be done. If there is work to be done it will be
+ * dispatched to another taskq thread.
+ */
+ zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
+}
+
+static zio_t *
+zio_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_t *lio = zio->io_logical;
+ blkptr_t *bp = zio->io_bp;
+ vdev_t *vd = zio->io_vd;
+ uint64_t psize = zio->io_size;
+ zio_t *pio, *pio_next;
+ zio_link_t *zl = NULL;
+
+ /*
+ * If our children haven't all completed,
+ * wait for them and then repeat this pipeline stage.
+ */
+ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
+ return (NULL);
+ }
+
+ /*
+ * If the allocation throttle is enabled, then update the accounting.
+ * We only track child I/Os that are part of an allocating async
+ * write. We must do this since the allocation is performed
+ * by the logical I/O but the actual write is done by child I/Os.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ ASSERT(zio->io_metaslab_class != NULL);
+ ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
+ zio_dva_throttle_done(zio);
+ }
+
+ /*
+ * If the allocation throttle is enabled, verify that
+ * we have decremented the refcounts for every I/O that was throttled.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(bp != NULL);
+
+ metaslab_group_alloc_verify(spa, zio->io_bp, zio,
+ zio->io_allocator);
+ VERIFY(zfs_refcount_not_held(
+ &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
+ zio));
+ }
+
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ ASSERT(zio->io_children[c][w] == 0);
+
+ if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
+ ASSERT(bp->blk_pad[0] == 0);
+ ASSERT(bp->blk_pad[1] == 0);
+ ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
+ (bp == zio_unique_parent(zio)->io_bp));
+ if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ zio->io_bp_override == NULL &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
+ ASSERT(BP_COUNT_GANG(bp) == 0 ||
+ (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+ }
+ if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+ VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
+ }
+
+ /*
+ * If there were child vdev/gang/ddt errors, they apply to us now.
+ */
+ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
+ zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+ zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+ /*
+ * If the I/O on the transformed data was successful, generate any
+ * checksum reports now while we still have the transformed data.
+ */
+ if (zio->io_error == 0) {
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ uint64_t align = zcr->zcr_align;
+ uint64_t asize = P2ROUNDUP(psize, align);
+ char *abuf = NULL;
+ abd_t *adata = zio->io_abd;
+
+ if (asize != psize) {
+ adata = abd_alloc_linear(asize, B_TRUE);
+ abd_copy(adata, zio->io_abd, psize);
+ abd_zero_off(adata, psize, asize - psize);
+ }
+
+ if (adata != NULL)
+ abuf = abd_borrow_buf_copy(adata, asize);
+
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, abuf);
+ zfs_ereport_free_checksum(zcr);
+
+ if (adata != NULL)
+ abd_return_buf(adata, abuf, asize);
+
+ if (asize != psize)
+ abd_free(adata);
+ }
+ }
+
+ zio_pop_transforms(zio); /* note: may set zio->io_error */
+
+ vdev_stat_update(zio, psize);
+
+ if (zio->io_error) {
+ /*
+ * If this I/O is attached to a particular vdev,
+ * generate an error message describing the I/O failure
+ * at the block level. We ignore these errors if the
+ * device is currently unavailable.
+ */
+ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
+ zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
+
+ if ((zio->io_error == EIO || !(zio->io_flags &
+ (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+ zio == lio) {
+ /*
+ * For logical I/O requests, tell the SPA to log the
+ * error and generate a logical data ereport.
+ */
+ spa_log_error(spa, zio);
+ zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
+ 0, 0);
+ }
+ }
+
+ if (zio->io_error && zio == lio) {
+ /*
+ * Determine whether zio should be reexecuted. This will
+ * propagate all the way to the root via zio_notify_parent().
+ */
+ ASSERT(vd == NULL && bp != NULL);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (IO_IS_ALLOCATING(zio) &&
+ !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ if (zio->io_error != ENOSPC)
+ zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+ else
+ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ }
+
+ if ((zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_FREE) &&
+ !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
+ zio->io_error == ENXIO &&
+ spa_load_state(spa) == SPA_LOAD_NONE &&
+ spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
+ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+ if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
+ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+ /*
+ * Here is a possibly good place to attempt to do
+ * either combinatorial reconstruction or error correction
+ * based on checksums. It also might be a good place
+ * to send out preliminary ereports before we suspend
+ * processing.
+ */
+ }
+
+ /*
+ * If there were logical child errors, they apply to us now.
+ * We defer this until now to avoid conflating logical child
+ * errors with errors that happened to the zio itself when
+ * updating vdev stats and reporting FMA events above.
+ */
+ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
+
+ if ((zio->io_error || zio->io_reexecute) &&
+ IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+ !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
+ zio_dva_unallocate(zio, zio->io_gang_tree, bp);
+
+ zio_gang_tree_free(&zio->io_gang_tree);
+
+ /*
+ * Godfather I/Os should never suspend.
+ */
+ if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
+ (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
+ zio->io_reexecute = 0;
+
+ if (zio->io_reexecute) {
+ /*
+ * This is a logical I/O that wants to reexecute.
+ *
+ * Reexecute is top-down. When an i/o fails, if it's not
+ * the root, it simply notifies its parent and sticks around.
+ * The parent, seeing that it still has children in zio_done(),
+ * does the same. This percolates all the way up to the root.
+ * The root i/o will reexecute or suspend the entire tree.
+ *
+ * This approach ensures that zio_reexecute() honors
+ * all the original i/o dependency relationships, e.g.
+ * parents not executing until children are ready.
+ */
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ zio->io_gang_leader = NULL;
+
+ mutex_enter(&zio->io_lock);
+ zio->io_state[ZIO_WAIT_DONE] = 1;
+ mutex_exit(&zio->io_lock);
+
+ /*
+ * "The Godfather" I/O monitors its children but is
+ * not a true parent to them. It will track them through
+ * the pipeline but severs its ties whenever they get into
+ * trouble (e.g. suspended). This allows "The Godfather"
+ * I/O to return status without blocking.
+ */
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+ pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
+
+ if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
+ (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+ zio_remove_child(pio, zio, remove_zl);
+ /*
+ * This is a rare code path, so we don't
+ * bother with "next_to_execute".
+ */
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
+ NULL);
+ }
+ }
+
+ if ((pio = zio_unique_parent(zio)) != NULL) {
+ /*
+ * We're not a root i/o, so there's nothing to do
+ * but notify our parent. Don't propagate errors
+ * upward since we haven't permanently failed yet.
+ */
+ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
+ zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
+ /*
+ * This is a rare code path, so we don't bother with
+ * "next_to_execute".
+ */
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
+ } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
+ /*
+ * We'd fail again if we reexecuted now, so suspend
+ * until conditions improve (e.g. device comes online).
+ */
+ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
+ } else {
+ /*
+ * Reexecution is potentially a huge amount of work.
+ * Hand it off to the otherwise-unused claim taskq.
+ */
+#if defined(illumos) || !defined(_KERNEL)
+ ASSERT(zio->io_tqent.tqent_next == NULL);
+#else
+ ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
+#endif
+ spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
+ ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
+ 0, &zio->io_tqent);
+ }
+ return (NULL);
+ }
+
+ ASSERT(zio->io_child_count == 0);
+ ASSERT(zio->io_reexecute == 0);
+ ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
+
+ /*
+ * Report any checksum errors, since the I/O is complete.
+ */
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, NULL);
+ zfs_ereport_free_checksum(zcr);
+ }
+
+ /*
+ * It is the responsibility of the done callback to ensure that this
+ * particular zio is no longer discoverable for adoption, and as
+ * such, cannot acquire any new parents.
+ */
+ if (zio->io_done)
+ zio->io_done(zio);
+
+ mutex_enter(&zio->io_lock);
+ zio->io_state[ZIO_WAIT_DONE] = 1;
+ mutex_exit(&zio->io_lock);
+
+ /*
+ * We are done executing this zio. We may want to execute a parent
+ * next. See the comment in zio_notify_parent().
+ */
+ zio_t *next_to_execute = NULL;
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
+ zio_remove_child(pio, zio, remove_zl);
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
+ }
+
+ if (zio->io_waiter != NULL) {
+ mutex_enter(&zio->io_lock);
+ zio->io_executor = NULL;
+ cv_broadcast(&zio->io_cv);
+ mutex_exit(&zio->io_lock);
+ } else {
+ zio_destroy(zio);
+ }
+
+ return (next_to_execute);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline definition
+ * ==========================================================================
+ */
+static zio_pipe_stage_t *zio_pipeline[] = {
+ NULL,
+ zio_read_bp_init,
+ zio_write_bp_init,
+ zio_free_bp_init,
+ zio_issue_async,
+ zio_write_compress,
+ zio_checksum_generate,
+ zio_nop_write,
+ zio_ddt_read_start,
+ zio_ddt_read_done,
+ zio_ddt_write,
+ zio_ddt_free,
+ zio_gang_assemble,
+ zio_gang_issue,
+ zio_dva_throttle,
+ zio_dva_allocate,
+ zio_dva_free,
+ zio_dva_claim,
+ zio_ready,
+ zio_vdev_io_start,
+ zio_vdev_io_done,
+ zio_vdev_io_assess,
+ zio_checksum_verify,
+ zio_done
+};
+
+
+
+
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects. Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+ const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
+ /*
+ * These variables represent the "equivalent" values for the zbookmark,
+ * after converting zbookmarks inside the meta dnode to their
+ * normal-object equivalents.
+ */
+ uint64_t zb1obj, zb2obj;
+ uint64_t zb1L0, zb2L0;
+ uint64_t zb1level, zb2level;
+
+ if (zb1->zb_object == zb2->zb_object &&
+ zb1->zb_level == zb2->zb_level &&
+ zb1->zb_blkid == zb2->zb_blkid)
+ return (0);
+
+ /*
+ * BP_SPANB calculates the span in blocks.
+ */
+ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+ zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
+
+ if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+ zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+ zb1L0 = 0;
+ zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+ } else {
+ zb1obj = zb1->zb_object;
+ zb1level = zb1->zb_level;
+ }
+
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+ zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+ zb2L0 = 0;
+ zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+ } else {
+ zb2obj = zb2->zb_object;
+ zb2level = zb2->zb_level;
+ }
+
+ /* Now that we have a canonical representation, do the comparison. */
+ if (zb1obj != zb2obj)
+ return (zb1obj < zb2obj ? -1 : 1);
+ else if (zb1L0 != zb2L0)
+ return (zb1L0 < zb2L0 ? -1 : 1);
+ else if (zb1level != zb2level)
+ return (zb1level > zb2level ? -1 : 1);
+ /*
+ * This can (theoretically) happen if the bookmarks have the same object
+ * and level, but different blkids, if the block sizes are not the same.
+ * There is presently no way to change the indirect block sizes
+ */
+ return (0);
+}
+
+/*
+ * This function checks the following: given that last_block is the place that
+ * our traversal stopped last time, does that guarantee that we've visited
+ * every node under subtree_root? Therefore, we can't just use the raw output
+ * of zbookmark_compare. We have to pass in a modified version of
+ * subtree_root; by incrementing the block id, and then checking whether
+ * last_block is before or equal to that, we can tell whether or not having
+ * visited last_block implies that all of subtree_root's children have been
+ * visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+ const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+ zbookmark_phys_t mod_zb = *subtree_root;
+ mod_zb.zb_blkid++;
+ ASSERT(last_block->zb_level == 0);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
+ return (B_FALSE);
+
+ /*
+ * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+ * data block size in sectors, because that variable is only used if
+ * the bookmark refers to a block in the meta-dnode. Since we don't
+ * know without examining it what object it refers to, and there's no
+ * harm in passing in this value in other cases, we always pass it in.
+ *
+ * We pass in 0 for the indirect block size shift because zb2 must be
+ * level 0. The indirect block size is only used to calculate the span
+ * of the bookmark, but since the bookmark must be level 0, the span is
+ * always 1, so the math works out.
+ *
+ * If you make changes to how the zbookmark_compare code works, be sure
+ * to make sure that this code still works afterwards.
+ */
+ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+ 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+ last_block) <= 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
new file mode 100644
index 000000000000..8924804a6fcb
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@@ -0,0 +1,475 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil.h>
+#include <sys/abd.h>
+#include <zfs_fletcher.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed. We support checksum vectors
+ * for three distinct reasons:
+ *
+ * 1. Different kinds of data need different levels of protection.
+ * For SPA metadata, we always want a very strong checksum.
+ * For user data, we let users make the trade-off between speed
+ * and checksum strength.
+ *
+ * 2. Cryptographic hash and MAC algorithms are an area of active research.
+ * It is likely that in future hash functions will be at least as strong
+ * as current best-of-breed, and may be substantially faster as well.
+ * We want the ability to take advantage of these new hashes as soon as
+ * they become available.
+ *
+ * 3. If someone develops hardware that can compute a strong hash quickly,
+ * we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength. When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
+ *
+ * SALTED CHECKSUMS
+ *
+ * To enable the use of less secure hash algorithms with dedup, we
+ * introduce the notion of salted checksums (MACs, really). A salted
+ * checksum is fed both a random 256-bit value (the salt) and the data
+ * to be checksummed. This salt is kept secret (stored on the pool, but
+ * never shown to the user). Thus even if an attacker knew of collision
+ * weaknesses in the hash algorithm, they won't be able to mount a known
+ * plaintext attack on the DDT, since the actual hash value cannot be
+ * known ahead of time. How the salt is used is algorithm-specific
+ * (some might simply prefix it to the data block, others might need to
+ * utilize a full-blown HMAC). On disk the salt is stored in a ZAP
+ * object in the MOS (DMU_POOL_CHECKSUM_SALT).
+ *
+ * CONTEXT TEMPLATES
+ *
+ * Some hashing algorithms need to perform a substantial amount of
+ * initialization work (e.g. salted checksums above may need to pre-hash
+ * the salt) before being able to process data. Performing this
+ * redundant work for each block would be wasteful, so we instead allow
+ * a checksum algorithm to do the work once (the first time it's used)
+ * and then keep this pre-initialized context as a template inside the
+ * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains
+ * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
+ * construct and destruct the pre-initialized checksum context. The
+ * pre-initialized context is then reused during each checksum
+ * invocation and passed to the checksum function.
+ */
+
+/*ARGSUSED*/
+static void
+abd_checksum_off(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_2_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_2_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_2_incremental_byteswap, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_4_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_4_incremental_byteswap, zcp);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+ {{NULL, NULL}, NULL, NULL, 0, "inherit"},
+ {{NULL, NULL}, NULL, NULL, 0, "on"},
+ {{abd_checksum_off, abd_checksum_off},
+ NULL, NULL, 0, "off"},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+ "label"},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+ "gang_header"},
+ {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
+ {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
+ NULL, NULL, 0, "fletcher2"},
+ {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
+ {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
+ {{abd_checksum_off, abd_checksum_off},
+ NULL, NULL, 0, "noparity"},
+ {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
+ {{abd_checksum_skein_native, abd_checksum_skein_byteswap},
+ abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
+#ifdef illumos
+ {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap},
+ abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
+ ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+#endif
+};
+
+/*
+ * The flag corresponding to the "verify" in dedup=[checksum,]verify
+ * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
+ */
+spa_feature_t
+zio_checksum_to_feature(enum zio_checksum cksum)
+{
+ VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
+
+ switch (cksum) {
+ case ZIO_CHECKSUM_SHA512:
+ return (SPA_FEATURE_SHA512);
+ case ZIO_CHECKSUM_SKEIN:
+ return (SPA_FEATURE_SKEIN);
+#ifdef illumos
+ case ZIO_CHECKSUM_EDONR:
+ return (SPA_FEATURE_EDONR);
+#endif
+ }
+ return (SPA_FEATURE_NONE);
+}
+
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
+{
+ ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (ZIO_CHECKSUM_ON_VALUE);
+
+ return (child);
+}
+
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+ enum zio_checksum parent)
+{
+ ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (spa_dedup_checksum(spa));
+
+ if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+ return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+ ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP) ||
+ (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+ return (child);
+}
+
+/*
+ * Set the external verifier for a gang block based on <vdev, offset, txg>,
+ * a tuple which is guaranteed to be unique for the life of the pool.
+ */
+static void
+zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
+{
+ dva_t *dva = BP_IDENTITY(bp);
+ uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+
+ ASSERT(BP_IS_GANG(bp));
+
+ ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
+}
+
+/*
+ * Set the external verifier for a label block based on its offset.
+ * The vdev is implicit, and the txg is unknowable at pool open time --
+ * hence the logic in vdev_uberblock_load() to find the most recent copy.
+ */
+static void
+zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
+{
+ ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
+}
+
+/*
+ * Calls the template init function of a checksum which supports context
+ * templates and installs the template into the spa_t.
+ */
+static void
+zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
+{
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+ if (ci->ci_tmpl_init == NULL)
+ return;
+ if (spa->spa_cksum_tmpls[checksum] != NULL)
+ return;
+
+ VERIFY(ci->ci_tmpl_free != NULL);
+ mutex_enter(&spa->spa_cksum_tmpls_lock);
+ if (spa->spa_cksum_tmpls[checksum] == NULL) {
+ spa->spa_cksum_tmpls[checksum] =
+ ci->ci_tmpl_init(&spa->spa_cksum_salt);
+ VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
+ }
+ mutex_exit(&spa->spa_cksum_tmpls_lock);
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
+ abd_t *abd, uint64_t size)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint64_t offset = zio->io_offset;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t cksum;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(ci->ci_func[0] != NULL);
+
+ zio_checksum_template_init(checksum, spa);
+
+ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+ zio_eck_t *eck;
+ void *data = abd_to_buf(abd);
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = data;
+
+ size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
+ uint64_t);
+ eck = &zilc->zc_eck;
+ } else {
+ eck = (zio_eck_t *)((char *)data + size) - 1;
+ }
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+ zio_checksum_gang_verifier(&eck->zec_cksum, bp);
+ else if (checksum == ZIO_CHECKSUM_LABEL)
+ zio_checksum_label_verifier(&eck->zec_cksum, offset);
+ else
+ bp->blk_cksum = eck->zec_cksum;
+ eck->zec_magic = ZEC_MAGIC;
+ ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
+ &cksum);
+ eck->zec_cksum = cksum;
+ } else {
+ ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
+ &bp->blk_cksum);
+ }
+}
+
+int
+zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
+ abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
+{
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t actual_cksum, expected_cksum;
+ int byteswap;
+
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ return (SET_ERROR(EINVAL));
+
+ zio_checksum_template_init(checksum, spa);
+
+ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+ zio_eck_t *eck;
+ zio_cksum_t verifier;
+ uint64_t data_size = size;
+ void *data = abd_borrow_buf_copy(abd, data_size);
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = data;
+ uint64_t nused;
+
+ eck = &zilc->zc_eck;
+ if (eck->zec_magic == ZEC_MAGIC) {
+ nused = zilc->zc_nused;
+ } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) {
+ nused = BSWAP_64(zilc->zc_nused);
+ } else {
+ abd_return_buf(abd, data, data_size);
+ return (SET_ERROR(ECKSUM));
+ }
+
+ if (nused > data_size) {
+ abd_return_buf(abd, data, data_size);
+ return (SET_ERROR(ECKSUM));
+ }
+
+ size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+ } else {
+ eck = (zio_eck_t *)((char *)data + data_size) - 1;
+ }
+
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+ zio_checksum_gang_verifier(&verifier, bp);
+ else if (checksum == ZIO_CHECKSUM_LABEL)
+ zio_checksum_label_verifier(&verifier, offset);
+ else
+ verifier = bp->blk_cksum;
+
+ byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
+
+ if (byteswap)
+ byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
+
+ size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data;
+ expected_cksum = eck->zec_cksum;
+ eck->zec_cksum = verifier;
+ abd_return_buf_copy(abd, data, data_size);
+
+ ci->ci_func[byteswap](abd, size,
+ spa->spa_cksum_tmpls[checksum], &actual_cksum);
+ abd_copy_from_buf_off(abd, &expected_cksum,
+ eck_offset, sizeof (zio_cksum_t));
+
+ if (byteswap) {
+ byteswap_uint64_array(&expected_cksum,
+ sizeof (zio_cksum_t));
+ }
+ } else {
+ byteswap = BP_SHOULD_BYTESWAP(bp);
+ expected_cksum = bp->blk_cksum;
+ ci->ci_func[byteswap](abd, size,
+ spa->spa_cksum_tmpls[checksum], &actual_cksum);
+ }
+
+ if (info != NULL) {
+ info->zbc_expected = expected_cksum;
+ info->zbc_actual = actual_cksum;
+ info->zbc_checksum_name = ci->ci_name;
+ info->zbc_byteswapped = byteswap;
+ info->zbc_injected = 0;
+ info->zbc_has_cksum = 1;
+ }
+
+ if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
+ return (SET_ERROR(ECKSUM));
+
+ return (0);
+}
+
+int
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+ int error;
+ uint64_t size = (bp == NULL ? zio->io_size :
+ (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+ uint64_t offset = zio->io_offset;
+ abd_t *data = zio->io_abd;
+ spa_t *spa = zio->io_spa;
+
+ error = zio_checksum_error_impl(spa, bp, checksum, data, size,
+ offset, info);
+
+ if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
+ error = zio_handle_fault_injection(zio, ECKSUM);
+ if (error != 0)
+ info->zbc_injected = 1;
+ }
+
+ return (error);
+}
+
+/*
+ * Called by a spa_t that's about to be deallocated. This steps through
+ * all of the checksum context templates and deallocates any that were
+ * initialized using the algorithm-specific template init function.
+ */
+void
+zio_checksum_templates_free(spa_t *spa)
+{
+ for (enum zio_checksum checksum = 0;
+ checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
+ if (spa->spa_cksum_tmpls[checksum] != NULL) {
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+ VERIFY(ci->ci_tmpl_free != NULL);
+ ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
+ spa->spa_cksum_tmpls[checksum] = NULL;
+ }
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
new file mode 100644
index 000000000000..b87303889ddb
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
@@ -0,0 +1,215 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/compress.h>
+#include <sys/kstat.h>
+#include <sys/spa.h>
+#include <sys/zfeature.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+typedef struct zcomp_stats {
+ kstat_named_t zcompstat_attempts;
+ kstat_named_t zcompstat_empty;
+ kstat_named_t zcompstat_skipped_insufficient_gain;
+} zcomp_stats_t;
+
+static zcomp_stats_t zcomp_stats = {
+ { "attempts", KSTAT_DATA_UINT64 },
+ { "empty", KSTAT_DATA_UINT64 },
+ { "skipped_insufficient_gain", KSTAT_DATA_UINT64 }
+};
+
+#define ZCOMPSTAT_INCR(stat, val) \
+ atomic_add_64(&zcomp_stats.stat.value.ui64, (val));
+
+#define ZCOMPSTAT_BUMP(stat) ZCOMPSTAT_INCR(stat, 1);
+
+kstat_t *zcomp_ksp;
+
+/*
+ * If nonzero, every 1/X decompression attempts will fail, simulating
+ * an undetected memory error.
+ */
+uint64_t zio_decompress_fail_fraction = 0;
+
+/*
+ * Compression vectors.
+ */
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+ {"inherit", 0, NULL, NULL},
+ {"on", 0, NULL, NULL},
+ {"uncompressed", 0, NULL, NULL},
+ {"lzjb", 0, lzjb_compress, lzjb_decompress},
+ {"empty", 0, NULL, NULL},
+ {"gzip-1", 1, gzip_compress, gzip_decompress},
+ {"gzip-2", 2, gzip_compress, gzip_decompress},
+ {"gzip-3", 3, gzip_compress, gzip_decompress},
+ {"gzip-4", 4, gzip_compress, gzip_decompress},
+ {"gzip-5", 5, gzip_compress, gzip_decompress},
+ {"gzip-6", 6, gzip_compress, gzip_decompress},
+ {"gzip-7", 7, gzip_compress, gzip_decompress},
+ {"gzip-8", 8, gzip_compress, gzip_decompress},
+ {"gzip-9", 9, gzip_compress, gzip_decompress},
+ {"zle", 64, zle_compress, zle_decompress},
+ {"lz4", 0, lz4_compress, lz4_decompress}
+};
+
+enum zio_compress
+zio_compress_select(spa_t *spa, enum zio_compress child,
+ enum zio_compress parent)
+{
+ enum zio_compress result;
+
+ ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent != ZIO_COMPRESS_INHERIT);
+
+ result = child;
+ if (result == ZIO_COMPRESS_INHERIT)
+ result = parent;
+
+ if (result == ZIO_COMPRESS_ON) {
+ if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
+ result = ZIO_COMPRESS_LZ4_ON_VALUE;
+ else
+ result = ZIO_COMPRESS_LEGACY_ON_VALUE;
+ }
+
+ return (result);
+}
+
+/*ARGSUSED*/
+static int
+zio_compress_zeroed_cb(void *data, size_t len, void *private)
+{
+ uint64_t *end = (uint64_t *)((char *)data + len);
+ for (uint64_t *word = (uint64_t *)data; word < end; word++)
+ if (*word != 0)
+ return (1);
+
+ return (0);
+}
+
+size_t
+zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len)
+{
+ size_t c_len, d_len;
+ zio_compress_info_t *ci = &zio_compress_table[c];
+
+ ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+
+ ZCOMPSTAT_BUMP(zcompstat_attempts);
+
+ /*
+ * If the data is all zeroes, we don't even need to allocate
+ * a block for it. We indicate this by returning zero size.
+ */
+ if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) {
+ ZCOMPSTAT_BUMP(zcompstat_empty);
+ return (0);
+ }
+
+ if (c == ZIO_COMPRESS_EMPTY)
+ return (s_len);
+
+ /* Compress at least 12.5% */
+ d_len = s_len - (s_len >> 3);
+
+ /* No compression algorithms can read from ABDs directly */
+ void *tmp = abd_borrow_buf_copy(src, s_len);
+ c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level);
+ abd_return_buf(src, tmp, s_len);
+
+ if (c_len > d_len) {
+ ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain);
+ return (s_len);
+ }
+
+ ASSERT3U(c_len, <=, d_len);
+ return (c_len);
+}
+
+int
+zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len)
+{
+ zio_compress_info_t *ci = &zio_compress_table[c];
+ if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+ return (SET_ERROR(EINVAL));
+
+ return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
+}
+
+int
+zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+ size_t s_len, size_t d_len)
+{
+ void *tmp = abd_borrow_buf_copy(src, s_len);
+ int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
+ abd_return_buf(src, tmp, s_len);
+
+ /*
+ * Decompression shouldn't fail, because we've already verifyied
+ * the checksum. However, for extra protection (e.g. against bitflips
+ * in non-ECC RAM), we handle this error (and test it).
+ */
+ ASSERT0(ret);
+ if (zio_decompress_fail_fraction != 0 &&
+ spa_get_random(zio_decompress_fail_fraction) == 0)
+ ret = SET_ERROR(EINVAL);
+
+ return (ret);
+}
+
+void
+zio_compress_init(void)
+{
+
+ zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (zcomp_ksp != NULL) {
+ zcomp_ksp->ks_data = &zcomp_stats;
+ kstat_install(zcomp_ksp);
+ }
+}
+
+void
+zio_compress_fini(void)
+{
+ if (zcomp_ksp != NULL) {
+ kstat_delete(zcomp_ksp);
+ zcomp_ksp = NULL;
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
new file mode 100644
index 000000000000..26f59af9968f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
@@ -0,0 +1,755 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault. These are kept in a global list. Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field. If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/fs/zfs.h>
+
+uint32_t zio_injection_enabled;
+
+/*
+ * Data describing each zinject handler registered on the system, and
+ * contains the list node linking the handler in the global zinject
+ * handler list.
+ */
+typedef struct inject_handler {
+ int zi_id;
+ spa_t *zi_spa;
+ zinject_record_t zi_record;
+ uint64_t *zi_lanes;
+ int zi_next_lane;
+ list_node_t zi_link;
+} inject_handler_t;
+
+/*
+ * List of all zinject handlers registered on the system, protected by
+ * the inject_lock defined below.
+ */
+static list_t inject_handlers;
+
+/*
+ * This protects insertion into, and traversal of, the inject handler
+ * list defined above; as well as the inject_delay_count. Any time a
+ * handler is inserted or removed from the list, this lock should be
+ * taken as a RW_WRITER; and any time traversal is done over the list
+ * (without modification to it) this lock should be taken as a RW_READER.
+ */
+static krwlock_t inject_lock;
+
+/*
+ * This holds the number of zinject delay handlers that have been
+ * registered on the system. It is protected by the inject_lock defined
+ * above. Thus modifications to this count must be a RW_WRITER of the
+ * inject_lock, and reads of this count must be (at least) a RW_READER
+ * of the lock.
+ */
+static int inject_delay_count = 0;
+
+/*
+ * This lock is used only in zio_handle_io_delay(), refer to the comment
+ * in that function for more details.
+ */
+static kmutex_t inject_delay_mtx;
+
+/*
+ * Used to assign unique identifying numbers to each new zinject handler.
+ */
+static int inject_next_id = 1;
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
+ zinject_record_t *record, int error)
+{
+ /*
+ * Check for a match against the MOS, which is based on type
+ */
+ if (zb->zb_objset == DMU_META_OBJSET &&
+ record->zi_objset == DMU_META_OBJSET &&
+ record->zi_object == DMU_META_DNODE_OBJECT) {
+ if (record->zi_type == DMU_OT_NONE ||
+ type == record->zi_type)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+ else
+ return (B_FALSE);
+ }
+
+ /*
+ * Check for an exact match.
+ */
+ if (zb->zb_objset == record->zi_objset &&
+ zb->zb_object == record->zi_object &&
+ zb->zb_level == record->zi_level &&
+ zb->zb_blkid >= record->zi_start &&
+ zb->zb_blkid <= record->zi_end &&
+ error == record->zi_error)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+
+ return (B_FALSE);
+}
+
+/*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa)
+ continue;
+
+ if (handler->zi_record.zi_type == type &&
+ strcmp(tag, handler->zi_record.zi_func) == 0)
+ panic("Panic requested in function %s\n", tag);
+ }
+
+ rw_exit(&inject_lock);
+}
+
+/*
+ * Determine if the I/O in question should return failure. Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+ int ret = 0;
+ inject_handler_t *handler;
+
+ /*
+ * Ignore I/O not associated with any logical data.
+ */
+ if (zio->io_logical == NULL)
+ return (0);
+
+ /*
+ * Currently, we only support fault injection on reads.
+ */
+ if (zio->io_type != ZIO_TYPE_READ)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (zio->io_spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
+ continue;
+
+ /* If this handler matches, return EIO */
+ if (zio_match_handler(&zio->io_logical->io_bookmark,
+ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+ &handler->zi_record, error)) {
+ ret = error;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+/*
+ * Determine if the zio is part of a label update and has an injection
+ * handler associated with that portion of the label. Currently, we
+ * allow error injection in either the nvlist or the uberblock region of
+ * of the vdev label.
+ */
+int
+zio_handle_label_injection(zio_t *zio, int error)
+{
+ inject_handler_t *handler;
+ vdev_t *vd = zio->io_vd;
+ uint64_t offset = zio->io_offset;
+ int label;
+ int ret = 0;
+
+ if (offset >= VDEV_LABEL_START_SIZE &&
+ offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+ uint64_t start = handler->zi_record.zi_start;
+ uint64_t end = handler->zi_record.zi_end;
+
+ if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
+ continue;
+
+ /*
+ * The injection region is the relative offsets within a
+ * vdev label. We must determine the label which is being
+ * updated and adjust our region accordingly.
+ */
+ label = vdev_label_number(vd->vdev_psize, offset);
+ start = vdev_label_offset(vd->vdev_psize, label, start);
+ end = vdev_label_offset(vd->vdev_psize, label, end);
+
+ if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
+ (offset >= start && offset <= end)) {
+ ret = error;
+ break;
+ }
+ }
+ rw_exit(&inject_lock);
+ return (ret);
+}
+
+
+int
+zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
+{
+ inject_handler_t *handler;
+ int ret = 0;
+
+ /*
+ * We skip over faults in the labels unless it's during
+ * device open (i.e. zio == NULL).
+ */
+ if (zio != NULL) {
+ uint64_t offset = zio->io_offset;
+
+ if (offset < VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+ return (0);
+ }
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
+ continue;
+
+ if (vd->vdev_guid == handler->zi_record.zi_guid) {
+ if (handler->zi_record.zi_failfast &&
+ (zio == NULL || (zio->io_flags &
+ (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
+ continue;
+ }
+
+ /* Handle type specific I/O failures */
+ if (zio != NULL &&
+ handler->zi_record.zi_iotype != ZIO_TYPES &&
+ handler->zi_record.zi_iotype != zio->io_type)
+ continue;
+
+ if (handler->zi_record.zi_error == error) {
+ /*
+ * For a failed open, pretend like the device
+ * has gone away.
+ */
+ if (error == ENXIO)
+ vd->vdev_stat.vs_aux =
+ VDEV_AUX_OPEN_FAILED;
+
+ /*
+ * Treat these errors as if they had been
+ * retried so that all the appropriate stats
+ * and FMA events are generated.
+ */
+ if (!handler->zi_record.zi_failfast &&
+ zio != NULL)
+ zio->io_flags |= ZIO_FLAG_IO_RETRY;
+
+ ret = error;
+ break;
+ }
+ if (handler->zi_record.zi_error == ENXIO) {
+ ret = SET_ERROR(EIO);
+ break;
+ }
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+/*
+ * Simulate hardware that ignores cache flushes. For requested number
+ * of seconds nix the actual writing to disk.
+ */
+void
+zio_handle_ignored_writes(zio_t *zio)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (zio->io_spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
+ continue;
+
+ /*
+ * Positive duration implies # of seconds, negative
+ * a number of txgs
+ */
+ if (handler->zi_record.zi_timer == 0) {
+ if (handler->zi_record.zi_duration > 0)
+ handler->zi_record.zi_timer = ddi_get_lbolt64();
+ else
+ handler->zi_record.zi_timer = zio->io_txg;
+ }
+
+ /* Have a "problem" writing 60% of the time */
+ if (spa_get_random(100) < 60)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ break;
+ }
+
+ rw_exit(&inject_lock);
+}
+
+void
+spa_handle_ignored_writes(spa_t *spa)
+{
+ inject_handler_t *handler;
+
+ if (zio_injection_enabled == 0)
+ return;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa ||
+ handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
+ continue;
+
+ if (handler->zi_record.zi_duration > 0) {
+ VERIFY(handler->zi_record.zi_timer == 0 ||
+ handler->zi_record.zi_timer +
+ handler->zi_record.zi_duration * hz >
+ ddi_get_lbolt64());
+ } else {
+ /* duration is negative so the subtraction here adds */
+ VERIFY(handler->zi_record.zi_timer == 0 ||
+ handler->zi_record.zi_timer -
+ handler->zi_record.zi_duration >=
+ spa_syncing_txg(spa));
+ }
+ }
+
+ rw_exit(&inject_lock);
+}
+
+hrtime_t
+zio_handle_io_delay(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ inject_handler_t *min_handler = NULL;
+ hrtime_t min_target = 0;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ /*
+ * inject_delay_count is a subset of zio_injection_enabled that
+ * is only incremented for delay handlers. These checks are
+ * mainly added to remind the reader why we're not explicitly
+ * checking zio_injection_enabled like the other functions.
+ */
+ IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
+ IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
+
+ /*
+ * If there aren't any inject delay handlers registered, then we
+ * can short circuit and simply return 0 here. A value of zero
+ * informs zio_delay_interrupt() that this request should not be
+ * delayed. This short circuit keeps us from acquiring the
+ * inject_delay_mutex unnecessarily.
+ */
+ if (inject_delay_count == 0) {
+ rw_exit(&inject_lock);
+ return (0);
+ }
+
+ /*
+ * Each inject handler has a number of "lanes" associated with
+ * it. Each lane is able to handle requests independently of one
+ * another, and at a latency defined by the inject handler
+ * record's zi_timer field. Thus if a handler in configured with
+ * a single lane with a 10ms latency, it will delay requests
+ * such that only a single request is completed every 10ms. So,
+ * if more than one request is attempted per each 10ms interval,
+ * the average latency of the requests will be greater than
+ * 10ms; but if only a single request is submitted each 10ms
+ * interval the average latency will be 10ms.
+ *
+ * We need to acquire this mutex to prevent multiple concurrent
+ * threads being assigned to the same lane of a given inject
+ * handler. The mutex allows us to perform the following two
+ * operations atomically:
+ *
+ * 1. determine the minimum handler and minimum target
+ * value of all the possible handlers
+ * 2. update that minimum handler's lane array
+ *
+ * Without atomicity, two (or more) threads could pick the same
+ * lane in step (1), and then conflict with each other in step
+ * (2). This could allow a single lane handler to process
+ * multiple requests simultaneously, which shouldn't be possible.
+ */
+ mutex_enter(&inject_delay_mtx);
+
+ for (inject_handler_t *handler = list_head(&inject_handlers);
+ handler != NULL; handler = list_next(&inject_handlers, handler)) {
+ if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+ continue;
+
+ if (vd->vdev_guid != handler->zi_record.zi_guid)
+ continue;
+
+ /*
+ * Defensive; should never happen as the array allocation
+ * occurs prior to inserting this handler on the list.
+ */
+ ASSERT3P(handler->zi_lanes, !=, NULL);
+
+ /*
+ * This should never happen, the zinject command should
+ * prevent a user from setting an IO delay with zero lanes.
+ */
+ ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
+
+ ASSERT3U(handler->zi_record.zi_nlanes, >,
+ handler->zi_next_lane);
+
+ /*
+ * We want to issue this IO to the lane that will become
+ * idle the soonest, so we compare the soonest this
+ * specific handler can complete the IO with all other
+ * handlers, to find the lowest value of all possible
+ * lanes. We then use this lane to submit the request.
+ *
+ * Since each handler has a constant value for its
+ * delay, we can just use the "next" lane for that
+ * handler; as it will always be the lane with the
+ * lowest value for that particular handler (i.e. the
+ * lane that will become idle the soonest). This saves a
+ * scan of each handler's lanes array.
+ *
+ * There's two cases to consider when determining when
+ * this specific IO request should complete. If this
+ * lane is idle, we want to "submit" the request now so
+ * it will complete after zi_timer milliseconds. Thus,
+ * we set the target to now + zi_timer.
+ *
+ * If the lane is busy, we want this request to complete
+ * zi_timer milliseconds after the lane becomes idle.
+ * Since the 'zi_lanes' array holds the time at which
+ * each lane will become idle, we use that value to
+ * determine when this request should complete.
+ */
+ hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
+ hrtime_t busy = handler->zi_record.zi_timer +
+ handler->zi_lanes[handler->zi_next_lane];
+ hrtime_t target = MAX(idle, busy);
+
+ if (min_handler == NULL) {
+ min_handler = handler;
+ min_target = target;
+ continue;
+ }
+
+ ASSERT3P(min_handler, !=, NULL);
+ ASSERT3U(min_target, !=, 0);
+
+ /*
+ * We don't yet increment the "next lane" variable since
+ * we still might find a lower value lane in another
+ * handler during any remaining iterations. Once we're
+ * sure we've selected the absolute minimum, we'll claim
+ * the lane and increment the handler's "next lane"
+ * field below.
+ */
+
+ if (target < min_target) {
+ min_handler = handler;
+ min_target = target;
+ }
+ }
+
+ /*
+ * 'min_handler' will be NULL if no IO delays are registered for
+ * this vdev, otherwise it will point to the handler containing
+ * the lane that will become idle the soonest.
+ */
+ if (min_handler != NULL) {
+ ASSERT3U(min_target, !=, 0);
+ min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
+
+ /*
+ * If we've used all possible lanes for this handler,
+ * loop back and start using the first lane again;
+ * otherwise, just increment the lane index.
+ */
+ min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
+ min_handler->zi_record.zi_nlanes;
+ }
+
+ mutex_exit(&inject_delay_mtx);
+ rw_exit(&inject_lock);
+
+ return (min_target);
+}
+
+/*
+ * Create a new handler for the given record. We add it to the list, adding
+ * a reference to the spa_t in the process. We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int error;
+ spa_t *spa;
+
+ /*
+ * If this is pool-wide metadata, make sure we unload the corresponding
+ * spa_t, so that the next attempt to load it will trigger the fault.
+ * We call spa_reset() to unload the pool appropriately.
+ */
+ if (flags & ZINJECT_UNLOAD_SPA)
+ if ((error = spa_reset(name)) != 0)
+ return (error);
+
+ if (record->zi_cmd == ZINJECT_DELAY_IO) {
+ /*
+ * A value of zero for the number of lanes or for the
+ * delay time doesn't make sense.
+ */
+ if (record->zi_timer == 0 || record->zi_nlanes == 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * The number of lanes is directly mapped to the size of
+ * an array used by the handler. Thus, to ensure the
+ * user doesn't trigger an allocation that's "too large"
+ * we cap the number of lanes here.
+ */
+ if (record->zi_nlanes >= UINT16_MAX)
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (!(flags & ZINJECT_NULL)) {
+ /*
+ * spa_inject_ref() will add an injection reference, which will
+ * prevent the pool from being removed from the namespace while
+ * still allowing it to be unloaded.
+ */
+ if ((spa = spa_inject_addref(name)) == NULL)
+ return (SET_ERROR(ENOENT));
+
+ handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+ handler->zi_spa = spa;
+ handler->zi_record = *record;
+
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ handler->zi_lanes = kmem_zalloc(
+ sizeof (*handler->zi_lanes) *
+ handler->zi_record.zi_nlanes, KM_SLEEP);
+ handler->zi_next_lane = 0;
+ } else {
+ handler->zi_lanes = NULL;
+ handler->zi_next_lane = 0;
+ }
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ /*
+ * We can't move this increment into the conditional
+ * above because we need to hold the RW_WRITER lock of
+ * inject_lock, and we don't want to hold that while
+ * allocating the handler's zi_lanes array.
+ */
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3S(inject_delay_count, >=, 0);
+ inject_delay_count++;
+ ASSERT3S(inject_delay_count, >, 0);
+ }
+
+ *id = handler->zi_id = inject_next_id++;
+ list_insert_tail(&inject_handlers, handler);
+ atomic_inc_32(&zio_injection_enabled);
+
+ rw_exit(&inject_lock);
+ }
+
+ /*
+ * Flush the ARC, so that any attempts to read this data will end up
+ * going to the ZIO layer. Note that this is a little overkill, but
+ * we don't have the necessary ARC interfaces to do anything else, and
+ * fault injection isn't a performance critical path.
+ */
+ if (flags & ZINJECT_FLUSH_ARC)
+ /*
+ * We must use FALSE to ensure arc_flush returns, since
+ * we're not preventing concurrent ARC insertions.
+ */
+ arc_flush(NULL, FALSE);
+
+ return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function. Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+ zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ mutex_enter(&spa_namespace_lock);
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id > *id)
+ break;
+
+ if (handler) {
+ *record = handler->zi_record;
+ *id = handler->zi_id;
+ (void) strncpy(name, spa_name(handler->zi_spa), buflen);
+ ret = 0;
+ } else {
+ ret = SET_ERROR(ENOENT);
+ }
+
+ rw_exit(&inject_lock);
+ mutex_exit(&spa_namespace_lock);
+
+ return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id == id)
+ break;
+
+ if (handler == NULL) {
+ rw_exit(&inject_lock);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3S(inject_delay_count, >, 0);
+ inject_delay_count--;
+ ASSERT3S(inject_delay_count, >=, 0);
+ }
+
+ list_remove(&inject_handlers, handler);
+ rw_exit(&inject_lock);
+
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3P(handler->zi_lanes, !=, NULL);
+ kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
+ handler->zi_record.zi_nlanes);
+ } else {
+ ASSERT3P(handler->zi_lanes, ==, NULL);
+ }
+
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_dec_32(&zio_injection_enabled);
+
+ return (0);
+}
+
+void
+zio_inject_init(void)
+{
+ rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&inject_handlers, sizeof (inject_handler_t),
+ offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+ list_destroy(&inject_handlers);
+ mutex_destroy(&inject_delay_mtx);
+ rw_destroy(&inject_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
new file mode 100644
index 000000000000..13c5673fbe26
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding. This is a fast and simple algorithm to eliminate
+ * runs of zeroes. Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values. If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end - 1) {
+ uchar_t *first = src;
+ uchar_t *len = dst++;
+ if (src[0] == 0) {
+ uchar_t *last = src + (256 - n);
+ while (src < MIN(last, s_end) && src[0] == 0)
+ src++;
+ *len = src - first - 1 + n;
+ } else {
+ uchar_t *last = src + n;
+ if (d_end - dst < n)
+ break;
+ while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+ *dst++ = *src++;
+ if (src[0])
+ *dst++ = *src++;
+ *len = src - first - 1;
+ }
+ }
+ return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end) {
+ int len = 1 + *src++;
+ if (len <= n) {
+ while (len-- != 0)
+ *dst++ = *src++;
+ } else {
+ len -= n;
+ while (len-- != 0)
+ *dst++ = 0;
+ }
+ }
+ return (dst == d_end ? 0 : -1);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
new file mode 100644
index 000000000000..ec333e54d2a8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
@@ -0,0 +1,187 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 The MathWorks, Inc. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define ZRL_LOCKED -1
+#define ZRL_DESTROYED -2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+ mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ zrl->zr_refcount = 0;
+ cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+ ASSERT0(zrl->zr_refcount);
+
+ mutex_destroy(&zrl->zr_mtx);
+ zrl->zr_refcount = ZRL_DESTROYED;
+ cv_destroy(&zrl->zr_cv);
+}
+
+void
+zrl_add_impl(zrlock_t *zrl, const char *zc)
+{
+ for (;;) {
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+ while (n != ZRL_LOCKED) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, n, n + 1);
+ if (cas == n) {
+ ASSERT3S((int32_t)n, >=, 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ DTRACE_PROBE2(zrlock__reentry,
+ zrlock_t *, zrl, uint32_t, n);
+ }
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ return;
+ }
+ n = cas;
+ }
+
+ mutex_enter(&zrl->zr_mtx);
+ while (zrl->zr_refcount == ZRL_LOCKED) {
+ cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+ }
+ mutex_exit(&zrl->zr_mtx);
+ }
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+ uint32_t n;
+
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+ }
+#endif
+ n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+ ASSERT3S((int32_t)n, >=, 0);
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ if (n == 0) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+ if (cas == 0) {
+#ifdef ZFS_DEBUG
+ ASSERT3P(zrl->zr_owner, ==, NULL);
+ zrl->zr_owner = curthread;
+#endif
+ return (1);
+ }
+ }
+
+ ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
+
+ return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+ ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
+
+ mutex_enter(&zrl->zr_mtx);
+#ifdef ZFS_DEBUG
+ ASSERT3P(zrl->zr_owner, ==, curthread);
+ zrl->zr_owner = NULL;
+ membar_producer(); /* make sure the owner store happens first */
+#endif
+ zrl->zr_refcount = 0;
+ cv_broadcast(&zrl->zr_cv);
+ mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_refcount(zrlock_t *zrl)
+{
+ ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+ int n = (int)zrl->zr_refcount;
+ return (n <= 0 ? 0 : n);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+ ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+ return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+ ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+ return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+ return (zrl->zr_owner);
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c
new file mode 100644
index 000000000000..76a9fa122b26
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c
@@ -0,0 +1,431 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZTHR Infrastructure
+ * ===================
+ *
+ * ZTHR threads are used for isolated operations that span multiple txgs
+ * within a SPA. They generally exist from SPA creation/loading and until
+ * the SPA is exported/destroyed. The ideal requirements for an operation
+ * to be modeled with a zthr are the following:
+ *
+ * 1] The operation needs to run over multiple txgs.
+ * 2] There is be a single point of reference in memory or on disk that
+ * indicates whether the operation should run/is running or has
+ * stopped.
+ *
+ * If the operation satisfies the above then the following rules guarantee
+ * a certain level of correctness:
+ *
+ * 1] Any thread EXCEPT the zthr changes the work indicator from stopped
+ * to running but not the opposite.
+ * 2] Only the zthr can change the work indicator from running to stopped
+ * (e.g. when it is done) but not the opposite.
+ *
+ * This way a normal zthr cycle should go like this:
+ *
+ * 1] An external thread changes the work indicator from stopped to
+ * running and wakes up the zthr.
+ * 2] The zthr wakes up, checks the indicator and starts working.
+ * 3] When the zthr is done, it changes the indicator to stopped, allowing
+ * a new cycle to start.
+ *
+ * Besides being awakened by other threads, a zthr can be configured
+ * during creation to wakeup on it's own after a specified interval
+ * [see zthr_create_timer()].
+ *
+ * Note: ZTHR threads are NOT a replacement for generic threads! Please
+ * ensure that they fit your use-case well before using them.
+ *
+ * == ZTHR creation
+ *
+ * Every zthr needs three inputs to start running:
+ *
+ * 1] A user-defined checker function (checkfunc) that decides whether
+ * the zthr should start working or go to sleep. The function should
+ * return TRUE when the zthr needs to work or FALSE to let it sleep,
+ * and should adhere to the following signature:
+ * boolean_t checkfunc_name(void *args, zthr_t *t);
+ *
+ * 2] A user-defined ZTHR function (func) which the zthr executes when
+ * it is not sleeping. The function should adhere to the following
+ * signature type:
+ * void func_name(void *args, zthr_t *t);
+ *
+ * 3] A void args pointer that will be passed to checkfunc and func
+ * implicitly by the infrastructure.
+ *
+ * The reason why the above API needs two different functions,
+ * instead of one that both checks and does the work, has to do with
+ * the zthr's internal state lock (zthr_state_lock) and the allowed
+ * cancellation windows. We want to hold the zthr_state_lock while
+ * running checkfunc but not while running func. This way the zthr
+ * can be cancelled while doing work and not while checking for work.
+ *
+ * To start a zthr:
+ * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
+ * or
+ * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
+ * args, max_sleep);
+ *
+ * After that you should be able to wakeup, cancel, and resume the
+ * zthr from another thread using the zthr_pointer.
+ *
+ * NOTE: ZTHR threads could potentially wake up spuriously and the
+ * user should take this into account when writing a checkfunc.
+ * [see ZTHR state transitions]
+ *
+ * == ZTHR cancellation
+ *
+ * ZTHR threads must be cancelled when their SPA is being exported
+ * or when they need to be paused so they don't interfere with other
+ * operations.
+ *
+ * To cancel a zthr:
+ * zthr_cancel(zthr_pointer);
+ *
+ * To resume it:
+ * zthr_resume(zthr_pointer);
+ *
+ * A zthr will implicitly check if it has received a cancellation
+ * signal every time func returns and every time it wakes up [see
+ * ZTHR state transitions below].
+ *
+ * At times, waiting for the zthr's func to finish its job may take
+ * time. This may be very time-consuming for some operations that
+ * need to cancel the SPA's zthrs (e.g spa_export). For this scenario
+ * the user can explicitly make their ZTHR function aware of incoming
+ * cancellation signals using zthr_iscancelled(). A common pattern for
+ * that looks like this:
+ *
+ * int
+ * func_name(void *args, zthr_t *t)
+ * {
+ * ... <unpack args> ...
+ * while (!work_done && !zthr_iscancelled(t)) {
+ * ... <do more work> ...
+ * }
+ * }
+ *
+ * == ZTHR cleanup
+ *
+ * Cancelling a zthr doesn't clean up its metadata (internal locks,
+ * function pointers to func and checkfunc, etc..). This is because
+ * we want to keep them around in case we want to resume the execution
+ * of the zthr later. Similarly for zthrs that exit themselves.
+ *
+ * To completely cleanup a zthr, cancel it first to ensure that it
+ * is not running and then use zthr_destroy().
+ *
+ * == ZTHR state transitions
+ *
+ * zthr creation
+ * +
+ * |
+ * | woke up
+ * | +--------------+ sleep
+ * | | ^
+ * | | |
+ * | | | FALSE
+ * | | |
+ * v v FALSE +
+ * cancelled? +---------> checkfunc?
+ * + ^ +
+ * | | |
+ * | | | TRUE
+ * | | |
+ * | | func returned v
+ * | +---------------+ func
+ * |
+ * | TRUE
+ * |
+ * v
+ * zthr stopped running
+ *
+ * == Implementation of ZTHR requests
+ *
+ * ZTHR wakeup, cancel, and resume are requests on a zthr to
+ * change its internal state. Requests on a zthr are serialized
+ * using the zthr_request_lock, while changes in its internal
+ * state are protected by the zthr_state_lock. A request will
+ * first acquire the zthr_request_lock and then immediately
+ * acquire the zthr_state_lock. We do this so that incoming
+ * requests are serialized using the request lock, while still
+ * allowing us to use the state lock for thread communication
+ * via zthr_cv.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zthr.h>
+
+struct zthr {
+ /* running thread doing the work */
+ kthread_t *zthr_thread;
+
+ /* lock protecting internal data & invariants */
+ kmutex_t zthr_state_lock;
+
+ /* mutex that serializes external requests */
+ kmutex_t zthr_request_lock;
+
+ /* notification mechanism for requests */
+ kcondvar_t zthr_cv;
+
+ /* flag set to true if we are canceling the zthr */
+ boolean_t zthr_cancel;
+
+ /*
+ * maximum amount of time that the zthr is spent sleeping;
+ * if this is 0, the thread doesn't wake up until it gets
+ * signaled.
+ */
+ hrtime_t zthr_wait_time;
+
+ /* consumer-provided callbacks & data */
+ zthr_checkfunc_t *zthr_checkfunc;
+ zthr_func_t *zthr_func;
+ void *zthr_arg;
+};
+
+static void
+zthr_procedure(void *arg)
+{
+ zthr_t *t = arg;
+
+ mutex_enter(&t->zthr_state_lock);
+ ASSERT3P(t->zthr_thread, ==, curthread);
+
+ while (!t->zthr_cancel) {
+ if (t->zthr_checkfunc(t->zthr_arg, t)) {
+ mutex_exit(&t->zthr_state_lock);
+ t->zthr_func(t->zthr_arg, t);
+ mutex_enter(&t->zthr_state_lock);
+ } else {
+ /* go to sleep */
+ if (t->zthr_wait_time == 0) {
+ cv_wait(&t->zthr_cv, &t->zthr_state_lock);
+ } else {
+ (void) cv_timedwait_hires(&t->zthr_cv,
+ &t->zthr_state_lock, t->zthr_wait_time,
+ MSEC2NSEC(1), 0);
+ }
+ }
+ }
+
+ /*
+ * Clear out the kernel thread metadata and notify the
+ * zthr_cancel() thread that we've stopped running.
+ */
+ t->zthr_thread = NULL;
+ t->zthr_cancel = B_FALSE;
+ cv_broadcast(&t->zthr_cv);
+
+ mutex_exit(&t->zthr_state_lock);
+ thread_exit();
+}
+
+zthr_t *
+zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg)
+{
+ return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0));
+}
+
+/*
+ * Create a zthr with specified maximum sleep time. If the time
+ * in sleeping state exceeds max_sleep, a wakeup(do the check and
+ * start working if required) will be triggered.
+ */
+zthr_t *
+zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func,
+ void *arg, hrtime_t max_sleep)
+{
+ zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+ mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
+
+ mutex_enter(&t->zthr_state_lock);
+ t->zthr_checkfunc = checkfunc;
+ t->zthr_func = func;
+ t->zthr_arg = arg;
+ t->zthr_wait_time = max_sleep;
+
+ t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
+ 0, &p0, TS_RUN, minclsyspri);
+ mutex_exit(&t->zthr_state_lock);
+
+ return (t);
+}
+
+void
+zthr_destroy(zthr_t *t)
+{
+ ASSERT(!MUTEX_HELD(&t->zthr_state_lock));
+ ASSERT(!MUTEX_HELD(&t->zthr_request_lock));
+ VERIFY3P(t->zthr_thread, ==, NULL);
+ mutex_destroy(&t->zthr_request_lock);
+ mutex_destroy(&t->zthr_state_lock);
+ cv_destroy(&t->zthr_cv);
+ kmem_free(t, sizeof (*t));
+}
+
+/*
+ * Wake up the zthr if it is sleeping. If the thread has been
+ * cancelled that does nothing.
+ */
+void
+zthr_wakeup(zthr_t *t)
+{
+ mutex_enter(&t->zthr_request_lock);
+ mutex_enter(&t->zthr_state_lock);
+
+ /*
+ * There are 4 states that we can find the zthr when issuing
+ * this broadcast:
+ *
+ * [1] The common case of the thread being asleep, at which
+ * point the broadcast will wake it up.
+ * [2] The thread has been cancelled. Waking up a cancelled
+ * thread is a no-op. Any work that is still left to be
+ * done should be handled the next time the thread is
+ * resumed.
+ * [3] The thread is doing work and is already up, so this
+ * is basically a no-op.
+ * [4] The thread was just created/resumed, in which case the
+ * behavior is similar to [3].
+ */
+ cv_broadcast(&t->zthr_cv);
+
+ mutex_exit(&t->zthr_state_lock);
+ mutex_exit(&t->zthr_request_lock);
+}
+
+/*
+ * Sends a cancel request to the zthr and blocks until the zthr is
+ * cancelled. If the zthr is not running (e.g. has been cancelled
+ * already), this is a no-op.
+ */
+void
+zthr_cancel(zthr_t *t)
+{
+ mutex_enter(&t->zthr_request_lock);
+ mutex_enter(&t->zthr_state_lock);
+
+ /*
+ * Since we are holding the zthr_state_lock at this point
+ * we can find the state in one of the following 4 states:
+ *
+ * [1] The thread has already been cancelled, therefore
+ * there is nothing for us to do.
+ * [2] The thread is sleeping, so we broadcast the CV first
+ * to wake it up and then we set the flag and we are
+ * waiting for it to exit.
+ * [3] The thread is doing work, in which case we just set
+ * the flag and wait for it to finish.
+ * [4] The thread was just created/resumed, in which case
+ * the behavior is similar to [3].
+ *
+ * Since requests are serialized, by the time that we get
+ * control back we expect that the zthr is cancelled and
+ * not running anymore.
+ */
+ if (t->zthr_thread != NULL) {
+ t->zthr_cancel = B_TRUE;
+
+ /* broadcast in case the zthr is sleeping */
+ cv_broadcast(&t->zthr_cv);
+
+ while (t->zthr_thread != NULL)
+ cv_wait(&t->zthr_cv, &t->zthr_state_lock);
+
+ ASSERT(!t->zthr_cancel);
+ }
+
+ mutex_exit(&t->zthr_state_lock);
+ mutex_exit(&t->zthr_request_lock);
+}
+
+/*
+ * Sends a resume request to the supplied zthr. If the zthr is
+ * already running this is a no-op.
+ */
+void
+zthr_resume(zthr_t *t)
+{
+ mutex_enter(&t->zthr_request_lock);
+ mutex_enter(&t->zthr_state_lock);
+
+ ASSERT3P(&t->zthr_checkfunc, !=, NULL);
+ ASSERT3P(&t->zthr_func, !=, NULL);
+ ASSERT(!t->zthr_cancel);
+
+ /*
+ * There are 4 states that we find the zthr in at this point
+ * given the locks that we hold:
+ *
+ * [1] The zthr was cancelled, so we spawn a new thread for
+ * the zthr (common case).
+ * [2] The zthr is running at which point this is a no-op.
+ * [3] The zthr is sleeping at which point this is a no-op.
+ * [4] The zthr was just spawned at which point this is a
+ * no-op.
+ */
+ if (t->zthr_thread == NULL) {
+ t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
+ 0, &p0, TS_RUN, minclsyspri);
+ }
+
+ mutex_exit(&t->zthr_state_lock);
+ mutex_exit(&t->zthr_request_lock);
+}
+
+/*
+ * This function is intended to be used by the zthr itself
+ * (specifically the zthr_func callback provided) to check
+ * if another thread has signaled it to stop running before
+ * doing some expensive operation.
+ *
+ * returns TRUE if we are in the middle of trying to cancel
+ * this thread.
+ *
+ * returns FALSE otherwise.
+ */
+boolean_t
+zthr_iscancelled(zthr_t *t)
+{
+ ASSERT3P(t->zthr_thread, ==, curthread);
+
+ /*
+ * The majority of the functions here grab zthr_request_lock
+ * first and then zthr_state_lock. This function only grabs
+ * the zthr_state_lock. That is because this function should
+ * only be called from the zthr_func to check if someone has
+ * issued a zthr_cancel() on the thread. If there is a zthr_cancel()
+ * happening concurrently, attempting to grab the request lock
+ * here would result in a deadlock.
+ *
+ * By grabbing only the zthr_state_lock this function is allowed
+ * to run concurrently with a zthr_cancel() request.
+ */
+ mutex_enter(&t->zthr_state_lock);
+ boolean_t cancelled = t->zthr_cancel;
+ mutex_exit(&t->zthr_state_lock);
+ return (cancelled);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
new file mode 100644
index 000000000000..f68670c956a1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -0,0 +1,3347 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/dsk/<pool_name>/<dataset_name>
+ * /dev/zvol/rdsk/<pool_name>/<dataset_name>
+ *
+ * These links are created by the /dev filesystem (sdev_zvolops.c).
+ * Volumes are persistent through reboot. No user command needs to be
+ * run before opening and using a device.
+ *
+ * FreeBSD notes.
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/disk.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dkio.h>
+#include <sys/byteorder.h>
+#include <sys/sunddi.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/queue.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zil.h>
+#include <sys/refcount.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
+#include <sys/zvol.h>
+#include <sys/zil_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil_impl.h>
+#include <sys/filio.h>
+#include <sys/zfs_rlock.h>
+
+#include <geom/geom.h>
+
+#include "zfs_namecheck.h"
+
+#ifndef illumos
+struct g_class zfs_zvol_class = {
+ .name = "ZFS::ZVOL",
+ .version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+
+#endif
+void *zfsdev_state;
+static char *zvol_tag = "zvol_tag";
+
+#define ZVOL_DUMPSIZE "dumpsize"
+
+/*
+ * This lock protects the zfsdev_state structure from being modified
+ * while it's being used, e.g. an open that comes in before a create
+ * finishes. It also protects temporary opens of the dataset so that,
+ * e.g., an open doesn't get a spurious EBUSY.
+ */
+#ifdef illumos
+kmutex_t zfsdev_state_lock;
+#else
+/*
+ * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
+ * spa_namespace_lock in the ZVOL code.
+ */
+#define zfsdev_state_lock spa_namespace_lock
+#endif
+static uint32_t zvol_minors;
+
+#ifndef illumos
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "ZFS VOLUME");
+static int volmode = ZFS_VOLMODE_GEOM;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
+ "Expose as GEOM providers (1), device files (2) or neither");
+static boolean_t zpool_on_zvol = B_FALSE;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
+ "Allow zpools to use zvols as vdevs (DANGEROUS)");
+
+#endif
+typedef struct zvol_extent {
+ list_node_t ze_node;
+ dva_t ze_dva; /* dva associated with this extent */
+ uint64_t ze_nblks; /* number of blocks in extent */
+} zvol_extent_t;
+
+/*
+ * The in-core state of each volume.
+ */
+typedef struct zvol_state {
+#ifndef illumos
+ LIST_ENTRY(zvol_state) zv_links;
+#endif
+ char zv_name[MAXPATHLEN]; /* pool/dd name */
+ uint64_t zv_volsize; /* amount of space we advertise */
+ uint64_t zv_volblocksize; /* volume block size */
+#ifdef illumos
+ minor_t zv_minor; /* minor number */
+#else
+ struct cdev *zv_dev; /* non-GEOM device */
+ struct g_provider *zv_provider; /* GEOM provider */
+#endif
+ uint8_t zv_min_bs; /* minimum addressable block shift */
+ uint8_t zv_flags; /* readonly, dumpified, etc. */
+ objset_t *zv_objset; /* objset handle */
+#ifdef illumos
+ uint32_t zv_open_count[OTYPCNT]; /* open counts */
+#endif
+ uint32_t zv_total_opens; /* total open count */
+ uint32_t zv_sync_cnt; /* synchronous open count */
+ zilog_t *zv_zilog; /* ZIL handle */
+ list_t zv_extents; /* List of extents for dump */
+ rangelock_t zv_rangelock;
+ dnode_t *zv_dn; /* dnode hold */
+#ifndef illumos
+ int zv_state;
+ int zv_volmode; /* Provide GEOM or cdev */
+ struct bio_queue_head zv_queue;
+ struct mtx zv_queue_mtx; /* zv_queue mutex */
+#endif
+} zvol_state_t;
+
+typedef enum {
+ ZVOL_ASYNC_CREATE_MINORS,
+ ZVOL_ASYNC_REMOVE_MINORS,
+ ZVOL_ASYNC_RENAME_MINORS,
+ ZVOL_ASYNC_MAX
+} zvol_async_op_t;
+
+typedef struct {
+ zvol_async_op_t op;
+ char pool[ZFS_MAX_DATASET_NAME_LEN];
+ char name1[ZFS_MAX_DATASET_NAME_LEN];
+ char name2[ZFS_MAX_DATASET_NAME_LEN];
+} zvol_task_t;
+
+#ifndef illumos
+static LIST_HEAD(, zvol_state) all_zvols;
+#endif
+/*
+ * zvol specific flags
+ */
+#define ZVOL_RDONLY 0x1
+#define ZVOL_DUMPIFIED 0x2
+#define ZVOL_EXCL 0x4
+#define ZVOL_WCE 0x8
+
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS/2;
+
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+
+/*
+ * If true, unmaps requested as synchronous are executed synchronously,
+ * otherwise all unmaps are asynchronous.
+ */
+boolean_t zvol_unmap_sync_enabled = B_FALSE;
+
+#ifndef illumos
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+ &zvol_unmap_enabled, 0,
+ "Enable UNMAP functionality");
+
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN,
+ &zvol_unmap_sync_enabled, 0,
+ "UNMAPs requested as sync are executed synchronously");
+
+static d_open_t zvol_d_open;
+static d_close_t zvol_d_close;
+static d_read_t zvol_read;
+static d_write_t zvol_write;
+static d_ioctl_t zvol_d_ioctl;
+static d_strategy_t zvol_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = zvol_d_open,
+ .d_close = zvol_d_close,
+ .d_read = zvol_read,
+ .d_write = zvol_write,
+ .d_ioctl = zvol_d_ioctl,
+ .d_strategy = zvol_strategy,
+ .d_name = "zvol",
+ .d_flags = D_DISK | D_TRACKCLOSE,
+};
+
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_start(struct bio *bp);
+static void zvol_geom_worker(void *arg);
+static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
+ uint64_t len, boolean_t sync);
+#endif /* !illumos */
+
+extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
+ nvlist_t *, nvlist_t *);
+static int zvol_remove_zv(zvol_state_t *);
+static int zvol_get_data(void *arg, lr_write_t *lr, char *buf,
+ struct lwb *lwb, zio_t *zio);
+static int zvol_dumpify(zvol_state_t *zv);
+static int zvol_dump_fini(zvol_state_t *zv);
+static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
+
+static void
+zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
+{
+#ifdef illumos
+ dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
+
+ zv->zv_volsize = volsize;
+ VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+ "Size", volsize) == DDI_SUCCESS);
+ VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+ "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
+
+ /* Notify specfs to invalidate the cached size */
+ spec_size_invalidate(dev, VBLK);
+ spec_size_invalidate(dev, VCHR);
+#else /* !illumos */
+ zv->zv_volsize = volsize;
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct g_provider *pp;
+
+ pp = zv->zv_provider;
+ if (pp == NULL)
+ return;
+ g_topology_lock();
+
+ /*
+ * Do not invoke resize event when initial size was zero.
+ * ZVOL initializes the size on first open, this is not
+ * real resizing.
+ */
+ if (pp->mediasize == 0)
+ pp->mediasize = zv->zv_volsize;
+ else
+ g_resize_provider(pp, zv->zv_volsize);
+ g_topology_unlock();
+ }
+#endif /* illumos */
+}
+
+int
+zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
+{
+ if (volsize == 0)
+ return (SET_ERROR(EINVAL));
+
+ if (volsize % blocksize != 0)
+ return (SET_ERROR(EINVAL));
+
+#ifdef _ILP32
+ if (volsize - 1 > SPEC_MAXOFFSET_T)
+ return (SET_ERROR(EOVERFLOW));
+#endif
+ return (0);
+}
+
+int
+zvol_check_volblocksize(uint64_t volblocksize)
+{
+ if (volblocksize < SPA_MINBLOCKSIZE ||
+ volblocksize > SPA_OLD_MAXBLOCKSIZE ||
+ !ISP2(volblocksize))
+ return (SET_ERROR(EDOM));
+
+ return (0);
+}
+
+int
+zvol_get_stats(objset_t *os, nvlist_t *nv)
+{
+ int error;
+ dmu_object_info_t doi;
+ uint64_t val;
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
+ if (error)
+ return (error);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
+
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+
+ if (error == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
+ doi.doi_data_block_size);
+ }
+
+ return (error);
+}
+
+static zvol_state_t *
+zvol_minor_lookup(const char *name)
+{
+#ifdef illumos
+ minor_t minor;
+#endif
+ zvol_state_t *zv;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+#ifdef illumos
+ for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ continue;
+#else
+ LIST_FOREACH(zv, &all_zvols, zv_links) {
+#endif
+ if (strcmp(zv->zv_name, name) == 0)
+ return (zv);
+ }
+
+ return (NULL);
+}
+
+/* extent mapping arg */
+struct maparg {
+ zvol_state_t *ma_zv;
+ uint64_t ma_blks;
+};
+
+/*ARGSUSED*/
+static int
+zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ struct maparg *ma = arg;
+ zvol_extent_t *ze;
+ int bs = ma->ma_zv->zv_volblocksize;
+
+ if (bp == NULL || BP_IS_HOLE(bp) ||
+ zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
+ return (0);
+
+ VERIFY(!BP_IS_EMBEDDED(bp));
+
+ VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
+ ma->ma_blks++;
+
+ /* Abort immediately if we have encountered gang blocks */
+ if (BP_IS_GANG(bp))
+ return (SET_ERROR(EFRAGS));
+
+ /*
+ * See if the block is at the end of the previous extent.
+ */
+ ze = list_tail(&ma->ma_zv->zv_extents);
+ if (ze &&
+ DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
+ DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
+ DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
+ ze->ze_nblks++;
+ return (0);
+ }
+
+ dprintf_bp(bp, "%s", "next blkptr:");
+
+ /* start a new extent */
+ ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
+ ze->ze_dva = bp->blk_dva[0]; /* structure assignment */
+ ze->ze_nblks = 1;
+ list_insert_tail(&ma->ma_zv->zv_extents, ze);
+ return (0);
+}
+
+static void
+zvol_free_extents(zvol_state_t *zv)
+{
+ zvol_extent_t *ze;
+
+ while (ze = list_head(&zv->zv_extents)) {
+ list_remove(&zv->zv_extents, ze);
+ kmem_free(ze, sizeof (zvol_extent_t));
+ }
+}
+
+static int
+zvol_get_lbas(zvol_state_t *zv)
+{
+ objset_t *os = zv->zv_objset;
+ struct maparg ma;
+ int err;
+
+ ma.ma_zv = zv;
+ ma.ma_blks = 0;
+ zvol_free_extents(zv);
+
+ /* commit any in-flight changes before traversing the dataset */
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ err = traverse_dataset(dmu_objset_ds(os), 0,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
+ if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
+ zvol_free_extents(zv);
+ return (err ? err : EIO);
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+void
+zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+ zfs_creat_t *zct = arg;
+ nvlist_t *nvprops = zct->zct_props;
+ int error;
+ uint64_t volblocksize, volsize;
+
+ VERIFY(nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
+ if (nvlist_lookup_uint64(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
+ volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
+
+ /*
+ * These properties must be removed from the list so the generic
+ * property setting step won't apply to them.
+ */
+ VERIFY(nvlist_remove_all(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
+ (void) nvlist_remove_all(nvprops,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
+
+ error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
+ ASSERT(error == 0);
+}
+
+/*
+ * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we
+ * implement DKIOCFREE/free-long-range.
+ */
+static int
+zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zvol_state_t *zv = arg1;
+ lr_truncate_t *lr = arg2;
+ uint64_t offset, length;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
+}
+
+/*
+ * Replay a TX_WRITE ZIL transaction that didn't get committed
+ * after a system failure
+ */
+static int
+zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zvol_state_t *zv = arg1;
+ lr_write_t *lr = arg2;
+ objset_t *os = zv->zv_objset;
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ uint64_t offset, length;
+ dmu_tx_t *tx;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
+ dmu_tx_commit(tx);
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
+{
+ return (SET_ERROR(ENOTSUP));
+}
+
+/*
+ * Callback vectors for replaying records.
+ * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
+ */
+zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
+ zvol_replay_err, /* 0 no such transaction type */
+ zvol_replay_err, /* TX_CREATE */
+ zvol_replay_err, /* TX_MKDIR */
+ zvol_replay_err, /* TX_MKXATTR */
+ zvol_replay_err, /* TX_SYMLINK */
+ zvol_replay_err, /* TX_REMOVE */
+ zvol_replay_err, /* TX_RMDIR */
+ zvol_replay_err, /* TX_LINK */
+ zvol_replay_err, /* TX_RENAME */
+ zvol_replay_write, /* TX_WRITE */
+ zvol_replay_truncate, /* TX_TRUNCATE */
+ zvol_replay_err, /* TX_SETATTR */
+ zvol_replay_err, /* TX_ACL */
+ zvol_replay_err, /* TX_CREATE_ACL */
+ zvol_replay_err, /* TX_CREATE_ATTR */
+ zvol_replay_err, /* TX_CREATE_ACL_ATTR */
+ zvol_replay_err, /* TX_MKDIR_ACL */
+ zvol_replay_err, /* TX_MKDIR_ATTR */
+ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
+ zvol_replay_err, /* TX_WRITE2 */
+};
+
+#ifdef illumos
+int
+zvol_name2minor(const char *name, minor_t *minor)
+{
+ zvol_state_t *zv;
+
+ mutex_enter(&zfsdev_state_lock);
+ zv = zvol_minor_lookup(name);
+ if (minor && zv)
+ *minor = zv->zv_minor;
+ mutex_exit(&zfsdev_state_lock);
+ return (zv ? 0 : -1);
+}
+#endif /* illumos */
+
+/*
+ * Create a minor node (plus a whole lot more) for the specified volume.
+ */
+static int
+zvol_create_minor(const char *name)
+{
+ zfs_soft_state_t *zs;
+ zvol_state_t *zv;
+ objset_t *os;
+#ifdef illumos
+ dmu_object_info_t doi;
+ minor_t minor = 0;
+ char chrbuf[30], blkbuf[30];
+#else
+ struct g_provider *pp;
+ struct g_geom *gp;
+ uint64_t mode;
+#endif
+ int error;
+
+#ifndef illumos
+ ZFS_LOG(1, "Creating ZVOL %s...", name);
+#endif
+
+ mutex_enter(&zfsdev_state_lock);
+
+ if (zvol_minor_lookup(name) != NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(EEXIST));
+ }
+
+ /* lie and say we're read-only */
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
+
+ if (error) {
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+ }
+
+#ifdef illumos
+ if ((minor = zfsdev_minor_alloc()) == 0) {
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(EAGAIN));
+ }
+ (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
+ (char *)name);
+
+ (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
+
+ if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
+ minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_soft_state_free(zfsdev_state, minor);
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(EAGAIN));
+ }
+
+ (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
+
+ if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
+ minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_remove_minor_node(zfs_dip, chrbuf);
+ ddi_soft_state_free(zfsdev_state, minor);
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(EAGAIN));
+ }
+
+ zs = ddi_get_soft_state(zfsdev_state, minor);
+ zs->zss_type = ZSST_ZVOL;
+ zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+#else /* !illumos */
+
+ zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
+ zv->zv_state = 0;
+ error = dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
+ if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
+ mode = volmode;
+
+ zv->zv_volmode = mode;
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ g_topology_lock();
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_geom_start;
+ gp->access = zvol_geom_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = 0;
+ pp->private = zv;
+
+ zv->zv_provider = pp;
+ bioq_init(&zv->zv_queue);
+ mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct make_dev_args args;
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ error = make_dev_s(&args, &zv->zv_dev,
+ "%s/%s", ZVOL_DRIVER, name);
+ if (error != 0) {
+ kmem_free(zv, sizeof(*zv));
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+ }
+ zv->zv_dev->si_iosize_max = MAXPHYS;
+ }
+ LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
+#endif /* illumos */
+
+ (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+ zv->zv_min_bs = DEV_BSHIFT;
+#ifdef illumos
+ zv->zv_minor = minor;
+#endif
+ zv->zv_objset = os;
+ if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
+ zv->zv_flags |= ZVOL_RDONLY;
+ rangelock_init(&zv->zv_rangelock, NULL, NULL);
+ list_create(&zv->zv_extents, sizeof (zvol_extent_t),
+ offsetof(zvol_extent_t, ze_node));
+#ifdef illumos
+ /* get and cache the blocksize */
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+ ASSERT(error == 0);
+ zv->zv_volblocksize = doi.doi_data_block_size;
+#endif
+
+ if (spa_writeable(dmu_objset_spa(os))) {
+ if (zil_replay_disable)
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ else
+ zil_replay(os, zv, zvol_replay_vector);
+ }
+ dmu_objset_disown(os, FTAG);
+ zv->zv_objset = NULL;
+
+ zvol_minors++;
+
+ mutex_exit(&zfsdev_state_lock);
+#ifndef illumos
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ zvol_geom_run(zv);
+ g_topology_unlock();
+ }
+
+ ZFS_LOG(1, "ZVOL %s created.", name);
+#endif
+
+ return (0);
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+static int
+zvol_remove_zv(zvol_state_t *zv)
+{
+#ifdef illumos
+ char nmbuf[20];
+ minor_t minor = zv->zv_minor;
+#endif
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ if (zv->zv_total_opens != 0)
+ return (SET_ERROR(EBUSY));
+
+#ifdef illumos
+ (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
+ ddi_remove_minor_node(zfs_dip, nmbuf);
+
+ (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
+ ddi_remove_minor_node(zfs_dip, nmbuf);
+#else
+ ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+ LIST_REMOVE(zv, zv_links);
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ g_topology_lock();
+ zvol_geom_destroy(zv);
+ g_topology_unlock();
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ if (zv->zv_dev != NULL)
+ destroy_dev(zv->zv_dev);
+ }
+#endif
+
+ rangelock_fini(&zv->zv_rangelock);
+
+ kmem_free(zv, sizeof (zvol_state_t));
+#ifdef illumos
+ ddi_soft_state_free(zfsdev_state, minor);
+#endif
+ zvol_minors--;
+ return (0);
+}
+
+int
+zvol_first_open(zvol_state_t *zv)
+{
+ dmu_object_info_t doi;
+ objset_t *os;
+ uint64_t volsize;
+ int error;
+ uint64_t readonly;
+
+ /* lie and say we're read-only */
+ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
+ zvol_tag, &os);
+ if (error)
+ return (error);
+
+ zv->zv_objset = os;
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ if (error) {
+ ASSERT(error == 0);
+ dmu_objset_disown(os, zvol_tag);
+ return (error);
+ }
+
+ /* get and cache the blocksize */
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+ if (error) {
+ ASSERT(error == 0);
+ dmu_objset_disown(os, zvol_tag);
+ return (error);
+ }
+ zv->zv_volblocksize = doi.doi_data_block_size;
+
+ error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn);
+ if (error) {
+ dmu_objset_disown(os, zvol_tag);
+ return (error);
+ }
+
+ zvol_size_changed(zv, volsize);
+ zv->zv_zilog = zil_open(os, zvol_get_data);
+
+ VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
+ NULL) == 0);
+ if (readonly || dmu_objset_is_snapshot(os) ||
+ !spa_writeable(dmu_objset_spa(os)))
+ zv->zv_flags |= ZVOL_RDONLY;
+ else
+ zv->zv_flags &= ~ZVOL_RDONLY;
+ return (error);
+}
+
+void
+zvol_last_close(zvol_state_t *zv)
+{
+ zil_close(zv->zv_zilog);
+ zv->zv_zilog = NULL;
+
+ dnode_rele(zv->zv_dn, zvol_tag);
+ zv->zv_dn = NULL;
+
+ /*
+ * Evict cached data
+ */
+ if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
+ !(zv->zv_flags & ZVOL_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+ dmu_objset_evict_dbufs(zv->zv_objset);
+
+ dmu_objset_disown(zv->zv_objset, zvol_tag);
+ zv->zv_objset = NULL;
+}
+
+#ifdef illumos
+int
+zvol_prealloc(zvol_state_t *zv)
+{
+ objset_t *os = zv->zv_objset;
+ dmu_tx_t *tx;
+ uint64_t refd, avail, usedobjs, availobjs;
+ uint64_t resid = zv->zv_volsize;
+ uint64_t off = 0;
+
+ /* Check the space usage before attempting to allocate the space */
+ dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
+ if (avail < zv->zv_volsize)
+ return (SET_ERROR(ENOSPC));
+
+ /* Free old extents if they exist */
+ zvol_free_extents(zv);
+
+ while (resid != 0) {
+ int error;
+ uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
+ return (error);
+ }
+ dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
+ dmu_tx_commit(tx);
+ off += bytes;
+ resid -= bytes;
+ }
+ txg_wait_synced(dmu_objset_pool(os), 0);
+
+ return (0);
+}
+#endif /* illumos */
+
+static int
+zvol_update_volsize(objset_t *os, uint64_t volsize)
+{
+ dmu_tx_t *tx;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
+ &volsize, tx);
+ dmu_tx_commit(tx);
+
+ if (error == 0)
+ error = dmu_free_long_range(os,
+ ZVOL_OBJ, volsize, DMU_OBJECT_END);
+ return (error);
+}
+
+void
+zvol_remove_minors_impl(const char *name)
+{
+#ifdef illumos
+ zvol_state_t *zv;
+ char *namebuf;
+ minor_t minor;
+
+ namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
+ (void) strncpy(namebuf, name, strlen(name));
+ (void) strcat(namebuf, "/");
+ mutex_enter(&zfsdev_state_lock);
+ for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ continue;
+ if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
+ (void) zvol_remove_zv(zv);
+ }
+ kmem_free(namebuf, strlen(name) + 2);
+
+ mutex_exit(&zfsdev_state_lock);
+#else /* !illumos */
+ zvol_state_t *zv, *tzv;
+ size_t namelen;
+
+ namelen = strlen(name);
+
+ mutex_enter(&zfsdev_state_lock);
+
+ LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
+ if (strcmp(zv->zv_name, name) == 0 ||
+ (strncmp(zv->zv_name, name, namelen) == 0 &&
+ strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
+ zv->zv_name[namelen] == '@'))) {
+ (void) zvol_remove_zv(zv);
+ }
+ }
+
+ mutex_exit(&zfsdev_state_lock);
+#endif /* illumos */
+}
+
+static int
+zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+ uint64_t old_volsize = 0ULL;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ /*
+ * Reinitialize the dump area to the new size. If we
+ * failed to resize the dump area then restore it back to
+ * its original size. We must set the new volsize prior
+ * to calling dumpvp_resize() to ensure that the devices'
+ * size(9P) is not visible by the dump subsystem.
+ */
+ old_volsize = zv->zv_volsize;
+ zvol_size_changed(zv, volsize);
+
+#ifdef ZVOL_DUMP
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ if ((error = zvol_dumpify(zv)) != 0 ||
+ (error = dumpvp_resize()) != 0) {
+ int dumpify_error;
+
+ (void) zvol_update_volsize(zv->zv_objset, old_volsize);
+ zvol_size_changed(zv, old_volsize);
+ dumpify_error = zvol_dumpify(zv);
+ error = dumpify_error ? dumpify_error : error;
+ }
+ }
+#endif /* ZVOL_DUMP */
+
+#ifdef illumos
+ /*
+ * Generate a LUN expansion event.
+ */
+ if (error == 0) {
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
+ zv->zv_minor);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+ }
+#endif /* illumos */
+ return (error);
+}
+
+int
+zvol_set_volsize(const char *name, uint64_t volsize)
+{
+ zvol_state_t *zv = NULL;
+ objset_t *os;
+ int error;
+ dmu_object_info_t doi;
+ uint64_t readonly;
+ boolean_t owned = B_FALSE;
+
+ error = dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
+ if (error != 0)
+ return (error);
+ if (readonly)
+ return (SET_ERROR(EROFS));
+
+ mutex_enter(&zfsdev_state_lock);
+ zv = zvol_minor_lookup(name);
+
+ if (zv == NULL || zv->zv_objset == NULL) {
+ if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
+ FTAG, &os)) != 0) {
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+ }
+ owned = B_TRUE;
+ if (zv != NULL)
+ zv->zv_objset = os;
+ } else {
+ os = zv->zv_objset;
+ }
+
+ if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
+ (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
+ goto out;
+
+ error = zvol_update_volsize(os, volsize);
+
+ if (error == 0 && zv != NULL)
+ error = zvol_update_live_volsize(zv, volsize);
+out:
+ if (owned) {
+ dmu_objset_disown(os, FTAG);
+ if (zv != NULL)
+ zv->zv_objset = NULL;
+ }
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+}
+
+/*ARGSUSED*/
+#ifdef illumos
+int
+zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+#else
+static int
+zvol_open(struct g_provider *pp, int flag, int count)
+#endif
+{
+ zvol_state_t *zv;
+ int err = 0;
+#ifdef illumos
+
+ mutex_enter(&zfsdev_state_lock);
+
+ zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (zv->zv_total_opens == 0)
+ err = zvol_first_open(zv);
+ if (err) {
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+ }
+#else /* !illumos */
+ boolean_t locked = B_FALSE;
+
+ if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+ /*
+ * if zfs_geom_probe_vdev_key is set, that means that zfs is
+ * attempting to probe geom providers while looking for a
+ * replacement for a missing VDEV. In this case, the
+ * spa_namespace_lock will not be held, but it is still illegal
+ * to use a zvol as a vdev. Deadlocks can result if another
+ * thread has spa_namespace_lock
+ */
+ return (EOPNOTSUPP);
+ }
+ /*
+ * Protect against recursively entering spa_namespace_lock
+ * when spa_open() is used for a pool on a (local) ZVOL(s).
+ * This is needed since we replaced upstream zfsdev_state_lock
+ * with spa_namespace_lock in the ZVOL code.
+ * We are using the same trick as spa_open().
+ * Note that calls in zvol_first_open which need to resolve
+ * pool name to a spa object will enter spa_open()
+ * recursively, but that function already has all the
+ * necessary protection.
+ */
+ if (!MUTEX_HELD(&zfsdev_state_lock)) {
+ mutex_enter(&zfsdev_state_lock);
+ locked = B_TRUE;
+ }
+
+ zv = pp->private;
+ if (zv == NULL) {
+ if (locked)
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (zv->zv_total_opens == 0) {
+ err = zvol_first_open(zv);
+ if (err) {
+ if (locked)
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+ }
+ pp->mediasize = zv->zv_volsize;
+ pp->stripeoffset = 0;
+ pp->stripesize = zv->zv_volblocksize;
+ }
+#endif /* illumos */
+ if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ err = SET_ERROR(EROFS);
+ goto out;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = SET_ERROR(EBUSY);
+ goto out;
+ }
+#ifdef FEXCL
+ if (flag & FEXCL) {
+ if (zv->zv_total_opens != 0) {
+ err = SET_ERROR(EBUSY);
+ goto out;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
+ }
+#endif
+
+#ifdef illumos
+ if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
+ zv->zv_open_count[otyp]++;
+ zv->zv_total_opens++;
+ }
+ mutex_exit(&zfsdev_state_lock);
+#else
+ zv->zv_total_opens += count;
+ if (locked)
+ mutex_exit(&zfsdev_state_lock);
+#endif
+
+ return (err);
+out:
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+#ifdef illumos
+ mutex_exit(&zfsdev_state_lock);
+#else
+ if (locked)
+ mutex_exit(&zfsdev_state_lock);
+#endif
+ return (err);
+}
+
+/*ARGSUSED*/
+#ifdef illumos
+int
+zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
+{
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ int error = 0;
+
+ mutex_enter(&zfsdev_state_lock);
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+#else /* !illumos */
+static int
+zvol_close(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ int error = 0;
+ boolean_t locked = B_FALSE;
+
+ /* See comment in zvol_open(). */
+ if (!MUTEX_HELD(&zfsdev_state_lock)) {
+ mutex_enter(&zfsdev_state_lock);
+ locked = B_TRUE;
+ }
+
+ zv = pp->private;
+ if (zv == NULL) {
+ if (locked)
+ mutex_exit(&zfsdev_state_lock);
+#endif /* illumos */
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT(zv->zv_total_opens == 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+#ifdef illumos
+ ASSERT(zv->zv_open_count[otyp] != 0);
+#endif
+ ASSERT(zv->zv_total_opens != 0);
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+#ifdef illumos
+ zv->zv_open_count[otyp]--;
+ zv->zv_total_opens--;
+#else
+ zv->zv_total_opens -= count;
+#endif
+
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+
+#ifdef illumos
+ mutex_exit(&zfsdev_state_lock);
+#else
+ if (locked)
+ mutex_exit(&zfsdev_state_lock);
+#endif
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+zvol_get_done(zgd_t *zgd, int error)
+{
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ rangelock_exit(zgd->zgd_lr);
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+static int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zvol_state_t *zv = arg;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length; /* length of user data */
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+ RL_READER);
+ error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's written out
+ * and its checksum is being calculated that no one can change
+ * the data. Contrarily to zfs_get_data we need not re-check
+ * blocksize after we get the lock because it cannot be changed.
+ */
+ size = zv->zv_volblocksize;
+ offset = P2ALIGN(offset, size);
+ zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+ RL_READER);
+ error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zvol_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
+
+ zvol_get_done(zgd, error);
+
+ return (error);
+}
+
+/*
+ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
+ *
+ * We store data in the log buffers if it's small enough.
+ * Otherwise we will later flush the data out via dmu_sync().
+ */
+ssize_t zvol_immediate_write_sz = 32768;
+#ifdef _KERNEL
+SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
+ &zvol_immediate_write_sz, 0, "Minimal size for indirect log write");
+#endif
+
+static void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
+ boolean_t sync)
+{
+ uint32_t blocksize = zv->zv_volblocksize;
+ zilog_t *zilog = zv->zv_zilog;
+ itx_wr_state_t write_state;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ write_state = WR_INDIRECT;
+ else if (!spa_has_slogs(zilog->zl_spa) &&
+ resid >= blocksize && blocksize > zvol_immediate_write_sz)
+ write_state = WR_INDIRECT;
+ else if (sync)
+ write_state = WR_COPIED;
+ else
+ write_state = WR_NEED_COPY;
+
+ while (resid) {
+ itx_t *itx;
+ lr_write_t *lr;
+ itx_wr_state_t wr_state = write_state;
+ ssize_t len = resid;
+
+ if (wr_state == WR_COPIED && resid > zil_max_copied_data(zilog))
+ wr_state = WR_NEED_COPY;
+ else if (wr_state == WR_INDIRECT)
+ len = MIN(blocksize - P2PHASE(off, blocksize), resid);
+
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+ (wr_state == WR_COPIED ? len : 0));
+ lr = (lr_write_t *)&itx->itx_lr;
+ if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
+ off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ wr_state = WR_NEED_COPY;
+ }
+
+ itx->itx_wr_state = wr_state;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = zv;
+
+ if (!sync && (zv->zv_sync_cnt == 0))
+ itx->itx_sync = B_FALSE;
+
+ zil_itx_assign(zilog, itx, tx);
+
+ off += len;
+ resid -= len;
+ }
+}
+
+#ifdef illumos
+static int
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+ uint64_t size, boolean_t doread, boolean_t isdump)
+{
+ vdev_disk_t *dvd;
+ int c;
+ int numerrors = 0;
+
+ if (vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) {
+ for (c = 0; c < vd->vdev_children; c++) {
+ int err = zvol_dumpio_vdev(vd->vdev_child[c],
+ addr, offset, origoffset, size, doread, isdump);
+ if (err != 0) {
+ numerrors++;
+ } else if (doread) {
+ break;
+ }
+ }
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
+ return (numerrors < vd->vdev_children ? 0 : EIO);
+
+ if (doread && !vdev_readable(vd))
+ return (SET_ERROR(EIO));
+ else if (!doread && !vdev_writeable(vd))
+ return (SET_ERROR(EIO));
+
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ return (vdev_raidz_physio(vd,
+ addr, size, offset, origoffset, doread, isdump));
+ }
+
+ offset += VDEV_LABEL_START_SIZE;
+
+ if (ddi_in_panic() || isdump) {
+ ASSERT(!doread);
+ if (doread)
+ return (SET_ERROR(EIO));
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
+ return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
+ lbtodb(size)));
+ } else {
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
+ return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+ offset, doread ? B_READ : B_WRITE));
+ }
+}
+
+static int
+zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
+ boolean_t doread, boolean_t isdump)
+{
+ vdev_t *vd;
+ int error;
+ zvol_extent_t *ze;
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+
+ /* Must be sector aligned, and not stradle a block boundary. */
+ if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
+ P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
+ return (SET_ERROR(EINVAL));
+ }
+ ASSERT(size <= zv->zv_volblocksize);
+
+ /* Locate the extent this belongs to */
+ ze = list_head(&zv->zv_extents);
+ while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
+ offset -= ze->ze_nblks * zv->zv_volblocksize;
+ ze = list_next(&zv->zv_extents, ze);
+ }
+
+ if (ze == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (!ddi_in_panic())
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
+ offset += DVA_GET_OFFSET(&ze->ze_dva);
+ error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+ size, doread, isdump);
+
+ if (!ddi_in_panic())
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ return (error);
+}
+
+int
+zvol_strategy(buf_t *bp)
+{
+ zfs_soft_state_t *zs = NULL;
+#else /* !illumos */
+void
+zvol_strategy(struct bio *bp)
+{
+#endif /* illumos */
+ zvol_state_t *zv;
+ uint64_t off, volsize;
+ size_t resid;
+ char *addr;
+ objset_t *os;
+ int error = 0;
+#ifdef illumos
+ boolean_t doread = bp->b_flags & B_READ;
+#else
+ boolean_t doread = 0;
+#endif
+ boolean_t is_dumpified;
+ boolean_t sync;
+
+#ifdef illumos
+ if (getminor(bp->b_edev) == 0) {
+ error = SET_ERROR(EINVAL);
+ } else {
+ zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
+ if (zs == NULL)
+ error = SET_ERROR(ENXIO);
+ else if (zs->zss_type != ZSST_ZVOL)
+ error = SET_ERROR(EINVAL);
+ }
+
+ if (error) {
+ bioerror(bp, error);
+ biodone(bp);
+ return (0);
+ }
+
+ zv = zs->zss_data;
+
+ if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
+ bioerror(bp, EROFS);
+ biodone(bp);
+ return (0);
+ }
+
+ off = ldbtob(bp->b_blkno);
+#else /* !illumos */
+ if (bp->bio_to)
+ zv = bp->bio_to->private;
+ else
+ zv = bp->bio_dev->si_drv2;
+
+ if (zv == NULL) {
+ error = SET_ERROR(ENXIO);
+ goto out;
+ }
+
+ if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
+ error = SET_ERROR(EROFS);
+ goto out;
+ }
+
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ goto sync;
+ case BIO_READ:
+ doread = 1;
+ case BIO_WRITE:
+ case BIO_DELETE:
+ break;
+ default:
+ error = EOPNOTSUPP;
+ goto out;
+ }
+
+ off = bp->bio_offset;
+#endif /* illumos */
+ volsize = zv->zv_volsize;
+
+ os = zv->zv_objset;
+ ASSERT(os != NULL);
+
+#ifdef illumos
+ bp_mapin(bp);
+ addr = bp->b_un.b_addr;
+ resid = bp->b_bcount;
+
+ if (resid > 0 && (off < 0 || off >= volsize)) {
+ bioerror(bp, EIO);
+ biodone(bp);
+ return (0);
+ }
+
+ is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
+ sync = ((!(bp->b_flags & B_ASYNC) &&
+ !(zv->zv_flags & ZVOL_WCE)) ||
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
+ !doread && !is_dumpified;
+#else /* !illumos */
+ addr = bp->bio_data;
+ resid = bp->bio_length;
+
+ if (resid > 0 && (off < 0 || off >= volsize)) {
+ error = SET_ERROR(EIO);
+ goto out;
+ }
+
+ is_dumpified = B_FALSE;
+ sync = !doread && !is_dumpified &&
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+#endif /* illumos */
+
+ /*
+ * There must be no buffer changes when doing a dmu_sync() because
+ * we can't change the data whilst calculating the checksum.
+ */
+ locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid,
+ doread ? RL_READER : RL_WRITER);
+
+#ifndef illumos
+ if (bp->bio_cmd == BIO_DELETE) {
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ zvol_log_truncate(zv, tx, off, resid, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ off, resid);
+ resid = 0;
+ }
+ goto unlock;
+ }
+#endif
+ while (resid != 0 && off < volsize) {
+ size_t size = MIN(resid, zvol_maxphys);
+#ifdef illumos
+ if (is_dumpified) {
+ size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
+ error = zvol_dumpio(zv, addr, off, size,
+ doread, B_FALSE);
+ } else if (doread) {
+#else
+ if (doread) {
+#endif
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+ DMU_READ_PREFETCH);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ zvol_log_write(zv, tx, off, size, sync);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ off += size;
+ addr += size;
+ resid -= size;
+ }
+#ifndef illumos
+unlock:
+#endif
+ rangelock_exit(lr);
+
+#ifdef illumos
+ if ((bp->b_resid = resid) == bp->b_bcount)
+ bioerror(bp, off > volsize ? EINVAL : error);
+
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ biodone(bp);
+
+ return (0);
+#else /* !illumos */
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length && off > volsize)
+ error = EINVAL;
+
+ if (sync) {
+sync:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+out:
+ if (bp->bio_to)
+ g_io_deliver(bp, error);
+ else
+ biofinish(bp, NULL, error);
+#endif /* illumos */
+}
+
+#ifdef illumos
+/*
+ * Set the buffer count to the zvol maximum transfer.
+ * Using our own routine instead of the default minphys()
+ * means that for larger writes we write bigger buffers on X86
+ * (128K instead of 56K) and flush the disk write cache less often
+ * (every zvol_maxphys - currently 1MB) instead of minphys (currently
+ * 56K on X86 and 128K on sparc).
+ */
+void
+zvol_minphys(struct buf *bp)
+{
+ if (bp->b_bcount > zvol_maxphys)
+ bp->b_bcount = zvol_maxphys;
+}
+
+int
+zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
+{
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ int error = 0;
+ uint64_t size;
+ uint64_t boff;
+ uint64_t resid;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
+
+ if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
+ return (SET_ERROR(EINVAL));
+
+ boff = ldbtob(blkno);
+ resid = ldbtob(nblocks);
+
+ VERIFY3U(boff + resid, <=, zv->zv_volsize);
+
+ while (resid) {
+ size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
+ error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
+ if (error)
+ break;
+ boff += size;
+ addr += size;
+ resid -= size;
+ }
+
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
+{
+ minor_t minor = getminor(dev);
+#else /* !illumos */
+int
+zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+#endif /* illumos */
+ zvol_state_t *zv;
+ uint64_t volsize;
+ int error = 0;
+
+#ifdef illumos
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
+#else
+ zv = dev->si_drv2;
+#endif
+
+ volsize = zv->zv_volsize;
+ /* uio_loffset == volsize isn't an error as its required for EOF processing. */
+ if (uio->uio_resid > 0 &&
+ (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
+ return (SET_ERROR(EIO));
+
+#ifdef illumos
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ error = physio(zvol_strategy, NULL, dev, B_READ,
+ zvol_minphys, uio);
+ return (error);
+ }
+#endif
+
+ locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
+ uio->uio_loffset, uio->uio_resid, RL_READER);
+ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+
+ /* don't read past the end */
+ if (bytes > volsize - uio->uio_loffset)
+ bytes = volsize - uio->uio_loffset;
+
+ error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ }
+ rangelock_exit(lr);
+
+ return (error);
+}
+
+#ifdef illumos
+/*ARGSUSED*/
+int
+zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
+{
+ minor_t minor = getminor(dev);
+#else /* !illumos */
+int
+zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+#endif /* illumos */
+ zvol_state_t *zv;
+ uint64_t volsize;
+ int error = 0;
+ boolean_t sync;
+
+#ifdef illumos
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
+#else
+ zv = dev->si_drv2;
+#endif
+
+ volsize = zv->zv_volsize;
+ /* uio_loffset == volsize isn't an error as its required for EOF processing. */
+ if (uio->uio_resid > 0 &&
+ (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
+ return (SET_ERROR(EIO));
+
+#ifdef illumos
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ error = physio(zvol_strategy, NULL, dev, B_WRITE,
+ zvol_minphys, uio);
+ return (error);
+ }
+
+ sync = !(zv->zv_flags & ZVOL_WCE) ||
+#else
+ sync = (ioflag & IO_SYNC) ||
+#endif
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+
+ locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
+ uio->uio_loffset, uio->uio_resid, RL_WRITER);
+ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+ uint64_t off = uio->uio_loffset;
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+ if (bytes > volsize - off) /* don't write past the end */
+ bytes = volsize - off;
+
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ break;
+ }
+ error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
+ if (error == 0)
+ zvol_log_write(zv, tx, off, bytes, sync);
+ dmu_tx_commit(tx);
+
+ if (error)
+ break;
+ }
+ rangelock_exit(lr);
+
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ return (error);
+}
+
+#ifdef illumos
+int
+zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
+{
+ struct uuid uuid = EFI_RESERVED;
+ efi_gpe_t gpe = { 0 };
+ uint32_t crc;
+ dk_efi_t efi;
+ int length;
+ char *ptr;
+
+ if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
+ return (SET_ERROR(EFAULT));
+ ptr = (char *)(uintptr_t)efi.dki_data_64;
+ length = efi.dki_length;
+ /*
+ * Some clients may attempt to request a PMBR for the
+ * zvol. Currently this interface will return EINVAL to
+ * such requests. These requests could be supported by
+ * adding a check for lba == 0 and consing up an appropriate
+ * PMBR.
+ */
+ if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
+ return (SET_ERROR(EINVAL));
+
+ gpe.efi_gpe_StartingLBA = LE_64(34ULL);
+ gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
+ UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
+
+ if (efi.dki_lba == 1) {
+ efi_gpt_t gpt = { 0 };
+
+ gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
+ gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
+ gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
+ gpt.efi_gpt_MyLBA = LE_64(1ULL);
+ gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
+ gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
+ gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
+ gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
+ gpt.efi_gpt_SizeOfPartitionEntry =
+ LE_32(sizeof (efi_gpe_t));
+ CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
+ gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
+ CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
+ gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
+ if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
+ flag))
+ return (SET_ERROR(EFAULT));
+ ptr += sizeof (gpt);
+ length -= sizeof (gpt);
+ }
+ if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
+ length), flag))
+ return (SET_ERROR(EFAULT));
+ return (0);
+}
+
+/*
+ * BEGIN entry points to allow external callers access to the volume.
+ */
+/*
+ * Return the volume parameters needed for access from an external caller.
+ * These values are invariant as long as the volume is held open.
+ */
+int
+zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+ uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+ void **rl_hdl, void **dnode_hdl)
+{
+ zvol_state_t *zv;
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ return (SET_ERROR(ENXIO));
+ if (zv->zv_flags & ZVOL_DUMPIFIED)
+ return (SET_ERROR(ENXIO));
+
+ ASSERT(blksize && max_xfer_len && minor_hdl &&
+ objset_hdl && zil_hdl && rl_hdl && dnode_hdl);
+
+ *blksize = zv->zv_volblocksize;
+ *max_xfer_len = (uint64_t)zvol_maxphys;
+ *minor_hdl = zv;
+ *objset_hdl = zv->zv_objset;
+ *zil_hdl = zv->zv_zilog;
+ *rl_hdl = &zv->zv_rangelock;
+ *dnode_hdl = zv->zv_dn;
+ return (0);
+}
+
+/*
+ * Return the current volume size to an external caller.
+ * The size can change while the volume is open.
+ */
+uint64_t
+zvol_get_volume_size(void *minor_hdl)
+{
+ zvol_state_t *zv = minor_hdl;
+
+ return (zv->zv_volsize);
+}
+
+/*
+ * Return the current WCE setting to an external caller.
+ * The WCE setting can change while the volume is open.
+ */
+int
+zvol_get_volume_wce(void *minor_hdl)
+{
+ zvol_state_t *zv = minor_hdl;
+
+ return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
+}
+
+/*
+ * Entry point for external callers to zvol_log_write
+ */
+void
+zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
+ boolean_t sync)
+{
+ zvol_state_t *zv = minor_hdl;
+
+ zvol_log_write(zv, tx, off, resid, sync);
+}
+/*
+ * END entry points to allow external callers access to the volume.
+ */
+#endif /* illumos */
+
+/*
+ * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
+ */
+static void
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
+ boolean_t sync)
+{
+ itx_t *itx;
+ lr_truncate_t *lr;
+ zilog_t *zilog = zv->zv_zilog;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+ lr = (lr_truncate_t *)&itx->itx_lr;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+
+ itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
+
+#ifdef illumos
+/*
+ * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
+ * Also a dirtbag dkio ioctl for unmap/free-block functionality.
+ */
+/*ARGSUSED*/
+int
+zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+{
+ zvol_state_t *zv;
+ struct dk_callback *dkc;
+ int error = 0;
+ locked_range_t *lr;
+
+ mutex_enter(&zfsdev_state_lock);
+
+ zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
+
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+ ASSERT(zv->zv_total_opens > 0);
+
+ switch (cmd) {
+
+ case DKIOCINFO:
+ {
+ struct dk_cinfo dki;
+
+ bzero(&dki, sizeof (dki));
+ (void) strcpy(dki.dki_cname, "zvol");
+ (void) strcpy(dki.dki_dname, "zvol");
+ dki.dki_ctype = DKC_UNKNOWN;
+ dki.dki_unit = getminor(dev);
+ dki.dki_maxtransfer =
+ 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
+ mutex_exit(&zfsdev_state_lock);
+ if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
+ error = SET_ERROR(EFAULT);
+ return (error);
+ }
+
+ case DKIOCGMEDIAINFO:
+ {
+ struct dk_minfo dkm;
+
+ bzero(&dkm, sizeof (dkm));
+ dkm.dki_lbsize = 1U << zv->zv_min_bs;
+ dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+ dkm.dki_media_type = DK_UNKNOWN;
+ mutex_exit(&zfsdev_state_lock);
+ if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
+ error = SET_ERROR(EFAULT);
+ return (error);
+ }
+
+ case DKIOCGMEDIAINFOEXT:
+ {
+ struct dk_minfo_ext dkmext;
+
+ bzero(&dkmext, sizeof (dkmext));
+ dkmext.dki_lbsize = 1U << zv->zv_min_bs;
+ dkmext.dki_pbsize = zv->zv_volblocksize;
+ dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+ dkmext.dki_media_type = DK_UNKNOWN;
+ mutex_exit(&zfsdev_state_lock);
+ if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
+ error = SET_ERROR(EFAULT);
+ return (error);
+ }
+
+ case DKIOCGETEFI:
+ {
+ uint64_t vs = zv->zv_volsize;
+ uint8_t bs = zv->zv_min_bs;
+
+ mutex_exit(&zfsdev_state_lock);
+ error = zvol_getefi((void *)arg, flag, vs, bs);
+ return (error);
+ }
+
+ case DKIOCFLUSHWRITECACHE:
+ dkc = (struct dk_callback *)arg;
+ mutex_exit(&zfsdev_state_lock);
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
+ (*dkc->dkc_callback)(dkc->dkc_cookie, error);
+ error = 0;
+ }
+ return (error);
+
+ case DKIOCGETWCE:
+ {
+ int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
+ if (ddi_copyout(&wce, (void *)arg, sizeof (int),
+ flag))
+ error = SET_ERROR(EFAULT);
+ break;
+ }
+ case DKIOCSETWCE:
+ {
+ int wce;
+ if (ddi_copyin((void *)arg, &wce, sizeof (int),
+ flag)) {
+ error = SET_ERROR(EFAULT);
+ break;
+ }
+ if (wce) {
+ zv->zv_flags |= ZVOL_WCE;
+ mutex_exit(&zfsdev_state_lock);
+ } else {
+ zv->zv_flags &= ~ZVOL_WCE;
+ mutex_exit(&zfsdev_state_lock);
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+ return (0);
+ }
+
+ case DKIOCGGEOM:
+ case DKIOCGVTOC:
+ /*
+ * commands using these (like prtvtoc) expect ENOTSUP
+ * since we're emulating an EFI label
+ */
+ error = SET_ERROR(ENOTSUP);
+ break;
+
+ case DKIOCDUMPINIT:
+ lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
+ RL_WRITER);
+ error = zvol_dumpify(zv);
+ rangelock_exit(lr);
+ break;
+
+ case DKIOCDUMPFINI:
+ if (!(zv->zv_flags & ZVOL_DUMPIFIED))
+ break;
+ lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
+ RL_WRITER);
+ error = zvol_dump_fini(zv);
+ rangelock_exit(lr);
+ break;
+
+ case DKIOCFREE:
+ {
+ dkioc_free_list_t *dfl;
+ dmu_tx_t *tx;
+
+ if (!zvol_unmap_enabled)
+ break;
+
+ if (!(flag & FKIOCTL)) {
+ error = dfl_copyin((void *)arg, &dfl, flag, KM_SLEEP);
+ if (error != 0)
+ break;
+ } else {
+ dfl = (dkioc_free_list_t *)arg;
+ ASSERT3U(dfl->dfl_num_exts, <=, DFL_COPYIN_MAX_EXTS);
+ if (dfl->dfl_num_exts > DFL_COPYIN_MAX_EXTS) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ }
+
+ mutex_exit(&zfsdev_state_lock);
+
+ for (int i = 0; i < dfl->dfl_num_exts; i++) {
+ uint64_t start = dfl->dfl_exts[i].dfle_start,
+ length = dfl->dfl_exts[i].dfle_length,
+ end = start + length;
+
+ /*
+ * Apply Postel's Law to length-checking. If they
+ * overshoot, just blank out until the end, if there's
+ * a need to blank out anything.
+ */
+ if (start >= zv->zv_volsize)
+ continue; /* No need to do anything... */
+ if (end > zv->zv_volsize) {
+ end = DMU_OBJECT_END;
+ length = end - start;
+ }
+
+ lr = rangelock_enter(&zv->zv_rangelock, start, length,
+ RL_WRITER);
+ tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ zvol_log_truncate(zv, tx, start, length,
+ B_TRUE);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset,
+ ZVOL_OBJ, start, length);
+ }
+
+ rangelock_exit(lr);
+
+ if (error != 0)
+ break;
+ }
+
+ /*
+ * If the write-cache is disabled, 'sync' property
+ * is set to 'always', or if the caller is asking for
+ * a synchronous free, commit this operation to the zil.
+ * This will sync any previous uncommitted writes to the
+ * zvol object.
+ * Can be overridden by the zvol_unmap_sync_enabled tunable.
+ */
+ if ((error == 0) && zvol_unmap_sync_enabled &&
+ (!(zv->zv_flags & ZVOL_WCE) ||
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
+ (dfl->dfl_flags & DF_WAIT_SYNC))) {
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+
+ if (!(flag & FKIOCTL))
+ dfl_free(dfl);
+
+ return (error);
+ }
+
+ default:
+ error = SET_ERROR(ENOTTY);
+ break;
+
+ }
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+}
+#endif /* illumos */
+
+int
+zvol_busy(void)
+{
+ return (zvol_minors != 0);
+}
+
+void
+zvol_init(void)
+{
+ VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
+ 1) == 0);
+#ifdef illumos
+ mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
+#else
+ ZFS_LOG(1, "ZVOL Initialized.");
+#endif
+}
+
+void
+zvol_fini(void)
+{
+#ifdef illumos
+ mutex_destroy(&zfsdev_state_lock);
+#endif
+ ddi_soft_state_fini(&zfsdev_state);
+ ZFS_LOG(1, "ZVOL Deinitialized.");
+}
+
+#ifdef illumos
+/*ARGSUSED*/
+static int
+zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
+ return (1);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
+}
+
+static int
+zvol_dump_init(zvol_state_t *zv, boolean_t resize)
+{
+ dmu_tx_t *tx;
+ int error;
+ objset_t *os = zv->zv_objset;
+ spa_t *spa = dmu_objset_spa(os);
+ vdev_t *vd = spa->spa_root_vdev;
+ nvlist_t *nv = NULL;
+ uint64_t version = spa_version(spa);
+ uint64_t checksum, compress, refresrv, vbs, dedup;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ ASSERT(vd->vdev_ops == &vdev_root_ops);
+
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
+ DMU_OBJECT_END);
+ if (error != 0)
+ return (error);
+ /* wait for dmu_free_long_range to actually free the blocks */
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+
+ /*
+ * If the pool on which the dump device is being initialized has more
+ * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
+ * enabled. If so, bump that feature's counter to indicate that the
+ * feature is active. We also check the vdev type to handle the
+ * following case:
+ * # zpool create test raidz disk1 disk2 disk3
+ * Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
+ * the raidz vdev itself has 3 children.
+ */
+ if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
+ return (SET_ERROR(ENOTSUP));
+ (void) dsl_sync_task(spa_name(spa),
+ zfs_mvdev_dump_feature_check,
+ zfs_mvdev_dump_activate_feature_sync, NULL,
+ 2, ZFS_SPACE_CHECK_RESERVED);
+ }
+
+ if (!resize) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
+ NULL);
+ }
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ &refresrv, NULL);
+ }
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
+ NULL);
+ }
+ if (version >= SPA_VERSION_DEDUP && error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
+ }
+ }
+ if (error != 0)
+ return (error);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ /*
+ * If we are resizing the dump device then we only need to
+ * update the refreservation to match the newly updated
+ * zvolsize. Otherwise, we save off the original state of the
+ * zvol so that we can restore them if the zvol is ever undumpified.
+ */
+ if (resize) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
+ &zv->zv_volsize, tx);
+ } else {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
+ &compress, tx);
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
+ &checksum, tx);
+ }
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
+ &refresrv, tx);
+ }
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
+ &vbs, tx);
+ }
+ if (error == 0) {
+ error = dmu_object_set_blocksize(
+ os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
+ }
+ if (version >= SPA_VERSION_DEDUP && error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
+ &dedup, tx);
+ }
+ if (error == 0)
+ zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
+ }
+ dmu_tx_commit(tx);
+
+ /*
+ * We only need update the zvol's property if we are initializing
+ * the dump area for the first time.
+ */
+ if (error == 0 && !resize) {
+ /*
+ * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
+ * function. Otherwise, use the old default -- OFF.
+ */
+ checksum = spa_feature_is_active(spa,
+ SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
+ ZIO_CHECKSUM_OFF;
+
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
+ VERIFY(nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+ ZIO_COMPRESS_OFF) == 0);
+ VERIFY(nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM),
+ checksum) == 0);
+ if (version >= SPA_VERSION_DEDUP) {
+ VERIFY(nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_DEDUP),
+ ZIO_CHECKSUM_OFF) == 0);
+ }
+
+ error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
+ nv, NULL);
+ nvlist_free(nv);
+ }
+
+ /* Allocate the space for the dump */
+ if (error == 0)
+ error = zvol_prealloc(zv);
+ return (error);
+}
+
+static int
+zvol_dumpify(zvol_state_t *zv)
+{
+ int error = 0;
+ uint64_t dumpsize = 0;
+ dmu_tx_t *tx;
+ objset_t *os = zv->zv_objset;
+
+ if (zv->zv_flags & ZVOL_RDONLY)
+ return (SET_ERROR(EROFS));
+
+ if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
+ 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
+ boolean_t resize = (dumpsize > 0);
+
+ if ((error = zvol_dump_init(zv, resize)) != 0) {
+ (void) zvol_dump_fini(zv);
+ return (error);
+ }
+ }
+
+ /*
+ * Build up our lba mapping.
+ */
+ error = zvol_get_lbas(zv);
+ if (error) {
+ (void) zvol_dump_fini(zv);
+ return (error);
+ }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ (void) zvol_dump_fini(zv);
+ return (error);
+ }
+
+ zv->zv_flags |= ZVOL_DUMPIFIED;
+ error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
+ &zv->zv_volsize, tx);
+ dmu_tx_commit(tx);
+
+ if (error) {
+ (void) zvol_dump_fini(zv);
+ return (error);
+ }
+
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ return (0);
+}
+
+static int
+zvol_dump_fini(zvol_state_t *zv)
+{
+ dmu_tx_t *tx;
+ objset_t *os = zv->zv_objset;
+ nvlist_t *nv;
+ int error = 0;
+ uint64_t checksum, compress, refresrv, vbs, dedup;
+ uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
+
+ /*
+ * Attempt to restore the zvol back to its pre-dumpified state.
+ * This is a best-effort attempt as it's possible that not all
+ * of these properties were initialized during the dumpify process
+ * (i.e. error during zvol_dump_init).
+ */
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
+ dmu_tx_commit(tx);
+
+ (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
+ (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
+ (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
+ (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
+
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ (void) nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
+ (void) nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
+ (void) nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
+ if (version >= SPA_VERSION_DEDUP &&
+ zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
+ (void) nvlist_add_uint64(nv,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
+ }
+ (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
+ nv, NULL);
+ nvlist_free(nv);
+
+ zvol_free_extents(zv);
+ zv->zv_flags &= ~ZVOL_DUMPIFIED;
+ (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
+ /* wait for dmu_free_long_range to actually free the blocks */
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
+ zv->zv_volblocksize = vbs;
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+#else /* !illumos */
+
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+ struct g_provider *pp;
+
+ pp = zv->zv_provider;
+ g_error_provider(pp, 0);
+
+ kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
+ "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+ struct g_provider *pp;
+
+ g_topology_assert();
+
+ mtx_lock(&zv->zv_queue_mtx);
+ zv->zv_state = 1;
+ wakeup_one(&zv->zv_queue);
+ while (zv->zv_state != 2)
+ msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
+ mtx_destroy(&zv->zv_queue_mtx);
+
+ pp = zv->zv_provider;
+ zv->zv_provider = NULL;
+ pp->private = NULL;
+ g_wither_geom(pp->geom, ENXIO);
+}
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+ int count, error, flags;
+
+ g_topology_assert();
+
+ /*
+ * To make it easier we expect either open or close, but not both
+ * at the same time.
+ */
+ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+ (acr <= 0 && acw <= 0 && ace <= 0),
+ ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+ pp->name, acr, acw, ace));
+
+ if (pp->private == NULL) {
+ if (acr <= 0 && acw <= 0 && ace <= 0)
+ return (0);
+ return (pp->error);
+ }
+
+ /*
+ * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
+ * because GEOM already handles that and handles it a bit differently.
+ * GEOM allows for multiple read/exclusive consumers and ZFS allows
+ * only one exclusive consumer, no matter if it is reader or writer.
+ * I like better the way GEOM works so I'll leave it for GEOM to
+ * decide what to do.
+ */
+
+ count = acr + acw + ace;
+ if (count == 0)
+ return (0);
+
+ flags = 0;
+ if (acr != 0 || ace != 0)
+ flags |= FREAD;
+ if (acw != 0)
+ flags |= FWRITE;
+
+ g_topology_unlock();
+ if (count > 0)
+ error = zvol_open(pp, flags, count);
+ else
+ error = zvol_close(pp, flags, -count);
+ g_topology_lock();
+ return (error);
+}
+
+static void
+zvol_geom_start(struct bio *bp)
+{
+ zvol_state_t *zv;
+ boolean_t first;
+
+ zv = bp->bio_to->private;
+ ASSERT(zv != NULL);
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ if (!THREAD_CAN_SLEEP())
+ goto enqueue;
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ break;
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_DELETE:
+ if (!THREAD_CAN_SLEEP())
+ goto enqueue;
+ zvol_strategy(bp);
+ break;
+ case BIO_GETATTR: {
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ uint64_t refd, avail, usedobjs, availobjs, val;
+
+ if (g_handleattr_int(bp, "GEOM::candelete", 1))
+ return;
+ if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksavail",
+ avail / DEV_BSIZE))
+ return;
+ } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksused",
+ refd / DEV_BSIZE))
+ return;
+ } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksavail",
+ avail / DEV_BSIZE))
+ return;
+ } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksused",
+ refd / DEV_BSIZE))
+ return;
+ }
+ /* FALLTHROUGH */
+ }
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
+ }
+ return;
+
+enqueue:
+ mtx_lock(&zv->zv_queue_mtx);
+ first = (bioq_first(&zv->zv_queue) == NULL);
+ bioq_insert_tail(&zv->zv_queue, bp);
+ mtx_unlock(&zv->zv_queue_mtx);
+ if (first)
+ wakeup_one(&zv->zv_queue);
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+ zvol_state_t *zv;
+ struct bio *bp;
+
+ thread_lock(curthread);
+ sched_prio(curthread, PRIBIO);
+ thread_unlock(curthread);
+
+ zv = arg;
+ for (;;) {
+ mtx_lock(&zv->zv_queue_mtx);
+ bp = bioq_takefirst(&zv->zv_queue);
+ if (bp == NULL) {
+ if (zv->zv_state == 1) {
+ zv->zv_state = 2;
+ wakeup(&zv->zv_state);
+ mtx_unlock(&zv->zv_queue_mtx);
+ kthread_exit();
+ }
+ msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
+ "zvol:io", 0);
+ continue;
+ }
+ mtx_unlock(&zv->zv_queue_mtx);
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ break;
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_DELETE:
+ zvol_strategy(bp);
+ break;
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
+ }
+ }
+}
+
+extern boolean_t dataset_name_hidden(const char *name);
+
+static int
+zvol_create_snapshots(objset_t *os, const char *name)
+{
+ uint64_t cookie, obj;
+ char *sname;
+ int error, len;
+
+ cookie = obj = 0;
+ sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+#if 0
+ (void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
+ DS_FIND_SNAPSHOTS);
+#endif
+
+ for (;;) {
+ len = snprintf(sname, MAXPATHLEN, "%s@", name);
+ if (len >= MAXPATHLEN) {
+ dmu_objset_rele(os, FTAG);
+ error = ENAMETOOLONG;
+ break;
+ }
+
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
+ sname + len, &obj, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT)
+ error = 0;
+ break;
+ }
+
+ error = zvol_create_minor(sname);
+ if (error != 0 && error != EEXIST) {
+ printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
+ sname, error);
+ break;
+ }
+ }
+
+ kmem_free(sname, MAXPATHLEN);
+ return (error);
+}
+
+int
+zvol_create_minors_impl(const char *name)
+{
+ uint64_t cookie;
+ objset_t *os;
+ char *osname, *p;
+ int error, len;
+
+ if (dataset_name_hidden(name))
+ return (0);
+
+ if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+ printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
+ name, error);
+ return (error);
+ }
+ if (dmu_objset_type(os) == DMU_OST_ZVOL) {
+ dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+ error = zvol_create_minor(name);
+ if (error == 0 || error == EEXIST) {
+ error = zvol_create_snapshots(os, name);
+ } else {
+ printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
+ name, error);
+ }
+ dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
+ dsl_dataset_rele(os->os_dsl_dataset, FTAG);
+ return (error);
+ }
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
+
+ osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
+ dmu_objset_rele(os, FTAG);
+ kmem_free(osname, MAXPATHLEN);
+ return (ENOENT);
+ }
+ p = osname + strlen(osname);
+ len = MAXPATHLEN - (p - osname);
+
+#if 0
+ /* Prefetch the datasets. */
+ cookie = 0;
+ while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
+ if (!dataset_name_hidden(osname))
+ (void) dmu_objset_prefetch(osname, NULL);
+ }
+#endif
+
+ cookie = 0;
+ while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
+ &cookie) == 0) {
+ dmu_objset_rele(os, FTAG);
+ (void)zvol_create_minors_impl(osname);
+ if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+ printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
+ name, error);
+ return (error);
+ }
+ }
+
+ dmu_objset_rele(os, FTAG);
+ kmem_free(osname, MAXPATHLEN);
+ return (0);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+ struct g_geom *gp;
+ struct g_provider *pp;
+ struct cdev *dev;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ g_topology_lock();
+ pp = zv->zv_provider;
+ ASSERT(pp != NULL);
+ gp = pp->geom;
+ ASSERT(gp != NULL);
+
+ zv->zv_provider = NULL;
+ g_wither_provider(pp, ENXIO);
+
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = zv->zv_volsize;
+ pp->private = zv;
+ zv->zv_provider = pp;
+ g_error_provider(pp, 0);
+ g_topology_unlock();
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct make_dev_args args;
+
+ if ((dev = zv->zv_dev) != NULL) {
+ zv->zv_dev = NULL;
+ destroy_dev(dev);
+ if (zv->zv_total_opens > 0) {
+ zv->zv_flags &= ~ZVOL_EXCL;
+ zv->zv_total_opens = 0;
+ zvol_last_close(zv);
+ }
+ }
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ if (make_dev_s(&args, &zv->zv_dev,
+ "%s/%s", ZVOL_DRIVER, newname) == 0)
+ zv->zv_dev->si_iosize_max = MAXPHYS;
+ }
+ strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
+}
+
+void
+zvol_rename_minors_impl(const char *oldname, const char *newname)
+{
+ char name[MAXPATHLEN];
+ struct g_provider *pp;
+ struct g_geom *gp;
+ size_t oldnamelen, newnamelen;
+ zvol_state_t *zv;
+ char *namebuf;
+ boolean_t locked = B_FALSE;
+
+ oldnamelen = strlen(oldname);
+ newnamelen = strlen(newname);
+
+ /* See comment in zvol_open(). */
+ if (!MUTEX_HELD(&zfsdev_state_lock)) {
+ mutex_enter(&zfsdev_state_lock);
+ locked = B_TRUE;
+ }
+
+ LIST_FOREACH(zv, &all_zvols, zv_links) {
+ if (strcmp(zv->zv_name, oldname) == 0) {
+ zvol_rename_minor(zv, newname);
+ } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+ (zv->zv_name[oldnamelen] == '/' ||
+ zv->zv_name[oldnamelen] == '@')) {
+ snprintf(name, sizeof(name), "%s%c%s", newname,
+ zv->zv_name[oldnamelen],
+ zv->zv_name + oldnamelen + 1);
+ zvol_rename_minor(zv, name);
+ }
+ }
+
+ if (locked)
+ mutex_exit(&zfsdev_state_lock);
+}
+
+static zvol_task_t *
+zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2)
+{
+ zvol_task_t *task;
+ char *delim;
+
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->op = op;
+ delim = strchr(name1, '/');
+ strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
+
+ strlcpy(task->name1, name1, MAXNAMELEN);
+ if (name2 != NULL)
+ strlcpy(task->name2, name2, MAXNAMELEN);
+
+ return (task);
+}
+
+static void
+zvol_task_free(zvol_task_t *task)
+{
+ kmem_free(task, sizeof (zvol_task_t));
+}
+
+/*
+ * The worker thread function performed asynchronously.
+ */
+static void
+zvol_task_cb(void *param)
+{
+ zvol_task_t *task = (zvol_task_t *)param;
+
+ switch (task->op) {
+ case ZVOL_ASYNC_CREATE_MINORS:
+ (void) zvol_create_minors_impl(task->name1);
+ break;
+ case ZVOL_ASYNC_REMOVE_MINORS:
+ zvol_remove_minors_impl(task->name1);
+ break;
+ case ZVOL_ASYNC_RENAME_MINORS:
+ zvol_rename_minors_impl(task->name1, task->name2);
+ break;
+ default:
+ VERIFY(0);
+ break;
+ }
+
+ zvol_task_free(task);
+}
+
+static void
+zvol_minors_helper(spa_t *spa, zvol_async_op_t op, const char *name1,
+ const char *name2)
+{
+ zvol_task_t *task;
+
+ if (dataset_name_hidden(name1))
+ return;
+ if (name2 != NULL && dataset_name_hidden(name2))
+ return;
+ task = zvol_task_alloc(op, name1, name2);
+ (void)taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+}
+
+void
+zvol_create_minors(spa_t *spa, const char *name)
+{
+ zvol_minors_helper(spa, ZVOL_ASYNC_CREATE_MINORS, name, NULL);
+}
+
+void
+zvol_remove_minors(spa_t *spa, const char *name)
+{
+ zvol_minors_helper(spa, ZVOL_ASYNC_REMOVE_MINORS, name, NULL);
+}
+
+void
+zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname)
+{
+ zvol_minors_helper(spa, ZVOL_ASYNC_RENAME_MINORS, oldname, newname);
+}
+
+static int
+zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv = dev->si_drv2;
+ int err = 0;
+
+ mutex_enter(&zfsdev_state_lock);
+ if (zv->zv_total_opens == 0)
+ err = zvol_first_open(zv);
+ if (err) {
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+ }
+ if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ err = SET_ERROR(EROFS);
+ goto out;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = SET_ERROR(EBUSY);
+ goto out;
+ }
+#ifdef FEXCL
+ if (flags & FEXCL) {
+ if (zv->zv_total_opens != 0) {
+ err = SET_ERROR(EBUSY);
+ goto out;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
+ }
+#endif
+
+ zv->zv_total_opens++;
+ if (flags & (FSYNC | FDSYNC)) {
+ zv->zv_sync_cnt++;
+ if (zv->zv_sync_cnt == 1)
+ zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
+ }
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+out:
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+}
+
+static int
+zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv = dev->si_drv2;
+
+ mutex_enter(&zfsdev_state_lock);
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT(zv->zv_total_opens == 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT(zv->zv_total_opens != 0);
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_total_opens--;
+ if (flags & (FSYNC | FDSYNC))
+ zv->zv_sync_cnt--;
+
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+
+ mutex_exit(&zfsdev_state_lock);
+ return (0);
+}
+
+static int
+zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+ zvol_state_t *zv;
+ locked_range_t *lr;
+ off_t offset, length;
+ int i, error;
+ boolean_t sync;
+
+ zv = dev->si_drv2;
+
+ error = 0;
+ KASSERT(zv->zv_total_opens > 0,
+ ("Device with zero access count in zvol_d_ioctl"));
+
+ i = IOCPARM_LEN(cmd);
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(u_int *)data = DEV_BSIZE;
+ break;
+ case DIOCGMEDIASIZE:
+ *(off_t *)data = zv->zv_volsize;
+ break;
+ case DIOCGFLUSH:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ break;
+ case DIOCGDELETE:
+ if (!zvol_unmap_enabled)
+ break;
+
+ offset = ((off_t *)data)[0];
+ length = ((off_t *)data)[1];
+ if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
+ offset < 0 || offset >= zv->zv_volsize ||
+ length <= 0) {
+ printf("%s: offset=%jd length=%jd\n", __func__, offset,
+ length);
+ error = EINVAL;
+ break;
+ }
+
+ lr = rangelock_enter(&zv->zv_rangelock, offset, length,
+ RL_WRITER);
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ sync = FALSE;
+ dmu_tx_abort(tx);
+ } else {
+ sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+ zvol_log_truncate(zv, tx, offset, length, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ offset, length);
+ }
+ rangelock_exit(lr);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ break;
+ case DIOCGSTRIPESIZE:
+ *(off_t *)data = zv->zv_volblocksize;
+ break;
+ case DIOCGSTRIPEOFFSET:
+ *(off_t *)data = 0;
+ break;
+ case DIOCGATTR: {
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
+ uint64_t refd, avail, usedobjs, availobjs;
+
+ if (strcmp(arg->name, "GEOM::candelete") == 0)
+ arg->value.i = 1;
+ else if (strcmp(arg->name, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ arg->value.off = avail / DEV_BSIZE;
+ } else if (strcmp(arg->name, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ arg->value.off = refd / DEV_BSIZE;
+ } else if (strcmp(arg->name, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+ arg->value.off = avail / DEV_BSIZE;
+ } else if (strcmp(arg->name, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ arg->value.off = refd / DEV_BSIZE;
+ } else
+ error = ENOIOCTL;
+ break;
+ }
+ case FIOSEEKHOLE:
+ case FIOSEEKDATA: {
+ off_t *off = (off_t *)data;
+ uint64_t noff;
+ boolean_t hole;
+
+ hole = (cmd == FIOSEEKHOLE);
+ noff = *off;
+ error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
+ *off = noff;
+ break;
+ }
+ default:
+ error = ENOIOCTL;
+ }
+
+ return (error);
+}
+#endif /* illumos */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/callb.c b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
new file mode 100644
index 000000000000..da479087f869
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
@@ -0,0 +1,438 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/systm.h> /* for delay() */
+#include <sys/taskq.h> /* For TASKQ_NAMELEN */
+#include <sys/kernel.h>
+
+#define CB_MAXNAME TASKQ_NAMELEN
+
+/*
+ * The callb mechanism provides generic event scheduling/echoing.
+ * A callb function is registered and called on behalf of the event.
+ */
+typedef struct callb {
+ struct callb *c_next; /* next in class or on freelist */
+ kthread_id_t c_thread; /* ptr to caller's thread struct */
+ char c_flag; /* info about the callb state */
+ uchar_t c_class; /* this callb's class */
+ kcondvar_t c_done_cv; /* signal callb completion */
+ boolean_t (*c_func)(); /* cb function: returns true if ok */
+ void *c_arg; /* arg to c_func */
+ char c_name[CB_MAXNAME+1]; /* debug:max func name length */
+} callb_t;
+
+/*
+ * callb c_flag bitmap definitions
+ */
+#define CALLB_FREE 0x0
+#define CALLB_TAKEN 0x1
+#define CALLB_EXECUTING 0x2
+
+/*
+ * Basic structure for a callb table.
+ * All callbs are organized into different class groups described
+ * by ct_class array.
+ * The callbs within a class are single-linked and normally run by a
+ * serial execution.
+ */
+typedef struct callb_table {
+ kmutex_t ct_lock; /* protect all callb states */
+ callb_t *ct_freelist; /* free callb structures */
+ int ct_busy; /* != 0 prevents additions */
+ kcondvar_t ct_busy_cv; /* to wait for not busy */
+ int ct_ncallb; /* num of callbs allocated */
+ callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */
+} callb_table_t;
+
+int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
+
+static callb_id_t callb_add_common(boolean_t (*)(void *, int),
+ void *, int, char *, kthread_id_t);
+
+static callb_table_t callb_table; /* system level callback table */
+static callb_table_t *ct = &callb_table;
+static kmutex_t callb_safe_mutex;
+callb_cpr_t callb_cprinfo_safe = {
+ &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, 0, 0 };
+
+/*
+ * Init all callb tables in the system.
+ */
+void
+callb_init(void *dummy __unused)
+{
+ callb_table.ct_busy = 0; /* mark table open for additions */
+ mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+callb_fini(void *dummy __unused)
+{
+ callb_t *cp;
+ int i;
+
+ mutex_enter(&ct->ct_lock);
+ for (i = 0; i < 16; i++) {
+ while ((cp = ct->ct_freelist) != NULL) {
+ ct->ct_freelist = cp->c_next;
+ ct->ct_ncallb--;
+ kmem_free(cp, sizeof (callb_t));
+ }
+ if (ct->ct_ncallb == 0)
+ break;
+ /* Not all callbacks finished, waiting for the rest. */
+ mutex_exit(&ct->ct_lock);
+ tsleep(ct, 0, "callb", hz / 4);
+ mutex_enter(&ct->ct_lock);
+ }
+ if (ct->ct_ncallb > 0)
+ printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb);
+ mutex_exit(&ct->ct_lock);
+ mutex_destroy(&callb_safe_mutex);
+ mutex_destroy(&callb_table.ct_lock);
+}
+
+/*
+ * callout_add() is called to register func() be called later.
+ */
+static callb_id_t
+callb_add_common(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name, kthread_id_t t)
+{
+ callb_t *cp;
+
+ ASSERT(class < NCBCLASS);
+
+ mutex_enter(&ct->ct_lock);
+ while (ct->ct_busy)
+ cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
+ if ((cp = ct->ct_freelist) == NULL) {
+ ct->ct_ncallb++;
+ cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
+ }
+ ct->ct_freelist = cp->c_next;
+ cp->c_thread = t;
+ cp->c_func = func;
+ cp->c_arg = arg;
+ cp->c_class = (uchar_t)class;
+ cp->c_flag |= CALLB_TAKEN;
+#ifdef DEBUG
+ if (strlen(name) > CB_MAXNAME)
+ cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
+ "too long -- truncated to %d chars",
+ name, CB_MAXNAME);
+#endif
+ (void) strncpy(cp->c_name, name, CB_MAXNAME);
+ cp->c_name[CB_MAXNAME] = '\0';
+
+ /*
+ * Insert the new callb at the head of its class list.
+ */
+ cp->c_next = ct->ct_first_cb[class];
+ ct->ct_first_cb[class] = cp;
+
+ mutex_exit(&ct->ct_lock);
+ return ((callb_id_t)cp);
+}
+
+/*
+ * The default function to add an entry to the callback table. Since
+ * it uses curthread as the thread identifier to store in the table,
+ * it should be used for the normal case of a thread which is calling
+ * to add ITSELF to the table.
+ */
+callb_id_t
+callb_add(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name)
+{
+ return (callb_add_common(func, arg, class, name, curthread));
+}
+
+/*
+ * A special version of callb_add() above for use by threads which
+ * might be adding an entry to the table on behalf of some other
+ * thread (for example, one which is constructed but not yet running).
+ * In this version the thread id is an argument.
+ */
+callb_id_t
+callb_add_thread(boolean_t (*func)(void *arg, int code),
+ void *arg, int class, char *name, kthread_id_t t)
+{
+ return (callb_add_common(func, arg, class, name, t));
+}
+
+/*
+ * callout_delete() is called to remove an entry identified by id
+ * that was originally placed there by a call to callout_add().
+ * return -1 if fail to delete a callb entry otherwise return 0.
+ */
+int
+callb_delete(callb_id_t id)
+{
+ callb_t **pp;
+ callb_t *me = (callb_t *)id;
+
+ mutex_enter(&ct->ct_lock);
+
+ for (;;) {
+ pp = &ct->ct_first_cb[me->c_class];
+ while (*pp != NULL && *pp != me)
+ pp = &(*pp)->c_next;
+
+#ifdef DEBUG
+ if (*pp != me) {
+ cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
+ (void *)me);
+ mutex_exit(&ct->ct_lock);
+ return (-1);
+ }
+#endif /* DEBUG */
+
+ /*
+ * It is not allowed to delete a callb in the middle of
+ * executing otherwise, the callb_execute() will be confused.
+ */
+ if (!(me->c_flag & CALLB_EXECUTING))
+ break;
+
+ cv_wait(&me->c_done_cv, &ct->ct_lock);
+ }
+ /* relink the class list */
+ *pp = me->c_next;
+
+ /* clean up myself and return the free callb to the head of freelist */
+ me->c_flag = CALLB_FREE;
+ me->c_next = ct->ct_freelist;
+ ct->ct_freelist = me;
+
+ mutex_exit(&ct->ct_lock);
+ return (0);
+}
+
+/*
+ * class: indicates to execute all callbs in the same class;
+ * code: optional argument for the callb functions.
+ * return: = 0: success
+ * != 0: ptr to string supplied when callback was registered
+ */
+void *
+callb_execute_class(int class, int code)
+{
+ callb_t *cp;
+ void *ret = NULL;
+
+ ASSERT(class < NCBCLASS);
+
+ mutex_enter(&ct->ct_lock);
+
+ for (cp = ct->ct_first_cb[class];
+ cp != NULL && ret == 0; cp = cp->c_next) {
+ while (cp->c_flag & CALLB_EXECUTING)
+ cv_wait(&cp->c_done_cv, &ct->ct_lock);
+ /*
+ * cont if the callb is deleted while we're sleeping
+ */
+ if (cp->c_flag == CALLB_FREE)
+ continue;
+ cp->c_flag |= CALLB_EXECUTING;
+
+#ifdef CALLB_DEBUG
+ printf("callb_execute: name=%s func=%p arg=%p\n",
+ cp->c_name, (void *)cp->c_func, (void *)cp->c_arg);
+#endif /* CALLB_DEBUG */
+
+ mutex_exit(&ct->ct_lock);
+ /* If callback function fails, pass back client's name */
+ if (!(*cp->c_func)(cp->c_arg, code))
+ ret = cp->c_name;
+ mutex_enter(&ct->ct_lock);
+
+ cp->c_flag &= ~CALLB_EXECUTING;
+ cv_broadcast(&cp->c_done_cv);
+ }
+ mutex_exit(&ct->ct_lock);
+ return (ret);
+}
+
+/*
+ * callers make sure no recursive entries to this func.
+ * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
+ *
+ * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
+ * use a cv_timedwait() in case the kernel thread is blocked.
+ *
+ * Note that this is a generic callback handler for daemon CPR and
+ * should NOT be changed to accommodate any specific requirement in a daemon.
+ * Individual daemons that require changes to the handler shall write
+ * callback routines in their own daemon modules.
+ */
+boolean_t
+callb_generic_cpr(void *arg, int code)
+{
+ callb_cpr_t *cp = (callb_cpr_t *)arg;
+ clock_t ret = 0; /* assume success */
+
+ mutex_enter(cp->cc_lockp);
+
+ switch (code) {
+ case CB_CODE_CPR_CHKPT:
+ cp->cc_events |= CALLB_CPR_START;
+#ifdef CPR_NOT_THREAD_SAFE
+ while (!(cp->cc_events & CALLB_CPR_SAFE))
+ /* cv_timedwait() returns -1 if it times out. */
+ if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
+ cp->cc_lockp, (callb_timeout_sec * hz),
+ TR_CLOCK_TICK)) == -1)
+ break;
+#endif
+ break;
+
+ case CB_CODE_CPR_RESUME:
+ cp->cc_events &= ~CALLB_CPR_START;
+ cv_signal(&cp->cc_stop_cv);
+ break;
+ }
+ mutex_exit(cp->cc_lockp);
+ return (ret != -1);
+}
+
+/*
+ * The generic callback function associated with kernel threads which
+ * are always considered safe.
+ */
+/* ARGSUSED */
+boolean_t
+callb_generic_cpr_safe(void *arg, int code)
+{
+ return (B_TRUE);
+}
+/*
+ * Prevent additions to callback table.
+ */
+void
+callb_lock_table(void)
+{
+ mutex_enter(&ct->ct_lock);
+ ASSERT(ct->ct_busy == 0);
+ ct->ct_busy = 1;
+ mutex_exit(&ct->ct_lock);
+}
+
+/*
+ * Allow additions to callback table.
+ */
+void
+callb_unlock_table(void)
+{
+ mutex_enter(&ct->ct_lock);
+ ASSERT(ct->ct_busy != 0);
+ ct->ct_busy = 0;
+ cv_broadcast(&ct->ct_busy_cv);
+ mutex_exit(&ct->ct_lock);
+}
+
+#ifdef illumos
+/*
+ * Return a boolean value indicating whether a particular kernel thread is
+ * stopped in accordance with the cpr callback protocol. If returning
+ * false, also return a pointer to the thread name via the 2nd argument.
+ */
+boolean_t
+callb_is_stopped(kthread_id_t tp, caddr_t *thread_name)
+{
+ callb_t *cp;
+ boolean_t ret_val;
+
+ mutex_enter(&ct->ct_lock);
+
+ for (cp = ct->ct_first_cb[CB_CL_CPR_DAEMON];
+ cp != NULL && tp != cp->c_thread; cp = cp->c_next)
+ ;
+
+ ret_val = (cp != NULL);
+ if (ret_val) {
+ /*
+ * We found the thread in the callback table and have
+ * provisionally set the return value to true. Now
+ * see if it is marked "safe" and is sleeping or stopped.
+ */
+ callb_cpr_t *ccp = (callb_cpr_t *)cp->c_arg;
+
+ *thread_name = cp->c_name; /* in case not stopped */
+ mutex_enter(ccp->cc_lockp);
+
+ if (ccp->cc_events & CALLB_CPR_SAFE) {
+ int retry;
+
+ mutex_exit(ccp->cc_lockp);
+ for (retry = 0; retry < CALLB_MAX_RETRY; retry++) {
+ thread_lock(tp);
+ if (tp->t_state & (TS_SLEEP | TS_STOPPED)) {
+ thread_unlock(tp);
+ break;
+ }
+ thread_unlock(tp);
+ delay(CALLB_THREAD_DELAY);
+ }
+ ret_val = retry < CALLB_MAX_RETRY;
+ } else {
+ ret_val =
+ (ccp->cc_events & CALLB_CPR_ALWAYS_SAFE) != 0;
+ mutex_exit(ccp->cc_lockp);
+ }
+ } else {
+ /*
+ * Thread not found in callback table. Make the best
+ * attempt to identify the thread in the error message.
+ */
+ ulong_t offset;
+ char *sym = kobj_getsymname((uintptr_t)tp->t_startpc,
+ &offset);
+
+ *thread_name = sym ? sym : "*unknown*";
+ }
+
+ mutex_exit(&ct->ct_lock);
+ return (ret_val);
+}
+#endif /* illumos */
+
+SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
+SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/fm.c b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
new file mode 100644
index 000000000000..21aac7a1b49d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
@@ -0,0 +1,1399 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Fault Management Architecture (FMA) Resource and Protocol Support
+ *
+ * The routines contained herein provide services to support kernel subsystems
+ * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
+ *
+ * Name-Value Pair Lists
+ *
+ * The embodiment of an FMA protocol element (event, fmri or authority) is a
+ * name-value pair list (nvlist_t). FMA-specific nvlist construtor and
+ * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
+ * to create an nvpair list using custom allocators. Callers may choose to
+ * allocate either from the kernel memory allocator, or from a preallocated
+ * buffer, useful in constrained contexts like high-level interrupt routines.
+ *
+ * Protocol Event and FMRI Construction
+ *
+ * Convenience routines are provided to construct nvlist events according to
+ * the FMA Event Protocol and Naming Schema specification for ereports and
+ * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
+ *
+ * ENA Manipulation
+ *
+ * Routines to generate ENA formats 0, 1 and 2 are available as well as
+ * routines to increment formats 1 and 2. Individual fields within the
+ * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
+ * fm_ena_format_get() and fm_ena_gen_get().
+ */
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/sysevent.h>
+#include <sys/nvpair.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/compress.h>
+#include <sys/cpuvar.h>
+#include <sys/kobj.h>
+#include <sys/kstat.h>
+#include <sys/processor.h>
+#include <sys/pcpu.h>
+#include <sys/sunddi.h>
+#include <sys/systeminfo.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/fm/util.h>
+#include <sys/fm/protocol.h>
+
+/*
+ * URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These
+ * values must be kept in sync with the FMA source code in usr/src/cmd/fm.
+ */
+static const char *fm_url = "http://www.sun.com/msg";
+static const char *fm_msgid = "SUNOS-8000-0G";
+static char *volatile fm_panicstr = NULL;
+
+#ifdef illumos
+errorq_t *ereport_errorq;
+#endif
+void *ereport_dumpbuf;
+size_t ereport_dumplen;
+
+static uint_t ereport_chanlen = ERPT_EVCH_MAX;
+static evchan_t *ereport_chan = NULL;
+static ulong_t ereport_qlen = 0;
+static size_t ereport_size = 0;
+static int ereport_cols = 80;
+
+extern void fastreboot_disable_highpil(void);
+
+/*
+ * Common fault management kstats to record ereport generation
+ * failures
+ */
+
+struct erpt_kstat {
+ kstat_named_t erpt_dropped; /* num erpts dropped on post */
+ kstat_named_t erpt_set_failed; /* num erpt set failures */
+ kstat_named_t fmri_set_failed; /* num fmri set failures */
+ kstat_named_t payload_set_failed; /* num payload set failures */
+};
+
+static struct erpt_kstat erpt_kstat_data = {
+ { "erpt-dropped", KSTAT_DATA_UINT64 },
+ { "erpt-set-failed", KSTAT_DATA_UINT64 },
+ { "fmri-set-failed", KSTAT_DATA_UINT64 },
+ { "payload-set-failed", KSTAT_DATA_UINT64 }
+};
+
+#ifdef illumos
+/*ARGSUSED*/
+static void
+fm_drain(void *private, void *data, errorq_elem_t *eep)
+{
+ nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep);
+
+ if (!KERNEL_PANICKED())
+ (void) fm_ereport_post(nvl, EVCH_TRYHARD);
+ else
+ fm_nvprint(nvl);
+}
+#endif
+
+void
+fm_init(void)
+{
+ kstat_t *ksp;
+
+#ifdef illumos
+ (void) sysevent_evc_bind(FM_ERROR_CHAN,
+ &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND);
+
+ (void) sysevent_evc_control(ereport_chan,
+ EVCH_SET_CHAN_LEN, &ereport_chanlen);
+#endif
+
+ if (ereport_qlen == 0)
+ ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
+
+ if (ereport_size == 0)
+ ereport_size = ERPT_DATA_SZ;
+
+#ifdef illumos
+ ereport_errorq = errorq_nvcreate("fm_ereport_queue",
+ (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
+ FM_ERR_PIL, ERRORQ_VITAL);
+ if (ereport_errorq == NULL)
+ panic("failed to create required ereport error queue");
+#endif
+
+ ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP);
+ ereport_dumplen = ereport_size;
+
+ /* Initialize ereport allocation and generation kstats */
+ ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED,
+ sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp != NULL) {
+ ksp->ks_data = &erpt_kstat_data;
+ kstat_install(ksp);
+ } else {
+ cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
+
+ }
+}
+
+#ifdef illumos
+/*
+ * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of
+ * output so they aren't split across console lines, and return the end column.
+ */
+/*PRINTFLIKE4*/
+static int
+fm_printf(int depth, int c, int cols, const char *format, ...)
+{
+ va_list ap;
+ int width;
+ char c1;
+
+ va_start(ap, format);
+ width = vsnprintf(&c1, sizeof (c1), format, ap);
+ va_end(ap);
+
+ if (c + width >= cols) {
+ console_printf("\n\r");
+ c = 0;
+ if (format[0] != ' ' && depth > 0) {
+ console_printf(" ");
+ c++;
+ }
+ }
+
+ va_start(ap, format);
+ console_vprintf(format, ap);
+ va_end(ap);
+
+ return ((c + width) % cols);
+}
+
+/*
+ * Recursively print a nvlist in the specified column width and return the
+ * column we end up in. This function is called recursively by fm_nvprint(),
+ * below. We generically format the entire nvpair using hexadecimal
+ * integers and strings, and elide any integer arrays. Arrays are basically
+ * used for cache dumps right now, so we suppress them so as not to overwhelm
+ * the amount of console output we produce at panic time. This can be further
+ * enhanced as FMA technology grows based upon the needs of consumers. All
+ * FMA telemetry is logged using the dump device transport, so the console
+ * output serves only as a fallback in case this procedure is unsuccessful.
+ */
+static int
+fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
+{
+ nvpair_t *nvp;
+
+ for (nvp = nvlist_next_nvpair(nvl, NULL);
+ nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ data_type_t type = nvpair_type(nvp);
+ const char *name = nvpair_name(nvp);
+
+ boolean_t b;
+ uint8_t i8;
+ uint16_t i16;
+ uint32_t i32;
+ uint64_t i64;
+ char *str;
+ nvlist_t *cnv;
+
+ if (strcmp(name, FM_CLASS) == 0)
+ continue; /* already printed by caller */
+
+ c = fm_printf(d, c, cols, " %s=", name);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ c = fm_printf(d + 1, c, cols, " 1");
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(nvp, &b);
+ c = fm_printf(d + 1, c, cols, b ? "1" : "0");
+ break;
+
+ case DATA_TYPE_BYTE:
+ (void) nvpair_value_byte(nvp, &i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_INT8:
+ (void) nvpair_value_int8(nvp, (void *)&i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_UINT8:
+ (void) nvpair_value_uint8(nvp, &i8);
+ c = fm_printf(d + 1, c, cols, "%x", i8);
+ break;
+
+ case DATA_TYPE_INT16:
+ (void) nvpair_value_int16(nvp, (void *)&i16);
+ c = fm_printf(d + 1, c, cols, "%x", i16);
+ break;
+
+ case DATA_TYPE_UINT16:
+ (void) nvpair_value_uint16(nvp, &i16);
+ c = fm_printf(d + 1, c, cols, "%x", i16);
+ break;
+
+ case DATA_TYPE_INT32:
+ (void) nvpair_value_int32(nvp, (void *)&i32);
+ c = fm_printf(d + 1, c, cols, "%x", i32);
+ break;
+
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &i32);
+ c = fm_printf(d + 1, c, cols, "%x", i32);
+ break;
+
+ case DATA_TYPE_INT64:
+ (void) nvpair_value_int64(nvp, (void *)&i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_HRTIME:
+ (void) nvpair_value_hrtime(nvp, (void *)&i64);
+ c = fm_printf(d + 1, c, cols, "%llx",
+ (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &str);
+ c = fm_printf(d + 1, c, cols, "\"%s\"",
+ str ? str : "<NULL>");
+ break;
+
+ case DATA_TYPE_NVLIST:
+ c = fm_printf(d + 1, c, cols, "[");
+ (void) nvpair_value_nvlist(nvp, &cnv);
+ c = fm_nvprintr(cnv, d + 1, c, cols);
+ c = fm_printf(d + 1, c, cols, " ]");
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **val;
+ uint_t i, nelem;
+
+ c = fm_printf(d + 1, c, cols, "[");
+ (void) nvpair_value_nvlist_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++) {
+ c = fm_nvprintr(val[i], d + 1, c, cols);
+ }
+ c = fm_printf(d + 1, c, cols, " ]");
+ }
+ break;
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_INT8_ARRAY:
+ case DATA_TYPE_UINT8_ARRAY:
+ case DATA_TYPE_INT16_ARRAY:
+ case DATA_TYPE_UINT16_ARRAY:
+ case DATA_TYPE_INT32_ARRAY:
+ case DATA_TYPE_UINT32_ARRAY:
+ case DATA_TYPE_INT64_ARRAY:
+ case DATA_TYPE_UINT64_ARRAY:
+ case DATA_TYPE_STRING_ARRAY:
+ c = fm_printf(d + 1, c, cols, "[...]");
+ break;
+ case DATA_TYPE_UNKNOWN:
+ c = fm_printf(d + 1, c, cols, "<unknown>");
+ break;
+ }
+ }
+
+ return (c);
+}
+
+void
+fm_nvprint(nvlist_t *nvl)
+{
+ char *class;
+ int c = 0;
+
+ console_printf("\r");
+
+ if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
+ c = fm_printf(0, c, ereport_cols, "%s", class);
+
+ if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0)
+ console_printf("\n");
+
+ console_printf("\n");
+}
+
+/*
+ * Wrapper for panic() that first produces an FMA-style message for admins.
+ * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
+ * is the one exception to that rule and the only error that gets messaged.
+ * This function is intended for use by subsystems that have detected a fatal
+ * error and enqueued appropriate ereports and wish to then force a panic.
+ */
+/*PRINTFLIKE1*/
+void
+fm_panic(const char *format, ...)
+{
+ va_list ap;
+
+ (void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format);
+#if defined(__i386) || defined(__amd64)
+ fastreboot_disable_highpil();
+#endif /* __i386 || __amd64 */
+ va_start(ap, format);
+ vpanic(format, ap);
+ va_end(ap);
+}
+
+/*
+ * Simply tell the caller if fm_panicstr is set, ie. an fma event has
+ * caused the panic. If so, something other than the default panic
+ * diagnosis method will diagnose the cause of the panic.
+ */
+int
+is_fm_panic()
+{
+ if (fm_panicstr)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Print any appropriate FMA banner message before the panic message. This
+ * function is called by panicsys() and prints the message for fm_panic().
+ * We print the message here so that it comes after the system is quiesced.
+ * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix).
+ * The rest of the message is for the console only and not needed in the log,
+ * so it is printed using console_printf(). We break it up into multiple
+ * chunks so as to avoid overflowing any small legacy prom_printf() buffers.
+ */
+void
+fm_banner(void)
+{
+ timespec_t tod;
+ hrtime_t now;
+
+ if (!fm_panicstr)
+ return; /* panic was not initiated by fm_panic(); do nothing */
+
+ if (KERNEL_PANICKED()) {
+ tod = panic_hrestime;
+ now = panic_hrtime;
+ } else {
+ gethrestime(&tod);
+ now = gethrtime_waitfree();
+ }
+
+ cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, "
+ "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid);
+
+ console_printf(
+"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n"
+"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n",
+ fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now);
+
+ console_printf(
+"PLATFORM: %s, CSN: -, HOSTNAME: %s\n"
+"SOURCE: %s, REV: %s %s\n",
+ platform, utsname.nodename, utsname.sysname,
+ utsname.release, utsname.version);
+
+ console_printf(
+"DESC: Errors have been detected that require a reboot to ensure system\n"
+"integrity. See %s/%s for more information.\n",
+ fm_url, fm_msgid);
+
+ console_printf(
+"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n"
+"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n"
+"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n");
+
+ console_printf("\n");
+}
+
+/*
+ * Utility function to write all of the pending ereports to the dump device.
+ * This function is called at either normal reboot or panic time, and simply
+ * iterates over the in-transit messages in the ereport sysevent channel.
+ */
+void
+fm_ereport_dump(void)
+{
+ evchanq_t *chq;
+ sysevent_t *sep;
+ erpt_dump_t ed;
+
+ timespec_t tod;
+ hrtime_t now;
+ char *buf;
+ size_t len;
+
+ if (KERNEL_PANICKED()) {
+ tod = panic_hrestime;
+ now = panic_hrtime;
+ } else {
+ if (ereport_errorq != NULL)
+ errorq_drain(ereport_errorq);
+ gethrestime(&tod);
+ now = gethrtime_waitfree();
+ }
+
+ /*
+ * In the panic case, sysevent_evc_walk_init() will return NULL.
+ */
+ if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL &&
+ !KERNEL_PANICKED())
+ return; /* event channel isn't initialized yet */
+
+ while ((sep = sysevent_evc_walk_step(chq)) != NULL) {
+ if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL)
+ break;
+
+ ed.ed_magic = ERPT_MAGIC;
+ ed.ed_chksum = checksum32(buf, len);
+ ed.ed_size = (uint32_t)len;
+ ed.ed_pad = 0;
+ ed.ed_hrt_nsec = SE_TIME(sep);
+ ed.ed_hrt_base = now;
+ ed.ed_tod_base.sec = tod.tv_sec;
+ ed.ed_tod_base.nsec = tod.tv_nsec;
+
+ dumpvp_write(&ed, sizeof (ed));
+ dumpvp_write(buf, len);
+ }
+
+ sysevent_evc_walk_fini(chq);
+}
+#endif
+
+/*
+ * Post an error report (ereport) to the sysevent error channel. The error
+ * channel must be established with a prior call to sysevent_evc_create()
+ * before publication may occur.
+ */
+void
+fm_ereport_post(nvlist_t *ereport, int evc_flag)
+{
+ size_t nvl_size = 0;
+ evchan_t *error_chan;
+ sysevent_id_t eid;
+
+ (void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
+ if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+ return;
+ }
+
+#ifdef illumos
+ if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
+ EVCH_CREAT|EVCH_HOLD_PEND) != 0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+ return;
+ }
+
+ if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
+ SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+ (void) sysevent_evc_unbind(error_chan);
+ return;
+ }
+ (void) sysevent_evc_unbind(error_chan);
+#else
+ (void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, ereport, &eid, DDI_SLEEP);
+#endif
+}
+
+/*
+ * Wrapppers for FM nvlist allocators
+ */
+/* ARGSUSED */
+static void *
+i_fm_alloc(nv_alloc_t *nva, size_t size)
+{
+ return (kmem_zalloc(size, KM_SLEEP));
+}
+
+/* ARGSUSED */
+static void
+i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+ kmem_free(buf, size);
+}
+
+const nv_alloc_ops_t fm_mem_alloc_ops = {
+ NULL,
+ NULL,
+ i_fm_alloc,
+ i_fm_free,
+ NULL
+};
+
+/*
+ * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer
+ * to the newly allocated nv_alloc_t structure is returned upon success or NULL
+ * is returned to indicate that the nv_alloc structure could not be created.
+ */
+nv_alloc_t *
+fm_nva_xcreate(char *buf, size_t bufsz)
+{
+ nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+ if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ return (NULL);
+ }
+
+ return (nvhdl);
+}
+
+/*
+ * Destroy a previously allocated nv_alloc structure. The fixed buffer
+ * associated with nva must be freed by the caller.
+ */
+void
+fm_nva_xdestroy(nv_alloc_t *nva)
+{
+ nv_alloc_fini(nva);
+ kmem_free(nva, sizeof (nv_alloc_t));
+}
+
+/*
+ * Create a new nv list. A pointer to a new nv list structure is returned
+ * upon success or NULL is returned to indicate that the structure could
+ * not be created. The newly created nv list is created and managed by the
+ * operations installed in nva. If nva is NULL, the default FMA nva
+ * operations are installed and used.
+ *
+ * When called from the kernel and nva == NULL, this function must be called
+ * from passive kernel context with no locks held that can prevent a
+ * sleeping memory allocation from occurring. Otherwise, this function may
+ * be called from other kernel contexts as long a valid nva created via
+ * fm_nva_create() is supplied.
+ */
+nvlist_t *
+fm_nvlist_create(nv_alloc_t *nva)
+{
+ int hdl_alloced = 0;
+ nvlist_t *nvl;
+ nv_alloc_t *nvhdl;
+
+ if (nva == NULL) {
+ nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+ if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ return (NULL);
+ }
+ hdl_alloced = 1;
+ } else {
+ nvhdl = nva;
+ }
+
+ if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
+ if (hdl_alloced) {
+ nv_alloc_fini(nvhdl);
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
+ }
+ return (NULL);
+ }
+
+ return (nvl);
+}
+
+/*
+ * Destroy a previously allocated nvlist structure. flag indicates whether
+ * or not the associated nva structure should be freed (FM_NVA_FREE) or
+ * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows
+ * it to be re-used for future nvlist creation operations.
+ */
+void
+fm_nvlist_destroy(nvlist_t *nvl, int flag)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
+
+ nvlist_free(nvl);
+
+ if (nva != NULL) {
+ if (flag == FM_NVA_FREE)
+ fm_nva_xdestroy(nva);
+ }
+}
+
+int
+i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
+{
+ int nelem, ret = 0;
+ data_type_t type;
+
+ while (ret == 0 && name != NULL) {
+ type = va_arg(ap, data_type_t);
+ switch (type) {
+ case DATA_TYPE_BYTE:
+ ret = nvlist_add_byte(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_byte_array(payload, name,
+ va_arg(ap, uchar_t *), nelem);
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ ret = nvlist_add_boolean_value(payload, name,
+ va_arg(ap, boolean_t));
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_boolean_array(payload, name,
+ va_arg(ap, boolean_t *), nelem);
+ break;
+ case DATA_TYPE_INT8:
+ ret = nvlist_add_int8(payload, name,
+ va_arg(ap, int));
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int8_array(payload, name,
+ va_arg(ap, int8_t *), nelem);
+ break;
+ case DATA_TYPE_UINT8:
+ ret = nvlist_add_uint8(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint8_array(payload, name,
+ va_arg(ap, uint8_t *), nelem);
+ break;
+ case DATA_TYPE_INT16:
+ ret = nvlist_add_int16(payload, name,
+ va_arg(ap, int));
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int16_array(payload, name,
+ va_arg(ap, int16_t *), nelem);
+ break;
+ case DATA_TYPE_UINT16:
+ ret = nvlist_add_uint16(payload, name,
+ va_arg(ap, uint_t));
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint16_array(payload, name,
+ va_arg(ap, uint16_t *), nelem);
+ break;
+ case DATA_TYPE_INT32:
+ ret = nvlist_add_int32(payload, name,
+ va_arg(ap, int32_t));
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int32_array(payload, name,
+ va_arg(ap, int32_t *), nelem);
+ break;
+ case DATA_TYPE_UINT32:
+ ret = nvlist_add_uint32(payload, name,
+ va_arg(ap, uint32_t));
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint32_array(payload, name,
+ va_arg(ap, uint32_t *), nelem);
+ break;
+ case DATA_TYPE_INT64:
+ ret = nvlist_add_int64(payload, name,
+ va_arg(ap, int64_t));
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_int64_array(payload, name,
+ va_arg(ap, int64_t *), nelem);
+ break;
+ case DATA_TYPE_UINT64:
+ ret = nvlist_add_uint64(payload, name,
+ va_arg(ap, uint64_t));
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_uint64_array(payload, name,
+ va_arg(ap, uint64_t *), nelem);
+ break;
+ case DATA_TYPE_STRING:
+ ret = nvlist_add_string(payload, name,
+ va_arg(ap, char *));
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_string_array(payload, name,
+ va_arg(ap, char **), nelem);
+ break;
+ case DATA_TYPE_NVLIST:
+ ret = nvlist_add_nvlist(payload, name,
+ va_arg(ap, nvlist_t *));
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ nelem = va_arg(ap, int);
+ ret = nvlist_add_nvlist_array(payload, name,
+ va_arg(ap, nvlist_t **), nelem);
+ break;
+ default:
+ ret = EINVAL;
+ }
+
+ name = va_arg(ap, char *);
+ }
+ return (ret);
+}
+
+void
+fm_payload_set(nvlist_t *payload, ...)
+{
+ int ret;
+ const char *name;
+ va_list ap;
+
+ va_start(ap, payload);
+ name = va_arg(ap, char *);
+ ret = i_fm_payload_set(payload, name, ap);
+ va_end(ap);
+
+ if (ret)
+ atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
+}
+
+/*
+ * Set-up and validate the members of an ereport event according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * class string ereport
+ * version uint8_t 0
+ * ena uint64_t <ena>
+ * detector nvlist_t <detector>
+ * ereport-payload nvlist_t <var args>
+ *
+ * We don't actually add a 'version' member to the payload. Really,
+ * the version quoted to us by our caller is that of the category 1
+ * "ereport" event class (and we require FM_EREPORT_VERS0) but
+ * the payload version of the actual leaf class event under construction
+ * may be something else. Callers should supply a version in the varargs,
+ * or (better) we could take two version arguments - one for the
+ * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
+ * for the leaf class.
+ */
+void
+fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
+ uint64_t ena, const nvlist_t *detector, ...)
+{
+ char ereport_class[FM_MAX_CLASS];
+ const char *name;
+ va_list ap;
+ int ret;
+
+ if (version != FM_EREPORT_VERS0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ return;
+ }
+
+ (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
+ FM_EREPORT_CLASS, erpt_class);
+ if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ }
+
+ if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
+ (nvlist_t *)detector) != 0) {
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+ }
+
+ va_start(ap, detector);
+ name = va_arg(ap, const char *);
+ ret = i_fm_payload_set(ereport, name, ap);
+ va_end(ap);
+
+ if (ret)
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+}
+
+/*
+ * Set-up and validate the members of an hc fmri according to;
+ *
+ * Member name Type Value
+ * ===================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * hc-name string <name>
+ * hc-id string <id>
+ *
+ * Note that auth and hc-id are optional members.
+ */
+
+#define HC_MAXPAIRS 20
+#define HC_MAXNAMELEN 50
+
+static int
+fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
+{
+ if (version != FM_HC_SCHEME_VERSION) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return (0);
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
+ nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return (0);
+ }
+
+ if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return (0);
+ }
+
+ return (1);
+}
+
+void
+fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+ nvlist_t *snvl, int npairs, ...)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+ nvlist_t *pairs[HC_MAXPAIRS];
+ va_list ap;
+ int i;
+
+ if (!fm_fmri_hc_set_common(fmri, version, auth))
+ return;
+
+ npairs = MIN(npairs, HC_MAXPAIRS);
+
+ va_start(ap, npairs);
+ for (i = 0; i < npairs; i++) {
+ const char *name = va_arg(ap, const char *);
+ uint32_t id = va_arg(ap, uint32_t);
+ char idstr[11];
+
+ (void) snprintf(idstr, sizeof (idstr), "%u", id);
+
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+ va_end(ap);
+
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+
+ for (i = 0; i < npairs; i++)
+ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+
+ if (snvl != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+}
+
+/*
+ * Set-up and validate the members of an dev fmri according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * devpath string <devpath>
+ * [devid] string <devid>
+ * [target-port-l0id] string <target-port-lun0-id>
+ *
+ * Note that auth and devid are optional members.
+ */
+void
+fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
+ const char *devpath, const char *devid, const char *tpl0)
+{
+ int err = 0;
+
+ if (version != DEV_SCHEME_VERSION0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
+
+ if (auth != NULL) {
+ err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth);
+ }
+
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
+
+ if (devid != NULL)
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
+
+ if (tpl0 != NULL)
+ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
+
+ if (err)
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+
+}
+
+/*
+ * Set-up and validate the members of an cpu fmri according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth>
+ * cpuid uint32_t <cpu_id>
+ * cpumask uint8_t <cpu_mask>
+ * serial uint64_t <serial_id>
+ *
+ * Note that auth, cpumask, serial are optional members.
+ *
+ */
+void
+fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
+ uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
+{
+ uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
+
+ if (version < CPU_SCHEME_VERSION1) {
+ atomic_inc_64(failedp);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
+ atomic_inc_64(failedp);
+ return;
+ }
+
+ if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
+ FM_FMRI_SCHEME_CPU) != 0) {
+ atomic_inc_64(failedp);
+ return;
+ }
+
+ if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0)
+ atomic_inc_64(failedp);
+
+ if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
+ atomic_inc_64(failedp);
+
+ if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
+ *cpu_maskp) != 0)
+ atomic_inc_64(failedp);
+
+ if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
+ FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
+ atomic_inc_64(failedp);
+}
+
+/*
+ * Set-up and validate the members of a mem according to:
+ *
+ * Member name Type Value
+ * ====================================================
+ * version uint8_t 0
+ * auth nvlist_t <auth> [optional]
+ * unum string <unum>
+ * serial string <serial> [optional*]
+ * offset uint64_t <offset> [optional]
+ *
+ * * serial is required if offset is present
+ */
+void
+fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+ const char *unum, const char *serial, uint64_t offset)
+{
+ if (version != MEM_SCHEME_VERSION0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (!serial && (offset != (uint64_t)-1)) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (auth != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+ (nvlist_t *)auth) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+
+ if (serial != NULL) {
+ if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
+ (char **)&serial, 1) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
+ FM_FMRI_MEM_OFFSET, offset) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+}
+
+void
+fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
+ uint64_t vdev_guid)
+{
+ if (version != ZFS_SCHEME_VERSION0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+
+ if (vdev_guid != 0) {
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
+ }
+}
+
+uint64_t
+fm_ena_increment(uint64_t ena)
+{
+ uint64_t new_ena;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
+ break;
+ case FM_ENA_FMT2:
+ new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
+ break;
+ default:
+ new_ena = 0;
+ }
+
+ return (new_ena);
+}
+
+uint64_t
+fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
+{
+ uint64_t ena = 0;
+
+ switch (format) {
+ case FM_ENA_FMT1:
+ if (timestamp) {
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((cpuid << ENA_FMT1_CPUID_SHFT) &
+ ENA_FMT1_CPUID_MASK) |
+ ((timestamp << ENA_FMT1_TIME_SHFT) &
+ ENA_FMT1_TIME_MASK));
+ } else {
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((cpuid << ENA_FMT1_CPUID_SHFT) &
+ ENA_FMT1_CPUID_MASK) |
+ ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) &
+ ENA_FMT1_TIME_MASK));
+ }
+ break;
+ case FM_ENA_FMT2:
+ ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+ ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
+ break;
+ default:
+ break;
+ }
+
+ return (ena);
+}
+
+uint64_t
+fm_ena_generate(uint64_t timestamp, uchar_t format)
+{
+ return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format));
+}
+
+uint64_t
+fm_ena_generation_get(uint64_t ena)
+{
+ uint64_t gen;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
+ break;
+ default:
+ gen = 0;
+ break;
+ }
+
+ return (gen);
+}
+
+uchar_t
+fm_ena_format_get(uint64_t ena)
+{
+
+ return (ENA_FORMAT(ena));
+}
+
+uint64_t
+fm_ena_id_get(uint64_t ena)
+{
+ uint64_t id;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
+ break;
+ default:
+ id = 0;
+ }
+
+ return (id);
+}
+
+uint64_t
+fm_ena_time_get(uint64_t ena)
+{
+ uint64_t time;
+
+ switch (ENA_FORMAT(ena)) {
+ case FM_ENA_FMT1:
+ time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
+ break;
+ case FM_ENA_FMT2:
+ time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
+ break;
+ default:
+ time = 0;
+ }
+
+ return (time);
+}
+
+#ifdef illumos
+/*
+ * Convert a getpcstack() trace to symbolic name+offset, and add the resulting
+ * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
+ */
+void
+fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth)
+{
+ int i;
+ char *sym;
+ ulong_t off;
+ char *stkpp[FM_STK_DEPTH];
+ char buf[FM_STK_DEPTH * FM_SYM_SZ];
+ char *stkp = buf;
+
+ for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) {
+ if ((sym = kobj_getsymname(stack[i], &off)) != NULL)
+ (void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off);
+ else
+ (void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]);
+ stkpp[i] = stkp;
+ }
+
+ fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK,
+ DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL);
+}
+#endif
+
+#ifdef illumos
+void
+print_msg_hwerr(ctid_t ct_id, proc_t *p)
+{
+ uprintf("Killed process %d (%s) in contract id %d "
+ "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
+}
+#endif
+
+void
+fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
+ nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
+{
+ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+ nvlist_t *pairs[HC_MAXPAIRS];
+ nvlist_t **hcl;
+ uint_t n;
+ int i, j;
+ va_list ap;
+ char *hcname, *hcid;
+
+ if (!fm_fmri_hc_set_common(fmri, version, auth))
+ return;
+
+ /*
+ * copy the bboard nvpairs to the pairs array
+ */
+ if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
+ != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
+ &hcname) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
+ for (j = 0; j <= i; j++) {
+ if (pairs[j] != NULL)
+ fm_nvlist_destroy(pairs[j],
+ FM_NVA_RETAIN);
+ }
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+ }
+
+ /*
+ * create the pairs from passed in pairs
+ */
+ npairs = MIN(npairs, HC_MAXPAIRS);
+
+ va_start(ap, npairs);
+ for (i = n; i < npairs + n; i++) {
+ const char *name = va_arg(ap, const char *);
+ uint32_t id = va_arg(ap, uint32_t);
+ char idstr[11];
+ (void) snprintf(idstr, sizeof (idstr), "%u", id);
+ pairs[i] = fm_nvlist_create(nva);
+ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+ for (j = 0; j <= i; j++) {
+ if (pairs[j] != NULL)
+ fm_nvlist_destroy(pairs[j],
+ FM_NVA_RETAIN);
+ }
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+ }
+ va_end(ap);
+
+ /*
+ * Create the fmri hc list
+ */
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
+ npairs + n) != 0) {
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+
+ for (i = 0; i < npairs + n; i++) {
+ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+ }
+
+ if (snvl != NULL) {
+ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
+ return;
+ }
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/list.c b/sys/cddl/contrib/opensolaris/uts/common/os/list.c
new file mode 100644
index 000000000000..d9f467f461aa
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/os/list.c
@@ -0,0 +1,243 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Generic doubly-linked list implementation
+ */
+
+#include <sys/list.h>
+#include <sys/list_impl.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+
+#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
+
+#define list_insert_after_node(list, node, object) { \
+ list_node_t *lnew = list_d2l(list, object); \
+ lnew->list_prev = (node); \
+ lnew->list_next = (node)->list_next; \
+ (node)->list_next->list_prev = lnew; \
+ (node)->list_next = lnew; \
+}
+
+#define list_insert_before_node(list, node, object) { \
+ list_node_t *lnew = list_d2l(list, object); \
+ lnew->list_next = (node); \
+ lnew->list_prev = (node)->list_prev; \
+ (node)->list_prev->list_next = lnew; \
+ (node)->list_prev = lnew; \
+}
+
+#define list_remove_node(node) \
+ (node)->list_prev->list_next = (node)->list_next; \
+ (node)->list_next->list_prev = (node)->list_prev; \
+ (node)->list_next = (node)->list_prev = NULL
+
+void
+list_create(list_t *list, size_t size, size_t offset)
+{
+ ASSERT(list);
+ ASSERT(size > 0);
+ ASSERT(size >= offset + sizeof (list_node_t));
+
+ list->list_size = size;
+ list->list_offset = offset;
+ list->list_head.list_next = list->list_head.list_prev =
+ &list->list_head;
+}
+
+void
+list_destroy(list_t *list)
+{
+ list_node_t *node = &list->list_head;
+
+ ASSERT(list);
+ ASSERT(list->list_head.list_next == node);
+ ASSERT(list->list_head.list_prev == node);
+
+ node->list_next = node->list_prev = NULL;
+}
+
+void
+list_insert_after(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL) {
+ list_insert_head(list, nobject);
+ } else {
+ list_node_t *lold = list_d2l(list, object);
+ list_insert_after_node(list, lold, nobject);
+ }
+}
+
+void
+list_insert_before(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL) {
+ list_insert_tail(list, nobject);
+ } else {
+ list_node_t *lold = list_d2l(list, object);
+ list_insert_before_node(list, lold, nobject);
+ }
+}
+
+void
+list_insert_head(list_t *list, void *object)
+{
+ list_node_t *lold = &list->list_head;
+ list_insert_after_node(list, lold, object);
+}
+
+void
+list_insert_tail(list_t *list, void *object)
+{
+ list_node_t *lold = &list->list_head;
+ list_insert_before_node(list, lold, object);
+}
+
+void
+list_remove(list_t *list, void *object)
+{
+ list_node_t *lold = list_d2l(list, object);
+ ASSERT(!list_empty(list));
+ ASSERT(lold->list_next != NULL);
+ list_remove_node(lold);
+}
+
+void *
+list_remove_head(list_t *list)
+{
+ list_node_t *head = list->list_head.list_next;
+ if (head == &list->list_head)
+ return (NULL);
+ list_remove_node(head);
+ return (list_object(list, head));
+}
+
+void *
+list_remove_tail(list_t *list)
+{
+ list_node_t *tail = list->list_head.list_prev;
+ if (tail == &list->list_head)
+ return (NULL);
+ list_remove_node(tail);
+ return (list_object(list, tail));
+}
+
+void *
+list_head(list_t *list)
+{
+ if (list_empty(list))
+ return (NULL);
+ return (list_object(list, list->list_head.list_next));
+}
+
+void *
+list_tail(list_t *list)
+{
+ if (list_empty(list))
+ return (NULL);
+ return (list_object(list, list->list_head.list_prev));
+}
+
+void *
+list_next(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->list_next != &list->list_head)
+ return (list_object(list, node->list_next));
+
+ return (NULL);
+}
+
+void *
+list_prev(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->list_prev != &list->list_head)
+ return (list_object(list, node->list_prev));
+
+ return (NULL);
+}
+
+/*
+ * Insert src list after dst list. Empty src list thereafter.
+ */
+void
+list_move_tail(list_t *dst, list_t *src)
+{
+ list_node_t *dstnode = &dst->list_head;
+ list_node_t *srcnode = &src->list_head;
+
+ ASSERT(dst->list_size == src->list_size);
+ ASSERT(dst->list_offset == src->list_offset);
+
+ if (list_empty(src))
+ return;
+
+ dstnode->list_prev->list_next = srcnode->list_next;
+ srcnode->list_next->list_prev = dstnode->list_prev;
+ dstnode->list_prev = srcnode->list_prev;
+ srcnode->list_prev->list_next = dstnode;
+
+ /* empty src list */
+ srcnode->list_next = srcnode->list_prev = srcnode;
+}
+
+void
+list_link_replace(list_node_t *lold, list_node_t *lnew)
+{
+ ASSERT(list_link_active(lold));
+ ASSERT(!list_link_active(lnew));
+
+ lnew->list_next = lold->list_next;
+ lnew->list_prev = lold->list_prev;
+ lold->list_prev->list_next = lnew;
+ lold->list_next->list_prev = lnew;
+ lold->list_next = lold->list_prev = NULL;
+}
+
+void
+list_link_init(list_node_t *link)
+{
+ link->list_next = NULL;
+ link->list_prev = NULL;
+}
+
+int
+list_link_active(list_node_t *link)
+{
+ return (link->list_next != NULL);
+}
+
+int
+list_is_empty(list_t *list)
+{
+ return (list_empty(list));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c b/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
new file mode 100644
index 000000000000..3682853de902
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/nvpair.h>
+
+static void *
+nv_alloc_sys(nv_alloc_t *nva, size_t size)
+{
+ return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg));
+}
+
+/*ARGSUSED*/
+static void
+nv_free_sys(nv_alloc_t *nva, void *buf, size_t size)
+{
+ kmem_free(buf, size);
+}
+
+static const nv_alloc_ops_t system_ops = {
+ NULL, /* nv_ao_init() */
+ NULL, /* nv_ao_fini() */
+ nv_alloc_sys, /* nv_ao_alloc() */
+ nv_free_sys, /* nv_ao_free() */
+ NULL /* nv_ao_reset() */
+};
+
+nv_alloc_t nv_alloc_sleep_def = {
+ &system_ops,
+ (void *)KM_SLEEP
+};
+
+nv_alloc_t nv_alloc_nosleep_def = {
+ &system_ops,
+ (void *)KM_NOSLEEP
+};
+
+nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def;
+nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
new file mode 100644
index 000000000000..b81678ca07d2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
@@ -0,0 +1,313 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ *
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2017 RackTop Systems.
+ */
+
+#ifndef _SYS_ACL_H
+#define _SYS_ACL_H
+
+#include <sys/types.h>
+#include <sys/acl_impl.h>
+
+#if defined(_KERNEL)
+/*
+ * When compiling OpenSolaris kernel code, this file is included instead of the
+ * FreeBSD one. Include the original sys/acl.h as well.
+ */
+#undef _SYS_ACL_H
+#include_next <sys/acl.h>
+#define _SYS_ACL_H
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_ACL_ENTRIES (1024) /* max entries of each type */
+typedef struct {
+ int a_type; /* the type of ACL entry */
+ uid_t a_id; /* the entry in -uid or gid */
+ o_mode_t a_perm; /* the permission field */
+} aclent_t;
+
+typedef struct ace {
+ uid_t a_who; /* uid or gid */
+ uint32_t a_access_mask; /* read,write,... */
+ uint16_t a_flags; /* see below */
+ uint16_t a_type; /* allow or deny */
+} ace_t;
+
+#ifndef _KERNEL
+typedef struct acl_info acl_t;
+#endif
+
+/*
+ * The following are Defined types for an aclent_t.
+ */
+#define USER_OBJ (0x01) /* object owner */
+#define USER (0x02) /* additional users */
+#define GROUP_OBJ (0x04) /* owning group of the object */
+#define GROUP (0x08) /* additional groups */
+#define CLASS_OBJ (0x10) /* file group class and mask entry */
+#define OTHER_OBJ (0x20) /* other entry for the object */
+#define ACL_DEFAULT (0x1000) /* default flag */
+/* default object owner */
+#define DEF_USER_OBJ (ACL_DEFAULT | USER_OBJ)
+/* default additional users */
+#define DEF_USER (ACL_DEFAULT | USER)
+/* default owning group */
+#define DEF_GROUP_OBJ (ACL_DEFAULT | GROUP_OBJ)
+/* default additional groups */
+#define DEF_GROUP (ACL_DEFAULT | GROUP)
+/* default mask entry */
+#define DEF_CLASS_OBJ (ACL_DEFAULT | CLASS_OBJ)
+/* default other entry */
+#define DEF_OTHER_OBJ (ACL_DEFAULT | OTHER_OBJ)
+
+/*
+ * The following are defined for ace_t.
+ */
+#define ACE_READ_DATA 0x00000001
+#define ACE_LIST_DIRECTORY 0x00000001
+#define ACE_WRITE_DATA 0x00000002
+#define ACE_ADD_FILE 0x00000002
+#define ACE_APPEND_DATA 0x00000004
+#define ACE_ADD_SUBDIRECTORY 0x00000004
+#define ACE_READ_NAMED_ATTRS 0x00000008
+#define ACE_WRITE_NAMED_ATTRS 0x00000010
+#define ACE_EXECUTE 0x00000020
+#define ACE_DELETE_CHILD 0x00000040
+#define ACE_READ_ATTRIBUTES 0x00000080
+#define ACE_WRITE_ATTRIBUTES 0x00000100
+#define ACE_DELETE 0x00010000
+#define ACE_READ_ACL 0x00020000
+#define ACE_WRITE_ACL 0x00040000
+#define ACE_WRITE_OWNER 0x00080000
+#define ACE_SYNCHRONIZE 0x00100000
+
+#define ACE_FILE_INHERIT_ACE 0x0001
+#define ACE_DIRECTORY_INHERIT_ACE 0x0002
+#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004
+#define ACE_INHERIT_ONLY_ACE 0x0008
+#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010
+#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020
+#define ACE_IDENTIFIER_GROUP 0x0040
+#define ACE_INHERITED_ACE 0x0080
+#define ACE_OWNER 0x1000
+#define ACE_GROUP 0x2000
+#define ACE_EVERYONE 0x4000
+
+#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000
+#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001
+#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002
+#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003
+
+#define ACL_AUTO_INHERIT 0x0001
+#define ACL_PROTECTED 0x0002
+#define ACL_DEFAULTED 0x0004
+#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \
+ ACL_DEFAULTED)
+
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+
+/*
+ * These are only applicable in a CIFS context.
+ */
+#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
+#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
+#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
+#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
+#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
+#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
+#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
+#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
+#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
+#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
+#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E
+#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
+#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10
+
+#define ACE_ALL_TYPES 0x001F
+
+typedef struct ace_object {
+ uid_t a_who; /* uid or gid */
+ uint32_t a_access_mask; /* read,write,... */
+ uint16_t a_flags; /* see below */
+ uint16_t a_type; /* allow or deny */
+ uint8_t a_obj_type[16]; /* obj type */
+ uint8_t a_inherit_obj_type[16]; /* inherit obj */
+} ace_object_t;
+
+#endif
+
+#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \
+ ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \
+ ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \
+ ACE_WRITE_OWNER|ACE_SYNCHRONIZE)
+
+#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \
+ ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
+
+#define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \
+ ACE_READ_NAMED_ATTRS)
+
+#define ACE_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \
+ ACE_WRITE_NAMED_ATTRS)
+
+#define ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \
+ ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \
+ ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE)
+/*
+ * The following flags are supported by both NFSv4 ACLs and ace_t.
+ */
+#define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \
+ ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE | \
+ ACE_INHERIT_ONLY_ACE | \
+ ACE_INHERITED_ACE | \
+ ACE_IDENTIFIER_GROUP)
+
+#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \
+ ACE_IDENTIFIER_GROUP)
+#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \
+ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
+
+/* cmd args to acl(2) for aclent_t */
+#define GETACL 1
+#define SETACL 2
+#define GETACLCNT 3
+
+/* cmd's to manipulate ace acls. */
+#define ACE_GETACL 4
+#define ACE_SETACL 5
+#define ACE_GETACLCNT 6
+
+/* minimal acl entries from GETACLCNT */
+#define MIN_ACL_ENTRIES 4
+
+#if !defined(_KERNEL)
+
+/* acl check errors */
+#define GRP_ERROR 1
+#define USER_ERROR 2
+#define OTHER_ERROR 3
+#define CLASS_ERROR 4
+#define DUPLICATE_ERROR 5
+#define MISS_ERROR 6
+#define MEM_ERROR 7
+#define ENTRY_ERROR 8
+
+
+/*
+ * similar to ufs_acl.h: changed to char type for user commands (tar, cpio)
+ * Attribute types
+ */
+#define UFSD_FREE ('0') /* Free entry */
+#define UFSD_ACL ('1') /* Access Control Lists */
+#define UFSD_DFACL ('2') /* reserved for future use */
+#define ACE_ACL ('3') /* ace_t style acls */
+
+/*
+ * flag to [f]acl_get()
+ * controls whether a trivial acl should be returned.
+ */
+#define ACL_NO_TRIVIAL 0x2
+
+
+/*
+ * Flags to control acl_totext()
+ */
+
+#define ACL_APPEND_ID 0x1 /* append uid/gid to user/group entries */
+#define ACL_COMPACT_FMT 0x2 /* build ACL in ls -V format */
+#define ACL_NORESOLVE 0x4 /* don't do name service lookups */
+#define ACL_SID_FMT 0x8 /* use usersid/groupsid when appropriate */
+
+/*
+ * Legacy aclcheck errors for aclent_t ACLs
+ */
+#define EACL_GRP_ERROR GRP_ERROR
+#define EACL_USER_ERROR USER_ERROR
+#define EACL_OTHER_ERROR OTHER_ERROR
+#define EACL_CLASS_ERROR CLASS_ERROR
+#define EACL_DUPLICATE_ERROR DUPLICATE_ERROR
+#define EACL_MISS_ERROR MISS_ERROR
+#define EACL_MEM_ERROR MEM_ERROR
+#define EACL_ENTRY_ERROR ENTRY_ERROR
+
+#define EACL_INHERIT_ERROR 9 /* invalid inherit flags */
+#define EACL_FLAGS_ERROR 10 /* unknown flag value */
+#define EACL_PERM_MASK_ERROR 11 /* unknown permission */
+#define EACL_COUNT_ERROR 12 /* invalid acl count */
+
+#define EACL_INVALID_SLOT 13 /* invalid acl slot */
+#define EACL_NO_ACL_ENTRY 14 /* Entry doesn't exist */
+#define EACL_DIFF_TYPE 15 /* acls aren't same type */
+
+#define EACL_INVALID_USER_GROUP 16 /* need user/group name */
+#define EACL_INVALID_STR 17 /* invalid acl string */
+#define EACL_FIELD_NOT_BLANK 18 /* can't have blank field */
+#define EACL_INVALID_ACCESS_TYPE 19 /* invalid access type */
+#define EACL_UNKNOWN_DATA 20 /* Unrecognized data in ACL */
+#define EACL_MISSING_FIELDS 21 /* missing fields in acl */
+
+#define EACL_INHERIT_NOTDIR 22 /* Need dir for inheritance */
+
+extern int aclcheck(aclent_t *, int, int *);
+extern int acltomode(aclent_t *, int, mode_t *);
+extern int aclfrommode(aclent_t *, int, mode_t *);
+extern int aclsort(int, int, aclent_t *);
+extern char *acltotext(aclent_t *, int);
+extern aclent_t *aclfromtext(char *, int *);
+extern void acl_free(acl_t *);
+extern int acl_get(const char *, int, acl_t **);
+extern int facl_get(int, int, acl_t **);
+extern int acl_set(const char *, acl_t *acl);
+extern int facl_set(int, acl_t *acl);
+extern int acl_strip(const char *, uid_t, gid_t, mode_t);
+extern int acl_trivial(const char *);
+extern char *acl_totext(acl_t *, int);
+extern int acl_fromtext(const char *, acl_t **);
+extern int acl_check(acl_t *, int);
+
+#else /* !defined(_KERNEL) */
+
+extern void ksort(caddr_t, int, int, int (*)(void *, void *));
+extern int cmp2acls(void *, void *);
+
+#endif /* !defined(_KERNEL) */
+
+extern int acl(const char *path, int cmd, int cnt, void *buf);
+extern int facl(int fd, int cmd, int cnt, void *buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ACL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
new file mode 100644
index 000000000000..8718f5bcf63f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ACL_IMPL_H
+#define _SYS_ACL_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * acl flags
+ *
+ * ACL_AUTO_INHERIT, ACL_PROTECTED and ACL_DEFAULTED
+ * flags can also be stored in this field.
+ */
+#define ACL_IS_TRIVIAL 0x10000
+#define ACL_IS_DIR 0x20000
+
+typedef enum acl_type {
+ ACLENT_T = 0,
+ ACE_T = 1
+} zfs_acl_type_t;
+
+struct acl_info {
+ zfs_acl_type_t acl_type; /* style of acl */
+ int acl_cnt; /* number of acl entries */
+ int acl_entry_size; /* sizeof acl entry */
+ int acl_flags; /* special flags about acl */
+ void *acl_aclp; /* the acl */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ACL_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h b/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h
new file mode 100644
index 000000000000..f10bcec40a7b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ASM_LINKAGE_H
+#define _SYS_ASM_LINKAGE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _ASM /* The remainder of this file is only for assembly files */
+
+/*
+ * make annoying differences in assembler syntax go away
+ */
+
+#if defined(__i386__) || defined(__amd64__)
+
+#define ASM_ENTRY_ALIGN 16
+
+#else
+
+#error Unsupported architecture.
+
+#endif
+
+/*
+ * ENTRY provides the standard procedure entry code and an easy way to
+ * insert the calls to mcount for profiling. ENTRY_NP is identical, but
+ * never calls mcount.
+ */
+#define ENTRY(x) \
+ .text; \
+ .align ASM_ENTRY_ALIGN; \
+ .globl x; \
+ .type x, @function; \
+x:
+
+/*
+ * ALTENTRY provides for additional entry points.
+ */
+#define ALTENTRY(x) \
+ .globl x; \
+ .type x, @function; \
+x:
+
+/*
+ * SET_SIZE trails a function and set the size for the ELF symbol table.
+ */
+#define SET_SIZE(x) \
+ .size x, [.-x]
+
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
new file mode 100644
index 000000000000..7e0e1e9dadd8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
@@ -0,0 +1,333 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _AVL_H
+#define _AVL_H
+
+/*
+ * This is a private header file. Applications should not directly include
+ * this file.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/avl_impl.h>
+
+/*
+ * This is a generic implementation of AVL trees for use in the Solaris kernel.
+ * The interfaces provide an efficient way of implementing an ordered set of
+ * data structures.
+ *
+ * AVL trees provide an alternative to using an ordered linked list. Using AVL
+ * trees will usually be faster, however they requires more storage. An ordered
+ * linked list in general requires 2 pointers in each data structure. The
+ * AVL tree implementation uses 3 pointers. The following chart gives the
+ * approximate performance of operations with the different approaches:
+ *
+ * Operation Link List AVL tree
+ * --------- -------- --------
+ * lookup O(n) O(log(n))
+ *
+ * insert 1 node constant constant
+ *
+ * delete 1 node constant between constant and O(log(n))
+ *
+ * delete all nodes O(n) O(n)
+ *
+ * visit the next
+ * or prev node constant between constant and O(log(n))
+ *
+ *
+ * The data structure nodes are anchored at an "avl_tree_t" (the equivalent
+ * of a list header) and the individual nodes will have a field of
+ * type "avl_node_t" (corresponding to list pointers).
+ *
+ * The type "avl_index_t" is used to indicate a position in the list for
+ * certain calls.
+ *
+ * The usage scenario is generally:
+ *
+ * 1. Create the list/tree with: avl_create()
+ *
+ * followed by any mixture of:
+ *
+ * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert()
+ *
+ * 2b. Visited elements with:
+ * avl_first() - returns the lowest valued node
+ * avl_last() - returns the highest valued node
+ * AVL_NEXT() - given a node go to next higher one
+ * AVL_PREV() - given a node go to previous lower one
+ *
+ * 2c. Find the node with the closest value either less than or greater
+ * than a given value with avl_nearest().
+ *
+ * 2d. Remove individual nodes from the list/tree with avl_remove().
+ *
+ * and finally when the list is being destroyed
+ *
+ * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes.
+ * Note that once you use avl_destroy_nodes(), you can no longer
+ * use any routine except avl_destroy_nodes() and avl_destoy().
+ *
+ * 4. Use avl_destroy() to destroy the AVL tree itself.
+ *
+ * Any locking for multiple thread access is up to the user to provide, just
+ * as is needed for any linked list implementation.
+ */
+
+/*
+ * AVL comparator helpers
+ */
+#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
+#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define AVL_PCMP(a, b) \
+ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
+/*
+ * AVL comparator helpers
+ */
+#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
+#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define AVL_PCMP(a, b) \
+ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
+/*
+ * Type used for the root of the AVL tree.
+ */
+typedef struct avl_tree avl_tree_t;
+
+/*
+ * The data nodes in the AVL tree must have a field of this type.
+ */
+typedef struct avl_node avl_node_t;
+
+/*
+ * An opaque type used to locate a position in the tree where a node
+ * would be inserted.
+ */
+typedef uintptr_t avl_index_t;
+
+
+/*
+ * Direction constants used for avl_nearest().
+ */
+#define AVL_BEFORE (0)
+#define AVL_AFTER (1)
+
+
+/*
+ * Prototypes
+ *
+ * Where not otherwise mentioned, "void *" arguments are a pointer to the
+ * user data structure which must contain a field of type avl_node_t.
+ *
+ * Also assume the user data structures looks like:
+ * stuct my_type {
+ * ...
+ * avl_node_t my_link;
+ * ...
+ * };
+ */
+
+/*
+ * Initialize an AVL tree. Arguments are:
+ *
+ * tree - the tree to be initialized
+ * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
+ * -1 for <, 0 for ==, and +1 for >
+ * size - the value of sizeof(struct my_type)
+ * offset - the value of OFFSETOF(struct my_type, my_link)
+ */
+extern void avl_create(avl_tree_t *tree,
+ int (*compar) (const void *, const void *), size_t size, size_t offset);
+
+
+/*
+ * Find a node with a matching value in the tree. Returns the matching node
+ * found. If not found, it returns NULL and then if "where" is not NULL it sets
+ * "where" for use with avl_insert() or avl_nearest().
+ *
+ * node - node that has the value being looked for
+ * where - position for use with avl_nearest() or avl_insert(), may be NULL
+ */
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
+
+/*
+ * Insert a node into the tree.
+ *
+ * node - the node to insert
+ * where - position as returned from avl_find()
+ */
+extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where);
+
+/*
+ * Insert "new_data" in "tree" in the given "direction" either after
+ * or before the data "here".
+ *
+ * This might be useful for avl clients caching recently accessed
+ * data to avoid doing avl_find() again for insertion.
+ *
+ * new_data - new data to insert
+ * here - existing node in "tree"
+ * direction - either AVL_AFTER or AVL_BEFORE the data "here".
+ */
+extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here,
+ int direction);
+
+
+/*
+ * Return the first or last valued node in the tree. Will return NULL
+ * if the tree is empty.
+ *
+ */
+extern void *avl_first(avl_tree_t *tree);
+extern void *avl_last(avl_tree_t *tree);
+
+
+/*
+ * Return the next or previous valued node in the tree.
+ * AVL_NEXT() will return NULL if at the last node.
+ * AVL_PREV() will return NULL if at the first node.
+ *
+ * node - the node from which the next or previous node is found
+ */
+#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER)
+#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE)
+
+
+/*
+ * Find the node with the nearest value either greater or less than
+ * the value from a previous avl_find(). Returns the node or NULL if
+ * there isn't a matching one.
+ *
+ * where - position as returned from avl_find()
+ * direction - either AVL_BEFORE or AVL_AFTER
+ *
+ * EXAMPLE get the greatest node that is less than a given value:
+ *
+ * avl_tree_t *tree;
+ * struct my_data look_for_value = {....};
+ * struct my_data *node;
+ * struct my_data *less;
+ * avl_index_t where;
+ *
+ * node = avl_find(tree, &look_for_value, &where);
+ * if (node != NULL)
+ * less = AVL_PREV(tree, node);
+ * else
+ * less = avl_nearest(tree, where, AVL_BEFORE);
+ */
+extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction);
+
+
+/*
+ * Add a single node to the tree.
+ * The node must not be in the tree, and it must not
+ * compare equal to any other node already in the tree.
+ *
+ * node - the node to add
+ */
+extern void avl_add(avl_tree_t *tree, void *node);
+
+
+/*
+ * Remove a single node from the tree. The node must be in the tree.
+ *
+ * node - the node to remove
+ */
+extern void avl_remove(avl_tree_t *tree, void *node);
+
+/*
+ * Reinsert a node only if its order has changed relative to its nearest
+ * neighbors. To optimize performance avl_update_lt() checks only the previous
+ * node and avl_update_gt() checks only the next node. Use avl_update_lt() and
+ * avl_update_gt() only if you know the direction in which the order of the
+ * node may change.
+ */
+extern boolean_t avl_update(avl_tree_t *, void *);
+extern boolean_t avl_update_lt(avl_tree_t *, void *);
+extern boolean_t avl_update_gt(avl_tree_t *, void *);
+
+/*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
+/*
+ * Return the number of nodes in the tree
+ */
+extern ulong_t avl_numnodes(avl_tree_t *tree);
+
+/*
+ * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise.
+ */
+extern boolean_t avl_is_empty(avl_tree_t *tree);
+
+/*
+ * Used to destroy any remaining nodes in a tree. The cookie argument should
+ * be initialized to NULL before the first call. Returns a node that has been
+ * removed from the tree and may be free()'d. Returns NULL when the tree is
+ * empty.
+ *
+ * Once you call avl_destroy_nodes(), you can only continuing calling it and
+ * finally avl_destroy(). No other AVL routines will be valid.
+ *
+ * cookie - a "void *" used to save state between calls to avl_destroy_nodes()
+ *
+ * EXAMPLE:
+ * avl_tree_t *tree;
+ * struct my_data *node;
+ * void *cookie;
+ *
+ * cookie = NULL;
+ * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
+ * free(node);
+ * avl_destroy(tree);
+ */
+extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie);
+
+
+/*
+ * Final destroy of an AVL tree. Arguments are:
+ *
+ * tree - the empty tree to destroy
+ */
+extern void avl_destroy(avl_tree_t *tree);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AVL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h
new file mode 100644
index 000000000000..620685f370d4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h
@@ -0,0 +1,164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AVL_IMPL_H
+#define _AVL_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This is a private header file. Applications should not directly include
+ * this file.
+ */
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * generic AVL tree implementation for kernel use
+ *
+ * There are 5 pieces of information stored for each node in an AVL tree
+ *
+ * pointer to less than child
+ * pointer to greater than child
+ * a pointer to the parent of this node
+ * an indication [0/1] of which child I am of my parent
+ * a "balance" (-1, 0, +1) indicating which child tree is taller
+ *
+ * Since they only need 3 bits, the last two fields are packed into the
+ * bottom bits of the parent pointer on 64 bit machines to save on space.
+ */
+
+#ifndef _LP64
+
+struct avl_node {
+ struct avl_node *avl_child[2]; /* left/right children */
+ struct avl_node *avl_parent; /* this node's parent */
+ unsigned short avl_child_index; /* my index in parent's avl_child[] */
+ short avl_balance; /* balance value: -1, 0, +1 */
+};
+
+#define AVL_XPARENT(n) ((n)->avl_parent)
+#define AVL_SETPARENT(n, p) ((n)->avl_parent = (p))
+
+#define AVL_XCHILD(n) ((n)->avl_child_index)
+#define AVL_SETCHILD(n, c) ((n)->avl_child_index = (unsigned short)(c))
+
+#define AVL_XBALANCE(n) ((n)->avl_balance)
+#define AVL_SETBALANCE(n, b) ((n)->avl_balance = (short)(b))
+
+#else /* _LP64 */
+
+/*
+ * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index
+ * values packed in the following manner:
+ *
+ * |63 3| 2 |1 0 |
+ * |-------------------------------------|-----------------|-------------|
+ * | avl_parent hi order bits | avl_child_index | avl_balance |
+ * | | | + 1 |
+ * |-------------------------------------|-----------------|-------------|
+ *
+ */
+struct avl_node {
+ struct avl_node *avl_child[2]; /* left/right children nodes */
+ uintptr_t avl_pcb; /* parent, child_index, balance */
+};
+
+/*
+ * macros to extract/set fields in avl_pcb
+ *
+ * pointer to the parent of the current node is the high order bits
+ */
+#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7))
+#define AVL_SETPARENT(n, p) \
+ ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p)))
+
+/*
+ * index of this node in its parent's avl_child[]: bit #2
+ */
+#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1)
+#define AVL_SETCHILD(n, c) \
+ ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2)))
+
+/*
+ * balance indication for a node, lowest 2 bits. A valid balance is
+ * -1, 0, or +1, and is encoded by adding 1 to the value to get the
+ * unsigned values of 0, 1, 2.
+ */
+#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1))
+#define AVL_SETBALANCE(n, b) \
+ ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1))))
+
+#endif /* _LP64 */
+
+
+
+/*
+ * switch between a node and data pointer for a given tree
+ * the value of "o" is tree->avl_offset
+ */
+#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o)))
+#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o)))
+
+
+
+/*
+ * macros used to create/access an avl_index_t
+ */
+#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1))
+#define AVL_INDEX2CHILD(x) ((x) & 1)
+#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c))
+
+
+/*
+ * The tree structure. The fields avl_root, avl_compar, and avl_offset come
+ * first since they are needed for avl_find(). We want them to fit into
+ * a single 64 byte cache line to make avl_find() as fast as possible.
+ */
+struct avl_tree {
+ struct avl_node *avl_root; /* root node in tree */
+ int (*avl_compar)(const void *, const void *);
+ size_t avl_offset; /* offsetof(type, avl_link_t field) */
+ ulong_t avl_numnodes; /* number of nodes in the tree */
+ size_t avl_size; /* sizeof user type struct */
+};
+
+
+/*
+ * This will only by used via AVL_NEXT() or AVL_PREV()
+ */
+extern void *avl_walk(struct avl_tree *, void *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AVL_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h b/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h
new file mode 100644
index 000000000000..485abc48f1a2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h
@@ -0,0 +1,198 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ifndef _SYS_BITMAP_H
+#define _SYS_BITMAP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/feature_tests.h>
+#if defined(__GNUC__) && defined(_ASM_INLINES) && \
+ (defined(__i386) || defined(__amd64))
+#include <asm/bitmap.h>
+#endif
+
+/*
+ * Operations on bitmaps of arbitrary size
+ * A bitmap is a vector of 1 or more ulong_t's.
+ * The user of the package is responsible for range checks and keeping
+ * track of sizes.
+ */
+
+#ifdef _LP64
+#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */
+#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */
+#else
+#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */
+#endif
+
+#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */
+#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */
+
+#ifdef _LP64
+#define BT_NBIPUL32 (1 << BT_ULSHIFT32) /* n bits per ulong_t */
+#define BT_ULMASK32 (BT_NBIPUL32 - 1) /* to extract bit index */
+#define BT_ULMAXMASK 0xffffffffffffffff /* used by bt_getlowbit */
+#else
+#define BT_ULMAXMASK 0xffffffff
+#endif
+
+/*
+ * bitmap is a ulong_t *, bitindex an index_t
+ *
+ * The macros BT_WIM and BT_BIW internal; there is no need
+ * for users of this package to use them.
+ */
+
+/*
+ * word in map
+ */
+#define BT_WIM(bitmap, bitindex) \
+ ((bitmap)[(bitindex) >> BT_ULSHIFT])
+/*
+ * bit in word
+ */
+#define BT_BIW(bitindex) \
+ (1UL << ((bitindex) & BT_ULMASK))
+
+#ifdef _LP64
+#define BT_WIM32(bitmap, bitindex) \
+ ((bitmap)[(bitindex) >> BT_ULSHIFT32])
+
+#define BT_BIW32(bitindex) \
+ (1UL << ((bitindex) & BT_ULMASK32))
+#endif
+
+/*
+ * These are public macros
+ *
+ * BT_BITOUL == n bits to n ulong_t's
+ */
+#define BT_BITOUL(nbits) \
+ (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL)
+#define BT_SIZEOFMAP(nbits) \
+ (BT_BITOUL(nbits) * sizeof (ulong_t))
+#define BT_TEST(bitmap, bitindex) \
+ ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0)
+#define BT_SET(bitmap, bitindex) \
+ { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); }
+#define BT_CLEAR(bitmap, bitindex) \
+ { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); }
+
+#ifdef _LP64
+#define BT_BITOUL32(nbits) \
+ (((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32)
+#define BT_SIZEOFMAP32(nbits) \
+ (BT_BITOUL32(nbits) * sizeof (uint_t))
+#define BT_TEST32(bitmap, bitindex) \
+ ((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0)
+#define BT_SET32(bitmap, bitindex) \
+ { BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); }
+#define BT_CLEAR32(bitmap, bitindex) \
+ { BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); }
+#endif /* _LP64 */
+
+
+/*
+ * BIT_ONLYONESET is a private macro not designed for bitmaps of
+ * arbitrary size. u must be an unsigned integer/long. It returns
+ * true if one and only one bit is set in u.
+ */
+#define BIT_ONLYONESET(u) \
+ ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0))
+
+#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && !defined(_ASM)
+#include <sys/atomic.h>
+
+/*
+ * return next available bit index from map with specified number of bits
+ */
+extern index_t bt_availbit(ulong_t *bitmap, size_t nbits);
+/*
+ * find the highest order bit that is on, and is within or below
+ * the word specified by wx
+ */
+extern int bt_gethighbit(ulong_t *mapp, int wx);
+extern int bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2,
+ size_t end_pos);
+/*
+ * Find highest and lowest one bit set.
+ * Returns bit number + 1 of bit that is set, otherwise returns 0.
+ * Low order bit is 0, high order bit is 31.
+ */
+extern int highbit(ulong_t);
+extern int highbit64(uint64_t);
+extern int lowbit(ulong_t);
+extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop);
+extern void bt_copy(ulong_t *, ulong_t *, ulong_t);
+
+/*
+ * find the parity
+ */
+extern int odd_parity(ulong_t);
+
+/*
+ * Atomically set/clear bits
+ * Atomic exclusive operations will set "result" to "-1"
+ * if the bit is already set/cleared. "result" will be set
+ * to 0 otherwise.
+ */
+#define BT_ATOMIC_SET(bitmap, bitindex) \
+ { atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
+#define BT_ATOMIC_CLEAR(bitmap, bitindex) \
+ { atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
+
+#define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \
+ { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \
+ (bitindex) % BT_NBIPUL); }
+#define BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \
+ { result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)), \
+ (bitindex) % BT_NBIPUL); }
+
+/*
+ * Extracts bits between index h (high, inclusive) and l (low, exclusive) from
+ * u, which must be an unsigned integer.
+ */
+#define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU))
+
+#endif /* (_KERNEL || _FAKE_KERNEL) && !_ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BITMAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h b/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
new file mode 100644
index 000000000000..43f14ebb369d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
@@ -0,0 +1,215 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CALLB_H
+#define _SYS_CALLB_H
+
+#include <sys/kcondvar.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * definitions of callback classes (c_class)
+ *
+ * Callbacks belong in the same class if (1) their callback routines
+ * do the same kind of processing (ideally, using the same callback function)
+ * and (2) they can/should be executed at the same time in a cpr
+ * suspend/resume operation.
+ *
+ * Note: The DAEMON class, in particular, is for stopping kernel threads
+ * and nothing else. The CALLB_* macros below should be used to deal
+ * with kernel threads, and the callback function should be callb_generic_cpr.
+ * Another idiosyncrasy of the DAEMON class is that if a suspend operation
+ * fails, some of the callback functions may be called with the RESUME
+ * code which were never called with SUSPEND. Not a problem currently,
+ * but see bug 4201851.
+ */
+#define CB_CL_CPR_DAEMON 0
+#define CB_CL_CPR_VM 1
+#define CB_CL_CPR_CALLOUT 2
+#define CB_CL_CPR_OBP 3
+#define CB_CL_CPR_FB 4
+#define CB_CL_PANIC 5
+#define CB_CL_CPR_RPC 6
+#define CB_CL_CPR_PROMPRINTF 7
+#define CB_CL_UADMIN 8
+#define CB_CL_CPR_PM 9
+#define CB_CL_HALT 10
+#define CB_CL_CPR_DMA 11
+#define CB_CL_CPR_POST_USER 12
+#define CB_CL_UADMIN_PRE_VFS 13
+#define CB_CL_MDBOOT CB_CL_UADMIN
+#define CB_CL_ENTER_DEBUGGER 14
+#define CB_CL_CPR_POST_KERNEL 15
+#define CB_CL_CPU_DEEP_IDLE 16
+#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */
+
+/*
+ * CB_CL_CPR_DAEMON class specific definitions are given below:
+ */
+
+/*
+ * code for CPR callb_execute_class
+ */
+#define CB_CODE_CPR_CHKPT 0
+#define CB_CODE_CPR_RESUME 1
+
+typedef void * callb_id_t;
+/*
+ * Per kernel thread structure for CPR daemon callbacks.
+ * Must be protected by either a existing lock in the daemon or
+ * a new lock created for such a purpose.
+ */
+typedef struct callb_cpr {
+ kmutex_t *cc_lockp; /* lock to protect this struct */
+ char cc_events; /* various events for CPR */
+ callb_id_t cc_id; /* callb id address */
+ kcondvar_t cc_callb_cv; /* cv for callback waiting */
+ kcondvar_t cc_stop_cv; /* cv to checkpoint block */
+} callb_cpr_t;
+
+/*
+ * cc_events definitions
+ */
+#define CALLB_CPR_START 1 /* a checkpoint request's started */
+#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */
+#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */
+
+/*
+ * Used when checking that all kernel threads are stopped.
+ */
+#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */
+#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */
+#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */
+ /* due to pwr mgmt of disks, make -- */
+ /* big enough for worst spinup time */
+
+#ifdef _KERNEL
+/*
+ *
+ * CALLB_CPR_INIT macro is used by kernel threads to add their entry to
+ * the callback table and perform other initialization. It automatically
+ * adds the thread as being in the callback class CB_CL_CPR_DAEMON.
+ *
+ * cp - ptr to the callb_cpr_t structure for this kernel thread
+ *
+ * lockp - pointer to mutex protecting the callb_cpr_t stuct
+ *
+ * func - pointer to the callback function for this kernel thread.
+ * It has the prototype boolean_t <func>(void *arg, int code)
+ * where: arg - ptr to the callb_cpr_t structure
+ * code - not used for this type of callback
+ * returns: B_TRUE if successful; B_FALSE if unsuccessful.
+ *
+ * name - a string giving the name of the kernel thread
+ *
+ * Note: lockp is the lock to protect the callb_cpr_t (cp) structure
+ * later on. No lock held is needed for this initialization.
+ */
+#define CALLB_CPR_INIT(cp, lockp, func, name) { \
+ strlcpy(curthread->td_name, (name), \
+ sizeof(curthread->td_name)); \
+ bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \
+ (cp)->cc_lockp = lockp; \
+ (cp)->cc_id = callb_add(func, (void *)(cp), \
+ CB_CL_CPR_DAEMON, name); \
+ cv_init(&(cp)->cc_callb_cv, NULL, CV_DEFAULT, NULL); \
+ cv_init(&(cp)->cc_stop_cv, NULL, CV_DEFAULT, NULL); \
+ }
+
+#ifndef __lock_lint
+#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp));
+#else
+#define CALLB_CPR_ASSERT(cp)
+#endif
+/*
+ * Some threads (like the idle threads) do not adhere to the callback
+ * protocol and are always considered safe. Such threads must never exit.
+ * They register their presence by calling this macro during their
+ * initialization.
+ *
+ * Args:
+ * t - thread pointer of the client kernel thread
+ * name - a string giving the name of the kernel thread
+ */
+#define CALLB_CPR_INIT_SAFE(t, name) { \
+ (void) callb_add_thread(callb_generic_cpr_safe, \
+ (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \
+ name, t); \
+ }
+/*
+ * The lock to protect cp's content must be held before
+ * calling the following two macros.
+ *
+ * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END
+ * is safe for checkpoint/resume.
+ */
+#define CALLB_CPR_SAFE_BEGIN(cp) { \
+ CALLB_CPR_ASSERT(cp) \
+ (cp)->cc_events |= CALLB_CPR_SAFE; \
+ if ((cp)->cc_events & CALLB_CPR_START) \
+ cv_signal(&(cp)->cc_callb_cv); \
+ }
+#define CALLB_CPR_SAFE_END(cp, lockp) { \
+ CALLB_CPR_ASSERT(cp) \
+ while ((cp)->cc_events & CALLB_CPR_START) \
+ cv_wait(&(cp)->cc_stop_cv, lockp); \
+ (cp)->cc_events &= ~CALLB_CPR_SAFE; \
+ }
+/*
+ * cv_destroy is nop right now but may be needed in the future.
+ */
+#define CALLB_CPR_EXIT(cp) { \
+ CALLB_CPR_ASSERT(cp) \
+ (cp)->cc_events |= CALLB_CPR_SAFE; \
+ if ((cp)->cc_events & CALLB_CPR_START) \
+ cv_signal(&(cp)->cc_callb_cv); \
+ mutex_exit((cp)->cc_lockp); \
+ (void) callb_delete((cp)->cc_id); \
+ cv_destroy(&(cp)->cc_callb_cv); \
+ cv_destroy(&(cp)->cc_stop_cv); \
+ }
+
+extern callb_cpr_t callb_cprinfo_safe;
+extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *);
+extern callb_id_t callb_add_thread(boolean_t (*)(void *, int),
+ void *, int, char *, kthread_id_t);
+extern int callb_delete(callb_id_t);
+extern void callb_execute(callb_id_t, int);
+extern void *callb_execute_class(int, int);
+extern boolean_t callb_generic_cpr(void *, int);
+extern boolean_t callb_generic_cpr_safe(void *, int);
+extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *);
+extern void callb_lock_table(void);
+extern void callb_unlock_table(void);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CALLB_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h b/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
new file mode 100644
index 000000000000..c9857b086575
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CCOMPILE_H
+#define _SYS_CCOMPILE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file contains definitions designed to enable different compilers
+ * to be used harmoniously on Solaris systems.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Allow for version tests for compiler bugs and features.
+ */
+#if defined(__GNUC__)
+#define __GNUC_VERSION \
+ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#define __GNUC_VERSION 0
+#endif
+
+#if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__)
+
+/*
+ * analogous to lint's PRINTFLIKEn
+ */
+#define __sun_attr___PRINTFLIKE__(__n) \
+ __attribute__((__format__(printf, __n, (__n)+1)))
+#define __sun_attr___VPRINTFLIKE__(__n) \
+ __attribute__((__format__(printf, __n, 0)))
+
+/*
+ * Handle the kernel printf routines that can take '%b' too
+ */
+#if __GNUC_VERSION < 30402
+/*
+ * XX64 at least this doesn't work correctly yet with 3.4.1 anyway!
+ */
+#define __sun_attr___KPRINTFLIKE__ __sun_attr___PRINTFLIKE__
+#define __sun_attr___KVPRINTFLIKE__ __sun_attr___VPRINTFLIKE__
+#else
+#define __sun_attr___KPRINTFLIKE__(__n) \
+ __attribute__((__format__(cmn_err, __n, (__n)+1)))
+#define __sun_attr___KVPRINTFLIKE__(__n) \
+ __attribute__((__format__(cmn_err, __n, 0)))
+#endif
+
+/*
+ * This one's pretty obvious -- the function never returns
+ */
+#define __sun_attr___noreturn__ __attribute__((__noreturn__))
+
+
+/*
+ * This is an appropriate label for functions that do not
+ * modify their arguments, e.g. strlen()
+ */
+#define __sun_attr___pure__ __attribute__((__pure__))
+
+/*
+ * This is a stronger form of __pure__. Can be used for functions
+ * that do not modify their arguments and don't depend on global
+ * memory.
+ */
+#define __sun_attr___const__ __attribute__((__const__))
+
+/*
+ * structure packing like #pragma pack(1)
+ */
+#define __sun_attr___packed__ __attribute__((__packed__))
+
+#define ___sun_attr_inner(__a) __sun_attr_##__a
+#define __sun_attr__(__a) ___sun_attr_inner __a
+
+#else /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */
+
+#define __sun_attr__(__a)
+
+#endif /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */
+
+/*
+ * Shorthand versions for readability
+ */
+
+#define __PRINTFLIKE(__n) __sun_attr__((__PRINTFLIKE__(__n)))
+#define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n)))
+#define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n)))
+#define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n)))
+#define __NORETURN __sun_attr__((__noreturn__))
+#define __CONST __sun_attr__((__const__))
+#define __PURE __sun_attr__((__pure__))
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CCOMPILE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h
new file mode 100644
index 000000000000..e710d8e5c30b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h
@@ -0,0 +1,128 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CMN_ERR_H
+#define _SYS_CMN_ERR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#if defined(_KERNEL) && !defined(_ASM)
+#include <sys/va_list.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Common error handling severity levels */
+
+#define CE_CONT 0 /* continuation */
+#define CE_NOTE 1 /* notice */
+#define CE_WARN 2 /* warning */
+#define CE_PANIC 3 /* panic */
+#define CE_IGNORE 4 /* print nothing */
+
+#ifndef _ASM
+
+#ifdef _KERNEL
+
+/*PRINTFLIKE2*/
+extern void cmn_err(int, const char *, ...)
+ __KPRINTFLIKE(2);
+#pragma rarely_called(cmn_err)
+
+extern void vzcmn_err(zoneid_t, int, const char *, __va_list)
+ __KVPRINTFLIKE(3);
+#pragma rarely_called(vzcmn_err)
+
+extern void vcmn_err(int, const char *, __va_list)
+ __KVPRINTFLIKE(2);
+#pragma rarely_called(vcmn_err)
+
+/*PRINTFLIKE3*/
+extern void zcmn_err(zoneid_t, int, const char *, ...)
+ __KPRINTFLIKE(3);
+#pragma rarely_called(zcmn_err)
+
+/*PRINTFLIKE1*/
+extern void printf(const char *, ...)
+ __KPRINTFLIKE(1);
+#pragma rarely_called(printf)
+
+extern void vzprintf(zoneid_t, const char *, __va_list)
+ __KVPRINTFLIKE(2);
+#pragma rarely_called(vzprintf)
+
+/*PRINTFLIKE2*/
+extern void zprintf(zoneid_t, const char *, ...)
+ __KPRINTFLIKE(2);
+#pragma rarely_called(zprintf)
+
+extern void vprintf(const char *, __va_list)
+ __KVPRINTFLIKE(1);
+#pragma rarely_called(vprintf)
+
+/*PRINTFLIKE1*/
+extern void uprintf(const char *, ...)
+ __KPRINTFLIKE(1);
+#pragma rarely_called(uprintf)
+
+extern void vuprintf(const char *, __va_list)
+ __KVPRINTFLIKE(1);
+#pragma rarely_called(vuprintf)
+
+/*PRINTFLIKE3*/
+extern size_t snprintf(char *, size_t, const char *, ...)
+ __KPRINTFLIKE(3);
+extern size_t vsnprintf(char *, size_t, const char *, __va_list)
+ __KVPRINTFLIKE(3);
+/*PRINTFLIKE2*/
+extern char *sprintf(char *, const char *, ...)
+ __KPRINTFLIKE(2);
+extern char *vsprintf(char *, const char *, __va_list)
+ __KVPRINTFLIKE(2);
+
+/*PRINTFLIKE1*/
+extern void panic(const char *, ...)
+ __KPRINTFLIKE(1) __NORETURN;
+#pragma rarely_called(panic)
+
+extern void vpanic(const char *, __va_list)
+ __KVPRINTFLIKE(1) __NORETURN;
+#pragma rarely_called(vpanic)
+
+#endif /* _KERNEL */
+#endif /* !_ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CMN_ERR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h b/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h
new file mode 100644
index 000000000000..3d79d9511092
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1998 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_COMPRESS_H
+#define _SYS_COMPRESS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern size_t compress(void *, void *, size_t);
+extern size_t decompress(void *, void *, size_t, size_t);
+extern uint32_t checksum32(void *, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_COMPRESS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
new file mode 100644
index 000000000000..a91c989bbff1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
@@ -0,0 +1,158 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ */
+
+#ifndef _SYS_CPUPART_H
+#define _SYS_CPUPART_H
+
+#include <sys/types.h>
+#include <sys/processor.h>
+#include <sys/cpuvar.h>
+#include <sys/disp.h>
+#include <sys/pset.h>
+#include <sys/lgrp.h>
+#include <sys/lgrp_user.h>
+#include <sys/pg.h>
+#include <sys/bitset.h>
+#include <sys/time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+
+typedef int cpupartid_t;
+
+/*
+ * Special partition id.
+ */
+#define CP_DEFAULT 0
+
+/*
+ * Flags for cpupart_list()
+ */
+#define CP_ALL 0 /* return all cpu partitions */
+#define CP_NONEMPTY 1 /* return only non-empty ones */
+
+typedef struct cpupart {
+ disp_t cp_kp_queue; /* partition-wide kpreempt queue */
+ cpupartid_t cp_id; /* partition ID */
+ int cp_ncpus; /* number of online processors */
+ struct cpupart *cp_next; /* next partition in list */
+ struct cpupart *cp_prev; /* previous partition in list */
+ struct cpu *cp_cpulist; /* processor list */
+ struct kstat *cp_kstat; /* per-partition statistics */
+
+ /*
+ * cp_nrunnable and cp_nrunning are used to calculate load average.
+ */
+ uint_t cp_nrunnable; /* current # of runnable threads */
+ uint_t cp_nrunning; /* current # of running threads */
+
+ /*
+ * cp_updates, cp_nrunnable_cum, cp_nwaiting_cum, and cp_hp_avenrun
+ * are used to generate kstat information on an as-needed basis.
+ */
+ uint64_t cp_updates; /* number of statistics updates */
+ uint64_t cp_nrunnable_cum; /* cum. # of runnable threads */
+ uint64_t cp_nwaiting_cum; /* cum. # of waiting threads */
+
+ struct loadavg_s cp_loadavg; /* cpupart loadavg */
+
+ klgrpset_t cp_lgrpset; /* set of lgroups on which this */
+ /* partition has cpus */
+ lpl_t *cp_lgrploads; /* table of load averages for this */
+ /* partition, indexed by lgrp ID */
+ int cp_nlgrploads; /* size of cp_lgrploads table */
+ uint64_t cp_hp_avenrun[3]; /* high-precision load average */
+ uint_t cp_attr; /* bitmask of attributes */
+ lgrp_gen_t cp_gen; /* generation number */
+ lgrp_id_t cp_lgrp_hint; /* last home lgroup chosen */
+ bitset_t cp_cmt_pgs; /* CMT PGs represented */
+ bitset_t cp_haltset; /* halted CPUs */
+} cpupart_t;
+
+typedef struct cpupart_kstat {
+ kstat_named_t cpk_updates; /* number of updates */
+ kstat_named_t cpk_runnable; /* cum # of runnable threads */
+ kstat_named_t cpk_waiting; /* cum # waiting for I/O */
+ kstat_named_t cpk_ncpus; /* current # of CPUs */
+ kstat_named_t cpk_avenrun_1min; /* 1-minute load average */
+ kstat_named_t cpk_avenrun_5min; /* 5-minute load average */
+ kstat_named_t cpk_avenrun_15min; /* 15-minute load average */
+} cpupart_kstat_t;
+
+/*
+ * Macro to obtain the maximum run priority for the global queue associated
+ * with given cpu partition.
+ */
+#define CP_MAXRUNPRI(cp) ((cp)->cp_kp_queue.disp_maxrunpri)
+
+/*
+ * This macro is used to determine if the given thread must surrender
+ * CPU to higher priority runnable threads on one of its dispatch queues.
+ * This should really be defined in <sys/disp.h> but it is not because
+ * including <sys/cpupart.h> there would cause recursive includes.
+ */
+#define DISP_MUST_SURRENDER(t) \
+ ((DISP_MAXRUNPRI(t) > DISP_PRIO(t)) || \
+ (CP_MAXRUNPRI(t->t_cpupart) > DISP_PRIO(t)))
+
+extern cpupart_t cp_default;
+extern cpupart_t *cp_list_head;
+extern uint_t cp_numparts;
+extern uint_t cp_numparts_nonempty;
+
+/*
+ * Each partition contains a bitset that indicates which CPUs are halted and
+ * which ones are running. Given the growing number of CPUs in current and
+ * future platforms, it's important to fanout each CPU within its partition's
+ * haltset to prevent contention due to false sharing. The fanout factor
+ * is platform specific, and declared accordingly.
+ */
+extern uint_t cp_haltset_fanout;
+
+extern void cpupart_initialize_default();
+extern cpupart_t *cpupart_find(psetid_t);
+extern int cpupart_create(psetid_t *);
+extern int cpupart_destroy(psetid_t);
+extern psetid_t cpupart_query_cpu(cpu_t *);
+extern int cpupart_attach_cpu(psetid_t, cpu_t *, int);
+extern int cpupart_get_cpus(psetid_t *, processorid_t *, uint_t *);
+extern int cpupart_bind_thread(kthread_id_t, psetid_t, int, void *,
+ void *);
+extern void cpupart_kpqalloc(pri_t);
+extern int cpupart_get_loadavg(psetid_t, int *, int);
+extern uint_t cpupart_list(psetid_t *, uint_t, int);
+extern int cpupart_setattr(psetid_t, uint_t);
+extern int cpupart_getattr(psetid_t, uint_t *);
+
+#endif /* _KERNEL || _FAKE_KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CPUPART_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
new file mode 100644
index 000000000000..f526c85872e7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
@@ -0,0 +1,830 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright 2017 RackTop Systems.
+ */
+
+#ifndef _SYS_CPUVAR_H
+#define _SYS_CPUVAR_H
+
+#include <sys/thread.h>
+#include <sys/sysinfo.h> /* has cpu_stat_t definition */
+#include <sys/disp.h>
+#include <sys/processor.h>
+#include <sys/kcpc.h> /* has kcpc_ctx_t definition */
+
+#include <sys/loadavg.h>
+#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
+#include <sys/machcpuvar.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/bitmap.h>
+#include <sys/rwlock.h>
+#include <sys/msacct.h>
+#if defined(__GNUC__) && defined(_ASM_INLINES) && defined(_KERNEL) && \
+ (defined(__i386) || defined(__amd64))
+#include <asm/cpuvar.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct squeue_set_s;
+
+#define CPU_CACHE_COHERENCE_SIZE 64
+
+/*
+ * For fast event tracing.
+ */
+struct ftrace_record;
+typedef struct ftrace_data {
+ int ftd_state; /* ftrace flags */
+ kmutex_t ftd_unused; /* ftrace buffer lock, unused */
+ struct ftrace_record *ftd_cur; /* current record */
+ struct ftrace_record *ftd_first; /* first record */
+ struct ftrace_record *ftd_last; /* last record */
+} ftrace_data_t;
+
+struct cyc_cpu;
+struct nvlist;
+
+/*
+ * Per-CPU data.
+ *
+ * Be careful adding new members: if they are not the same in all modules (e.g.
+ * change size depending on a #define), CTF uniquification can fail to work
+ * properly. Furthermore, this is transitive in that it applies recursively to
+ * all types pointed to by cpu_t.
+ */
+typedef struct cpu {
+ processorid_t cpu_id; /* CPU number */
+ processorid_t cpu_seqid; /* sequential CPU id (0..ncpus-1) */
+ volatile cpu_flag_t cpu_flags; /* flags indicating CPU state */
+ struct cpu *cpu_self; /* pointer to itself */
+ kthread_t *cpu_thread; /* current thread */
+ kthread_t *cpu_idle_thread; /* idle thread for this CPU */
+ kthread_t *cpu_pause_thread; /* pause thread for this CPU */
+ klwp_id_t cpu_lwp; /* current lwp (if any) */
+ klwp_id_t cpu_fpowner; /* currently loaded fpu owner */
+ struct cpupart *cpu_part; /* partition with this CPU */
+ struct lgrp_ld *cpu_lpl; /* pointer to this cpu's load */
+ int cpu_cache_offset; /* see kmem.c for details */
+
+ /*
+ * Links to other CPUs. It is safe to walk these lists if
+ * one of the following is true:
+ * - cpu_lock held
+ * - preemption disabled via kpreempt_disable
+ * - PIL >= DISP_LEVEL
+ * - acting thread is an interrupt thread
+ * - all other CPUs are paused
+ */
+ struct cpu *cpu_next; /* next existing CPU */
+ struct cpu *cpu_prev; /* prev existing CPU */
+ struct cpu *cpu_next_onln; /* next online (enabled) CPU */
+ struct cpu *cpu_prev_onln; /* prev online (enabled) CPU */
+ struct cpu *cpu_next_part; /* next CPU in partition */
+ struct cpu *cpu_prev_part; /* prev CPU in partition */
+ struct cpu *cpu_next_lgrp; /* next CPU in latency group */
+ struct cpu *cpu_prev_lgrp; /* prev CPU in latency group */
+ struct cpu *cpu_next_lpl; /* next CPU in lgrp partition */
+ struct cpu *cpu_prev_lpl;
+
+ struct cpu_pg *cpu_pg; /* cpu's processor groups */
+
+ void *cpu_reserved[4]; /* reserved for future use */
+
+ /*
+ * Scheduling variables.
+ */
+ disp_t *cpu_disp; /* dispatch queue data */
+ /*
+ * Note that cpu_disp is set before the CPU is added to the system
+ * and is never modified. Hence, no additional locking is needed
+ * beyond what's necessary to access the cpu_t structure.
+ */
+ char cpu_runrun; /* scheduling flag - set to preempt */
+ char cpu_kprunrun; /* force kernel preemption */
+ pri_t cpu_chosen_level; /* priority at which cpu */
+ /* was chosen for scheduling */
+ kthread_t *cpu_dispthread; /* thread selected for dispatch */
+ disp_lock_t cpu_thread_lock; /* dispatcher lock on current thread */
+ uint8_t cpu_disp_flags; /* flags used by dispatcher */
+ /*
+ * The following field is updated when ever the cpu_dispthread
+ * changes. Also in places, where the current thread(cpu_dispthread)
+ * priority changes. This is used in disp_lowpri_cpu()
+ */
+ pri_t cpu_dispatch_pri; /* priority of cpu_dispthread */
+ clock_t cpu_last_swtch; /* last time switched to new thread */
+
+ /*
+ * Interrupt data.
+ */
+ caddr_t cpu_intr_stack; /* interrupt stack */
+ kthread_t *cpu_intr_thread; /* interrupt thread list */
+ uint_t cpu_intr_actv; /* interrupt levels active (bitmask) */
+ int cpu_base_spl; /* priority for highest rupt active */
+
+ /*
+ * Statistics.
+ */
+ cpu_stats_t cpu_stats; /* per-CPU statistics */
+ struct kstat *cpu_info_kstat; /* kstat for cpu info */
+
+ uintptr_t cpu_profile_pc; /* kernel PC in profile interrupt */
+ uintptr_t cpu_profile_upc; /* user PC in profile interrupt */
+ uintptr_t cpu_profile_pil; /* PIL when profile interrupted */
+
+ ftrace_data_t cpu_ftrace; /* per cpu ftrace data */
+
+ clock_t cpu_deadman_counter; /* used by deadman() */
+ uint_t cpu_deadman_countdown; /* used by deadman() */
+
+ kmutex_t cpu_cpc_ctxlock; /* protects context for idle thread */
+ kcpc_ctx_t *cpu_cpc_ctx; /* performance counter context */
+
+ /*
+ * Configuration information for the processor_info system call.
+ */
+ processor_info_t cpu_type_info; /* config info */
+ time_t cpu_state_begin; /* when CPU entered current state */
+ char cpu_cpr_flags; /* CPR related info */
+ struct cyc_cpu *cpu_cyclic; /* per cpu cyclic subsystem data */
+ struct squeue_set_s *cpu_squeue_set; /* per cpu squeue set */
+ struct nvlist *cpu_props; /* pool-related properties */
+
+ krwlock_t cpu_ft_lock; /* DTrace: fasttrap lock */
+ uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */
+ hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */
+ hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */
+ volatile uint16_t cpu_mstate; /* cpu microstate */
+ volatile uint16_t cpu_mstate_gen; /* generation counter */
+ volatile hrtime_t cpu_mstate_start; /* cpu microstate start time */
+ volatile hrtime_t cpu_acct[NCMSTATES]; /* cpu microstate data */
+ hrtime_t cpu_intracct[NCMSTATES]; /* interrupt mstate data */
+ hrtime_t cpu_waitrq; /* cpu run-queue wait time */
+ struct loadavg_s cpu_loadavg; /* loadavg info for this cpu */
+
+ char *cpu_idstr; /* for printing and debugging */
+ char *cpu_brandstr; /* for printing */
+
+ /*
+ * Sum of all device interrupt weights that are currently directed at
+ * this cpu. Cleared at start of interrupt redistribution.
+ */
+ int32_t cpu_intr_weight;
+ void *cpu_vm_data;
+
+ struct cpu_physid *cpu_physid; /* physical associations */
+
+ uint64_t cpu_curr_clock; /* current clock freq in Hz */
+ char *cpu_supp_freqs; /* supported freqs in Hz */
+
+ uintptr_t cpu_cpcprofile_pc; /* kernel PC in cpc interrupt */
+ uintptr_t cpu_cpcprofile_upc; /* user PC in cpc interrupt */
+
+ /*
+ * Interrupt load factor used by dispatcher & softcall
+ */
+ hrtime_t cpu_intrlast; /* total interrupt time (nsec) */
+ int cpu_intrload; /* interrupt load factor (0-99%) */
+
+ uint_t cpu_rotor; /* for cheap pseudo-random numbers */
+
+ struct cu_cpu_info *cpu_cu_info; /* capacity & util. info */
+
+ /*
+ * cpu_generation is updated whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation are protected by cpu_lock.
+ *
+ * See CPU_NEW_GENERATION() macro below.
+ */
+ volatile uint_t cpu_generation; /* tracking on/off-line */
+
+ /*
+ * New members must be added /before/ this member, as the CTF tools
+ * rely on this being the last field before cpu_m, so they can
+ * correctly calculate the offset when synthetically adding the cpu_m
+ * member in objects that do not have it. This fixup is required for
+ * uniquification to work correctly.
+ */
+ uintptr_t cpu_m_pad;
+
+#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
+ struct machcpu cpu_m; /* per architecture info */
+#endif
+} cpu_t;
+
+/*
+ * The cpu_core structure consists of per-CPU state available in any context.
+ * On some architectures, this may mean that the page(s) containing the
+ * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
+ * is up to the platform to assure that this is performed properly. Note that
+ * the structure is sized to avoid false sharing.
+ */
+#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uint8_t) + \
+ sizeof (uintptr_t) + sizeof (kmutex_t))
+#define CPUC_PADSIZE CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE
+
+typedef struct cpu_core {
+ uint16_t cpuc_dtrace_flags; /* DTrace flags */
+ uint8_t cpuc_dcpc_intr_state; /* DCPC provider intr state */
+ uint8_t cpuc_pad[CPUC_PADSIZE]; /* padding */
+ uintptr_t cpuc_dtrace_illval; /* DTrace illegal value */
+ kmutex_t cpuc_pid_lock; /* DTrace pid provider lock */
+} cpu_core_t;
+
+#ifdef _KERNEL
+extern cpu_core_t cpu_core[];
+#endif /* _KERNEL */
+
+/*
+ * CPU_ON_INTR() macro. Returns non-zero if currently on interrupt stack.
+ * Note that this isn't a test for a high PIL. For example, cpu_intr_actv
+ * does not get updated when we go through sys_trap from TL>0 at high PIL.
+ * getpil() should be used instead to check for PIL levels.
+ */
+#define CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1))
+
+/*
+ * Check to see if an interrupt thread might be active at a given ipl.
+ * If so return true.
+ * We must be conservative--it is ok to give a false yes, but a false no
+ * will cause disaster. (But if the situation changes after we check it is
+ * ok--the caller is trying to ensure that an interrupt routine has been
+ * exited).
+ * This is used when trying to remove an interrupt handler from an autovector
+ * list in avintr.c.
+ */
+#define INTR_ACTIVE(cpup, level) \
+ ((level) <= LOCK_LEVEL ? \
+ ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup)))
+
+/*
+ * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one
+ * looks at it. It's meant as a cheap mechanism to be incorporated in routines
+ * wanting to avoid biasing, but where true randomness isn't needed (just
+ * something that changes).
+ */
+#define CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++)
+
+#if defined(_KERNEL) || defined(_KMEMUSER)
+
+#define INTR_STACK_SIZE MAX(DEFAULTSTKSZ, PAGESIZE)
+
+/* MEMBERS PROTECTED BY "atomicity": cpu_flags */
+
+/*
+ * Flags in the CPU structure.
+ *
+ * These are protected by cpu_lock (except during creation).
+ *
+ * Offlined-CPUs have three stages of being offline:
+ *
+ * CPU_ENABLE indicates that the CPU is participating in I/O interrupts
+ * that can be directed at a number of different CPUs. If CPU_ENABLE
+ * is off, the CPU will not be given interrupts that can be sent elsewhere,
+ * but will still get interrupts from devices associated with that CPU only,
+ * and from other CPUs.
+ *
+ * CPU_OFFLINE indicates that the dispatcher should not allow any threads
+ * other than interrupt threads to run on that CPU. A CPU will not have
+ * CPU_OFFLINE set if there are any bound threads (besides interrupts).
+ *
+ * CPU_QUIESCED is set if p_offline was able to completely turn idle the
+ * CPU and it will not have to run interrupt threads. In this case it'll
+ * stay in the idle loop until CPU_QUIESCED is turned off.
+ *
+ * CPU_FROZEN is used only by CPR to mark CPUs that have been successfully
+ * suspended (in the suspend path), or have yet to be resumed (in the resume
+ * case).
+ *
+ * On some platforms CPUs can be individually powered off.
+ * The following flags are set for powered off CPUs: CPU_QUIESCED,
+ * CPU_OFFLINE, and CPU_POWEROFF. The following flags are cleared:
+ * CPU_RUNNING, CPU_READY, CPU_EXISTS, and CPU_ENABLE.
+ */
+#define CPU_RUNNING 0x001 /* CPU running */
+#define CPU_READY 0x002 /* CPU ready for cross-calls */
+#define CPU_QUIESCED 0x004 /* CPU will stay in idle */
+#define CPU_EXISTS 0x008 /* CPU is configured */
+#define CPU_ENABLE 0x010 /* CPU enabled for interrupts */
+#define CPU_OFFLINE 0x020 /* CPU offline via p_online */
+#define CPU_POWEROFF 0x040 /* CPU is powered off */
+#define CPU_FROZEN 0x080 /* CPU is frozen via CPR suspend */
+#define CPU_SPARE 0x100 /* CPU offline available for use */
+#define CPU_FAULTED 0x200 /* CPU offline diagnosed faulty */
+
+#define FMT_CPU_FLAGS \
+ "\20\12fault\11spare\10frozen" \
+ "\7poweroff\6offline\5enable\4exist\3quiesced\2ready\1run"
+
+#define CPU_ACTIVE(cpu) (((cpu)->cpu_flags & CPU_OFFLINE) == 0)
+
+/*
+ * Flags for cpu_offline(), cpu_faulted(), and cpu_spare().
+ */
+#define CPU_FORCED 0x0001 /* Force CPU offline */
+
+/*
+ * DTrace flags.
+ */
+#define CPU_DTRACE_NOFAULT 0x0001 /* Don't fault */
+#define CPU_DTRACE_DROP 0x0002 /* Drop this ECB */
+#define CPU_DTRACE_BADADDR 0x0004 /* DTrace fault: bad address */
+#define CPU_DTRACE_BADALIGN 0x0008 /* DTrace fault: bad alignment */
+#define CPU_DTRACE_DIVZERO 0x0010 /* DTrace fault: divide by zero */
+#define CPU_DTRACE_ILLOP 0x0020 /* DTrace fault: illegal operation */
+#define CPU_DTRACE_NOSCRATCH 0x0040 /* DTrace fault: out of scratch */
+#define CPU_DTRACE_KPRIV 0x0080 /* DTrace fault: bad kernel access */
+#define CPU_DTRACE_UPRIV 0x0100 /* DTrace fault: bad user access */
+#define CPU_DTRACE_TUPOFLOW 0x0200 /* DTrace fault: tuple stack overflow */
+#if defined(__sparc)
+#define CPU_DTRACE_FAKERESTORE 0x0400 /* pid provider hint to getreg */
+#endif
+#define CPU_DTRACE_ENTRY 0x0800 /* pid provider hint to ustack() */
+#define CPU_DTRACE_BADSTACK 0x1000 /* DTrace fault: bad stack */
+
+#define CPU_DTRACE_FAULT (CPU_DTRACE_BADADDR | CPU_DTRACE_BADALIGN | \
+ CPU_DTRACE_DIVZERO | CPU_DTRACE_ILLOP | \
+ CPU_DTRACE_NOSCRATCH | CPU_DTRACE_KPRIV | \
+ CPU_DTRACE_UPRIV | CPU_DTRACE_TUPOFLOW | \
+ CPU_DTRACE_BADSTACK)
+#define CPU_DTRACE_ERROR (CPU_DTRACE_FAULT | CPU_DTRACE_DROP)
+
+/*
+ * Dispatcher flags
+ * These flags must be changed only by the current CPU.
+ */
+#define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */
+#define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */
+
+#endif /* _KERNEL || _KMEMUSER */
+
+#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
+
+/*
+ * Macros for manipulating sets of CPUs as a bitmap. Note that this
+ * bitmap may vary in size depending on the maximum CPU id a specific
+ * platform supports. This may be different than the number of CPUs
+ * the platform supports, since CPU ids can be sparse. We define two
+ * sets of macros; one for platforms where the maximum CPU id is less
+ * than the number of bits in a single word (32 in a 32-bit kernel,
+ * 64 in a 64-bit kernel), and one for platforms that require bitmaps
+ * of more than one word.
+ */
+
+#define CPUSET_WORDS BT_BITOUL(NCPU)
+#define CPUSET_NOTINSET ((uint_t)-1)
+
+#if CPUSET_WORDS > 1
+
+typedef struct cpuset {
+ ulong_t cpub[CPUSET_WORDS];
+} cpuset_t;
+
+/*
+ * Private functions for manipulating cpusets that do not fit in a
+ * single word. These should not be used directly; instead the
+ * CPUSET_* macros should be used so the code will be portable
+ * across different definitions of NCPU.
+ */
+extern void cpuset_all(cpuset_t *);
+extern void cpuset_all_but(cpuset_t *, uint_t);
+extern int cpuset_isnull(cpuset_t *);
+extern int cpuset_cmp(cpuset_t *, cpuset_t *);
+extern void cpuset_only(cpuset_t *, uint_t);
+extern uint_t cpuset_find(cpuset_t *);
+extern void cpuset_bounds(cpuset_t *, uint_t *, uint_t *);
+
+#define CPUSET_ALL(set) cpuset_all(&(set))
+#define CPUSET_ALL_BUT(set, cpu) cpuset_all_but(&(set), cpu)
+#define CPUSET_ONLY(set, cpu) cpuset_only(&(set), cpu)
+#define CPU_IN_SET(set, cpu) BT_TEST((set).cpub, cpu)
+#define CPUSET_ADD(set, cpu) BT_SET((set).cpub, cpu)
+#define CPUSET_DEL(set, cpu) BT_CLEAR((set).cpub, cpu)
+#define CPUSET_ISNULL(set) cpuset_isnull(&(set))
+#define CPUSET_ISEQUAL(set1, set2) cpuset_cmp(&(set1), &(set2))
+
+/*
+ * Find one CPU in the cpuset.
+ * Sets "cpu" to the id of the found CPU, or CPUSET_NOTINSET if no cpu
+ * could be found. (i.e. empty set)
+ */
+#define CPUSET_FIND(set, cpu) { \
+ cpu = cpuset_find(&(set)); \
+}
+
+/*
+ * Determine the smallest and largest CPU id in the set. Returns
+ * CPUSET_NOTINSET in smallest and largest when set is empty.
+ */
+#define CPUSET_BOUNDS(set, smallest, largest) { \
+ cpuset_bounds(&(set), &(smallest), &(largest)); \
+}
+
+/*
+ * Atomic cpuset operations
+ * These are safe to use for concurrent cpuset manipulations.
+ * "xdel" and "xadd" are exclusive operations, that set "result" to "0"
+ * if the add or del was successful, or "-1" if not successful.
+ * (e.g. attempting to add a cpu to a cpuset that's already there, or
+ * deleting a cpu that's not in the cpuset)
+ */
+
+#define CPUSET_ATOMIC_DEL(set, cpu) BT_ATOMIC_CLEAR((set).cpub, (cpu))
+#define CPUSET_ATOMIC_ADD(set, cpu) BT_ATOMIC_SET((set).cpub, (cpu))
+
+#define CPUSET_ATOMIC_XADD(set, cpu, result) \
+ BT_ATOMIC_SET_EXCL((set).cpub, cpu, result)
+
+#define CPUSET_ATOMIC_XDEL(set, cpu, result) \
+ BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result)
+
+
+#define CPUSET_OR(set1, set2) { \
+ int _i; \
+ for (_i = 0; _i < CPUSET_WORDS; _i++) \
+ (set1).cpub[_i] |= (set2).cpub[_i]; \
+}
+
+#define CPUSET_XOR(set1, set2) { \
+ int _i; \
+ for (_i = 0; _i < CPUSET_WORDS; _i++) \
+ (set1).cpub[_i] ^= (set2).cpub[_i]; \
+}
+
+#define CPUSET_AND(set1, set2) { \
+ int _i; \
+ for (_i = 0; _i < CPUSET_WORDS; _i++) \
+ (set1).cpub[_i] &= (set2).cpub[_i]; \
+}
+
+#define CPUSET_ZERO(set) { \
+ int _i; \
+ for (_i = 0; _i < CPUSET_WORDS; _i++) \
+ (set).cpub[_i] = 0; \
+}
+
+#elif CPUSET_WORDS == 1
+
+typedef ulong_t cpuset_t; /* a set of CPUs */
+
+#define CPUSET(cpu) (1UL << (cpu))
+
+#define CPUSET_ALL(set) ((void)((set) = ~0UL))
+#define CPUSET_ALL_BUT(set, cpu) ((void)((set) = ~CPUSET(cpu)))
+#define CPUSET_ONLY(set, cpu) ((void)((set) = CPUSET(cpu)))
+#define CPU_IN_SET(set, cpu) ((set) & CPUSET(cpu))
+#define CPUSET_ADD(set, cpu) ((void)((set) |= CPUSET(cpu)))
+#define CPUSET_DEL(set, cpu) ((void)((set) &= ~CPUSET(cpu)))
+#define CPUSET_ISNULL(set) ((set) == 0)
+#define CPUSET_ISEQUAL(set1, set2) ((set1) == (set2))
+#define CPUSET_OR(set1, set2) ((void)((set1) |= (set2)))
+#define CPUSET_XOR(set1, set2) ((void)((set1) ^= (set2)))
+#define CPUSET_AND(set1, set2) ((void)((set1) &= (set2)))
+#define CPUSET_ZERO(set) ((void)((set) = 0))
+
+#define CPUSET_FIND(set, cpu) { \
+ cpu = (uint_t)(lowbit(set) - 1); \
+}
+
+#define CPUSET_BOUNDS(set, smallest, largest) { \
+ smallest = (uint_t)(lowbit(set) - 1); \
+ largest = (uint_t)(highbit(set) - 1); \
+}
+
+#define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_ulong(&(set), ~CPUSET(cpu))
+#define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_ulong(&(set), CPUSET(cpu))
+
+#define CPUSET_ATOMIC_XADD(set, cpu, result) \
+ { result = atomic_set_long_excl(&(set), (cpu)); }
+
+#define CPUSET_ATOMIC_XDEL(set, cpu, result) \
+ { result = atomic_clear_long_excl(&(set), (cpu)); }
+
+#else /* CPUSET_WORDS <= 0 */
+
+#error NCPU is undefined or invalid
+
+#endif /* CPUSET_WORDS */
+
+extern cpuset_t cpu_seqid_inuse;
+
+#endif /* (_KERNEL || _KMEMUSER) && _MACHDEP */
+
+#define CPU_CPR_OFFLINE 0x0
+#define CPU_CPR_ONLINE 0x1
+#define CPU_CPR_IS_OFFLINE(cpu) (((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) == 0)
+#define CPU_CPR_IS_ONLINE(cpu) ((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE)
+#define CPU_SET_CPR_FLAGS(cpu, flag) ((cpu)->cpu_cpr_flags |= flag)
+
+#if defined(_KERNEL) || defined(_KMEMUSER)
+
+extern struct cpu *cpu[]; /* indexed by CPU number */
+extern struct cpu **cpu_seq; /* indexed by sequential CPU id */
+extern cpu_t *cpu_list; /* list of CPUs */
+extern cpu_t *cpu_active; /* list of active CPUs */
+extern int ncpus; /* number of CPUs present */
+extern int ncpus_online; /* number of CPUs not quiesced */
+extern int max_ncpus; /* max present before ncpus is known */
+extern int boot_max_ncpus; /* like max_ncpus but for real */
+extern int boot_ncpus; /* # cpus present @ boot */
+extern processorid_t max_cpuid; /* maximum CPU number */
+extern struct cpu *cpu_inmotion; /* offline or partition move target */
+extern cpu_t *clock_cpu_list;
+extern processorid_t max_cpu_seqid_ever; /* maximum seqid ever given */
+
+#if defined(__i386) || defined(__amd64)
+extern struct cpu *curcpup(void);
+#define CPU (curcpup()) /* Pointer to current CPU */
+#else
+#define CPU (curthread->t_cpu) /* Pointer to current CPU */
+#endif
+
+/*
+ * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id
+ * as the target and to grab cpu_lock instead of requiring the caller
+ * to grab it.
+ */
+#define CPU_CURRENT -3
+
+/*
+ * Per-CPU statistics
+ *
+ * cpu_stats_t contains numerous system and VM-related statistics, in the form
+ * of gauges or monotonically-increasing event occurrence counts.
+ */
+
+#define CPU_STATS_ENTER_K() kpreempt_disable()
+#define CPU_STATS_EXIT_K() kpreempt_enable()
+
+#define CPU_STATS_ADD_K(class, stat, amount) \
+ { kpreempt_disable(); /* keep from switching CPUs */\
+ CPU_STATS_ADDQ(CPU, class, stat, amount); \
+ kpreempt_enable(); \
+ }
+
+#define CPU_STATS_ADDQ(cp, class, stat, amount) { \
+ extern void __dtrace_probe___cpu_##class##info_##stat(uint_t, \
+ uint64_t *, cpu_t *); \
+ uint64_t *stataddr = &((cp)->cpu_stats.class.stat); \
+ __dtrace_probe___cpu_##class##info_##stat((amount), \
+ stataddr, cp); \
+ *(stataddr) += (amount); \
+}
+
+#define CPU_STATS(cp, stat) \
+ ((cp)->cpu_stats.stat)
+
+/*
+ * Increment CPU generation value.
+ * This macro should be called whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation should be protected by cpu_lock.
+ */
+#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++)
+
+#endif /* _KERNEL || _KMEMUSER */
+
+/*
+ * CPU support routines (not for genassym.c)
+ */
+#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && defined(__STDC__)
+
+struct zone;
+
+void cpu_list_init(cpu_t *);
+void cpu_add_unit(cpu_t *);
+void cpu_del_unit(int cpuid);
+void cpu_add_active(cpu_t *);
+void cpu_kstat_init(cpu_t *);
+void cpu_visibility_add(cpu_t *, struct zone *);
+void cpu_visibility_remove(cpu_t *, struct zone *);
+void cpu_visibility_configure(cpu_t *, struct zone *);
+void cpu_visibility_unconfigure(cpu_t *, struct zone *);
+void cpu_visibility_online(cpu_t *, struct zone *);
+void cpu_visibility_offline(cpu_t *, struct zone *);
+void cpu_create_intrstat(cpu_t *);
+void cpu_delete_intrstat(cpu_t *);
+int cpu_kstat_intrstat_update(kstat_t *, int);
+void cpu_intr_swtch_enter(kthread_t *);
+void cpu_intr_swtch_exit(kthread_t *);
+
+void mbox_lock_init(void); /* initialize cross-call locks */
+void mbox_init(int cpun); /* initialize cross-calls */
+void poke_cpu(int cpun); /* interrupt another CPU (to preempt) */
+
+/*
+ * values for safe_list. Pause state that CPUs are in.
+ */
+#define PAUSE_IDLE 0 /* normal state */
+#define PAUSE_READY 1 /* paused thread ready to spl */
+#define PAUSE_WAIT 2 /* paused thread is spl-ed high */
+#define PAUSE_DIE 3 /* tell pause thread to leave */
+#define PAUSE_DEAD 4 /* pause thread has left */
+
+void mach_cpu_pause(volatile char *);
+
+void pause_cpus(cpu_t *off_cp, void *(*func)(void *));
+void start_cpus(void);
+int cpus_paused(void);
+
+void cpu_pause_init(void);
+cpu_t *cpu_get(processorid_t cpun); /* get the CPU struct associated */
+
+int cpu_online(cpu_t *cp); /* take cpu online */
+int cpu_offline(cpu_t *cp, int flags); /* take cpu offline */
+int cpu_spare(cpu_t *cp, int flags); /* take cpu to spare */
+int cpu_faulted(cpu_t *cp, int flags); /* take cpu to faulted */
+int cpu_poweron(cpu_t *cp); /* take powered-off cpu to offline */
+int cpu_poweroff(cpu_t *cp); /* take offline cpu to powered-off */
+
+cpu_t *cpu_intr_next(cpu_t *cp); /* get next online CPU taking intrs */
+int cpu_intr_count(cpu_t *cp); /* count # of CPUs handling intrs */
+int cpu_intr_on(cpu_t *cp); /* CPU taking I/O interrupts? */
+void cpu_intr_enable(cpu_t *cp); /* enable I/O interrupts */
+int cpu_intr_disable(cpu_t *cp); /* disable I/O interrupts */
+void cpu_intr_alloc(cpu_t *cp, int n); /* allocate interrupt threads */
+
+/*
+ * Routines for checking CPU states.
+ */
+int cpu_is_online(cpu_t *); /* check if CPU is online */
+int cpu_is_nointr(cpu_t *); /* check if CPU can service intrs */
+int cpu_is_active(cpu_t *); /* check if CPU can run threads */
+int cpu_is_offline(cpu_t *); /* check if CPU is offline */
+int cpu_is_poweredoff(cpu_t *); /* check if CPU is powered off */
+
+int cpu_flagged_online(cpu_flag_t); /* flags show CPU is online */
+int cpu_flagged_nointr(cpu_flag_t); /* flags show CPU not handling intrs */
+int cpu_flagged_active(cpu_flag_t); /* flags show CPU scheduling threads */
+int cpu_flagged_offline(cpu_flag_t); /* flags show CPU is offline */
+int cpu_flagged_poweredoff(cpu_flag_t); /* flags show CPU is powered off */
+
+/*
+ * The processor_info(2) state of a CPU is a simplified representation suitable
+ * for use by an application program. Kernel subsystems should utilize the
+ * internal per-CPU state as given by the cpu_flags member of the cpu structure,
+ * as this information may include platform- or architecture-specific state
+ * critical to a subsystem's disposition of a particular CPU.
+ */
+void cpu_set_state(cpu_t *); /* record/timestamp current state */
+int cpu_get_state(cpu_t *); /* get current cpu state */
+const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */
+
+
+void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */
+void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */
+ /* frequencies */
+
+int cpu_configure(int);
+int cpu_unconfigure(int);
+void cpu_destroy_bound_threads(cpu_t *cp);
+
+extern int cpu_bind_thread(kthread_t *tp, processorid_t bind,
+ processorid_t *obind, int *error);
+extern int cpu_unbind(processorid_t cpu_id, boolean_t force);
+extern void thread_affinity_set(kthread_t *t, int cpu_id);
+extern void thread_affinity_clear(kthread_t *t);
+extern void affinity_set(int cpu_id);
+extern void affinity_clear(void);
+extern void init_cpu_mstate(struct cpu *, int);
+extern void term_cpu_mstate(struct cpu *);
+extern void new_cpu_mstate(int, hrtime_t);
+extern void get_cpu_mstate(struct cpu *, hrtime_t *);
+extern void thread_nomigrate(void);
+extern void thread_allowmigrate(void);
+extern void weakbinding_stop(void);
+extern void weakbinding_start(void);
+
+/*
+ * The following routines affect the CPUs participation in interrupt processing,
+ * if that is applicable on the architecture. This only affects interrupts
+ * which aren't directed at the processor (not cross calls).
+ *
+ * cpu_disable_intr returns non-zero if interrupts were previously enabled.
+ */
+int cpu_disable_intr(struct cpu *cp); /* stop issuing interrupts to cpu */
+void cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */
+
+/*
+ * The mutex cpu_lock protects cpu_flags for all CPUs, as well as the ncpus
+ * and ncpus_online counts.
+ */
+extern kmutex_t cpu_lock; /* lock protecting CPU data */
+
+/*
+ * CPU state change events
+ *
+ * Various subsystems need to know when CPUs change their state. They get this
+ * information by registering CPU state change callbacks using
+ * register_cpu_setup_func(). Whenever any CPU changes its state, the callback
+ * function is called. The callback function is passed three arguments:
+ *
+ * Event, described by cpu_setup_t
+ * CPU ID
+ * Transparent pointer passed when registering the callback
+ *
+ * The callback function is called with cpu_lock held. The return value from the
+ * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG
+ * events. For these two events, non-zero return value indicates a failure and
+ * prevents successful completion of the operation.
+ *
+ * New events may be added in the future. Callback functions should ignore any
+ * events that they do not understand.
+ *
+ * The following events provide notification callbacks:
+ *
+ * CPU_INIT A new CPU is started and added to the list of active CPUs
+ * This event is only used during boot
+ *
+ * CPU_CONFIG A newly inserted CPU is prepared for starting running code
+ * This event is called by DR code
+ *
+ * CPU_UNCONFIG CPU has been powered off and needs cleanup
+ * This event is called by DR code
+ *
+ * CPU_ON CPU is enabled but does not run anything yet
+ *
+ * CPU_INTR_ON CPU is enabled and has interrupts enabled
+ *
+ * CPU_OFF CPU is going offline but can still run threads
+ *
+ * CPU_CPUPART_OUT CPU is going to move out of its partition
+ *
+ * CPU_CPUPART_IN CPU is going to move to a new partition
+ *
+ * CPU_SETUP CPU is set up during boot and can run threads
+ */
+typedef enum {
+ CPU_INIT,
+ CPU_CONFIG,
+ CPU_UNCONFIG,
+ CPU_ON,
+ CPU_OFF,
+ CPU_CPUPART_IN,
+ CPU_CPUPART_OUT,
+ CPU_SETUP,
+ CPU_INTR_ON
+} cpu_setup_t;
+
+typedef int cpu_setup_func_t(cpu_setup_t, int, void *);
+
+/*
+ * Routines used to register interest in cpu's being added to or removed
+ * from the system.
+ */
+extern void register_cpu_setup_func(cpu_setup_func_t *, void *);
+extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *);
+extern void cpu_state_change_notify(int, cpu_setup_t);
+
+/*
+ * Call specified function on the given CPU
+ */
+typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t);
+extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t);
+
+
+/*
+ * Create various strings that describe the given CPU for the
+ * processor_info system call and configuration-related kstats.
+ */
+#define CPU_IDSTRLEN 100
+
+extern void init_cpu_info(struct cpu *);
+extern void populate_idstr(struct cpu *);
+extern void cpu_vm_data_init(struct cpu *);
+extern void cpu_vm_data_destroy(struct cpu *);
+
+#endif /* _KERNEL || _FAKE_KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CPUVAR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
new file mode 100644
index 000000000000..5056f9a51105
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
@@ -0,0 +1,193 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ifndef _SYS_CRED_H
+#define _SYS_CRED_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The credential is an opaque kernel private data structure defined in
+ * <sys/cred_impl.h>.
+ */
+
+typedef struct cred cred_t;
+
+#ifdef _KERNEL
+
+#define CRED() curthread->t_cred
+
+struct proc; /* cred.h is included in proc.h */
+struct prcred;
+struct ksid;
+struct ksidlist;
+struct credklpd;
+struct credgrp;
+
+struct auditinfo_addr; /* cred.h is included in audit.h */
+
+extern int ngroups_max;
+/*
+ * kcred is used when you need all privileges.
+ */
+extern struct cred *kcred;
+
+extern void cred_init(void);
+extern void crhold(cred_t *);
+extern void crfree(cred_t *);
+extern cred_t *cralloc(void); /* all but ref uninitialized */
+extern cred_t *cralloc_ksid(void); /* cralloc() + ksid alloc'ed */
+extern cred_t *crget(void); /* initialized */
+extern cred_t *crcopy(cred_t *);
+extern void crcopy_to(cred_t *, cred_t *);
+extern cred_t *crdup(cred_t *);
+extern void crdup_to(cred_t *, cred_t *);
+extern cred_t *crgetcred(void);
+extern void crset(struct proc *, cred_t *);
+extern void crset_zone_privall(cred_t *);
+extern int groupmember(gid_t, const cred_t *);
+extern int supgroupmember(gid_t, const cred_t *);
+extern int hasprocperm(const cred_t *, const cred_t *);
+extern int prochasprocperm(struct proc *, struct proc *, const cred_t *);
+extern int crcmp(const cred_t *, const cred_t *);
+extern cred_t *zone_kcred(void);
+
+extern uid_t crgetuid(const cred_t *);
+extern uid_t crgetruid(const cred_t *);
+extern uid_t crgetsuid(const cred_t *);
+extern gid_t crgetgid(const cred_t *);
+extern gid_t crgetrgid(const cred_t *);
+extern gid_t crgetsgid(const cred_t *);
+extern zoneid_t crgetzoneid(const cred_t *);
+extern projid_t crgetprojid(const cred_t *);
+
+extern cred_t *crgetmapped(const cred_t *);
+
+
+extern const struct auditinfo_addr *crgetauinfo(const cred_t *);
+extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *);
+
+extern uint_t crgetref(const cred_t *);
+
+extern const gid_t *crgetgroups(const cred_t *);
+extern const gid_t *crgetggroups(const struct credgrp *);
+
+extern int crgetngroups(const cred_t *);
+
+/*
+ * Sets real, effective and/or saved uid/gid;
+ * -1 argument accepted as "no change".
+ */
+extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t);
+extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t);
+
+/*
+ * Sets real, effective and saved uids/gids all to the same
+ * values. Both values must be non-negative and <= MAXUID
+ */
+extern int crsetugid(cred_t *, uid_t, gid_t);
+
+/*
+ * Functions to handle the supplemental group list.
+ */
+extern int crsetgroups(cred_t *, int, gid_t *);
+extern struct credgrp *crgrpcopyin(int, gid_t *);
+extern void crgrprele(struct credgrp *);
+extern void crsetcredgrp(cred_t *, struct credgrp *);
+
+/*
+ * Private interface for setting zone association of credential.
+ */
+struct zone;
+extern void crsetzone(cred_t *, struct zone *);
+extern struct zone *crgetzone(const cred_t *);
+
+/*
+ * Private interface for setting project id in credential.
+ */
+extern void crsetprojid(cred_t *, projid_t);
+
+/*
+ * Private interface for nfs.
+ */
+extern cred_t *crnetadjust(cred_t *);
+
+/*
+ * Private interface for procfs.
+ */
+extern void cred2prcred(const cred_t *, struct prcred *);
+
+/*
+ * Private interfaces for Rampart Trusted Solaris.
+ */
+struct ts_label_s;
+extern struct ts_label_s *crgetlabel(const cred_t *);
+extern boolean_t crisremote(const cred_t *);
+
+/*
+ * Private interfaces for ephemeral uids.
+ */
+#define VALID_UID(id, zn) \
+ ((id) <= MAXUID || valid_ephemeral_uid((zn), (id)))
+
+#define VALID_GID(id, zn) \
+ ((id) <= MAXUID || valid_ephemeral_gid((zn), (id)))
+
+extern boolean_t valid_ephemeral_uid(struct zone *, uid_t);
+extern boolean_t valid_ephemeral_gid(struct zone *, gid_t);
+
+extern int eph_uid_alloc(struct zone *, int, uid_t *, int);
+extern int eph_gid_alloc(struct zone *, int, gid_t *, int);
+
+extern void crsetsid(cred_t *, struct ksid *, int);
+extern void crsetsidlist(cred_t *, struct ksidlist *);
+
+extern struct ksid *crgetsid(const cred_t *, int);
+extern struct ksidlist *crgetsidlist(const cred_t *);
+
+extern int crsetpriv(cred_t *, ...);
+
+extern struct credklpd *crgetcrklpd(const cred_t *);
+extern void crsetcrklpd(cred_t *, struct credklpd *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRED_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h b/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h
new file mode 100644
index 000000000000..528814118e1c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h
@@ -0,0 +1,360 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _CTF_H
+#define _CTF_H
+
+#ifdef illumos
+#pragma ident "%Z%%M% %I% %E% SMI"
+#endif
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * CTF - Compact ANSI-C Type Format
+ *
+ * This file format can be used to compactly represent the information needed
+ * by a debugger to interpret the ANSI-C types used by a given program.
+ * Traditionally, this kind of information is generated by the compiler when
+ * invoked with the -g flag and is stored in "stabs" strings or in the more
+ * modern DWARF format. CTF provides a representation of only the information
+ * that is relevant to debugging a complex, optimized C program such as the
+ * operating system kernel in a form that is significantly more compact than
+ * the equivalent stabs or DWARF representation. The format is data-model
+ * independent, so consumers do not need different code depending on whether
+ * they are 32-bit or 64-bit programs. CTF assumes that a standard ELF symbol
+ * table is available for use in the debugger, and uses the structure and data
+ * of the symbol table to avoid storing redundant information. The CTF data
+ * may be compressed on disk or in memory, indicated by a bit in the header.
+ * CTF may be interpreted in a raw disk file, or it may be stored in an ELF
+ * section, typically named .SUNW_ctf. Data structures are aligned so that
+ * a raw CTF file or CTF ELF section may be manipulated using mmap(2).
+ *
+ * The CTF file or section itself has the following structure:
+ *
+ * +--------+--------+---------+----------+-------+--------+
+ * | file | type | data | function | data | string |
+ * | header | labels | objects | info | types | table |
+ * +--------+--------+---------+----------+-------+--------+
+ *
+ * The file header stores a magic number and version information, encoding
+ * flags, and the byte offset of each of the sections relative to the end of the
+ * header itself. If the CTF data has been uniquified against another set of
+ * CTF data, a reference to that data also appears in the the header. This
+ * reference is the name of the label corresponding to the types uniquified
+ * against.
+ *
+ * Following the header is a list of labels, used to group the types included in
+ * the data types section. Each label is accompanied by a type ID i. A given
+ * label refers to the group of types whose IDs are in the range [0, i].
+ *
+ * Data object and function records are stored in the same order as they appear
+ * in the corresponding symbol table, except that symbols marked SHN_UNDEF are
+ * not stored and symbols that have no type data are padded out with zeroes.
+ * For each data object, the type ID (a small integer) is recorded. For each
+ * function, the type ID of the return type and argument types is recorded.
+ *
+ * The data types section is a list of variable size records that represent each
+ * type, in order by their ID. The types themselves form a directed graph,
+ * where each node may contain one or more outgoing edges to other type nodes,
+ * denoted by their ID.
+ *
+ * Strings are recorded as a string table ID (0 or 1) and a byte offset into the
+ * string table. String table 0 is the internal CTF string table. String table
+ * 1 is the external string table, which is the string table associated with the
+ * ELF symbol table for this object. CTF does not record any strings that are
+ * already in the symbol table, and the CTF string table does not contain any
+ * duplicated strings.
+ *
+ * If the CTF data has been merged with another parent CTF object, some outgoing
+ * edges may refer to type nodes that exist in another CTF object. The debugger
+ * and libctf library are responsible for connecting the appropriate objects
+ * together so that the full set of types can be explored and manipulated.
+ */
+
+#define CTF_MAX_TYPE 0xffff /* max type identifier value */
+#define CTF_MAX_NAME 0x7fffffff /* max offset into a string table */
+#define CTF_MAX_VLEN 0x3ff /* max struct, union, enum members or args */
+#define CTF_MAX_INTOFF 0xff /* max offset of intrinsic value in bits */
+#define CTF_MAX_INTBITS 0xffff /* max size of an intrinsic in bits */
+
+/* See ctf_type_t */
+#define CTF_MAX_SIZE 0xfffe /* max size of a type in bytes */
+#define CTF_LSIZE_SENT 0xffff /* sentinel for ctt_size */
+#define CTF_MAX_LSIZE UINT64_MAX
+
+typedef struct ctf_preamble {
+ ushort_t ctp_magic; /* magic number (CTF_MAGIC) */
+ uchar_t ctp_version; /* data format version number (CTF_VERSION) */
+ uchar_t ctp_flags; /* flags (see below) */
+} ctf_preamble_t;
+
+typedef struct ctf_header {
+ ctf_preamble_t cth_preamble;
+ uint_t cth_parlabel; /* ref to name of parent lbl uniq'd against */
+ uint_t cth_parname; /* ref to basename of parent */
+ uint_t cth_lbloff; /* offset of label section */
+ uint_t cth_objtoff; /* offset of object section */
+ uint_t cth_funcoff; /* offset of function section */
+ uint_t cth_typeoff; /* offset of type section */
+ uint_t cth_stroff; /* offset of string section */
+ uint_t cth_strlen; /* length of string section in bytes */
+} ctf_header_t;
+
+#define cth_magic cth_preamble.ctp_magic
+#define cth_version cth_preamble.ctp_version
+#define cth_flags cth_preamble.ctp_flags
+
+#ifdef CTF_OLD_VERSIONS
+
+typedef struct ctf_header_v1 {
+ ctf_preamble_t cth_preamble;
+ uint_t cth_objtoff;
+ uint_t cth_funcoff;
+ uint_t cth_typeoff;
+ uint_t cth_stroff;
+ uint_t cth_strlen;
+} ctf_header_v1_t;
+
+#endif /* CTF_OLD_VERSIONS */
+
+#define CTF_MAGIC 0xcff1 /* magic number identifying header */
+
+/* data format version number */
+#define CTF_VERSION_1 1
+#define CTF_VERSION_2 2
+#define CTF_VERSION CTF_VERSION_2 /* current version */
+
+#define CTF_F_COMPRESS 0x1 /* data buffer is compressed */
+
+typedef struct ctf_lblent {
+ uint_t ctl_label; /* ref to name of label */
+ uint_t ctl_typeidx; /* last type associated with this label */
+} ctf_lblent_t;
+
+typedef struct ctf_stype {
+ uint_t ctt_name; /* reference to name in string table */
+ ushort_t ctt_info; /* encoded kind, variant length (see below) */
+ union {
+ ushort_t _size; /* size of entire type in bytes */
+ ushort_t _type; /* reference to another type */
+ } _u;
+} ctf_stype_t;
+
+/*
+ * type sizes, measured in bytes, come in two flavors. 99% of them fit within
+ * (USHRT_MAX - 1), and thus can be stored in the ctt_size member of a
+ * ctf_stype_t. The maximum value for these sizes is CTF_MAX_SIZE. The sizes
+ * larger than CTF_MAX_SIZE must be stored in the ctt_lsize member of a
+ * ctf_type_t. Use of this member is indicated by the presence of
+ * CTF_LSIZE_SENT in ctt_size.
+ */
+typedef struct ctf_type {
+ uint_t ctt_name; /* reference to name in string table */
+ ushort_t ctt_info; /* encoded kind, variant length (see below) */
+ union {
+ ushort_t _size; /* always CTF_LSIZE_SENT */
+ ushort_t _type; /* do not use */
+ } _u;
+ uint_t ctt_lsizehi; /* high 32 bits of type size in bytes */
+ uint_t ctt_lsizelo; /* low 32 bits of type size in bytes */
+} ctf_type_t;
+
+#define ctt_size _u._size /* for fundamental types that have a size */
+#define ctt_type _u._type /* for types that reference another type */
+
+/*
+ * The following macros compose and decompose values for ctt_info and
+ * ctt_name, as well as other structures that contain name references.
+ *
+ * ------------------------
+ * ctt_info: | kind | isroot | vlen |
+ * ------------------------
+ * 15 11 10 9 0
+ *
+ * kind = CTF_INFO_KIND(c.ctt_info); <-- CTF_K_* value (see below)
+ * vlen = CTF_INFO_VLEN(c.ctt_info); <-- length of variable data list
+ *
+ * stid = CTF_NAME_STID(c.ctt_name); <-- string table id number (0 or 1)
+ * offset = CTF_NAME_OFFSET(c.ctt_name); <-- string table byte offset
+ *
+ * c.ctt_info = CTF_TYPE_INFO(kind, vlen);
+ * c.ctt_name = CTF_TYPE_NAME(stid, offset);
+ */
+
+#define CTF_INFO_KIND(info) (((info) & 0xf800) >> 11)
+#define CTF_INFO_ISROOT(info) (((info) & 0x0400) >> 10)
+#define CTF_INFO_VLEN(info) (((info) & CTF_MAX_VLEN))
+
+#define CTF_NAME_STID(name) ((name) >> 31)
+#define CTF_NAME_OFFSET(name) ((name) & 0x7fffffff)
+
+#define CTF_TYPE_INFO(kind, isroot, vlen) \
+ (((kind) << 11) | (((isroot) ? 1 : 0) << 10) | ((vlen) & CTF_MAX_VLEN))
+
+#define CTF_TYPE_NAME(stid, offset) \
+ (((stid) << 31) | ((offset) & 0x7fffffff))
+
+#define CTF_TYPE_ISPARENT(id) ((id) < 0x8000)
+#define CTF_TYPE_ISCHILD(id) ((id) > 0x7fff)
+
+#define CTF_TYPE_TO_INDEX(id) ((id) & 0x7fff)
+#define CTF_INDEX_TO_TYPE(id, child) ((child) ? ((id) | 0x8000) : (id))
+#define CTF_PARENT_SHIFT 15
+
+#define CTF_STRTAB_0 0 /* symbolic define for string table id 0 */
+#define CTF_STRTAB_1 1 /* symbolic define for string table id 1 */
+
+#define CTF_TYPE_LSIZE(cttp) \
+ (((uint64_t)(cttp)->ctt_lsizehi) << 32 | (cttp)->ctt_lsizelo)
+#define CTF_SIZE_TO_LSIZE_HI(size) ((uint32_t)((uint64_t)(size) >> 32))
+#define CTF_SIZE_TO_LSIZE_LO(size) ((uint32_t)(size))
+
+#ifdef CTF_OLD_VERSIONS
+
+#define CTF_INFO_KIND_V1(info) (((info) & 0xf000) >> 12)
+#define CTF_INFO_ISROOT_V1(info) (((info) & 0x0800) >> 11)
+#define CTF_INFO_VLEN_V1(info) (((info) & 0x07ff))
+
+#define CTF_TYPE_INFO_V1(kind, isroot, vlen) \
+ (((kind) << 12) | (((isroot) ? 1 : 0) << 11) | ((vlen) & 0x07ff))
+
+#endif /* CTF_OLD_VERSIONS */
+
+/*
+ * Values for CTF_TYPE_KIND(). If the kind has an associated data list,
+ * CTF_INFO_VLEN() will extract the number of elements in the list, and
+ * the type of each element is shown in the comments below.
+ */
+#define CTF_K_UNKNOWN 0 /* unknown type (used for padding) */
+#define CTF_K_INTEGER 1 /* variant data is CTF_INT_DATA() (see below) */
+#define CTF_K_FLOAT 2 /* variant data is CTF_FP_DATA() (see below) */
+#define CTF_K_POINTER 3 /* ctt_type is referenced type */
+#define CTF_K_ARRAY 4 /* variant data is single ctf_array_t */
+#define CTF_K_FUNCTION 5 /* ctt_type is return type, variant data is */
+ /* list of argument types (ushort_t's) */
+#define CTF_K_STRUCT 6 /* variant data is list of ctf_member_t's */
+#define CTF_K_UNION 7 /* variant data is list of ctf_member_t's */
+#define CTF_K_ENUM 8 /* variant data is list of ctf_enum_t's */
+#define CTF_K_FORWARD 9 /* no additional data; ctt_name is tag */
+#define CTF_K_TYPEDEF 10 /* ctt_type is referenced type */
+#define CTF_K_VOLATILE 11 /* ctt_type is base type */
+#define CTF_K_CONST 12 /* ctt_type is base type */
+#define CTF_K_RESTRICT 13 /* ctt_type is base type */
+
+#define CTF_K_MAX 31 /* Maximum possible CTF_K_* value */
+
+/*
+ * Values for ctt_type when kind is CTF_K_INTEGER. The flags, offset in bits,
+ * and size in bits are encoded as a single word using the following macros.
+ */
+#define CTF_INT_ENCODING(data) (((data) & 0xff000000) >> 24)
+#define CTF_INT_OFFSET(data) (((data) & 0x00ff0000) >> 16)
+#define CTF_INT_BITS(data) (((data) & 0x0000ffff))
+
+#define CTF_INT_DATA(encoding, offset, bits) \
+ (((encoding) << 24) | ((offset) << 16) | (bits))
+
+#define CTF_INT_SIGNED 0x01 /* integer is signed (otherwise unsigned) */
+#define CTF_INT_CHAR 0x02 /* character display format */
+#define CTF_INT_BOOL 0x04 /* boolean display format */
+#define CTF_INT_VARARGS 0x08 /* varargs display format */
+
+/*
+ * Values for ctt_type when kind is CTF_K_FLOAT. The encoding, offset in bits,
+ * and size in bits are encoded as a single word using the following macros.
+ */
+#define CTF_FP_ENCODING(data) (((data) & 0xff000000) >> 24)
+#define CTF_FP_OFFSET(data) (((data) & 0x00ff0000) >> 16)
+#define CTF_FP_BITS(data) (((data) & 0x0000ffff))
+
+#define CTF_FP_DATA(encoding, offset, bits) \
+ (((encoding) << 24) | ((offset) << 16) | (bits))
+
+#define CTF_FP_SINGLE 1 /* IEEE 32-bit float encoding */
+#define CTF_FP_DOUBLE 2 /* IEEE 64-bit float encoding */
+#define CTF_FP_CPLX 3 /* Complex encoding */
+#define CTF_FP_DCPLX 4 /* Double complex encoding */
+#define CTF_FP_LDCPLX 5 /* Long double complex encoding */
+#define CTF_FP_LDOUBLE 6 /* Long double encoding */
+#define CTF_FP_INTRVL 7 /* Interval (2x32-bit) encoding */
+#define CTF_FP_DINTRVL 8 /* Double interval (2x64-bit) encoding */
+#define CTF_FP_LDINTRVL 9 /* Long double interval (2x128-bit) encoding */
+#define CTF_FP_IMAGRY 10 /* Imaginary (32-bit) encoding */
+#define CTF_FP_DIMAGRY 11 /* Long imaginary (64-bit) encoding */
+#define CTF_FP_LDIMAGRY 12 /* Long double imaginary (128-bit) encoding */
+
+#define CTF_FP_MAX 12 /* Maximum possible CTF_FP_* value */
+
+typedef struct ctf_array {
+ ushort_t cta_contents; /* reference to type of array contents */
+ ushort_t cta_index; /* reference to type of array index */
+ uint_t cta_nelems; /* number of elements */
+} ctf_array_t;
+
+/*
+ * Most structure members have bit offsets that can be expressed using a
+ * short. Some don't. ctf_member_t is used for structs which cannot
+ * contain any of these large offsets, whereas ctf_lmember_t is used in the
+ * latter case. If ctt_size for a given struct is >= 8192 bytes, all members
+ * will be stored as type ctf_lmember_t.
+ */
+
+#define CTF_LSTRUCT_THRESH 8192
+
+typedef struct ctf_member {
+ uint_t ctm_name; /* reference to name in string table */
+ ushort_t ctm_type; /* reference to type of member */
+ ushort_t ctm_offset; /* offset of this member in bits */
+} ctf_member_t;
+
+typedef struct ctf_lmember {
+ uint_t ctlm_name; /* reference to name in string table */
+ ushort_t ctlm_type; /* reference to type of member */
+ ushort_t ctlm_pad; /* padding */
+ uint_t ctlm_offsethi; /* high 32 bits of member offset in bits */
+ uint_t ctlm_offsetlo; /* low 32 bits of member offset in bits */
+} ctf_lmember_t;
+
+#define CTF_LMEM_OFFSET(ctlmp) \
+ (((uint64_t)(ctlmp)->ctlm_offsethi) << 32 | (ctlmp)->ctlm_offsetlo)
+#define CTF_OFFSET_TO_LMEMHI(offset) ((uint32_t)((uint64_t)(offset) >> 32))
+#define CTF_OFFSET_TO_LMEMLO(offset) ((uint32_t)(offset))
+
+typedef struct ctf_enum {
+ uint_t cte_name; /* reference to name in string table */
+ int cte_value; /* value associated with this name */
+} ctf_enum_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CTF_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h b/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h
new file mode 100644
index 000000000000..6b7ab01b6929
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h
@@ -0,0 +1,251 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * This header file defines the interfaces available from the CTF debugger
+ * library, libctf, and an equivalent kernel module. This API can be used by
+ * a debugger to operate on data in the Compact ANSI-C Type Format (CTF).
+ * This is NOT a public interface, although it may eventually become one in
+ * the fullness of time after we gain more experience with the interfaces.
+ *
+ * In the meantime, be aware that any program linked with this API in this
+ * release of Solaris is almost guaranteed to break in the next release.
+ *
+ * In short, do not user this header file or the CTF routines for any purpose.
+ */
+
+#ifndef _CTF_API_H
+#define _CTF_API_H
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/elf.h>
+#include <sys/ctf.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Clients can open one or more CTF containers and obtain a pointer to an
+ * opaque ctf_file_t. Types are identified by an opaque ctf_id_t token.
+ * These opaque definitions allow libctf to evolve without breaking clients.
+ */
+typedef struct ctf_file ctf_file_t;
+typedef long ctf_id_t;
+
+/*
+ * If the debugger needs to provide the CTF library with a set of raw buffers
+ * for use as the CTF data, symbol table, and string table, it can do so by
+ * filling in ctf_sect_t structures and passing them to ctf_bufopen():
+ */
+typedef struct ctf_sect {
+ const char *cts_name; /* section name (if any) */
+ ulong_t cts_type; /* section type (ELF SHT_... value) */
+ ulong_t cts_flags; /* section flags (ELF SHF_... value) */
+#ifdef illumos
+ const void *cts_data; /* pointer to section data */
+#else
+ void *cts_data; /* pointer to section data */
+#endif
+ size_t cts_size; /* size of data in bytes */
+ size_t cts_entsize; /* size of each section entry (symtab only) */
+ off64_t cts_offset; /* file offset of this section (if any) */
+} ctf_sect_t;
+
+/*
+ * Encoding information for integers, floating-point values, and certain other
+ * intrinsics can be obtained by calling ctf_type_encoding(), below. The flags
+ * field will contain values appropriate for the type defined in <sys/ctf.h>.
+ */
+typedef struct ctf_encoding {
+ uint_t cte_format; /* data format (CTF_INT_* or CTF_FP_* flags) */
+ uint_t cte_offset; /* offset of value in bits */
+ uint_t cte_bits; /* size of storage in bits */
+} ctf_encoding_t;
+
+typedef struct ctf_membinfo {
+ ctf_id_t ctm_type; /* type of struct or union member */
+ ulong_t ctm_offset; /* offset of member in bits */
+} ctf_membinfo_t;
+
+typedef struct ctf_arinfo {
+ ctf_id_t ctr_contents; /* type of array contents */
+ ctf_id_t ctr_index; /* type of array index */
+ uint_t ctr_nelems; /* number of elements */
+} ctf_arinfo_t;
+
+typedef struct ctf_funcinfo {
+ ctf_id_t ctc_return; /* function return type */
+ uint_t ctc_argc; /* number of typed arguments to function */
+ uint_t ctc_flags; /* function attributes (see below) */
+} ctf_funcinfo_t;
+
+typedef struct ctf_lblinfo {
+ ctf_id_t ctb_typeidx; /* last type associated with the label */
+} ctf_lblinfo_t;
+
+#define CTF_FUNC_VARARG 0x1 /* function arguments end with varargs */
+
+/*
+ * Functions that return integer status or a ctf_id_t use the following value
+ * to indicate failure. ctf_errno() can be used to obtain an error code.
+ */
+#define CTF_ERR (-1L)
+
+/*
+ * The CTF data model is inferred to be the caller's data model or the data
+ * model of the given object, unless ctf_setmodel() is explicitly called.
+ */
+#define CTF_MODEL_ILP32 1 /* object data model is ILP32 */
+#define CTF_MODEL_LP64 2 /* object data model is LP64 */
+#ifdef _LP64
+#define CTF_MODEL_NATIVE CTF_MODEL_LP64
+#else
+#define CTF_MODEL_NATIVE CTF_MODEL_ILP32
+#endif
+
+/*
+ * Dynamic CTF containers can be created using ctf_create(). The ctf_add_*
+ * routines can be used to add new definitions to the dynamic container.
+ * New types are labeled as root or non-root to determine whether they are
+ * visible at the top-level program scope when subsequently doing a lookup.
+ */
+#define CTF_ADD_NONROOT 0 /* type only visible in nested scope */
+#define CTF_ADD_ROOT 1 /* type visible at top-level scope */
+
+/*
+ * These typedefs are used to define the signature for callback functions
+ * that can be used with the iteration and visit functions below:
+ */
+typedef int ctf_visit_f(const char *, ctf_id_t, ulong_t, int, void *);
+typedef int ctf_member_f(const char *, ctf_id_t, ulong_t, void *);
+typedef int ctf_enum_f(const char *, int, void *);
+typedef int ctf_type_f(ctf_id_t, void *);
+typedef int ctf_label_f(const char *, const ctf_lblinfo_t *, void *);
+
+extern ctf_file_t *ctf_bufopen(const ctf_sect_t *, const ctf_sect_t *,
+ const ctf_sect_t *, int *);
+extern ctf_file_t *ctf_fdopen(int, int *);
+extern ctf_file_t *ctf_open(const char *, int *);
+extern ctf_file_t *ctf_create(int *);
+extern ctf_file_t *ctf_dup(ctf_file_t *);
+extern void ctf_close(ctf_file_t *);
+
+extern ctf_file_t *ctf_parent_file(ctf_file_t *);
+extern const char *ctf_parent_name(ctf_file_t *);
+
+extern int ctf_import(ctf_file_t *, ctf_file_t *);
+extern int ctf_setmodel(ctf_file_t *, int);
+extern int ctf_getmodel(ctf_file_t *);
+
+extern void ctf_setspecific(ctf_file_t *, void *);
+extern void *ctf_getspecific(ctf_file_t *);
+
+extern int ctf_errno(ctf_file_t *);
+extern const char *ctf_errmsg(int);
+extern int ctf_version(int);
+
+extern int ctf_func_info(ctf_file_t *, ulong_t, ctf_funcinfo_t *);
+extern int ctf_func_args(ctf_file_t *, ulong_t, uint_t, ctf_id_t *);
+
+extern ctf_id_t ctf_lookup_by_name(ctf_file_t *, const char *);
+extern ctf_id_t ctf_lookup_by_symbol(ctf_file_t *, ulong_t);
+
+extern ctf_id_t ctf_type_resolve(ctf_file_t *, ctf_id_t);
+extern ssize_t ctf_type_lname(ctf_file_t *, ctf_id_t, char *, size_t);
+extern char *ctf_type_name(ctf_file_t *, ctf_id_t, char *, size_t);
+extern char *ctf_type_qname(ctf_file_t *, ctf_id_t, char *, size_t,
+ const char *);
+extern ssize_t ctf_type_size(ctf_file_t *, ctf_id_t);
+extern ssize_t ctf_type_align(ctf_file_t *, ctf_id_t);
+extern int ctf_type_kind(ctf_file_t *, ctf_id_t);
+extern ctf_id_t ctf_type_reference(ctf_file_t *, ctf_id_t);
+extern ctf_id_t ctf_type_pointer(ctf_file_t *, ctf_id_t);
+extern int ctf_type_encoding(ctf_file_t *, ctf_id_t, ctf_encoding_t *);
+extern int ctf_type_visit(ctf_file_t *, ctf_id_t, ctf_visit_f *, void *);
+extern int ctf_type_cmp(ctf_file_t *, ctf_id_t, ctf_file_t *, ctf_id_t);
+extern int ctf_type_compat(ctf_file_t *, ctf_id_t, ctf_file_t *, ctf_id_t);
+
+extern int ctf_member_info(ctf_file_t *, ctf_id_t, const char *,
+ ctf_membinfo_t *);
+extern int ctf_array_info(ctf_file_t *, ctf_id_t, ctf_arinfo_t *);
+
+extern const char *ctf_enum_name(ctf_file_t *, ctf_id_t, int);
+extern int ctf_enum_value(ctf_file_t *, ctf_id_t, const char *, int *);
+
+extern const char *ctf_label_topmost(ctf_file_t *);
+extern int ctf_label_info(ctf_file_t *, const char *, ctf_lblinfo_t *);
+
+extern int ctf_member_iter(ctf_file_t *, ctf_id_t, ctf_member_f *, void *);
+extern int ctf_enum_iter(ctf_file_t *, ctf_id_t, ctf_enum_f *, void *);
+extern int ctf_type_iter(ctf_file_t *, ctf_type_f *, void *);
+extern int ctf_label_iter(ctf_file_t *, ctf_label_f *, void *);
+
+extern ctf_id_t ctf_add_array(ctf_file_t *, uint_t, const ctf_arinfo_t *);
+extern ctf_id_t ctf_add_const(ctf_file_t *, uint_t, ctf_id_t);
+extern ctf_id_t ctf_add_enum(ctf_file_t *, uint_t, const char *);
+extern ctf_id_t ctf_add_float(ctf_file_t *, uint_t,
+ const char *, const ctf_encoding_t *);
+extern ctf_id_t ctf_add_forward(ctf_file_t *, uint_t, const char *, uint_t);
+extern ctf_id_t ctf_add_function(ctf_file_t *, uint_t,
+ const ctf_funcinfo_t *, const ctf_id_t *);
+extern ctf_id_t ctf_add_integer(ctf_file_t *, uint_t,
+ const char *, const ctf_encoding_t *);
+extern ctf_id_t ctf_add_pointer(ctf_file_t *, uint_t, ctf_id_t);
+extern ctf_id_t ctf_add_type(ctf_file_t *, ctf_file_t *, ctf_id_t);
+extern ctf_id_t ctf_add_typedef(ctf_file_t *, uint_t, const char *, ctf_id_t);
+extern ctf_id_t ctf_add_restrict(ctf_file_t *, uint_t, ctf_id_t);
+extern ctf_id_t ctf_add_struct(ctf_file_t *, uint_t, const char *);
+extern ctf_id_t ctf_add_union(ctf_file_t *, uint_t, const char *);
+extern ctf_id_t ctf_add_volatile(ctf_file_t *, uint_t, ctf_id_t);
+
+extern int ctf_add_enumerator(ctf_file_t *, ctf_id_t, const char *, int);
+extern int ctf_add_member(ctf_file_t *, ctf_id_t, const char *, ctf_id_t);
+
+extern int ctf_set_array(ctf_file_t *, ctf_id_t, const ctf_arinfo_t *);
+
+extern int ctf_delete_type(ctf_file_t *, ctf_id_t);
+
+extern int ctf_update(ctf_file_t *);
+extern int ctf_discard(ctf_file_t *);
+extern int ctf_write(ctf_file_t *, int);
+
+#ifdef _KERNEL
+
+struct module;
+extern ctf_file_t *ctf_modopen(struct module *, int *);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CTF_API_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
new file mode 100644
index 000000000000..cf8a15e5b6cc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
@@ -0,0 +1,159 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ *
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+#ifndef _SYS_DEBUG_H
+#define _SYS_DEBUG_H
+
+#include <sys/types.h>
+#include <sys/note.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * ASSERT(ex) causes a panic or debugger entry if expression ex is not
+ * true. ASSERT() is included only for debugging, and is a no-op in
+ * production kernels. VERIFY(ex), on the other hand, behaves like
+ * ASSERT and is evaluated on both debug and non-debug kernels.
+ */
+
+extern int assfail(const char *, const char *, int);
+#define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
+#ifdef DEBUG
+#define ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
+#else
+#define ASSERT(x) ((void)0)
+#endif
+
+/*
+ * Assertion variants sensitive to the compilation data model
+ */
+#if defined(_LP64)
+#define ASSERT64(x) ASSERT(x)
+#define ASSERT32(x)
+#else
+#define ASSERT64(x)
+#define ASSERT32(x) ASSERT(x)
+#endif
+
+/*
+ * IMPLY and EQUIV are assertions of the form:
+ *
+ * if (a) then (b)
+ * and
+ * if (a) then (b) *AND* if (b) then (a)
+ */
+#ifdef DEBUG
+#define IMPLY(A, B) \
+ ((void)(((!(A)) || (B)) || \
+ assfail("(" #A ") implies (" #B ")", __FILE__, __LINE__)))
+#define EQUIV(A, B) \
+ ((void)((!!(A) == !!(B)) || \
+ assfail("(" #A ") is equivalent to (" #B ")", __FILE__, __LINE__)))
+#else
+#define IMPLY(A, B) ((void)0)
+#define EQUIV(A, B) ((void)0)
+#endif
+
+/*
+ * ASSERT3() behaves like ASSERT() except that it is an explicit conditional,
+ * and prints out the values of the left and right hand expressions as part of
+ * the panic message to ease debugging. The three variants imply the type
+ * of their arguments. ASSERT3S() is for signed data types, ASSERT3U() is
+ * for unsigned, and ASSERT3P() is for pointers. The VERIFY3*() macros
+ * have the same relationship as above.
+ */
+extern void assfail3(const char *, uintmax_t, const char *, uintmax_t,
+ const char *, int);
+#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
+ const TYPE __left = (TYPE)(LEFT); \
+ const TYPE __right = (TYPE)(RIGHT); \
+ if (!(__left OP __right)) \
+ assfail3(#LEFT " " #OP " " #RIGHT, \
+ (uintmax_t)__left, #OP, (uintmax_t)__right, \
+ __FILE__, __LINE__); \
+_NOTE(CONSTCOND) } while (0)
+
+#define VERIFY3B(x, y, z) VERIFY3_IMPL(x, y, z, boolean_t)
+#define VERIFY3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t)
+#define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t)
+#define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t)
+#define VERIFY0(x) VERIFY3_IMPL(x, ==, 0, uintmax_t)
+
+#ifdef DEBUG
+#define ASSERT3B(x, y, z) VERIFY3_IMPL(x, y, z, boolean_t)
+#define ASSERT3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t)
+#define ASSERT3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t)
+#define ASSERT3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t)
+#define ASSERT0(x) VERIFY3_IMPL(x, ==, 0, uintmax_t)
+#else
+#define ASSERT3B(x, y, z) ((void)0)
+#define ASSERT3S(x, y, z) ((void)0)
+#define ASSERT3U(x, y, z) ((void)0)
+#define ASSERT3P(x, y, z) ((void)0)
+#define ASSERT0(x) ((void)0)
+#endif
+
+/*
+ * Compile-time assertion. The condition 'x' must be constant.
+ */
+#ifndef CTASSERT
+#define CTASSERT(x) _CTASSERT(x, __LINE__)
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) \
+ _Static_assert((x), "Static assert failed at " #y)
+#endif
+
+#ifdef _KERNEL
+
+extern void abort_sequence_enter(char *);
+extern void debug_enter(char *);
+
+#endif /* _KERNEL */
+
+#if defined(DEBUG) && !defined(__sun)
+/* CSTYLED */
+#define STATIC
+#else
+/* CSTYLED */
+#define STATIC static
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DEBUG_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
new file mode 100644
index 000000000000..b474f91ce01d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
@@ -0,0 +1,2510 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DTRACE_H
+#define _SYS_DTRACE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * DTrace Dynamic Tracing Software: Kernel Interfaces
+ *
+ * Note: The contents of this file are private to the implementation of the
+ * Solaris system and DTrace subsystem and are subject to change at any time
+ * without notice. Applications and drivers using these interfaces will fail
+ * to run on future releases. These interfaces should not be used for any
+ * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB).
+ * Please refer to the "Solaris Dynamic Tracing Guide" for more information.
+ */
+
+#ifndef _ASM
+
+#include <sys/types.h>
+#include <sys/modctl.h>
+#include <sys/processor.h>
+#ifdef illumos
+#include <sys/systm.h>
+#else
+#include <sys/cpuvar.h>
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/ioccom.h>
+#include <sys/ucred.h>
+typedef int model_t;
+#endif
+#include <sys/ctf_api.h>
+#ifdef illumos
+#include <sys/cyclic.h>
+#include <sys/int_limits.h>
+#else
+#include <sys/stdint.h>
+#endif
+
+/*
+ * DTrace Universal Constants and Typedefs
+ */
+#define DTRACE_CPUALL -1 /* all CPUs */
+#define DTRACE_IDNONE 0 /* invalid probe identifier */
+#define DTRACE_EPIDNONE 0 /* invalid enabled probe identifier */
+#define DTRACE_AGGIDNONE 0 /* invalid aggregation identifier */
+#define DTRACE_AGGVARIDNONE 0 /* invalid aggregation variable ID */
+#define DTRACE_CACHEIDNONE 0 /* invalid predicate cache */
+#define DTRACE_PROVNONE 0 /* invalid provider identifier */
+#define DTRACE_METAPROVNONE 0 /* invalid meta-provider identifier */
+#define DTRACE_ARGNONE -1 /* invalid argument index */
+
+#define DTRACE_PROVNAMELEN 64
+#define DTRACE_MODNAMELEN 64
+#define DTRACE_FUNCNAMELEN 192
+#define DTRACE_NAMELEN 64
+#define DTRACE_FULLNAMELEN (DTRACE_PROVNAMELEN + DTRACE_MODNAMELEN + \
+ DTRACE_FUNCNAMELEN + DTRACE_NAMELEN + 4)
+#define DTRACE_ARGTYPELEN 128
+
+typedef uint32_t dtrace_id_t; /* probe identifier */
+typedef uint32_t dtrace_epid_t; /* enabled probe identifier */
+typedef uint32_t dtrace_aggid_t; /* aggregation identifier */
+typedef int64_t dtrace_aggvarid_t; /* aggregation variable identifier */
+typedef uint16_t dtrace_actkind_t; /* action kind */
+typedef int64_t dtrace_optval_t; /* option value */
+typedef uint32_t dtrace_cacheid_t; /* predicate cache identifier */
+
+typedef enum dtrace_probespec {
+ DTRACE_PROBESPEC_NONE = -1,
+ DTRACE_PROBESPEC_PROVIDER = 0,
+ DTRACE_PROBESPEC_MOD,
+ DTRACE_PROBESPEC_FUNC,
+ DTRACE_PROBESPEC_NAME
+} dtrace_probespec_t;
+
+/*
+ * DTrace Intermediate Format (DIF)
+ *
+ * The following definitions describe the DTrace Intermediate Format (DIF), a
+ * a RISC-like instruction set and program encoding used to represent
+ * predicates and actions that can be bound to DTrace probes. The constants
+ * below defining the number of available registers are suggested minimums; the
+ * compiler should use DTRACEIOC_CONF to dynamically obtain the number of
+ * registers provided by the current DTrace implementation.
+ */
+#define DIF_VERSION_1 1 /* DIF version 1: Solaris 10 Beta */
+#define DIF_VERSION_2 2 /* DIF version 2: Solaris 10 FCS */
+#define DIF_VERSION DIF_VERSION_2 /* latest DIF instruction set version */
+#define DIF_DIR_NREGS 8 /* number of DIF integer registers */
+#define DIF_DTR_NREGS 8 /* number of DIF tuple registers */
+
+#define DIF_OP_OR 1 /* or r1, r2, rd */
+#define DIF_OP_XOR 2 /* xor r1, r2, rd */
+#define DIF_OP_AND 3 /* and r1, r2, rd */
+#define DIF_OP_SLL 4 /* sll r1, r2, rd */
+#define DIF_OP_SRL 5 /* srl r1, r2, rd */
+#define DIF_OP_SUB 6 /* sub r1, r2, rd */
+#define DIF_OP_ADD 7 /* add r1, r2, rd */
+#define DIF_OP_MUL 8 /* mul r1, r2, rd */
+#define DIF_OP_SDIV 9 /* sdiv r1, r2, rd */
+#define DIF_OP_UDIV 10 /* udiv r1, r2, rd */
+#define DIF_OP_SREM 11 /* srem r1, r2, rd */
+#define DIF_OP_UREM 12 /* urem r1, r2, rd */
+#define DIF_OP_NOT 13 /* not r1, rd */
+#define DIF_OP_MOV 14 /* mov r1, rd */
+#define DIF_OP_CMP 15 /* cmp r1, r2 */
+#define DIF_OP_TST 16 /* tst r1 */
+#define DIF_OP_BA 17 /* ba label */
+#define DIF_OP_BE 18 /* be label */
+#define DIF_OP_BNE 19 /* bne label */
+#define DIF_OP_BG 20 /* bg label */
+#define DIF_OP_BGU 21 /* bgu label */
+#define DIF_OP_BGE 22 /* bge label */
+#define DIF_OP_BGEU 23 /* bgeu label */
+#define DIF_OP_BL 24 /* bl label */
+#define DIF_OP_BLU 25 /* blu label */
+#define DIF_OP_BLE 26 /* ble label */
+#define DIF_OP_BLEU 27 /* bleu label */
+#define DIF_OP_LDSB 28 /* ldsb [r1], rd */
+#define DIF_OP_LDSH 29 /* ldsh [r1], rd */
+#define DIF_OP_LDSW 30 /* ldsw [r1], rd */
+#define DIF_OP_LDUB 31 /* ldub [r1], rd */
+#define DIF_OP_LDUH 32 /* lduh [r1], rd */
+#define DIF_OP_LDUW 33 /* lduw [r1], rd */
+#define DIF_OP_LDX 34 /* ldx [r1], rd */
+#define DIF_OP_RET 35 /* ret rd */
+#define DIF_OP_NOP 36 /* nop */
+#define DIF_OP_SETX 37 /* setx intindex, rd */
+#define DIF_OP_SETS 38 /* sets strindex, rd */
+#define DIF_OP_SCMP 39 /* scmp r1, r2 */
+#define DIF_OP_LDGA 40 /* ldga var, ri, rd */
+#define DIF_OP_LDGS 41 /* ldgs var, rd */
+#define DIF_OP_STGS 42 /* stgs var, rs */
+#define DIF_OP_LDTA 43 /* ldta var, ri, rd */
+#define DIF_OP_LDTS 44 /* ldts var, rd */
+#define DIF_OP_STTS 45 /* stts var, rs */
+#define DIF_OP_SRA 46 /* sra r1, r2, rd */
+#define DIF_OP_CALL 47 /* call subr, rd */
+#define DIF_OP_PUSHTR 48 /* pushtr type, rs, rr */
+#define DIF_OP_PUSHTV 49 /* pushtv type, rs, rv */
+#define DIF_OP_POPTS 50 /* popts */
+#define DIF_OP_FLUSHTS 51 /* flushts */
+#define DIF_OP_LDGAA 52 /* ldgaa var, rd */
+#define DIF_OP_LDTAA 53 /* ldtaa var, rd */
+#define DIF_OP_STGAA 54 /* stgaa var, rs */
+#define DIF_OP_STTAA 55 /* sttaa var, rs */
+#define DIF_OP_LDLS 56 /* ldls var, rd */
+#define DIF_OP_STLS 57 /* stls var, rs */
+#define DIF_OP_ALLOCS 58 /* allocs r1, rd */
+#define DIF_OP_COPYS 59 /* copys r1, r2, rd */
+#define DIF_OP_STB 60 /* stb r1, [rd] */
+#define DIF_OP_STH 61 /* sth r1, [rd] */
+#define DIF_OP_STW 62 /* stw r1, [rd] */
+#define DIF_OP_STX 63 /* stx r1, [rd] */
+#define DIF_OP_ULDSB 64 /* uldsb [r1], rd */
+#define DIF_OP_ULDSH 65 /* uldsh [r1], rd */
+#define DIF_OP_ULDSW 66 /* uldsw [r1], rd */
+#define DIF_OP_ULDUB 67 /* uldub [r1], rd */
+#define DIF_OP_ULDUH 68 /* ulduh [r1], rd */
+#define DIF_OP_ULDUW 69 /* ulduw [r1], rd */
+#define DIF_OP_ULDX 70 /* uldx [r1], rd */
+#define DIF_OP_RLDSB 71 /* rldsb [r1], rd */
+#define DIF_OP_RLDSH 72 /* rldsh [r1], rd */
+#define DIF_OP_RLDSW 73 /* rldsw [r1], rd */
+#define DIF_OP_RLDUB 74 /* rldub [r1], rd */
+#define DIF_OP_RLDUH 75 /* rlduh [r1], rd */
+#define DIF_OP_RLDUW 76 /* rlduw [r1], rd */
+#define DIF_OP_RLDX 77 /* rldx [r1], rd */
+#define DIF_OP_XLATE 78 /* xlate xlrindex, rd */
+#define DIF_OP_XLARG 79 /* xlarg xlrindex, rd */
+
+#define DIF_INTOFF_MAX 0xffff /* highest integer table offset */
+#define DIF_STROFF_MAX 0xffff /* highest string table offset */
+#define DIF_REGISTER_MAX 0xff /* highest register number */
+#define DIF_VARIABLE_MAX 0xffff /* highest variable identifier */
+#define DIF_SUBROUTINE_MAX 0xffff /* highest subroutine code */
+
+#define DIF_VAR_ARRAY_MIN 0x0000 /* lowest numbered array variable */
+#define DIF_VAR_ARRAY_UBASE 0x0080 /* lowest user-defined array */
+#define DIF_VAR_ARRAY_MAX 0x00ff /* highest numbered array variable */
+
+#define DIF_VAR_OTHER_MIN 0x0100 /* lowest numbered scalar or assc */
+#define DIF_VAR_OTHER_UBASE 0x0500 /* lowest user-defined scalar or assc */
+#define DIF_VAR_OTHER_MAX 0xffff /* highest numbered scalar or assc */
+
+#define DIF_VAR_ARGS 0x0000 /* arguments array */
+#define DIF_VAR_REGS 0x0001 /* registers array */
+#define DIF_VAR_UREGS 0x0002 /* user registers array */
+#define DIF_VAR_CURTHREAD 0x0100 /* thread pointer */
+#define DIF_VAR_TIMESTAMP 0x0101 /* timestamp */
+#define DIF_VAR_VTIMESTAMP 0x0102 /* virtual timestamp */
+#define DIF_VAR_IPL 0x0103 /* interrupt priority level */
+#define DIF_VAR_EPID 0x0104 /* enabled probe ID */
+#define DIF_VAR_ID 0x0105 /* probe ID */
+#define DIF_VAR_ARG0 0x0106 /* first argument */
+#define DIF_VAR_ARG1 0x0107 /* second argument */
+#define DIF_VAR_ARG2 0x0108 /* third argument */
+#define DIF_VAR_ARG3 0x0109 /* fourth argument */
+#define DIF_VAR_ARG4 0x010a /* fifth argument */
+#define DIF_VAR_ARG5 0x010b /* sixth argument */
+#define DIF_VAR_ARG6 0x010c /* seventh argument */
+#define DIF_VAR_ARG7 0x010d /* eighth argument */
+#define DIF_VAR_ARG8 0x010e /* ninth argument */
+#define DIF_VAR_ARG9 0x010f /* tenth argument */
+#define DIF_VAR_STACKDEPTH 0x0110 /* stack depth */
+#define DIF_VAR_CALLER 0x0111 /* caller */
+#define DIF_VAR_PROBEPROV 0x0112 /* probe provider */
+#define DIF_VAR_PROBEMOD 0x0113 /* probe module */
+#define DIF_VAR_PROBEFUNC 0x0114 /* probe function */
+#define DIF_VAR_PROBENAME 0x0115 /* probe name */
+#define DIF_VAR_PID 0x0116 /* process ID */
+#define DIF_VAR_TID 0x0117 /* (per-process) thread ID */
+#define DIF_VAR_EXECNAME 0x0118 /* name of executable */
+#define DIF_VAR_ZONENAME 0x0119 /* zone name associated with process */
+#define DIF_VAR_WALLTIMESTAMP 0x011a /* wall-clock timestamp */
+#define DIF_VAR_USTACKDEPTH 0x011b /* user-land stack depth */
+#define DIF_VAR_UCALLER 0x011c /* user-level caller */
+#define DIF_VAR_PPID 0x011d /* parent process ID */
+#define DIF_VAR_UID 0x011e /* process user ID */
+#define DIF_VAR_GID 0x011f /* process group ID */
+#define DIF_VAR_ERRNO 0x0120 /* thread errno */
+#define DIF_VAR_EXECARGS 0x0121 /* process arguments */
+#define DIF_VAR_JID 0x0122 /* process jail id */
+#define DIF_VAR_JAILNAME 0x0123 /* process jail name */
+
+#ifndef illumos
+#define DIF_VAR_CPU 0x0200
+#endif
+
+#define DIF_SUBR_RAND 0
+#define DIF_SUBR_MUTEX_OWNED 1
+#define DIF_SUBR_MUTEX_OWNER 2
+#define DIF_SUBR_MUTEX_TYPE_ADAPTIVE 3
+#define DIF_SUBR_MUTEX_TYPE_SPIN 4
+#define DIF_SUBR_RW_READ_HELD 5
+#define DIF_SUBR_RW_WRITE_HELD 6
+#define DIF_SUBR_RW_ISWRITER 7
+#define DIF_SUBR_COPYIN 8
+#define DIF_SUBR_COPYINSTR 9
+#define DIF_SUBR_SPECULATION 10
+#define DIF_SUBR_PROGENYOF 11
+#define DIF_SUBR_STRLEN 12
+#define DIF_SUBR_COPYOUT 13
+#define DIF_SUBR_COPYOUTSTR 14
+#define DIF_SUBR_ALLOCA 15
+#define DIF_SUBR_BCOPY 16
+#define DIF_SUBR_COPYINTO 17
+#define DIF_SUBR_MSGDSIZE 18
+#define DIF_SUBR_MSGSIZE 19
+#define DIF_SUBR_GETMAJOR 20
+#define DIF_SUBR_GETMINOR 21
+#define DIF_SUBR_DDI_PATHNAME 22
+#define DIF_SUBR_STRJOIN 23
+#define DIF_SUBR_LLTOSTR 24
+#define DIF_SUBR_BASENAME 25
+#define DIF_SUBR_DIRNAME 26
+#define DIF_SUBR_CLEANPATH 27
+#define DIF_SUBR_STRCHR 28
+#define DIF_SUBR_STRRCHR 29
+#define DIF_SUBR_STRSTR 30
+#define DIF_SUBR_STRTOK 31
+#define DIF_SUBR_SUBSTR 32
+#define DIF_SUBR_INDEX 33
+#define DIF_SUBR_RINDEX 34
+#define DIF_SUBR_HTONS 35
+#define DIF_SUBR_HTONL 36
+#define DIF_SUBR_HTONLL 37
+#define DIF_SUBR_NTOHS 38
+#define DIF_SUBR_NTOHL 39
+#define DIF_SUBR_NTOHLL 40
+#define DIF_SUBR_INET_NTOP 41
+#define DIF_SUBR_INET_NTOA 42
+#define DIF_SUBR_INET_NTOA6 43
+#define DIF_SUBR_TOUPPER 44
+#define DIF_SUBR_TOLOWER 45
+#define DIF_SUBR_MEMREF 46
+#define DIF_SUBR_SX_SHARED_HELD 47
+#define DIF_SUBR_SX_EXCLUSIVE_HELD 48
+#define DIF_SUBR_SX_ISEXCLUSIVE 49
+#define DIF_SUBR_MEMSTR 50
+#define DIF_SUBR_GETF 51
+#define DIF_SUBR_JSON 52
+#define DIF_SUBR_STRTOLL 53
+#define DIF_SUBR_MAX 53 /* max subroutine value */
+
+typedef uint32_t dif_instr_t;
+
+#define DIF_INSTR_OP(i) (((i) >> 24) & 0xff)
+#define DIF_INSTR_R1(i) (((i) >> 16) & 0xff)
+#define DIF_INSTR_R2(i) (((i) >> 8) & 0xff)
+#define DIF_INSTR_RD(i) ((i) & 0xff)
+#define DIF_INSTR_RS(i) ((i) & 0xff)
+#define DIF_INSTR_LABEL(i) ((i) & 0xffffff)
+#define DIF_INSTR_VAR(i) (((i) >> 8) & 0xffff)
+#define DIF_INSTR_INTEGER(i) (((i) >> 8) & 0xffff)
+#define DIF_INSTR_STRING(i) (((i) >> 8) & 0xffff)
+#define DIF_INSTR_SUBR(i) (((i) >> 8) & 0xffff)
+#define DIF_INSTR_TYPE(i) (((i) >> 16) & 0xff)
+#define DIF_INSTR_XLREF(i) (((i) >> 8) & 0xffff)
+
+#define DIF_INSTR_FMT(op, r1, r2, d) \
+ (((op) << 24) | ((r1) << 16) | ((r2) << 8) | (d))
+
+#define DIF_INSTR_NOT(r1, d) (DIF_INSTR_FMT(DIF_OP_NOT, r1, 0, d))
+#define DIF_INSTR_MOV(r1, d) (DIF_INSTR_FMT(DIF_OP_MOV, r1, 0, d))
+#define DIF_INSTR_CMP(op, r1, r2) (DIF_INSTR_FMT(op, r1, r2, 0))
+#define DIF_INSTR_TST(r1) (DIF_INSTR_FMT(DIF_OP_TST, r1, 0, 0))
+#define DIF_INSTR_BRANCH(op, label) (((op) << 24) | (label))
+#define DIF_INSTR_LOAD(op, r1, d) (DIF_INSTR_FMT(op, r1, 0, d))
+#define DIF_INSTR_STORE(op, r1, d) (DIF_INSTR_FMT(op, r1, 0, d))
+#define DIF_INSTR_SETX(i, d) ((DIF_OP_SETX << 24) | ((i) << 8) | (d))
+#define DIF_INSTR_SETS(s, d) ((DIF_OP_SETS << 24) | ((s) << 8) | (d))
+#define DIF_INSTR_RET(d) (DIF_INSTR_FMT(DIF_OP_RET, 0, 0, d))
+#define DIF_INSTR_NOP (DIF_OP_NOP << 24)
+#define DIF_INSTR_LDA(op, v, r, d) (DIF_INSTR_FMT(op, v, r, d))
+#define DIF_INSTR_LDV(op, v, d) (((op) << 24) | ((v) << 8) | (d))
+#define DIF_INSTR_STV(op, v, rs) (((op) << 24) | ((v) << 8) | (rs))
+#define DIF_INSTR_CALL(s, d) ((DIF_OP_CALL << 24) | ((s) << 8) | (d))
+#define DIF_INSTR_PUSHTS(op, t, r2, rs) (DIF_INSTR_FMT(op, t, r2, rs))
+#define DIF_INSTR_POPTS (DIF_OP_POPTS << 24)
+#define DIF_INSTR_FLUSHTS (DIF_OP_FLUSHTS << 24)
+#define DIF_INSTR_ALLOCS(r1, d) (DIF_INSTR_FMT(DIF_OP_ALLOCS, r1, 0, d))
+#define DIF_INSTR_COPYS(r1, r2, d) (DIF_INSTR_FMT(DIF_OP_COPYS, r1, r2, d))
+#define DIF_INSTR_XLATE(op, r, d) (((op) << 24) | ((r) << 8) | (d))
+
+#define DIF_REG_R0 0 /* %r0 is always set to zero */
+
+/*
+ * A DTrace Intermediate Format Type (DIF Type) is used to represent the types
+ * of variables, function and associative array arguments, and the return type
+ * for each DIF object (shown below). It contains a description of the type,
+ * its size in bytes, and a module identifier.
+ */
+typedef struct dtrace_diftype {
+ uint8_t dtdt_kind; /* type kind (see below) */
+ uint8_t dtdt_ckind; /* type kind in CTF */
+ uint8_t dtdt_flags; /* type flags (see below) */
+ uint8_t dtdt_pad; /* reserved for future use */
+ uint32_t dtdt_size; /* type size in bytes (unless string) */
+} dtrace_diftype_t;
+
+#define DIF_TYPE_CTF 0 /* type is a CTF type */
+#define DIF_TYPE_STRING 1 /* type is a D string */
+
+#define DIF_TF_BYREF 0x1 /* type is passed by reference */
+#define DIF_TF_BYUREF 0x2 /* user type is passed by reference */
+
+/*
+ * A DTrace Intermediate Format variable record is used to describe each of the
+ * variables referenced by a given DIF object. It contains an integer variable
+ * identifier along with variable scope and properties, as shown below. The
+ * size of this structure must be sizeof (int) aligned.
+ */
+typedef struct dtrace_difv {
+ uint32_t dtdv_name; /* variable name index in dtdo_strtab */
+ uint32_t dtdv_id; /* variable reference identifier */
+ uint8_t dtdv_kind; /* variable kind (see below) */
+ uint8_t dtdv_scope; /* variable scope (see below) */
+ uint16_t dtdv_flags; /* variable flags (see below) */
+ dtrace_diftype_t dtdv_type; /* variable type (see above) */
+} dtrace_difv_t;
+
+#define DIFV_KIND_ARRAY 0 /* variable is an array of quantities */
+#define DIFV_KIND_SCALAR 1 /* variable is a scalar quantity */
+
+#define DIFV_SCOPE_GLOBAL 0 /* variable has global scope */
+#define DIFV_SCOPE_THREAD 1 /* variable has thread scope */
+#define DIFV_SCOPE_LOCAL 2 /* variable has local scope */
+
+#define DIFV_F_REF 0x1 /* variable is referenced by DIFO */
+#define DIFV_F_MOD 0x2 /* variable is written by DIFO */
+
+/*
+ * DTrace Actions
+ *
+ * The upper byte determines the class of the action; the low bytes determines
+ * the specific action within that class. The classes of actions are as
+ * follows:
+ *
+ * [ no class ] <= May record process- or kernel-related data
+ * DTRACEACT_PROC <= Only records process-related data
+ * DTRACEACT_PROC_DESTRUCTIVE <= Potentially destructive to processes
+ * DTRACEACT_KERNEL <= Only records kernel-related data
+ * DTRACEACT_KERNEL_DESTRUCTIVE <= Potentially destructive to the kernel
+ * DTRACEACT_SPECULATIVE <= Speculation-related action
+ * DTRACEACT_AGGREGATION <= Aggregating action
+ */
+#define DTRACEACT_NONE 0 /* no action */
+#define DTRACEACT_DIFEXPR 1 /* action is DIF expression */
+#define DTRACEACT_EXIT 2 /* exit() action */
+#define DTRACEACT_PRINTF 3 /* printf() action */
+#define DTRACEACT_PRINTA 4 /* printa() action */
+#define DTRACEACT_LIBACT 5 /* library-controlled action */
+#define DTRACEACT_TRACEMEM 6 /* tracemem() action */
+#define DTRACEACT_TRACEMEM_DYNSIZE 7 /* dynamic tracemem() size */
+#define DTRACEACT_PRINTM 8 /* printm() action (BSD) */
+
+#define DTRACEACT_PROC 0x0100
+#define DTRACEACT_USTACK (DTRACEACT_PROC + 1)
+#define DTRACEACT_JSTACK (DTRACEACT_PROC + 2)
+#define DTRACEACT_USYM (DTRACEACT_PROC + 3)
+#define DTRACEACT_UMOD (DTRACEACT_PROC + 4)
+#define DTRACEACT_UADDR (DTRACEACT_PROC + 5)
+
+#define DTRACEACT_PROC_DESTRUCTIVE 0x0200
+#define DTRACEACT_STOP (DTRACEACT_PROC_DESTRUCTIVE + 1)
+#define DTRACEACT_RAISE (DTRACEACT_PROC_DESTRUCTIVE + 2)
+#define DTRACEACT_SYSTEM (DTRACEACT_PROC_DESTRUCTIVE + 3)
+#define DTRACEACT_FREOPEN (DTRACEACT_PROC_DESTRUCTIVE + 4)
+
+#define DTRACEACT_PROC_CONTROL 0x0300
+
+#define DTRACEACT_KERNEL 0x0400
+#define DTRACEACT_STACK (DTRACEACT_KERNEL + 1)
+#define DTRACEACT_SYM (DTRACEACT_KERNEL + 2)
+#define DTRACEACT_MOD (DTRACEACT_KERNEL + 3)
+
+#define DTRACEACT_KERNEL_DESTRUCTIVE 0x0500
+#define DTRACEACT_BREAKPOINT (DTRACEACT_KERNEL_DESTRUCTIVE + 1)
+#define DTRACEACT_PANIC (DTRACEACT_KERNEL_DESTRUCTIVE + 2)
+#define DTRACEACT_CHILL (DTRACEACT_KERNEL_DESTRUCTIVE + 3)
+
+#define DTRACEACT_SPECULATIVE 0x0600
+#define DTRACEACT_SPECULATE (DTRACEACT_SPECULATIVE + 1)
+#define DTRACEACT_COMMIT (DTRACEACT_SPECULATIVE + 2)
+#define DTRACEACT_DISCARD (DTRACEACT_SPECULATIVE + 3)
+
+#define DTRACEACT_CLASS(x) ((x) & 0xff00)
+
+#define DTRACEACT_ISDESTRUCTIVE(x) \
+ (DTRACEACT_CLASS(x) == DTRACEACT_PROC_DESTRUCTIVE || \
+ DTRACEACT_CLASS(x) == DTRACEACT_KERNEL_DESTRUCTIVE)
+
+#define DTRACEACT_ISSPECULATIVE(x) \
+ (DTRACEACT_CLASS(x) == DTRACEACT_SPECULATIVE)
+
+#define DTRACEACT_ISPRINTFLIKE(x) \
+ ((x) == DTRACEACT_PRINTF || (x) == DTRACEACT_PRINTA || \
+ (x) == DTRACEACT_SYSTEM || (x) == DTRACEACT_FREOPEN)
+
+/*
+ * DTrace Aggregating Actions
+ *
+ * These are functions f(x) for which the following is true:
+ *
+ * f(f(x_0) U f(x_1) U ... U f(x_n)) = f(x_0 U x_1 U ... U x_n)
+ *
+ * where x_n is a set of arbitrary data. Aggregating actions are in their own
+ * DTrace action class, DTTRACEACT_AGGREGATION. The macros provided here allow
+ * for easier processing of the aggregation argument and data payload for a few
+ * aggregating actions (notably: quantize(), lquantize(), and ustack()).
+ */
+#define DTRACEACT_AGGREGATION 0x0700
+#define DTRACEAGG_COUNT (DTRACEACT_AGGREGATION + 1)
+#define DTRACEAGG_MIN (DTRACEACT_AGGREGATION + 2)
+#define DTRACEAGG_MAX (DTRACEACT_AGGREGATION + 3)
+#define DTRACEAGG_AVG (DTRACEACT_AGGREGATION + 4)
+#define DTRACEAGG_SUM (DTRACEACT_AGGREGATION + 5)
+#define DTRACEAGG_STDDEV (DTRACEACT_AGGREGATION + 6)
+#define DTRACEAGG_QUANTIZE (DTRACEACT_AGGREGATION + 7)
+#define DTRACEAGG_LQUANTIZE (DTRACEACT_AGGREGATION + 8)
+#define DTRACEAGG_LLQUANTIZE (DTRACEACT_AGGREGATION + 9)
+
+#define DTRACEACT_ISAGG(x) \
+ (DTRACEACT_CLASS(x) == DTRACEACT_AGGREGATION)
+
+#define DTRACE_QUANTIZE_NBUCKETS \
+ (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1)
+
+#define DTRACE_QUANTIZE_ZEROBUCKET ((sizeof (uint64_t) * NBBY) - 1)
+
+#define DTRACE_QUANTIZE_BUCKETVAL(buck) \
+ (int64_t)((buck) < DTRACE_QUANTIZE_ZEROBUCKET ? \
+ -(1LL << (DTRACE_QUANTIZE_ZEROBUCKET - 1 - (buck))) : \
+ (buck) == DTRACE_QUANTIZE_ZEROBUCKET ? 0 : \
+ 1LL << ((buck) - DTRACE_QUANTIZE_ZEROBUCKET - 1))
+
+#define DTRACE_LQUANTIZE_STEPSHIFT 48
+#define DTRACE_LQUANTIZE_STEPMASK ((uint64_t)UINT16_MAX << 48)
+#define DTRACE_LQUANTIZE_LEVELSHIFT 32
+#define DTRACE_LQUANTIZE_LEVELMASK ((uint64_t)UINT16_MAX << 32)
+#define DTRACE_LQUANTIZE_BASESHIFT 0
+#define DTRACE_LQUANTIZE_BASEMASK UINT32_MAX
+
+#define DTRACE_LQUANTIZE_STEP(x) \
+ (uint16_t)(((x) & DTRACE_LQUANTIZE_STEPMASK) >> \
+ DTRACE_LQUANTIZE_STEPSHIFT)
+
+#define DTRACE_LQUANTIZE_LEVELS(x) \
+ (uint16_t)(((x) & DTRACE_LQUANTIZE_LEVELMASK) >> \
+ DTRACE_LQUANTIZE_LEVELSHIFT)
+
+#define DTRACE_LQUANTIZE_BASE(x) \
+ (int32_t)(((x) & DTRACE_LQUANTIZE_BASEMASK) >> \
+ DTRACE_LQUANTIZE_BASESHIFT)
+
+#define DTRACE_LLQUANTIZE_FACTORSHIFT 48
+#define DTRACE_LLQUANTIZE_FACTORMASK ((uint64_t)UINT16_MAX << 48)
+#define DTRACE_LLQUANTIZE_LOWSHIFT 32
+#define DTRACE_LLQUANTIZE_LOWMASK ((uint64_t)UINT16_MAX << 32)
+#define DTRACE_LLQUANTIZE_HIGHSHIFT 16
+#define DTRACE_LLQUANTIZE_HIGHMASK ((uint64_t)UINT16_MAX << 16)
+#define DTRACE_LLQUANTIZE_NSTEPSHIFT 0
+#define DTRACE_LLQUANTIZE_NSTEPMASK UINT16_MAX
+
+#define DTRACE_LLQUANTIZE_FACTOR(x) \
+ (uint16_t)(((x) & DTRACE_LLQUANTIZE_FACTORMASK) >> \
+ DTRACE_LLQUANTIZE_FACTORSHIFT)
+
+#define DTRACE_LLQUANTIZE_LOW(x) \
+ (uint16_t)(((x) & DTRACE_LLQUANTIZE_LOWMASK) >> \
+ DTRACE_LLQUANTIZE_LOWSHIFT)
+
+#define DTRACE_LLQUANTIZE_HIGH(x) \
+ (uint16_t)(((x) & DTRACE_LLQUANTIZE_HIGHMASK) >> \
+ DTRACE_LLQUANTIZE_HIGHSHIFT)
+
+#define DTRACE_LLQUANTIZE_NSTEP(x) \
+ (uint16_t)(((x) & DTRACE_LLQUANTIZE_NSTEPMASK) >> \
+ DTRACE_LLQUANTIZE_NSTEPSHIFT)
+
+#define DTRACE_USTACK_NFRAMES(x) (uint32_t)((x) & UINT32_MAX)
+#define DTRACE_USTACK_STRSIZE(x) (uint32_t)((x) >> 32)
+#define DTRACE_USTACK_ARG(x, y) \
+ ((((uint64_t)(y)) << 32) | ((x) & UINT32_MAX))
+
+#ifndef _LP64
+#if BYTE_ORDER == _BIG_ENDIAN
+#define DTRACE_PTR(type, name) uint32_t name##pad; type *name
+#else
+#define DTRACE_PTR(type, name) type *name; uint32_t name##pad
+#endif
+#else
+#define DTRACE_PTR(type, name) type *name
+#endif
+
+/*
+ * DTrace Object Format (DOF)
+ *
+ * DTrace programs can be persistently encoded in the DOF format so that they
+ * may be embedded in other programs (for example, in an ELF file) or in the
+ * dtrace driver configuration file for use in anonymous tracing. The DOF
+ * format is versioned and extensible so that it can be revised and so that
+ * internal data structures can be modified or extended compatibly. All DOF
+ * structures use fixed-size types, so the 32-bit and 64-bit representations
+ * are identical and consumers can use either data model transparently.
+ *
+ * The file layout is structured as follows:
+ *
+ * +---------------+-------------------+----- ... ----+---- ... ------+
+ * | dof_hdr_t | dof_sec_t[ ... ] | loadable | non-loadable |
+ * | (file header) | (section headers) | section data | section data |
+ * +---------------+-------------------+----- ... ----+---- ... ------+
+ * |<------------ dof_hdr.dofh_loadsz --------------->| |
+ * |<------------ dof_hdr.dofh_filesz ------------------------------->|
+ *
+ * The file header stores meta-data including a magic number, data model for
+ * the instrumentation, data encoding, and properties of the DIF code within.
+ * The header describes its own size and the size of the section headers. By
+ * convention, an array of section headers follows the file header, and then
+ * the data for all loadable sections and unloadable sections. This permits
+ * consumer code to easily download the headers and all loadable data into the
+ * DTrace driver in one contiguous chunk, omitting other extraneous sections.
+ *
+ * The section headers describe the size, offset, alignment, and section type
+ * for each section. Sections are described using a set of #defines that tell
+ * the consumer what kind of data is expected. Sections can contain links to
+ * other sections by storing a dof_secidx_t, an index into the section header
+ * array, inside of the section data structures. The section header includes
+ * an entry size so that sections with data arrays can grow their structures.
+ *
+ * The DOF data itself can contain many snippets of DIF (i.e. >1 DIFOs), which
+ * are represented themselves as a collection of related DOF sections. This
+ * permits us to change the set of sections associated with a DIFO over time,
+ * and also permits us to encode DIFOs that contain different sets of sections.
+ * When a DOF section wants to refer to a DIFO, it stores the dof_secidx_t of a
+ * section of type DOF_SECT_DIFOHDR. This section's data is then an array of
+ * dof_secidx_t's which in turn denote the sections associated with this DIFO.
+ *
+ * This loose coupling of the file structure (header and sections) to the
+ * structure of the DTrace program itself (ECB descriptions, action
+ * descriptions, and DIFOs) permits activities such as relocation processing
+ * to occur in a single pass without having to understand D program structure.
+ *
+ * Finally, strings are always stored in ELF-style string tables along with a
+ * string table section index and string table offset. Therefore strings in
+ * DOF are always arbitrary-length and not bound to the current implementation.
+ */
+
+#define DOF_ID_SIZE 16 /* total size of dofh_ident[] in bytes */
+
+typedef struct dof_hdr {
+ uint8_t dofh_ident[DOF_ID_SIZE]; /* identification bytes (see below) */
+ uint32_t dofh_flags; /* file attribute flags (if any) */
+ uint32_t dofh_hdrsize; /* size of file header in bytes */
+ uint32_t dofh_secsize; /* size of section header in bytes */
+ uint32_t dofh_secnum; /* number of section headers */
+ uint64_t dofh_secoff; /* file offset of section headers */
+ uint64_t dofh_loadsz; /* file size of loadable portion */
+ uint64_t dofh_filesz; /* file size of entire DOF file */
+ uint64_t dofh_pad; /* reserved for future use */
+} dof_hdr_t;
+
+#define DOF_ID_MAG0 0 /* first byte of magic number */
+#define DOF_ID_MAG1 1 /* second byte of magic number */
+#define DOF_ID_MAG2 2 /* third byte of magic number */
+#define DOF_ID_MAG3 3 /* fourth byte of magic number */
+#define DOF_ID_MODEL 4 /* DOF data model (see below) */
+#define DOF_ID_ENCODING 5 /* DOF data encoding (see below) */
+#define DOF_ID_VERSION 6 /* DOF file format major version (see below) */
+#define DOF_ID_DIFVERS 7 /* DIF instruction set version */
+#define DOF_ID_DIFIREG 8 /* DIF integer registers used by compiler */
+#define DOF_ID_DIFTREG 9 /* DIF tuple registers used by compiler */
+#define DOF_ID_PAD 10 /* start of padding bytes (all zeroes) */
+
+#define DOF_MAG_MAG0 0x7F /* DOF_ID_MAG[0-3] */
+#define DOF_MAG_MAG1 'D'
+#define DOF_MAG_MAG2 'O'
+#define DOF_MAG_MAG3 'F'
+
+#define DOF_MAG_STRING "\177DOF"
+#define DOF_MAG_STRLEN 4
+
+#define DOF_MODEL_NONE 0 /* DOF_ID_MODEL */
+#define DOF_MODEL_ILP32 1
+#define DOF_MODEL_LP64 2
+
+#ifdef _LP64
+#define DOF_MODEL_NATIVE DOF_MODEL_LP64
+#else
+#define DOF_MODEL_NATIVE DOF_MODEL_ILP32
+#endif
+
+#define DOF_ENCODE_NONE 0 /* DOF_ID_ENCODING */
+#define DOF_ENCODE_LSB 1
+#define DOF_ENCODE_MSB 2
+
+#if BYTE_ORDER == _BIG_ENDIAN
+#define DOF_ENCODE_NATIVE DOF_ENCODE_MSB
+#else
+#define DOF_ENCODE_NATIVE DOF_ENCODE_LSB
+#endif
+
+#define DOF_VERSION_1 1 /* DOF version 1: Solaris 10 FCS */
+#define DOF_VERSION_2 2 /* DOF version 2: Solaris Express 6/06 */
+#define DOF_VERSION DOF_VERSION_2 /* Latest DOF version */
+
+#define DOF_FL_VALID 0 /* mask of all valid dofh_flags bits */
+
+typedef uint32_t dof_secidx_t; /* section header table index type */
+typedef uint32_t dof_stridx_t; /* string table index type */
+
+#define DOF_SECIDX_NONE (-1U) /* null value for section indices */
+#define DOF_STRIDX_NONE (-1U) /* null value for string indices */
+
+typedef struct dof_sec {
+ uint32_t dofs_type; /* section type (see below) */
+ uint32_t dofs_align; /* section data memory alignment */
+ uint32_t dofs_flags; /* section flags (if any) */
+ uint32_t dofs_entsize; /* size of section entry (if table) */
+ uint64_t dofs_offset; /* offset of section data within file */
+ uint64_t dofs_size; /* size of section data in bytes */
+} dof_sec_t;
+
+#define DOF_SECT_NONE 0 /* null section */
+#define DOF_SECT_COMMENTS 1 /* compiler comments */
+#define DOF_SECT_SOURCE 2 /* D program source code */
+#define DOF_SECT_ECBDESC 3 /* dof_ecbdesc_t */
+#define DOF_SECT_PROBEDESC 4 /* dof_probedesc_t */
+#define DOF_SECT_ACTDESC 5 /* dof_actdesc_t array */
+#define DOF_SECT_DIFOHDR 6 /* dof_difohdr_t (variable length) */
+#define DOF_SECT_DIF 7 /* uint32_t array of byte code */
+#define DOF_SECT_STRTAB 8 /* string table */
+#define DOF_SECT_VARTAB 9 /* dtrace_difv_t array */
+#define DOF_SECT_RELTAB 10 /* dof_relodesc_t array */
+#define DOF_SECT_TYPTAB 11 /* dtrace_diftype_t array */
+#define DOF_SECT_URELHDR 12 /* dof_relohdr_t (user relocations) */
+#define DOF_SECT_KRELHDR 13 /* dof_relohdr_t (kernel relocations) */
+#define DOF_SECT_OPTDESC 14 /* dof_optdesc_t array */
+#define DOF_SECT_PROVIDER 15 /* dof_provider_t */
+#define DOF_SECT_PROBES 16 /* dof_probe_t array */
+#define DOF_SECT_PRARGS 17 /* uint8_t array (probe arg mappings) */
+#define DOF_SECT_PROFFS 18 /* uint32_t array (probe arg offsets) */
+#define DOF_SECT_INTTAB 19 /* uint64_t array */
+#define DOF_SECT_UTSNAME 20 /* struct utsname */
+#define DOF_SECT_XLTAB 21 /* dof_xlref_t array */
+#define DOF_SECT_XLMEMBERS 22 /* dof_xlmember_t array */
+#define DOF_SECT_XLIMPORT 23 /* dof_xlator_t */
+#define DOF_SECT_XLEXPORT 24 /* dof_xlator_t */
+#define DOF_SECT_PREXPORT 25 /* dof_secidx_t array (exported objs) */
+#define DOF_SECT_PRENOFFS 26 /* uint32_t array (enabled offsets) */
+
+#define DOF_SECF_LOAD 1 /* section should be loaded */
+
+#define DOF_SEC_ISLOADABLE(x) \
+ (((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) || \
+ ((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) || \
+ ((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) || \
+ ((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) || \
+ ((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) || \
+ ((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) || \
+ ((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) || \
+ ((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) || \
+ ((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) || \
+ ((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) || \
+ ((x) == DOF_SECT_XLEXPORT) || ((x) == DOF_SECT_PREXPORT) || \
+ ((x) == DOF_SECT_PRENOFFS))
+
+typedef struct dof_ecbdesc {
+ dof_secidx_t dofe_probes; /* link to DOF_SECT_PROBEDESC */
+ dof_secidx_t dofe_pred; /* link to DOF_SECT_DIFOHDR */
+ dof_secidx_t dofe_actions; /* link to DOF_SECT_ACTDESC */
+ uint32_t dofe_pad; /* reserved for future use */
+ uint64_t dofe_uarg; /* user-supplied library argument */
+} dof_ecbdesc_t;
+
+typedef struct dof_probedesc {
+ dof_secidx_t dofp_strtab; /* link to DOF_SECT_STRTAB section */
+ dof_stridx_t dofp_provider; /* provider string */
+ dof_stridx_t dofp_mod; /* module string */
+ dof_stridx_t dofp_func; /* function string */
+ dof_stridx_t dofp_name; /* name string */
+ uint32_t dofp_id; /* probe identifier (or zero) */
+} dof_probedesc_t;
+
+typedef struct dof_actdesc {
+ dof_secidx_t dofa_difo; /* link to DOF_SECT_DIFOHDR */
+ dof_secidx_t dofa_strtab; /* link to DOF_SECT_STRTAB section */
+ uint32_t dofa_kind; /* action kind (DTRACEACT_* constant) */
+ uint32_t dofa_ntuple; /* number of subsequent tuple actions */
+ uint64_t dofa_arg; /* kind-specific argument */
+ uint64_t dofa_uarg; /* user-supplied argument */
+} dof_actdesc_t;
+
+typedef struct dof_difohdr {
+ dtrace_diftype_t dofd_rtype; /* return type for this fragment */
+ dof_secidx_t dofd_links[1]; /* variable length array of indices */
+} dof_difohdr_t;
+
+typedef struct dof_relohdr {
+ dof_secidx_t dofr_strtab; /* link to DOF_SECT_STRTAB for names */
+ dof_secidx_t dofr_relsec; /* link to DOF_SECT_RELTAB for relos */
+ dof_secidx_t dofr_tgtsec; /* link to section we are relocating */
+} dof_relohdr_t;
+
+typedef struct dof_relodesc {
+ dof_stridx_t dofr_name; /* string name of relocation symbol */
+ uint32_t dofr_type; /* relo type (DOF_RELO_* constant) */
+ uint64_t dofr_offset; /* byte offset for relocation */
+ uint64_t dofr_data; /* additional type-specific data */
+} dof_relodesc_t;
+
+#define DOF_RELO_NONE 0 /* empty relocation entry */
+#define DOF_RELO_SETX 1 /* relocate setx value */
+#define DOF_RELO_DOFREL 2 /* relocate DOF-relative value */
+
+typedef struct dof_optdesc {
+ uint32_t dofo_option; /* option identifier */
+ dof_secidx_t dofo_strtab; /* string table, if string option */
+ uint64_t dofo_value; /* option value or string index */
+} dof_optdesc_t;
+
+typedef uint32_t dof_attr_t; /* encoded stability attributes */
+
+#define DOF_ATTR(n, d, c) (((n) << 24) | ((d) << 16) | ((c) << 8))
+#define DOF_ATTR_NAME(a) (((a) >> 24) & 0xff)
+#define DOF_ATTR_DATA(a) (((a) >> 16) & 0xff)
+#define DOF_ATTR_CLASS(a) (((a) >> 8) & 0xff)
+
+typedef struct dof_provider {
+ dof_secidx_t dofpv_strtab; /* link to DOF_SECT_STRTAB section */
+ dof_secidx_t dofpv_probes; /* link to DOF_SECT_PROBES section */
+ dof_secidx_t dofpv_prargs; /* link to DOF_SECT_PRARGS section */
+ dof_secidx_t dofpv_proffs; /* link to DOF_SECT_PROFFS section */
+ dof_stridx_t dofpv_name; /* provider name string */
+ dof_attr_t dofpv_provattr; /* provider attributes */
+ dof_attr_t dofpv_modattr; /* module attributes */
+ dof_attr_t dofpv_funcattr; /* function attributes */
+ dof_attr_t dofpv_nameattr; /* name attributes */
+ dof_attr_t dofpv_argsattr; /* args attributes */
+ dof_secidx_t dofpv_prenoffs; /* link to DOF_SECT_PRENOFFS section */
+} dof_provider_t;
+
+typedef struct dof_probe {
+ uint64_t dofpr_addr; /* probe base address or offset */
+ dof_stridx_t dofpr_func; /* probe function string */
+ dof_stridx_t dofpr_name; /* probe name string */
+ dof_stridx_t dofpr_nargv; /* native argument type strings */
+ dof_stridx_t dofpr_xargv; /* translated argument type strings */
+ uint32_t dofpr_argidx; /* index of first argument mapping */
+ uint32_t dofpr_offidx; /* index of first offset entry */
+ uint8_t dofpr_nargc; /* native argument count */
+ uint8_t dofpr_xargc; /* translated argument count */
+ uint16_t dofpr_noffs; /* number of offset entries for probe */
+ uint32_t dofpr_enoffidx; /* index of first is-enabled offset */
+ uint16_t dofpr_nenoffs; /* number of is-enabled offsets */
+ uint16_t dofpr_pad1; /* reserved for future use */
+ uint32_t dofpr_pad2; /* reserved for future use */
+} dof_probe_t;
+
+typedef struct dof_xlator {
+ dof_secidx_t dofxl_members; /* link to DOF_SECT_XLMEMBERS section */
+ dof_secidx_t dofxl_strtab; /* link to DOF_SECT_STRTAB section */
+ dof_stridx_t dofxl_argv; /* input parameter type strings */
+ uint32_t dofxl_argc; /* input parameter list length */
+ dof_stridx_t dofxl_type; /* output type string name */
+ dof_attr_t dofxl_attr; /* output stability attributes */
+} dof_xlator_t;
+
+typedef struct dof_xlmember {
+ dof_secidx_t dofxm_difo; /* member link to DOF_SECT_DIFOHDR */
+ dof_stridx_t dofxm_name; /* member name */
+ dtrace_diftype_t dofxm_type; /* member type */
+} dof_xlmember_t;
+
+typedef struct dof_xlref {
+ dof_secidx_t dofxr_xlator; /* link to DOF_SECT_XLATORS section */
+ uint32_t dofxr_member; /* index of referenced dof_xlmember */
+ uint32_t dofxr_argn; /* index of argument for DIF_OP_XLARG */
+} dof_xlref_t;
+
+/*
+ * DTrace Intermediate Format Object (DIFO)
+ *
+ * A DIFO is used to store the compiled DIF for a D expression, its return
+ * type, and its string and variable tables. The string table is a single
+ * buffer of character data into which sets instructions and variable
+ * references can reference strings using a byte offset. The variable table
+ * is an array of dtrace_difv_t structures that describe the name and type of
+ * each variable and the id used in the DIF code. This structure is described
+ * above in the DIF section of this header file. The DIFO is used at both
+ * user-level (in the library) and in the kernel, but the structure is never
+ * passed between the two: the DOF structures form the only interface. As a
+ * result, the definition can change depending on the presence of _KERNEL.
+ */
+typedef struct dtrace_difo {
+ dif_instr_t *dtdo_buf; /* instruction buffer */
+ uint64_t *dtdo_inttab; /* integer table (optional) */
+ char *dtdo_strtab; /* string table (optional) */
+ dtrace_difv_t *dtdo_vartab; /* variable table (optional) */
+ uint_t dtdo_len; /* length of instruction buffer */
+ uint_t dtdo_intlen; /* length of integer table */
+ uint_t dtdo_strlen; /* length of string table */
+ uint_t dtdo_varlen; /* length of variable table */
+ dtrace_diftype_t dtdo_rtype; /* return type */
+ uint_t dtdo_refcnt; /* owner reference count */
+ uint_t dtdo_destructive; /* invokes destructive subroutines */
+#ifndef _KERNEL
+ dof_relodesc_t *dtdo_kreltab; /* kernel relocations */
+ dof_relodesc_t *dtdo_ureltab; /* user relocations */
+ struct dt_node **dtdo_xlmtab; /* translator references */
+ uint_t dtdo_krelen; /* length of krelo table */
+ uint_t dtdo_urelen; /* length of urelo table */
+ uint_t dtdo_xlmlen; /* length of translator table */
+#endif
+} dtrace_difo_t;
+
+/*
+ * DTrace Enabling Description Structures
+ *
+ * When DTrace is tracking the description of a DTrace enabling entity (probe,
+ * predicate, action, ECB, record, etc.), it does so in a description
+ * structure. These structures all end in "desc", and are used at both
+ * user-level and in the kernel -- but (with the exception of
+ * dtrace_probedesc_t) they are never passed between them. Typically,
+ * user-level will use the description structures when assembling an enabling.
+ * It will then distill those description structures into a DOF object (see
+ * above), and send it into the kernel. The kernel will again use the
+ * description structures to create a description of the enabling as it reads
+ * the DOF. When the description is complete, the enabling will be actually
+ * created -- turning it into the structures that represent the enabling
+ * instead of merely describing it. Not surprisingly, the description
+ * structures bear a strong resemblance to the DOF structures that act as their
+ * conduit.
+ */
+struct dtrace_predicate;
+
+typedef struct dtrace_probedesc {
+ dtrace_id_t dtpd_id; /* probe identifier */
+ char dtpd_provider[DTRACE_PROVNAMELEN]; /* probe provider name */
+ char dtpd_mod[DTRACE_MODNAMELEN]; /* probe module name */
+ char dtpd_func[DTRACE_FUNCNAMELEN]; /* probe function name */
+ char dtpd_name[DTRACE_NAMELEN]; /* probe name */
+} dtrace_probedesc_t;
+
+typedef struct dtrace_repldesc {
+ dtrace_probedesc_t dtrpd_match; /* probe descr. to match */
+ dtrace_probedesc_t dtrpd_create; /* probe descr. to create */
+} dtrace_repldesc_t;
+
+typedef struct dtrace_preddesc {
+ dtrace_difo_t *dtpdd_difo; /* pointer to DIF object */
+ struct dtrace_predicate *dtpdd_predicate; /* pointer to predicate */
+} dtrace_preddesc_t;
+
+typedef struct dtrace_actdesc {
+ dtrace_difo_t *dtad_difo; /* pointer to DIF object */
+ struct dtrace_actdesc *dtad_next; /* next action */
+ dtrace_actkind_t dtad_kind; /* kind of action */
+ uint32_t dtad_ntuple; /* number in tuple */
+ uint64_t dtad_arg; /* action argument */
+ uint64_t dtad_uarg; /* user argument */
+ int dtad_refcnt; /* reference count */
+} dtrace_actdesc_t;
+
+typedef struct dtrace_ecbdesc {
+ dtrace_actdesc_t *dted_action; /* action description(s) */
+ dtrace_preddesc_t dted_pred; /* predicate description */
+ dtrace_probedesc_t dted_probe; /* probe description */
+ uint64_t dted_uarg; /* library argument */
+ int dted_refcnt; /* reference count */
+} dtrace_ecbdesc_t;
+
+/*
+ * DTrace Metadata Description Structures
+ *
+ * DTrace separates the trace data stream from the metadata stream. The only
+ * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
+ * timestamp) or (in the case of aggregations) aggregation identifiers. To
+ * determine the structure of the data, DTrace consumers pass the token to the
+ * kernel, and receive in return a corresponding description of the enabled
+ * probe (via the dtrace_eprobedesc structure) or the aggregation (via the
+ * dtrace_aggdesc structure). Both of these structures are expressed in terms
+ * of record descriptions (via the dtrace_recdesc structure) that describe the
+ * exact structure of the data. Some record descriptions may also contain a
+ * format identifier; this additional bit of metadata can be retrieved from the
+ * kernel, for which a format description is returned via the dtrace_fmtdesc
+ * structure. Note that all four of these structures must be bitness-neutral
+ * to allow for a 32-bit DTrace consumer on a 64-bit kernel.
+ */
+typedef struct dtrace_recdesc {
+ dtrace_actkind_t dtrd_action; /* kind of action */
+ uint32_t dtrd_size; /* size of record */
+ uint32_t dtrd_offset; /* offset in ECB's data */
+ uint16_t dtrd_alignment; /* required alignment */
+ uint16_t dtrd_format; /* format, if any */
+ uint64_t dtrd_arg; /* action argument */
+ uint64_t dtrd_uarg; /* user argument */
+} dtrace_recdesc_t;
+
+typedef struct dtrace_eprobedesc {
+ dtrace_epid_t dtepd_epid; /* enabled probe ID */
+ dtrace_id_t dtepd_probeid; /* probe ID */
+ uint64_t dtepd_uarg; /* library argument */
+ uint32_t dtepd_size; /* total size */
+ int dtepd_nrecs; /* number of records */
+ dtrace_recdesc_t dtepd_rec[1]; /* records themselves */
+} dtrace_eprobedesc_t;
+
+typedef struct dtrace_aggdesc {
+ DTRACE_PTR(char, dtagd_name); /* not filled in by kernel */
+ dtrace_aggvarid_t dtagd_varid; /* not filled in by kernel */
+ int dtagd_flags; /* not filled in by kernel */
+ dtrace_aggid_t dtagd_id; /* aggregation ID */
+ dtrace_epid_t dtagd_epid; /* enabled probe ID */
+ uint32_t dtagd_size; /* size in bytes */
+ int dtagd_nrecs; /* number of records */
+ uint32_t dtagd_pad; /* explicit padding */
+ dtrace_recdesc_t dtagd_rec[1]; /* record descriptions */
+} dtrace_aggdesc_t;
+
+typedef struct dtrace_fmtdesc {
+ DTRACE_PTR(char, dtfd_string); /* format string */
+ int dtfd_length; /* length of format string */
+ uint16_t dtfd_format; /* format identifier */
+} dtrace_fmtdesc_t;
+
+#define DTRACE_SIZEOF_EPROBEDESC(desc) \
+ (sizeof (dtrace_eprobedesc_t) + ((desc)->dtepd_nrecs ? \
+ (((desc)->dtepd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0))
+
+#define DTRACE_SIZEOF_AGGDESC(desc) \
+ (sizeof (dtrace_aggdesc_t) + ((desc)->dtagd_nrecs ? \
+ (((desc)->dtagd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0))
+
+/*
+ * DTrace Option Interface
+ *
+ * Run-time DTrace options are set and retrieved via DOF_SECT_OPTDESC sections
+ * in a DOF image. The dof_optdesc structure contains an option identifier and
+ * an option value. The valid option identifiers are found below; the mapping
+ * between option identifiers and option identifying strings is maintained at
+ * user-level. Note that the value of DTRACEOPT_UNSET is such that all of the
+ * following are potentially valid option values: all positive integers, zero
+ * and negative one. Some options (notably "bufpolicy" and "bufresize") take
+ * predefined tokens as their values; these are defined with
+ * DTRACEOPT_{option}_{token}.
+ */
+#define DTRACEOPT_BUFSIZE 0 /* buffer size */
+#define DTRACEOPT_BUFPOLICY 1 /* buffer policy */
+#define DTRACEOPT_DYNVARSIZE 2 /* dynamic variable size */
+#define DTRACEOPT_AGGSIZE 3 /* aggregation size */
+#define DTRACEOPT_SPECSIZE 4 /* speculation size */
+#define DTRACEOPT_NSPEC 5 /* number of speculations */
+#define DTRACEOPT_STRSIZE 6 /* string size */
+#define DTRACEOPT_CLEANRATE 7 /* dynvar cleaning rate */
+#define DTRACEOPT_CPU 8 /* CPU to trace */
+#define DTRACEOPT_BUFRESIZE 9 /* buffer resizing policy */
+#define DTRACEOPT_GRABANON 10 /* grab anonymous state, if any */
+#define DTRACEOPT_FLOWINDENT 11 /* indent function entry/return */
+#define DTRACEOPT_QUIET 12 /* only output explicitly traced data */
+#define DTRACEOPT_STACKFRAMES 13 /* number of stack frames */
+#define DTRACEOPT_USTACKFRAMES 14 /* number of user stack frames */
+#define DTRACEOPT_AGGRATE 15 /* aggregation snapshot rate */
+#define DTRACEOPT_SWITCHRATE 16 /* buffer switching rate */
+#define DTRACEOPT_STATUSRATE 17 /* status rate */
+#define DTRACEOPT_DESTRUCTIVE 18 /* destructive actions allowed */
+#define DTRACEOPT_STACKINDENT 19 /* output indent for stack traces */
+#define DTRACEOPT_RAWBYTES 20 /* always print bytes in raw form */
+#define DTRACEOPT_JSTACKFRAMES 21 /* number of jstack() frames */
+#define DTRACEOPT_JSTACKSTRSIZE 22 /* size of jstack() string table */
+#define DTRACEOPT_AGGSORTKEY 23 /* sort aggregations by key */
+#define DTRACEOPT_AGGSORTREV 24 /* reverse-sort aggregations */
+#define DTRACEOPT_AGGSORTPOS 25 /* agg. position to sort on */
+#define DTRACEOPT_AGGSORTKEYPOS 26 /* agg. key position to sort on */
+#define DTRACEOPT_TEMPORAL 27 /* temporally ordered output */
+#define DTRACEOPT_AGGHIST 28 /* histogram aggregation output */
+#define DTRACEOPT_AGGPACK 29 /* packed aggregation output */
+#define DTRACEOPT_AGGZOOM 30 /* zoomed aggregation scaling */
+#define DTRACEOPT_ZONE 31 /* zone in which to enable probes */
+#define DTRACEOPT_MAX 32 /* number of options */
+
+#define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */
+
+#define DTRACEOPT_BUFPOLICY_RING 0 /* ring buffer */
+#define DTRACEOPT_BUFPOLICY_FILL 1 /* fill buffer, then stop */
+#define DTRACEOPT_BUFPOLICY_SWITCH 2 /* switch buffers */
+
+#define DTRACEOPT_BUFRESIZE_AUTO 0 /* automatic resizing */
+#define DTRACEOPT_BUFRESIZE_MANUAL 1 /* manual resizing */
+
+/*
+ * DTrace Buffer Interface
+ *
+ * In order to get a snapshot of the principal or aggregation buffer,
+ * user-level passes a buffer description to the kernel with the dtrace_bufdesc
+ * structure. This describes which CPU user-level is interested in, and
+ * where user-level wishes the kernel to snapshot the buffer to (the
+ * dtbd_data field). The kernel uses the same structure to pass back some
+ * information regarding the buffer: the size of data actually copied out, the
+ * number of drops, the number of errors, the offset of the oldest record,
+ * and the time of the snapshot.
+ *
+ * If the buffer policy is a "switch" policy, taking a snapshot of the
+ * principal buffer has the additional effect of switching the active and
+ * inactive buffers. Taking a snapshot of the aggregation buffer _always_ has
+ * the additional effect of switching the active and inactive buffers.
+ */
+typedef struct dtrace_bufdesc {
+ uint64_t dtbd_size; /* size of buffer */
+ uint32_t dtbd_cpu; /* CPU or DTRACE_CPUALL */
+ uint32_t dtbd_errors; /* number of errors */
+ uint64_t dtbd_drops; /* number of drops */
+ DTRACE_PTR(char, dtbd_data); /* data */
+ uint64_t dtbd_oldest; /* offset of oldest record */
+ uint64_t dtbd_timestamp; /* hrtime of snapshot */
+} dtrace_bufdesc_t;
+
+/*
+ * Each record in the buffer (dtbd_data) begins with a header that includes
+ * the epid and a timestamp. The timestamp is split into two 4-byte parts
+ * so that we do not require 8-byte alignment.
+ */
+typedef struct dtrace_rechdr {
+ dtrace_epid_t dtrh_epid; /* enabled probe id */
+ uint32_t dtrh_timestamp_hi; /* high bits of hrtime_t */
+ uint32_t dtrh_timestamp_lo; /* low bits of hrtime_t */
+} dtrace_rechdr_t;
+
+#define DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) \
+ ((dtrh)->dtrh_timestamp_lo + \
+ ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32))
+
+#define DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) { \
+ (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime; \
+ (dtrh)->dtrh_timestamp_hi = hrtime >> 32; \
+}
+
+/*
+ * DTrace Status
+ *
+ * The status of DTrace is relayed via the dtrace_status structure. This
+ * structure contains members to count drops other than the capacity drops
+ * available via the buffer interface (see above). This consists of dynamic
+ * drops (including capacity dynamic drops, rinsing drops and dirty drops), and
+ * speculative drops (including capacity speculative drops, drops due to busy
+ * speculative buffers and drops due to unavailable speculative buffers).
+ * Additionally, the status structure contains a field to indicate the number
+ * of "fill"-policy buffers have been filled and a boolean field to indicate
+ * that exit() has been called. If the dtst_exiting field is non-zero, no
+ * further data will be generated until tracing is stopped (at which time any
+ * enablings of the END action will be processed); if user-level sees that
+ * this field is non-zero, tracing should be stopped as soon as possible.
+ */
+typedef struct dtrace_status {
+ uint64_t dtst_dyndrops; /* dynamic drops */
+ uint64_t dtst_dyndrops_rinsing; /* dyn drops due to rinsing */
+ uint64_t dtst_dyndrops_dirty; /* dyn drops due to dirty */
+ uint64_t dtst_specdrops; /* speculative drops */
+ uint64_t dtst_specdrops_busy; /* spec drops due to busy */
+ uint64_t dtst_specdrops_unavail; /* spec drops due to unavail */
+ uint64_t dtst_errors; /* total errors */
+ uint64_t dtst_filled; /* number of filled bufs */
+ uint64_t dtst_stkstroverflows; /* stack string tab overflows */
+ uint64_t dtst_dblerrors; /* errors in ERROR probes */
+ char dtst_killed; /* non-zero if killed */
+ char dtst_exiting; /* non-zero if exit() called */
+ char dtst_pad[6]; /* pad out to 64-bit align */
+} dtrace_status_t;
+
+/*
+ * DTrace Configuration
+ *
+ * User-level may need to understand some elements of the kernel DTrace
+ * configuration in order to generate correct DIF. This information is
+ * conveyed via the dtrace_conf structure.
+ */
+typedef struct dtrace_conf {
+ uint_t dtc_difversion; /* supported DIF version */
+ uint_t dtc_difintregs; /* # of DIF integer registers */
+ uint_t dtc_diftupregs; /* # of DIF tuple registers */
+ uint_t dtc_ctfmodel; /* CTF data model */
+ uint_t dtc_pad[8]; /* reserved for future use */
+} dtrace_conf_t;
+
+/*
+ * DTrace Faults
+ *
+ * The constants below DTRACEFLT_LIBRARY indicate probe processing faults;
+ * constants at or above DTRACEFLT_LIBRARY indicate faults in probe
+ * postprocessing at user-level. Probe processing faults induce an ERROR
+ * probe and are replicated in unistd.d to allow users' ERROR probes to decode
+ * the error condition using thse symbolic labels.
+ */
+#define DTRACEFLT_UNKNOWN 0 /* Unknown fault */
+#define DTRACEFLT_BADADDR 1 /* Bad address */
+#define DTRACEFLT_BADALIGN 2 /* Bad alignment */
+#define DTRACEFLT_ILLOP 3 /* Illegal operation */
+#define DTRACEFLT_DIVZERO 4 /* Divide-by-zero */
+#define DTRACEFLT_NOSCRATCH 5 /* Out of scratch space */
+#define DTRACEFLT_KPRIV 6 /* Illegal kernel access */
+#define DTRACEFLT_UPRIV 7 /* Illegal user access */
+#define DTRACEFLT_TUPOFLOW 8 /* Tuple stack overflow */
+#define DTRACEFLT_BADSTACK 9 /* Bad stack */
+
+#define DTRACEFLT_LIBRARY 1000 /* Library-level fault */
+
+/*
+ * DTrace Argument Types
+ *
+ * Because it would waste both space and time, argument types do not reside
+ * with the probe. In order to determine argument types for args[X]
+ * variables, the D compiler queries for argument types on a probe-by-probe
+ * basis. (This optimizes for the common case that arguments are either not
+ * used or used in an untyped fashion.) Typed arguments are specified with a
+ * string of the type name in the dtragd_native member of the argument
+ * description structure. Typed arguments may be further translated to types
+ * of greater stability; the provider indicates such a translated argument by
+ * filling in the dtargd_xlate member with the string of the translated type.
+ * Finally, the provider may indicate which argument value a given argument
+ * maps to by setting the dtargd_mapping member -- allowing a single argument
+ * to map to multiple args[X] variables.
+ */
+typedef struct dtrace_argdesc {
+ dtrace_id_t dtargd_id; /* probe identifier */
+ int dtargd_ndx; /* arg number (-1 iff none) */
+ int dtargd_mapping; /* value mapping */
+ char dtargd_native[DTRACE_ARGTYPELEN]; /* native type name */
+ char dtargd_xlate[DTRACE_ARGTYPELEN]; /* translated type name */
+} dtrace_argdesc_t;
+
+/*
+ * DTrace Stability Attributes
+ *
+ * Each DTrace provider advertises the name and data stability of each of its
+ * probe description components, as well as its architectural dependencies.
+ * The D compiler can query the provider attributes (dtrace_pattr_t below) in
+ * order to compute the properties of an input program and report them.
+ */
+typedef uint8_t dtrace_stability_t; /* stability code (see attributes(5)) */
+typedef uint8_t dtrace_class_t; /* architectural dependency class */
+
+#define DTRACE_STABILITY_INTERNAL 0 /* private to DTrace itself */
+#define DTRACE_STABILITY_PRIVATE 1 /* private to Sun (see docs) */
+#define DTRACE_STABILITY_OBSOLETE 2 /* scheduled for removal */
+#define DTRACE_STABILITY_EXTERNAL 3 /* not controlled by Sun */
+#define DTRACE_STABILITY_UNSTABLE 4 /* new or rapidly changing */
+#define DTRACE_STABILITY_EVOLVING 5 /* less rapidly changing */
+#define DTRACE_STABILITY_STABLE 6 /* mature interface from Sun */
+#define DTRACE_STABILITY_STANDARD 7 /* industry standard */
+#define DTRACE_STABILITY_MAX 7 /* maximum valid stability */
+
+#define DTRACE_CLASS_UNKNOWN 0 /* unknown architectural dependency */
+#define DTRACE_CLASS_CPU 1 /* CPU-module-specific */
+#define DTRACE_CLASS_PLATFORM 2 /* platform-specific (uname -i) */
+#define DTRACE_CLASS_GROUP 3 /* hardware-group-specific (uname -m) */
+#define DTRACE_CLASS_ISA 4 /* ISA-specific (uname -p) */
+#define DTRACE_CLASS_COMMON 5 /* common to all systems */
+#define DTRACE_CLASS_MAX 5 /* maximum valid class */
+
+#define DTRACE_PRIV_NONE 0x0000
+#define DTRACE_PRIV_KERNEL 0x0001
+#define DTRACE_PRIV_USER 0x0002
+#define DTRACE_PRIV_PROC 0x0004
+#define DTRACE_PRIV_OWNER 0x0008
+#define DTRACE_PRIV_ZONEOWNER 0x0010
+
+#define DTRACE_PRIV_ALL \
+ (DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER | \
+ DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER | DTRACE_PRIV_ZONEOWNER)
+
+typedef struct dtrace_ppriv {
+ uint32_t dtpp_flags; /* privilege flags */
+ uid_t dtpp_uid; /* user ID */
+ zoneid_t dtpp_zoneid; /* zone ID */
+} dtrace_ppriv_t;
+
+typedef struct dtrace_attribute {
+ dtrace_stability_t dtat_name; /* entity name stability */
+ dtrace_stability_t dtat_data; /* entity data stability */
+ dtrace_class_t dtat_class; /* entity data dependency */
+} dtrace_attribute_t;
+
+typedef struct dtrace_pattr {
+ dtrace_attribute_t dtpa_provider; /* provider attributes */
+ dtrace_attribute_t dtpa_mod; /* module attributes */
+ dtrace_attribute_t dtpa_func; /* function attributes */
+ dtrace_attribute_t dtpa_name; /* name attributes */
+ dtrace_attribute_t dtpa_args; /* args[] attributes */
+} dtrace_pattr_t;
+
+typedef struct dtrace_providerdesc {
+ char dtvd_name[DTRACE_PROVNAMELEN]; /* provider name */
+ dtrace_pattr_t dtvd_attr; /* stability attributes */
+ dtrace_ppriv_t dtvd_priv; /* privileges required */
+} dtrace_providerdesc_t;
+
+/*
+ * DTrace Pseudodevice Interface
+ *
+ * DTrace is controlled through ioctl(2)'s to the in-kernel dtrace:dtrace
+ * pseudodevice driver. These ioctls comprise the user-kernel interface to
+ * DTrace.
+ */
+#ifdef illumos
+#define DTRACEIOC (('d' << 24) | ('t' << 16) | ('r' << 8))
+#define DTRACEIOC_PROVIDER (DTRACEIOC | 1) /* provider query */
+#define DTRACEIOC_PROBES (DTRACEIOC | 2) /* probe query */
+#define DTRACEIOC_BUFSNAP (DTRACEIOC | 4) /* snapshot buffer */
+#define DTRACEIOC_PROBEMATCH (DTRACEIOC | 5) /* match probes */
+#define DTRACEIOC_ENABLE (DTRACEIOC | 6) /* enable probes */
+#define DTRACEIOC_AGGSNAP (DTRACEIOC | 7) /* snapshot agg. */
+#define DTRACEIOC_EPROBE (DTRACEIOC | 8) /* get eprobe desc. */
+#define DTRACEIOC_PROBEARG (DTRACEIOC | 9) /* get probe arg */
+#define DTRACEIOC_CONF (DTRACEIOC | 10) /* get config. */
+#define DTRACEIOC_STATUS (DTRACEIOC | 11) /* get status */
+#define DTRACEIOC_GO (DTRACEIOC | 12) /* start tracing */
+#define DTRACEIOC_STOP (DTRACEIOC | 13) /* stop tracing */
+#define DTRACEIOC_AGGDESC (DTRACEIOC | 15) /* get agg. desc. */
+#define DTRACEIOC_FORMAT (DTRACEIOC | 16) /* get format str */
+#define DTRACEIOC_DOFGET (DTRACEIOC | 17) /* get DOF */
+#define DTRACEIOC_REPLICATE (DTRACEIOC | 18) /* replicate enab */
+#else
+#define DTRACEIOC_PROVIDER _IOWR('x',1,dtrace_providerdesc_t)
+ /* provider query */
+#define DTRACEIOC_PROBES _IOWR('x',2,dtrace_probedesc_t)
+ /* probe query */
+#define DTRACEIOC_BUFSNAP _IOW('x',4,dtrace_bufdesc_t *)
+ /* snapshot buffer */
+#define DTRACEIOC_PROBEMATCH _IOWR('x',5,dtrace_probedesc_t)
+ /* match probes */
+typedef struct {
+ void *dof; /* DOF userland address written to driver. */
+ int n_matched; /* # matches returned by driver. */
+} dtrace_enable_io_t;
+#define DTRACEIOC_ENABLE _IOWR('x',6,dtrace_enable_io_t)
+ /* enable probes */
+#define DTRACEIOC_AGGSNAP _IOW('x',7,dtrace_bufdesc_t *)
+ /* snapshot agg. */
+#define DTRACEIOC_EPROBE _IOW('x',8,dtrace_eprobedesc_t)
+ /* get eprobe desc. */
+#define DTRACEIOC_PROBEARG _IOWR('x',9,dtrace_argdesc_t)
+ /* get probe arg */
+#define DTRACEIOC_CONF _IOR('x',10,dtrace_conf_t)
+ /* get config. */
+#define DTRACEIOC_STATUS _IOR('x',11,dtrace_status_t)
+ /* get status */
+#define DTRACEIOC_GO _IOR('x',12,processorid_t)
+ /* start tracing */
+#define DTRACEIOC_STOP _IOWR('x',13,processorid_t)
+ /* stop tracing */
+#define DTRACEIOC_AGGDESC _IOW('x',15,dtrace_aggdesc_t *)
+ /* get agg. desc. */
+#define DTRACEIOC_FORMAT _IOWR('x',16,dtrace_fmtdesc_t)
+ /* get format str */
+#define DTRACEIOC_DOFGET _IOW('x',17,dof_hdr_t *)
+ /* get DOF */
+#define DTRACEIOC_REPLICATE _IOW('x',18,dtrace_repldesc_t)
+ /* replicate enab */
+#endif
+
+/*
+ * DTrace Helpers
+ *
+ * In general, DTrace establishes probes in processes and takes actions on
+ * processes without knowing their specific user-level structures. Instead of
+ * existing in the framework, process-specific knowledge is contained by the
+ * enabling D program -- which can apply process-specific knowledge by making
+ * appropriate use of DTrace primitives like copyin() and copyinstr() to
+ * operate on user-level data. However, there may exist some specific probes
+ * of particular semantic relevance that the application developer may wish to
+ * explicitly export. For example, an application may wish to export a probe
+ * at the point that it begins and ends certain well-defined transactions. In
+ * addition to providing probes, programs may wish to offer assistance for
+ * certain actions. For example, in highly dynamic environments (e.g., Java),
+ * it may be difficult to obtain a stack trace in terms of meaningful symbol
+ * names (the translation from instruction addresses to corresponding symbol
+ * names may only be possible in situ); these environments may wish to define
+ * a series of actions to be applied in situ to obtain a meaningful stack
+ * trace.
+ *
+ * These two mechanisms -- user-level statically defined tracing and assisting
+ * DTrace actions -- are provided via DTrace _helpers_. Helpers are specified
+ * via DOF, but unlike enabling DOF, helper DOF may contain definitions of
+ * providers, probes and their arguments. If a helper wishes to provide
+ * action assistance, probe descriptions and corresponding DIF actions may be
+ * specified in the helper DOF. For such helper actions, however, the probe
+ * description describes the specific helper: all DTrace helpers have the
+ * provider name "dtrace" and the module name "helper", and the name of the
+ * helper is contained in the function name (for example, the ustack() helper
+ * is named "ustack"). Any helper-specific name may be contained in the name
+ * (for example, if a helper were to have a constructor, it might be named
+ * "dtrace:helper:<helper>:init"). Helper actions are only called when the
+ * action that they are helping is taken. Helper actions may only return DIF
+ * expressions, and may only call the following subroutines:
+ *
+ * alloca() <= Allocates memory out of the consumer's scratch space
+ * bcopy() <= Copies memory to scratch space
+ * copyin() <= Copies memory from user-level into consumer's scratch
+ * copyinto() <= Copies memory into a specific location in scratch
+ * copyinstr() <= Copies a string into a specific location in scratch
+ *
+ * Helper actions may only access the following built-in variables:
+ *
+ * curthread <= Current kthread_t pointer
+ * tid <= Current thread identifier
+ * pid <= Current process identifier
+ * ppid <= Parent process identifier
+ * uid <= Current user ID
+ * gid <= Current group ID
+ * execname <= Current executable name
+ * zonename <= Current zone name
+ *
+ * Helper actions may not manipulate or allocate dynamic variables, but they
+ * may have clause-local and statically-allocated global variables. The
+ * helper action variable state is specific to the helper action -- variables
+ * used by the helper action may not be accessed outside of the helper
+ * action, and the helper action may not access variables that like outside
+ * of it. Helper actions may not load from kernel memory at-large; they are
+ * restricting to loading current user state (via copyin() and variants) and
+ * scratch space. As with probe enablings, helper actions are executed in
+ * program order. The result of the helper action is the result of the last
+ * executing helper expression.
+ *
+ * Helpers -- composed of either providers/probes or probes/actions (or both)
+ * -- are added by opening the "helper" minor node, and issuing an ioctl(2)
+ * (DTRACEHIOC_ADDDOF) that specifies the dof_helper_t structure. This
+ * encapsulates the name and base address of the user-level library or
+ * executable publishing the helpers and probes as well as the DOF that
+ * contains the definitions of those helpers and probes.
+ *
+ * The DTRACEHIOC_ADD and DTRACEHIOC_REMOVE are left in place for legacy
+ * helpers and should no longer be used. No other ioctls are valid on the
+ * helper minor node.
+ */
+#ifdef illumos
+#define DTRACEHIOC (('d' << 24) | ('t' << 16) | ('h' << 8))
+#define DTRACEHIOC_ADD (DTRACEHIOC | 1) /* add helper */
+#define DTRACEHIOC_REMOVE (DTRACEHIOC | 2) /* remove helper */
+#define DTRACEHIOC_ADDDOF (DTRACEHIOC | 3) /* add helper DOF */
+#else
+#define DTRACEHIOC_REMOVE _IOW('z', 2, int) /* remove helper */
+#define DTRACEHIOC_ADDDOF _IOWR('z', 3, dof_helper_t)/* add helper DOF */
+#endif
+
+typedef struct dof_helper {
+ char dofhp_mod[DTRACE_MODNAMELEN]; /* executable or library name */
+ uint64_t dofhp_addr; /* base address of object */
+ uint64_t dofhp_dof; /* address of helper DOF */
+#ifdef __FreeBSD__
+ pid_t dofhp_pid; /* target process ID */
+ int dofhp_gen;
+#endif
+} dof_helper_t;
+
+#define DTRACEMNR_DTRACE "dtrace" /* node for DTrace ops */
+#define DTRACEMNR_HELPER "helper" /* node for helpers */
+#define DTRACEMNRN_DTRACE 0 /* minor for DTrace ops */
+#define DTRACEMNRN_HELPER 1 /* minor for helpers */
+#define DTRACEMNRN_CLONE 2 /* first clone minor */
+
+#ifdef _KERNEL
+
+/*
+ * DTrace Provider API
+ *
+ * The following functions are implemented by the DTrace framework and are
+ * used to implement separate in-kernel DTrace providers. Common functions
+ * are provided in uts/common/os/dtrace.c. ISA-dependent subroutines are
+ * defined in uts/<isa>/dtrace/dtrace_asm.s or uts/<isa>/dtrace/dtrace_isa.c.
+ *
+ * The provider API has two halves: the API that the providers consume from
+ * DTrace, and the API that providers make available to DTrace.
+ *
+ * 1 Framework-to-Provider API
+ *
+ * 1.1 Overview
+ *
+ * The Framework-to-Provider API is represented by the dtrace_pops structure
+ * that the provider passes to the framework when registering itself. This
+ * structure consists of the following members:
+ *
+ * dtps_provide() <-- Provide all probes, all modules
+ * dtps_provide_module() <-- Provide all probes in specified module
+ * dtps_enable() <-- Enable specified probe
+ * dtps_disable() <-- Disable specified probe
+ * dtps_suspend() <-- Suspend specified probe
+ * dtps_resume() <-- Resume specified probe
+ * dtps_getargdesc() <-- Get the argument description for args[X]
+ * dtps_getargval() <-- Get the value for an argX or args[X] variable
+ * dtps_usermode() <-- Find out if the probe was fired in user mode
+ * dtps_destroy() <-- Destroy all state associated with this probe
+ *
+ * 1.2 void dtps_provide(void *arg, const dtrace_probedesc_t *spec)
+ *
+ * 1.2.1 Overview
+ *
+ * Called to indicate that the provider should provide all probes. If the
+ * specified description is non-NULL, dtps_provide() is being called because
+ * no probe matched a specified probe -- if the provider has the ability to
+ * create custom probes, it may wish to create a probe that matches the
+ * specified description.
+ *
+ * 1.2.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is a pointer to a probe description that the provider may
+ * wish to consider when creating custom probes. The provider is expected to
+ * call back into the DTrace framework via dtrace_probe_create() to create
+ * any necessary probes. dtps_provide() may be called even if the provider
+ * has made available all probes; the provider should check the return value
+ * of dtrace_probe_create() to handle this case. Note that the provider need
+ * not implement both dtps_provide() and dtps_provide_module(); see
+ * "Arguments and Notes" for dtrace_register(), below.
+ *
+ * 1.2.3 Return value
+ *
+ * None.
+ *
+ * 1.2.4 Caller's context
+ *
+ * dtps_provide() is typically called from open() or ioctl() context, but may
+ * be called from other contexts as well. The DTrace framework is locked in
+ * such a way that providers may not register or unregister. This means that
+ * the provider may not call any DTrace API that affects its registration with
+ * the framework, including dtrace_register(), dtrace_unregister(),
+ * dtrace_invalidate(), and dtrace_condense(). However, the context is such
+ * that the provider may (and indeed, is expected to) call probe-related
+ * DTrace routines, including dtrace_probe_create(), dtrace_probe_lookup(),
+ * and dtrace_probe_arg().
+ *
+ * 1.3 void dtps_provide_module(void *arg, modctl_t *mp)
+ *
+ * 1.3.1 Overview
+ *
+ * Called to indicate that the provider should provide all probes in the
+ * specified module.
+ *
+ * 1.3.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is a pointer to a modctl structure that indicates the
+ * module for which probes should be created.
+ *
+ * 1.3.3 Return value
+ *
+ * None.
+ *
+ * 1.3.4 Caller's context
+ *
+ * dtps_provide_module() may be called from open() or ioctl() context, but
+ * may also be called from a module loading context. mod_lock is held, and
+ * the DTrace framework is locked in such a way that providers may not
+ * register or unregister. This means that the provider may not call any
+ * DTrace API that affects its registration with the framework, including
+ * dtrace_register(), dtrace_unregister(), dtrace_invalidate(), and
+ * dtrace_condense(). However, the context is such that the provider may (and
+ * indeed, is expected to) call probe-related DTrace routines, including
+ * dtrace_probe_create(), dtrace_probe_lookup(), and dtrace_probe_arg(). Note
+ * that the provider need not implement both dtps_provide() and
+ * dtps_provide_module(); see "Arguments and Notes" for dtrace_register(),
+ * below.
+ *
+ * 1.4 void dtps_enable(void *arg, dtrace_id_t id, void *parg)
+ *
+ * 1.4.1 Overview
+ *
+ * Called to enable the specified probe.
+ *
+ * 1.4.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is the identifier of the probe to be enabled. The third
+ * argument is the probe argument as passed to dtrace_probe_create().
+ * dtps_enable() will be called when a probe transitions from not being
+ * enabled at all to having one or more ECB. The number of ECBs associated
+ * with the probe may change without subsequent calls into the provider.
+ * When the number of ECBs drops to zero, the provider will be explicitly
+ * told to disable the probe via dtps_disable(). dtrace_probe() should never
+ * be called for a probe identifier that hasn't been explicitly enabled via
+ * dtps_enable().
+ *
+ * 1.4.3 Return value
+ *
+ * None.
+ *
+ * 1.4.4 Caller's context
+ *
+ * The DTrace framework is locked in such a way that it may not be called
+ * back into at all. cpu_lock is held. mod_lock is not held and may not
+ * be acquired.
+ *
+ * 1.5 void dtps_disable(void *arg, dtrace_id_t id, void *parg)
+ *
+ * 1.5.1 Overview
+ *
+ * Called to disable the specified probe.
+ *
+ * 1.5.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is the identifier of the probe to be disabled. The third
+ * argument is the probe argument as passed to dtrace_probe_create().
+ * dtps_disable() will be called when a probe transitions from being enabled
+ * to having zero ECBs. dtrace_probe() should never be called for a probe
+ * identifier that has been explicitly enabled via dtps_disable().
+ *
+ * 1.5.3 Return value
+ *
+ * None.
+ *
+ * 1.5.4 Caller's context
+ *
+ * The DTrace framework is locked in such a way that it may not be called
+ * back into at all. cpu_lock is held. mod_lock is not held and may not
+ * be acquired.
+ *
+ * 1.6 void dtps_suspend(void *arg, dtrace_id_t id, void *parg)
+ *
+ * 1.6.1 Overview
+ *
+ * Called to suspend the specified enabled probe. This entry point is for
+ * providers that may need to suspend some or all of their probes when CPUs
+ * are being powered on or when the boot monitor is being entered for a
+ * prolonged period of time.
+ *
+ * 1.6.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is the identifier of the probe to be suspended. The
+ * third argument is the probe argument as passed to dtrace_probe_create().
+ * dtps_suspend will only be called on an enabled probe. Providers that
+ * provide a dtps_suspend entry point will want to take roughly the action
+ * that it takes for dtps_disable.
+ *
+ * 1.6.3 Return value
+ *
+ * None.
+ *
+ * 1.6.4 Caller's context
+ *
+ * Interrupts are disabled. The DTrace framework is in a state such that the
+ * specified probe cannot be disabled or destroyed for the duration of
+ * dtps_suspend(). As interrupts are disabled, the provider is afforded
+ * little latitude; the provider is expected to do no more than a store to
+ * memory.
+ *
+ * 1.7 void dtps_resume(void *arg, dtrace_id_t id, void *parg)
+ *
+ * 1.7.1 Overview
+ *
+ * Called to resume the specified enabled probe. This entry point is for
+ * providers that may need to resume some or all of their probes after the
+ * completion of an event that induced a call to dtps_suspend().
+ *
+ * 1.7.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is the identifier of the probe to be resumed. The
+ * third argument is the probe argument as passed to dtrace_probe_create().
+ * dtps_resume will only be called on an enabled probe. Providers that
+ * provide a dtps_resume entry point will want to take roughly the action
+ * that it takes for dtps_enable.
+ *
+ * 1.7.3 Return value
+ *
+ * None.
+ *
+ * 1.7.4 Caller's context
+ *
+ * Interrupts are disabled. The DTrace framework is in a state such that the
+ * specified probe cannot be disabled or destroyed for the duration of
+ * dtps_resume(). As interrupts are disabled, the provider is afforded
+ * little latitude; the provider is expected to do no more than a store to
+ * memory.
+ *
+ * 1.8 void dtps_getargdesc(void *arg, dtrace_id_t id, void *parg,
+ * dtrace_argdesc_t *desc)
+ *
+ * 1.8.1 Overview
+ *
+ * Called to retrieve the argument description for an args[X] variable.
+ *
+ * 1.8.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is the identifier of the current probe. The third
+ * argument is the probe argument as passed to dtrace_probe_create(). The
+ * fourth argument is a pointer to the argument description. This
+ * description is both an input and output parameter: it contains the
+ * index of the desired argument in the dtargd_ndx field, and expects
+ * the other fields to be filled in upon return. If there is no argument
+ * corresponding to the specified index, the dtargd_ndx field should be set
+ * to DTRACE_ARGNONE.
+ *
+ * 1.8.3 Return value
+ *
+ * None. The dtargd_ndx, dtargd_native, dtargd_xlate and dtargd_mapping
+ * members of the dtrace_argdesc_t structure are all output values.
+ *
+ * 1.8.4 Caller's context
+ *
+ * dtps_getargdesc() is called from ioctl() context. mod_lock is held, and
+ * the DTrace framework is locked in such a way that providers may not
+ * register or unregister. This means that the provider may not call any
+ * DTrace API that affects its registration with the framework, including
+ * dtrace_register(), dtrace_unregister(), dtrace_invalidate(), and
+ * dtrace_condense().
+ *
+ * 1.9 uint64_t dtps_getargval(void *arg, dtrace_id_t id, void *parg,
+ * int argno, int aframes)
+ *
+ * 1.9.1 Overview
+ *
+ * Called to retrieve a value for an argX or args[X] variable.
+ *
+ * 1.9.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is the identifier of the current probe. The third
+ * argument is the probe argument as passed to dtrace_probe_create(). The
+ * fourth argument is the number of the argument (the X in the example in
+ * 1.9.1). The fifth argument is the number of stack frames that were used
+ * to get from the actual place in the code that fired the probe to
+ * dtrace_probe() itself, the so-called artificial frames. This argument may
+ * be used to descend an appropriate number of frames to find the correct
+ * values. If this entry point is left NULL, the dtrace_getarg() built-in
+ * function is used.
+ *
+ * 1.9.3 Return value
+ *
+ * The value of the argument.
+ *
+ * 1.9.4 Caller's context
+ *
+ * This is called from within dtrace_probe() meaning that interrupts
+ * are disabled. No locks should be taken within this entry point.
+ *
+ * 1.10 int dtps_usermode(void *arg, dtrace_id_t id, void *parg)
+ *
+ * 1.10.1 Overview
+ *
+ * Called to determine if the probe was fired in a user context.
+ *
+ * 1.10.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is the identifier of the current probe. The third
+ * argument is the probe argument as passed to dtrace_probe_create(). This
+ * entry point must not be left NULL for providers whose probes allow for
+ * mixed mode tracing, that is to say those probes that can fire during
+ * kernel- _or_ user-mode execution
+ *
+ * 1.10.3 Return value
+ *
+ * A bitwise OR that encapsulates both the mode (either DTRACE_MODE_KERNEL
+ * or DTRACE_MODE_USER) and the policy when the privilege of the enabling
+ * is insufficient for that mode (a combination of DTRACE_MODE_NOPRIV_DROP,
+ * DTRACE_MODE_NOPRIV_RESTRICT, and DTRACE_MODE_LIMITEDPRIV_RESTRICT). If
+ * DTRACE_MODE_NOPRIV_DROP bit is set, insufficient privilege will result
+ * in the probe firing being silently ignored for the enabling; if the
+ * DTRACE_NODE_NOPRIV_RESTRICT bit is set, insufficient privilege will not
+ * prevent probe processing for the enabling, but restrictions will be in
+ * place that induce a UPRIV fault upon attempt to examine probe arguments
+ * or current process state. If the DTRACE_MODE_LIMITEDPRIV_RESTRICT bit
+ * is set, similar restrictions will be placed upon operation if the
+ * privilege is sufficient to process the enabling, but does not otherwise
+ * entitle the enabling to all zones. The DTRACE_MODE_NOPRIV_DROP and
+ * DTRACE_MODE_NOPRIV_RESTRICT are mutually exclusive (and one of these
+ * two policies must be specified), but either may be combined (or not)
+ * with DTRACE_MODE_LIMITEDPRIV_RESTRICT.
+ *
+ * 1.10.4 Caller's context
+ *
+ * This is called from within dtrace_probe() meaning that interrupts
+ * are disabled. No locks should be taken within this entry point.
+ *
+ * 1.11 void dtps_destroy(void *arg, dtrace_id_t id, void *parg)
+ *
+ * 1.11.1 Overview
+ *
+ * Called to destroy the specified probe.
+ *
+ * 1.11.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_register(). The
+ * second argument is the identifier of the probe to be destroyed. The third
+ * argument is the probe argument as passed to dtrace_probe_create(). The
+ * provider should free all state associated with the probe. The framework
+ * guarantees that dtps_destroy() is only called for probes that have either
+ * been disabled via dtps_disable() or were never enabled via dtps_enable().
+ * Once dtps_disable() has been called for a probe, no further call will be
+ * made specifying the probe.
+ *
+ * 1.11.3 Return value
+ *
+ * None.
+ *
+ * 1.11.4 Caller's context
+ *
+ * The DTrace framework is locked in such a way that it may not be called
+ * back into at all. mod_lock is held. cpu_lock is not held, and may not be
+ * acquired.
+ *
+ *
+ * 2 Provider-to-Framework API
+ *
+ * 2.1 Overview
+ *
+ * The Provider-to-Framework API provides the mechanism for the provider to
+ * register itself with the DTrace framework, to create probes, to lookup
+ * probes and (most importantly) to fire probes. The Provider-to-Framework
+ * consists of:
+ *
+ * dtrace_register() <-- Register a provider with the DTrace framework
+ * dtrace_unregister() <-- Remove a provider's DTrace registration
+ * dtrace_invalidate() <-- Invalidate the specified provider
+ * dtrace_condense() <-- Remove a provider's unenabled probes
+ * dtrace_attached() <-- Indicates whether or not DTrace has attached
+ * dtrace_probe_create() <-- Create a DTrace probe
+ * dtrace_probe_lookup() <-- Lookup a DTrace probe based on its name
+ * dtrace_probe_arg() <-- Return the probe argument for a specific probe
+ * dtrace_probe() <-- Fire the specified probe
+ *
+ * 2.2 int dtrace_register(const char *name, const dtrace_pattr_t *pap,
+ * uint32_t priv, cred_t *cr, const dtrace_pops_t *pops, void *arg,
+ * dtrace_provider_id_t *idp)
+ *
+ * 2.2.1 Overview
+ *
+ * dtrace_register() registers the calling provider with the DTrace
+ * framework. It should generally be called by DTrace providers in their
+ * attach(9E) entry point.
+ *
+ * 2.2.2 Arguments and Notes
+ *
+ * The first argument is the name of the provider. The second argument is a
+ * pointer to the stability attributes for the provider. The third argument
+ * is the privilege flags for the provider, and must be some combination of:
+ *
+ * DTRACE_PRIV_NONE <= All users may enable probes from this provider
+ *
+ * DTRACE_PRIV_PROC <= Any user with privilege of PRIV_DTRACE_PROC may
+ * enable probes from this provider
+ *
+ * DTRACE_PRIV_USER <= Any user with privilege of PRIV_DTRACE_USER may
+ * enable probes from this provider
+ *
+ * DTRACE_PRIV_KERNEL <= Any user with privilege of PRIV_DTRACE_KERNEL
+ * may enable probes from this provider
+ *
+ * DTRACE_PRIV_OWNER <= This flag places an additional constraint on
+ * the privilege requirements above. These probes
+ * require either (a) a user ID matching the user
+ * ID of the cred passed in the fourth argument
+ * or (b) the PRIV_PROC_OWNER privilege.
+ *
+ * DTRACE_PRIV_ZONEOWNER<= This flag places an additional constraint on
+ * the privilege requirements above. These probes
+ * require either (a) a zone ID matching the zone
+ * ID of the cred passed in the fourth argument
+ * or (b) the PRIV_PROC_ZONE privilege.
+ *
+ * Note that these flags designate the _visibility_ of the probes, not
+ * the conditions under which they may or may not fire.
+ *
+ * The fourth argument is the credential that is associated with the
+ * provider. This argument should be NULL if the privilege flags don't
+ * include DTRACE_PRIV_OWNER or DTRACE_PRIV_ZONEOWNER. If non-NULL, the
+ * framework stashes the uid and zoneid represented by this credential
+ * for use at probe-time, in implicit predicates. These limit visibility
+ * of the probes to users and/or zones which have sufficient privilege to
+ * access them.
+ *
+ * The fifth argument is a DTrace provider operations vector, which provides
+ * the implementation for the Framework-to-Provider API. (See Section 1,
+ * above.) This must be non-NULL, and each member must be non-NULL. The
+ * exceptions to this are (1) the dtps_provide() and dtps_provide_module()
+ * members (if the provider so desires, _one_ of these members may be left
+ * NULL -- denoting that the provider only implements the other) and (2)
+ * the dtps_suspend() and dtps_resume() members, which must either both be
+ * NULL or both be non-NULL.
+ *
+ * The sixth argument is a cookie to be specified as the first argument for
+ * each function in the Framework-to-Provider API. This argument may have
+ * any value.
+ *
+ * The final argument is a pointer to dtrace_provider_id_t. If
+ * dtrace_register() successfully completes, the provider identifier will be
+ * stored in the memory pointed to be this argument. This argument must be
+ * non-NULL.
+ *
+ * 2.2.3 Return value
+ *
+ * On success, dtrace_register() returns 0 and stores the new provider's
+ * identifier into the memory pointed to by the idp argument. On failure,
+ * dtrace_register() returns an errno:
+ *
+ * EINVAL The arguments passed to dtrace_register() were somehow invalid.
+ * This may because a parameter that must be non-NULL was NULL,
+ * because the name was invalid (either empty or an illegal
+ * provider name) or because the attributes were invalid.
+ *
+ * No other failure code is returned.
+ *
+ * 2.2.4 Caller's context
+ *
+ * dtrace_register() may induce calls to dtrace_provide(); the provider must
+ * hold no locks across dtrace_register() that may also be acquired by
+ * dtrace_provide(). cpu_lock and mod_lock must not be held.
+ *
+ * 2.3 int dtrace_unregister(dtrace_provider_t id)
+ *
+ * 2.3.1 Overview
+ *
+ * Unregisters the specified provider from the DTrace framework. It should
+ * generally be called by DTrace providers in their detach(9E) entry point.
+ *
+ * 2.3.2 Arguments and Notes
+ *
+ * The only argument is the provider identifier, as returned from a
+ * successful call to dtrace_register(). As a result of calling
+ * dtrace_unregister(), the DTrace framework will call back into the provider
+ * via the dtps_destroy() entry point. Once dtrace_unregister() successfully
+ * completes, however, the DTrace framework will no longer make calls through
+ * the Framework-to-Provider API.
+ *
+ * 2.3.3 Return value
+ *
+ * On success, dtrace_unregister returns 0. On failure, dtrace_unregister()
+ * returns an errno:
+ *
+ * EBUSY There are currently processes that have the DTrace pseudodevice
+ * open, or there exists an anonymous enabling that hasn't yet
+ * been claimed.
+ *
+ * No other failure code is returned.
+ *
+ * 2.3.4 Caller's context
+ *
+ * Because a call to dtrace_unregister() may induce calls through the
+ * Framework-to-Provider API, the caller may not hold any lock across
+ * dtrace_register() that is also acquired in any of the Framework-to-
+ * Provider API functions. Additionally, mod_lock may not be held.
+ *
+ * 2.4 void dtrace_invalidate(dtrace_provider_id_t id)
+ *
+ * 2.4.1 Overview
+ *
+ * Invalidates the specified provider. All subsequent probe lookups for the
+ * specified provider will fail, but its probes will not be removed.
+ *
+ * 2.4.2 Arguments and note
+ *
+ * The only argument is the provider identifier, as returned from a
+ * successful call to dtrace_register(). In general, a provider's probes
+ * always remain valid; dtrace_invalidate() is a mechanism for invalidating
+ * an entire provider, regardless of whether or not probes are enabled or
+ * not. Note that dtrace_invalidate() will _not_ prevent already enabled
+ * probes from firing -- it will merely prevent any new enablings of the
+ * provider's probes.
+ *
+ * 2.5 int dtrace_condense(dtrace_provider_id_t id)
+ *
+ * 2.5.1 Overview
+ *
+ * Removes all the unenabled probes for the given provider. This function is
+ * not unlike dtrace_unregister(), except that it doesn't remove the
+ * provider just as many of its associated probes as it can.
+ *
+ * 2.5.2 Arguments and Notes
+ *
+ * As with dtrace_unregister(), the sole argument is the provider identifier
+ * as returned from a successful call to dtrace_register(). As a result of
+ * calling dtrace_condense(), the DTrace framework will call back into the
+ * given provider's dtps_destroy() entry point for each of the provider's
+ * unenabled probes.
+ *
+ * 2.5.3 Return value
+ *
+ * Currently, dtrace_condense() always returns 0. However, consumers of this
+ * function should check the return value as appropriate; its behavior may
+ * change in the future.
+ *
+ * 2.5.4 Caller's context
+ *
+ * As with dtrace_unregister(), the caller may not hold any lock across
+ * dtrace_condense() that is also acquired in the provider's entry points.
+ * Also, mod_lock may not be held.
+ *
+ * 2.6 int dtrace_attached()
+ *
+ * 2.6.1 Overview
+ *
+ * Indicates whether or not DTrace has attached.
+ *
+ * 2.6.2 Arguments and Notes
+ *
+ * For most providers, DTrace makes initial contact beyond registration.
+ * That is, once a provider has registered with DTrace, it waits to hear
+ * from DTrace to create probes. However, some providers may wish to
+ * proactively create probes without first being told by DTrace to do so.
+ * If providers wish to do this, they must first call dtrace_attached() to
+ * determine if DTrace itself has attached. If dtrace_attached() returns 0,
+ * the provider must not make any other Provider-to-Framework API call.
+ *
+ * 2.6.3 Return value
+ *
+ * dtrace_attached() returns 1 if DTrace has attached, 0 otherwise.
+ *
+ * 2.7 int dtrace_probe_create(dtrace_provider_t id, const char *mod,
+ * const char *func, const char *name, int aframes, void *arg)
+ *
+ * 2.7.1 Overview
+ *
+ * Creates a probe with specified module name, function name, and name.
+ *
+ * 2.7.2 Arguments and Notes
+ *
+ * The first argument is the provider identifier, as returned from a
+ * successful call to dtrace_register(). The second, third, and fourth
+ * arguments are the module name, function name, and probe name,
+ * respectively. Of these, module name and function name may both be NULL
+ * (in which case the probe is considered to be unanchored), or they may both
+ * be non-NULL. The name must be non-NULL, and must point to a non-empty
+ * string.
+ *
+ * The fifth argument is the number of artificial stack frames that will be
+ * found on the stack when dtrace_probe() is called for the new probe. These
+ * artificial frames will be automatically be pruned should the stack() or
+ * stackdepth() functions be called as part of one of the probe's ECBs. If
+ * the parameter doesn't add an artificial frame, this parameter should be
+ * zero.
+ *
+ * The final argument is a probe argument that will be passed back to the
+ * provider when a probe-specific operation is called. (e.g., via
+ * dtps_enable(), dtps_disable(), etc.)
+ *
+ * Note that it is up to the provider to be sure that the probe that it
+ * creates does not already exist -- if the provider is unsure of the probe's
+ * existence, it should assure its absence with dtrace_probe_lookup() before
+ * calling dtrace_probe_create().
+ *
+ * 2.7.3 Return value
+ *
+ * dtrace_probe_create() always succeeds, and always returns the identifier
+ * of the newly-created probe.
+ *
+ * 2.7.4 Caller's context
+ *
+ * While dtrace_probe_create() is generally expected to be called from
+ * dtps_provide() and/or dtps_provide_module(), it may be called from other
+ * non-DTrace contexts. Neither cpu_lock nor mod_lock may be held.
+ *
+ * 2.8 dtrace_id_t dtrace_probe_lookup(dtrace_provider_t id, const char *mod,
+ * const char *func, const char *name)
+ *
+ * 2.8.1 Overview
+ *
+ * Looks up a probe based on provdider and one or more of module name,
+ * function name and probe name.
+ *
+ * 2.8.2 Arguments and Notes
+ *
+ * The first argument is the provider identifier, as returned from a
+ * successful call to dtrace_register(). The second, third, and fourth
+ * arguments are the module name, function name, and probe name,
+ * respectively. Any of these may be NULL; dtrace_probe_lookup() will return
+ * the identifier of the first probe that is provided by the specified
+ * provider and matches all of the non-NULL matching criteria.
+ * dtrace_probe_lookup() is generally used by a provider to be check the
+ * existence of a probe before creating it with dtrace_probe_create().
+ *
+ * 2.8.3 Return value
+ *
+ * If the probe exists, returns its identifier. If the probe does not exist,
+ * return DTRACE_IDNONE.
+ *
+ * 2.8.4 Caller's context
+ *
+ * While dtrace_probe_lookup() is generally expected to be called from
+ * dtps_provide() and/or dtps_provide_module(), it may also be called from
+ * other non-DTrace contexts. Neither cpu_lock nor mod_lock may be held.
+ *
+ * 2.9 void *dtrace_probe_arg(dtrace_provider_t id, dtrace_id_t probe)
+ *
+ * 2.9.1 Overview
+ *
+ * Returns the probe argument associated with the specified probe.
+ *
+ * 2.9.2 Arguments and Notes
+ *
+ * The first argument is the provider identifier, as returned from a
+ * successful call to dtrace_register(). The second argument is a probe
+ * identifier, as returned from dtrace_probe_lookup() or
+ * dtrace_probe_create(). This is useful if a probe has multiple
+ * provider-specific components to it: the provider can create the probe
+ * once with provider-specific state, and then add to the state by looking
+ * up the probe based on probe identifier.
+ *
+ * 2.9.3 Return value
+ *
+ * Returns the argument associated with the specified probe. If the
+ * specified probe does not exist, or if the specified probe is not provided
+ * by the specified provider, NULL is returned.
+ *
+ * 2.9.4 Caller's context
+ *
+ * While dtrace_probe_arg() is generally expected to be called from
+ * dtps_provide() and/or dtps_provide_module(), it may also be called from
+ * other non-DTrace contexts. Neither cpu_lock nor mod_lock may be held.
+ *
+ * 2.10 void dtrace_probe(dtrace_id_t probe, uintptr_t arg0, uintptr_t arg1,
+ * uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
+ *
+ * 2.10.1 Overview
+ *
+ * The epicenter of DTrace: fires the specified probes with the specified
+ * arguments.
+ *
+ * 2.10.2 Arguments and Notes
+ *
+ * The first argument is a probe identifier as returned by
+ * dtrace_probe_create() or dtrace_probe_lookup(). The second through sixth
+ * arguments are the values to which the D variables "arg0" through "arg4"
+ * will be mapped.
+ *
+ * dtrace_probe() should be called whenever the specified probe has fired --
+ * however the provider defines it.
+ *
+ * 2.10.3 Return value
+ *
+ * None.
+ *
+ * 2.10.4 Caller's context
+ *
+ * dtrace_probe() may be called in virtually any context: kernel, user,
+ * interrupt, high-level interrupt, with arbitrary adaptive locks held, with
+ * dispatcher locks held, with interrupts disabled, etc. The only latitude
+ * that must be afforded to DTrace is the ability to make calls within
+ * itself (and to its in-kernel subroutines) and the ability to access
+ * arbitrary (but mapped) memory. On some platforms, this constrains
+ * context. For example, on UltraSPARC, dtrace_probe() cannot be called
+ * from any context in which TL is greater than zero. dtrace_probe() may
+ * also not be called from any routine which may be called by dtrace_probe()
+ * -- which includes functions in the DTrace framework and some in-kernel
+ * DTrace subroutines. All such functions "dtrace_"; providers that
+ * instrument the kernel arbitrarily should be sure to not instrument these
+ * routines.
+ */
+typedef struct dtrace_pops {
+ void (*dtps_provide)(void *arg, dtrace_probedesc_t *spec);
+ void (*dtps_provide_module)(void *arg, modctl_t *mp);
+ void (*dtps_enable)(void *arg, dtrace_id_t id, void *parg);
+ void (*dtps_disable)(void *arg, dtrace_id_t id, void *parg);
+ void (*dtps_suspend)(void *arg, dtrace_id_t id, void *parg);
+ void (*dtps_resume)(void *arg, dtrace_id_t id, void *parg);
+ void (*dtps_getargdesc)(void *arg, dtrace_id_t id, void *parg,
+ dtrace_argdesc_t *desc);
+ uint64_t (*dtps_getargval)(void *arg, dtrace_id_t id, void *parg,
+ int argno, int aframes);
+ int (*dtps_usermode)(void *arg, dtrace_id_t id, void *parg);
+ void (*dtps_destroy)(void *arg, dtrace_id_t id, void *parg);
+} dtrace_pops_t;
+
+#define DTRACE_MODE_KERNEL 0x01
+#define DTRACE_MODE_USER 0x02
+#define DTRACE_MODE_NOPRIV_DROP 0x10
+#define DTRACE_MODE_NOPRIV_RESTRICT 0x20
+#define DTRACE_MODE_LIMITEDPRIV_RESTRICT 0x40
+
+typedef uintptr_t dtrace_provider_id_t;
+
+extern int dtrace_register(const char *, const dtrace_pattr_t *, uint32_t,
+ cred_t *, const dtrace_pops_t *, void *, dtrace_provider_id_t *);
+extern int dtrace_unregister(dtrace_provider_id_t);
+extern int dtrace_condense(dtrace_provider_id_t);
+extern void dtrace_invalidate(dtrace_provider_id_t);
+extern dtrace_id_t dtrace_probe_lookup(dtrace_provider_id_t, char *,
+ char *, char *);
+extern dtrace_id_t dtrace_probe_create(dtrace_provider_id_t, const char *,
+ const char *, const char *, int, void *);
+extern void *dtrace_probe_arg(dtrace_provider_id_t, dtrace_id_t);
+extern void dtrace_probe(dtrace_id_t, uintptr_t arg0, uintptr_t arg1,
+ uintptr_t arg2, uintptr_t arg3, uintptr_t arg4);
+
+/*
+ * DTrace Meta Provider API
+ *
+ * The following functions are implemented by the DTrace framework and are
+ * used to implement meta providers. Meta providers plug into the DTrace
+ * framework and are used to instantiate new providers on the fly. At
+ * present, there is only one type of meta provider and only one meta
+ * provider may be registered with the DTrace framework at a time. The
+ * sole meta provider type provides user-land static tracing facilities
+ * by taking meta probe descriptions and adding a corresponding provider
+ * into the DTrace framework.
+ *
+ * 1 Framework-to-Provider
+ *
+ * 1.1 Overview
+ *
+ * The Framework-to-Provider API is represented by the dtrace_mops structure
+ * that the meta provider passes to the framework when registering itself as
+ * a meta provider. This structure consists of the following members:
+ *
+ * dtms_create_probe() <-- Add a new probe to a created provider
+ * dtms_provide_pid() <-- Create a new provider for a given process
+ * dtms_remove_pid() <-- Remove a previously created provider
+ *
+ * 1.2 void dtms_create_probe(void *arg, void *parg,
+ * dtrace_helper_probedesc_t *probedesc);
+ *
+ * 1.2.1 Overview
+ *
+ * Called by the DTrace framework to create a new probe in a provider
+ * created by this meta provider.
+ *
+ * 1.2.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_meta_register().
+ * The second argument is the provider cookie for the associated provider;
+ * this is obtained from the return value of dtms_provide_pid(). The third
+ * argument is the helper probe description.
+ *
+ * 1.2.3 Return value
+ *
+ * None
+ *
+ * 1.2.4 Caller's context
+ *
+ * dtms_create_probe() is called from either ioctl() or module load context
+ * in the context of a newly-created provider (that is, a provider that
+ * is a result of a call to dtms_provide_pid()). The DTrace framework is
+ * locked in such a way that meta providers may not register or unregister,
+ * such that no other thread can call into a meta provider operation and that
+ * atomicity is assured with respect to meta provider operations across
+ * dtms_provide_pid() and subsequent calls to dtms_create_probe().
+ * The context is thus effectively single-threaded with respect to the meta
+ * provider, and that the meta provider cannot call dtrace_meta_register()
+ * or dtrace_meta_unregister(). However, the context is such that the
+ * provider may (and is expected to) call provider-related DTrace provider
+ * APIs including dtrace_probe_create().
+ *
+ * 1.3 void *dtms_provide_pid(void *arg, dtrace_meta_provider_t *mprov,
+ * pid_t pid)
+ *
+ * 1.3.1 Overview
+ *
+ * Called by the DTrace framework to instantiate a new provider given the
+ * description of the provider and probes in the mprov argument. The
+ * meta provider should call dtrace_register() to insert the new provider
+ * into the DTrace framework.
+ *
+ * 1.3.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_meta_register().
+ * The second argument is a pointer to a structure describing the new
+ * helper provider. The third argument is the process identifier for
+ * process associated with this new provider. Note that the name of the
+ * provider as passed to dtrace_register() should be the contatenation of
+ * the dtmpb_provname member of the mprov argument and the processs
+ * identifier as a string.
+ *
+ * 1.3.3 Return value
+ *
+ * The cookie for the provider that the meta provider creates. This is
+ * the same value that it passed to dtrace_register().
+ *
+ * 1.3.4 Caller's context
+ *
+ * dtms_provide_pid() is called from either ioctl() or module load context.
+ * The DTrace framework is locked in such a way that meta providers may not
+ * register or unregister. This means that the meta provider cannot call
+ * dtrace_meta_register() or dtrace_meta_unregister(). However, the context
+ * is such that the provider may -- and is expected to -- call
+ * provider-related DTrace provider APIs including dtrace_register().
+ *
+ * 1.4 void dtms_remove_pid(void *arg, dtrace_meta_provider_t *mprov,
+ * pid_t pid)
+ *
+ * 1.4.1 Overview
+ *
+ * Called by the DTrace framework to remove a provider that had previously
+ * been instantiated via the dtms_provide_pid() entry point. The meta
+ * provider need not remove the provider immediately, but this entry
+ * point indicates that the provider should be removed as soon as possible
+ * using the dtrace_unregister() API.
+ *
+ * 1.4.2 Arguments and notes
+ *
+ * The first argument is the cookie as passed to dtrace_meta_register().
+ * The second argument is a pointer to a structure describing the helper
+ * provider. The third argument is the process identifier for process
+ * associated with this new provider.
+ *
+ * 1.4.3 Return value
+ *
+ * None
+ *
+ * 1.4.4 Caller's context
+ *
+ * dtms_remove_pid() is called from either ioctl() or exit() context.
+ * The DTrace framework is locked in such a way that meta providers may not
+ * register or unregister. This means that the meta provider cannot call
+ * dtrace_meta_register() or dtrace_meta_unregister(). However, the context
+ * is such that the provider may -- and is expected to -- call
+ * provider-related DTrace provider APIs including dtrace_unregister().
+ */
+typedef struct dtrace_helper_probedesc {
+ char *dthpb_mod; /* probe module */
+ char *dthpb_func; /* probe function */
+ char *dthpb_name; /* probe name */
+ uint64_t dthpb_base; /* base address */
+ uint32_t *dthpb_offs; /* offsets array */
+ uint32_t *dthpb_enoffs; /* is-enabled offsets array */
+ uint32_t dthpb_noffs; /* offsets count */
+ uint32_t dthpb_nenoffs; /* is-enabled offsets count */
+ uint8_t *dthpb_args; /* argument mapping array */
+ uint8_t dthpb_xargc; /* translated argument count */
+ uint8_t dthpb_nargc; /* native argument count */
+ char *dthpb_xtypes; /* translated types strings */
+ char *dthpb_ntypes; /* native types strings */
+} dtrace_helper_probedesc_t;
+
+typedef struct dtrace_helper_provdesc {
+ char *dthpv_provname; /* provider name */
+ dtrace_pattr_t dthpv_pattr; /* stability attributes */
+} dtrace_helper_provdesc_t;
+
+typedef struct dtrace_mops {
+ void (*dtms_create_probe)(void *, void *, dtrace_helper_probedesc_t *);
+ void *(*dtms_provide_pid)(void *, dtrace_helper_provdesc_t *, pid_t);
+ void (*dtms_remove_pid)(void *, dtrace_helper_provdesc_t *, pid_t);
+} dtrace_mops_t;
+
+typedef uintptr_t dtrace_meta_provider_id_t;
+
+extern int dtrace_meta_register(const char *, const dtrace_mops_t *, void *,
+ dtrace_meta_provider_id_t *);
+extern int dtrace_meta_unregister(dtrace_meta_provider_id_t);
+
+/*
+ * DTrace Kernel Hooks
+ *
+ * The following functions are implemented by the base kernel and form a set of
+ * hooks used by the DTrace framework. DTrace hooks are implemented in either
+ * uts/common/os/dtrace_subr.c, an ISA-specific assembly file, or in a
+ * uts/<platform>/os/dtrace_subr.c corresponding to each hardware platform.
+ */
+
+typedef enum dtrace_vtime_state {
+ DTRACE_VTIME_INACTIVE = 0, /* No DTrace, no TNF */
+ DTRACE_VTIME_ACTIVE, /* DTrace virtual time, no TNF */
+ DTRACE_VTIME_INACTIVE_TNF, /* No DTrace, TNF active */
+ DTRACE_VTIME_ACTIVE_TNF /* DTrace virtual time _and_ TNF */
+} dtrace_vtime_state_t;
+
+#ifdef illumos
+extern dtrace_vtime_state_t dtrace_vtime_active;
+#endif
+extern void dtrace_vtime_switch(kthread_t *next);
+extern void dtrace_vtime_enable_tnf(void);
+extern void dtrace_vtime_disable_tnf(void);
+extern void dtrace_vtime_enable(void);
+extern void dtrace_vtime_disable(void);
+
+struct regs;
+struct reg;
+
+#ifdef illumos
+extern int (*dtrace_pid_probe_ptr)(struct reg *);
+extern int (*dtrace_return_probe_ptr)(struct reg *);
+extern void (*dtrace_fasttrap_fork_ptr)(proc_t *, proc_t *);
+extern void (*dtrace_fasttrap_exec_ptr)(proc_t *);
+extern void (*dtrace_fasttrap_exit_ptr)(proc_t *);
+extern void dtrace_fasttrap_fork(proc_t *, proc_t *);
+#endif
+
+typedef uintptr_t dtrace_icookie_t;
+typedef void (*dtrace_xcall_t)(void *);
+
+extern dtrace_icookie_t dtrace_interrupt_disable(void);
+extern void dtrace_interrupt_enable(dtrace_icookie_t);
+
+extern void dtrace_membar_producer(void);
+extern void dtrace_membar_consumer(void);
+
+extern void (*dtrace_cpu_init)(processorid_t);
+#ifdef illumos
+extern void (*dtrace_modload)(modctl_t *);
+extern void (*dtrace_modunload)(modctl_t *);
+#endif
+extern void (*dtrace_helpers_cleanup)(void);
+extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child);
+extern void (*dtrace_cpustart_init)(void);
+extern void (*dtrace_cpustart_fini)(void);
+extern void (*dtrace_closef)(void);
+
+extern void (*dtrace_debugger_init)(void);
+extern void (*dtrace_debugger_fini)(void);
+extern dtrace_cacheid_t dtrace_predcache_id;
+
+#ifdef illumos
+extern hrtime_t dtrace_gethrtime(void);
+#else
+void dtrace_debug_printf(const char *, ...) __printflike(1, 2);
+#endif
+extern void dtrace_sync(void);
+extern void dtrace_toxic_ranges(void (*)(uintptr_t, uintptr_t));
+extern void dtrace_xcall(processorid_t, dtrace_xcall_t, void *);
+extern void dtrace_vpanic(const char *, __va_list);
+extern void dtrace_panic(const char *, ...);
+
+extern int dtrace_safe_defer_signal(void);
+extern void dtrace_safe_synchronous_signal(void);
+
+extern int dtrace_mach_aframes(void);
+
+#if defined(__i386) || defined(__amd64)
+extern int dtrace_instr_size(uchar_t *instr);
+extern int dtrace_instr_size_isa(uchar_t *, model_t, int *);
+extern void dtrace_invop_callsite(void);
+#endif
+extern void dtrace_invop_add(int (*)(uintptr_t, struct trapframe *, uintptr_t));
+extern void dtrace_invop_remove(int (*)(uintptr_t, struct trapframe *,
+ uintptr_t));
+
+#ifdef __sparc
+extern int dtrace_blksuword32(uintptr_t, uint32_t *, int);
+extern void dtrace_getfsr(uint64_t *);
+#endif
+
+#ifndef illumos
+extern void dtrace_helpers_duplicate(proc_t *, proc_t *);
+extern void dtrace_helpers_destroy(proc_t *);
+#endif
+
+#define DTRACE_CPUFLAG_ISSET(flag) \
+ (cpu_core[curcpu].cpuc_dtrace_flags & (flag))
+
+#define DTRACE_CPUFLAG_SET(flag) \
+ (cpu_core[curcpu].cpuc_dtrace_flags |= (flag))
+
+#define DTRACE_CPUFLAG_CLEAR(flag) \
+ (cpu_core[curcpu].cpuc_dtrace_flags &= ~(flag))
+
+#endif /* _KERNEL */
+
+#endif /* _ASM */
+
+#if defined(__i386) || defined(__amd64)
+
+#define DTRACE_INVOP_PUSHL_EBP 1
+#define DTRACE_INVOP_PUSHQ_RBP DTRACE_INVOP_PUSHL_EBP
+#define DTRACE_INVOP_POPL_EBP 2
+#define DTRACE_INVOP_POPQ_RBP DTRACE_INVOP_POPL_EBP
+#define DTRACE_INVOP_LEAVE 3
+#define DTRACE_INVOP_NOP 4
+#define DTRACE_INVOP_RET 5
+
+#elif defined(__powerpc__)
+
+#define DTRACE_INVOP_BCTR 1
+#define DTRACE_INVOP_BLR 2
+#define DTRACE_INVOP_JUMP 3
+#define DTRACE_INVOP_MFLR_R0 4
+#define DTRACE_INVOP_NOP 5
+
+#elif defined(__arm__)
+
+#define DTRACE_INVOP_SHIFT 4
+#define DTRACE_INVOP_MASK ((1 << DTRACE_INVOP_SHIFT) - 1)
+#define DTRACE_INVOP_DATA(x) ((x) >> DTRACE_INVOP_SHIFT)
+
+#define DTRACE_INVOP_PUSHM 1
+#define DTRACE_INVOP_POPM 2
+#define DTRACE_INVOP_B 3
+
+#elif defined(__aarch64__)
+
+#define INSN_SIZE 4
+
+#define B_MASK 0xff000000
+#define B_DATA_MASK 0x00ffffff
+#define B_INSTR 0x14000000
+
+#define RET_INSTR 0xd65f03c0
+
+#define LDP_STP_MASK 0xffc00000
+#define STP_32 0x29800000
+#define STP_64 0xa9800000
+#define LDP_32 0x28c00000
+#define LDP_64 0xa8c00000
+#define LDP_STP_PREIND (1 << 24)
+#define LDP_STP_DIR (1 << 22) /* Load instruction */
+#define ARG1_SHIFT 0
+#define ARG1_MASK 0x1f
+#define ARG2_SHIFT 10
+#define ARG2_MASK 0x1f
+#define OFFSET_SHIFT 15
+#define OFFSET_SIZE 7
+#define OFFSET_MASK ((1 << OFFSET_SIZE) - 1)
+
+#define DTRACE_INVOP_PUSHM 1
+#define DTRACE_INVOP_RET 2
+#define DTRACE_INVOP_B 3
+
+#elif defined(__mips__)
+
+#define INSN_SIZE 4
+
+/* Load/Store double RA to/from SP */
+#define LDSD_RA_SP_MASK 0xffff0000
+#define LDSD_DATA_MASK 0x0000ffff
+#define SD_RA_SP 0xffbf0000
+#define LD_RA_SP 0xdfbf0000
+
+#define DTRACE_INVOP_SD 1
+#define DTRACE_INVOP_LD 2
+
+#elif defined(__riscv)
+
+#define DTRACE_INVOP_SD 1
+#define DTRACE_INVOP_C_SDSP 2
+#define DTRACE_INVOP_RET 3
+#define DTRACE_INVOP_C_RET 4
+#define DTRACE_INVOP_NOP 5
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DTRACE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
new file mode 100644
index 000000000000..0b8df9834fa6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
@@ -0,0 +1,1351 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_DTRACE_IMPL_H
+#define _SYS_DTRACE_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * DTrace Dynamic Tracing Software: Kernel Implementation Interfaces
+ *
+ * Note: The contents of this file are private to the implementation of the
+ * Solaris system and DTrace subsystem and are subject to change at any time
+ * without notice. Applications and drivers using these interfaces will fail
+ * to run on future releases. These interfaces should not be used for any
+ * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB).
+ * Please refer to the "Solaris Dynamic Tracing Guide" for more information.
+ */
+
+#include <sys/dtrace.h>
+
+#ifndef illumos
+#ifdef __sparcv9
+typedef uint32_t pc_t;
+#else
+typedef uintptr_t pc_t;
+#endif
+typedef u_long greg_t;
+#endif
+
+/*
+ * DTrace Implementation Constants and Typedefs
+ */
+#define DTRACE_MAXPROPLEN 128
+#define DTRACE_DYNVAR_CHUNKSIZE 256
+
+#ifdef __FreeBSD__
+#define NCPU MAXCPU
+#endif /* __FreeBSD__ */
+
+struct dtrace_probe;
+struct dtrace_ecb;
+struct dtrace_predicate;
+struct dtrace_action;
+struct dtrace_provider;
+struct dtrace_state;
+
+typedef struct dtrace_probe dtrace_probe_t;
+typedef struct dtrace_ecb dtrace_ecb_t;
+typedef struct dtrace_predicate dtrace_predicate_t;
+typedef struct dtrace_action dtrace_action_t;
+typedef struct dtrace_provider dtrace_provider_t;
+typedef struct dtrace_meta dtrace_meta_t;
+typedef struct dtrace_state dtrace_state_t;
+typedef uint32_t dtrace_optid_t;
+typedef uint32_t dtrace_specid_t;
+typedef uint64_t dtrace_genid_t;
+
+/*
+ * DTrace Probes
+ *
+ * The probe is the fundamental unit of the DTrace architecture. Probes are
+ * created by DTrace providers, and managed by the DTrace framework. A probe
+ * is identified by a unique <provider, module, function, name> tuple, and has
+ * a unique probe identifier assigned to it. (Some probes are not associated
+ * with a specific point in text; these are called _unanchored probes_ and have
+ * no module or function associated with them.) Probes are represented as a
+ * dtrace_probe structure. To allow quick lookups based on each element of the
+ * probe tuple, probes are hashed by each of provider, module, function and
+ * name. (If a lookup is performed based on a regular expression, a
+ * dtrace_probekey is prepared, and a linear search is performed.) Each probe
+ * is additionally pointed to by a linear array indexed by its identifier. The
+ * identifier is the provider's mechanism for indicating to the DTrace
+ * framework that a probe has fired: the identifier is passed as the first
+ * argument to dtrace_probe(), where it is then mapped into the corresponding
+ * dtrace_probe structure. From the dtrace_probe structure, dtrace_probe() can
+ * iterate over the probe's list of enabling control blocks; see "DTrace
+ * Enabling Control Blocks", below.)
+ */
+struct dtrace_probe {
+ dtrace_id_t dtpr_id; /* probe identifier */
+ dtrace_ecb_t *dtpr_ecb; /* ECB list; see below */
+ dtrace_ecb_t *dtpr_ecb_last; /* last ECB in list */
+ void *dtpr_arg; /* provider argument */
+ dtrace_cacheid_t dtpr_predcache; /* predicate cache ID */
+ int dtpr_aframes; /* artificial frames */
+ dtrace_provider_t *dtpr_provider; /* pointer to provider */
+ char *dtpr_mod; /* probe's module name */
+ char *dtpr_func; /* probe's function name */
+ char *dtpr_name; /* probe's name */
+ dtrace_probe_t *dtpr_nextmod; /* next in module hash */
+ dtrace_probe_t *dtpr_prevmod; /* previous in module hash */
+ dtrace_probe_t *dtpr_nextfunc; /* next in function hash */
+ dtrace_probe_t *dtpr_prevfunc; /* previous in function hash */
+ dtrace_probe_t *dtpr_nextname; /* next in name hash */
+ dtrace_probe_t *dtpr_prevname; /* previous in name hash */
+ dtrace_genid_t dtpr_gen; /* probe generation ID */
+};
+
+typedef int dtrace_probekey_f(const char *, const char *, int);
+
+typedef struct dtrace_probekey {
+ char *dtpk_prov; /* provider name to match */
+ dtrace_probekey_f *dtpk_pmatch; /* provider matching function */
+ char *dtpk_mod; /* module name to match */
+ dtrace_probekey_f *dtpk_mmatch; /* module matching function */
+ char *dtpk_func; /* func name to match */
+ dtrace_probekey_f *dtpk_fmatch; /* func matching function */
+ char *dtpk_name; /* name to match */
+ dtrace_probekey_f *dtpk_nmatch; /* name matching function */
+ dtrace_id_t dtpk_id; /* identifier to match */
+} dtrace_probekey_t;
+
+typedef struct dtrace_hashbucket {
+ struct dtrace_hashbucket *dthb_next; /* next on hash chain */
+ dtrace_probe_t *dthb_chain; /* chain of probes */
+ int dthb_len; /* number of probes here */
+} dtrace_hashbucket_t;
+
+typedef struct dtrace_hash {
+ dtrace_hashbucket_t **dth_tab; /* hash table */
+ int dth_size; /* size of hash table */
+ int dth_mask; /* mask to index into table */
+ int dth_nbuckets; /* total number of buckets */
+ uintptr_t dth_nextoffs; /* offset of next in probe */
+ uintptr_t dth_prevoffs; /* offset of prev in probe */
+ uintptr_t dth_stroffs; /* offset of str in probe */
+} dtrace_hash_t;
+
+/*
+ * DTrace Enabling Control Blocks
+ *
+ * When a provider wishes to fire a probe, it calls into dtrace_probe(),
+ * passing the probe identifier as the first argument. As described above,
+ * dtrace_probe() maps the identifier into a pointer to a dtrace_probe_t
+ * structure. This structure contains information about the probe, and a
+ * pointer to the list of Enabling Control Blocks (ECBs). Each ECB points to
+ * DTrace consumer state, and contains an optional predicate, and a list of
+ * actions. (Shown schematically below.) The ECB abstraction allows a single
+ * probe to be multiplexed across disjoint consumers, or across disjoint
+ * enablings of a single probe within one consumer.
+ *
+ * Enabling Control Block
+ * dtrace_ecb_t
+ * +------------------------+
+ * | dtrace_epid_t ---------+--------------> Enabled Probe ID (EPID)
+ * | dtrace_state_t * ------+--------------> State associated with this ECB
+ * | dtrace_predicate_t * --+---------+
+ * | dtrace_action_t * -----+----+ |
+ * | dtrace_ecb_t * ---+ | | | Predicate (if any)
+ * +-------------------+----+ | | dtrace_predicate_t
+ * | | +---> +--------------------+
+ * | | | dtrace_difo_t * ---+----> DIFO
+ * | | +--------------------+
+ * | |
+ * Next ECB | | Action
+ * (if any) | | dtrace_action_t
+ * : +--> +-------------------+
+ * : | dtrace_actkind_t -+------> kind
+ * v | dtrace_difo_t * --+------> DIFO (if any)
+ * | dtrace_recdesc_t -+------> record descr.
+ * | dtrace_action_t * +------+
+ * +-------------------+ |
+ * | Next action
+ * +-------------------------------+ (if any)
+ * |
+ * | Action
+ * | dtrace_action_t
+ * +--> +-------------------+
+ * | dtrace_actkind_t -+------> kind
+ * | dtrace_difo_t * --+------> DIFO (if any)
+ * | dtrace_action_t * +------+
+ * +-------------------+ |
+ * | Next action
+ * +-------------------------------+ (if any)
+ * |
+ * :
+ * v
+ *
+ *
+ * dtrace_probe() iterates over the ECB list. If the ECB needs less space
+ * than is available in the principal buffer, the ECB is processed: if the
+ * predicate is non-NULL, the DIF object is executed. If the result is
+ * non-zero, the action list is processed, with each action being executed
+ * accordingly. When the action list has been completely executed, processing
+ * advances to the next ECB. The ECB abstraction allows disjoint consumers
+ * to multiplex on single probes.
+ *
+ * Execution of the ECB results in consuming dte_size bytes in the buffer
+ * to record data. During execution, dte_needed bytes must be available in
+ * the buffer. This space is used for both recorded data and tuple data.
+ */
+struct dtrace_ecb {
+ dtrace_epid_t dte_epid; /* enabled probe ID */
+ uint32_t dte_alignment; /* required alignment */
+ size_t dte_needed; /* space needed for execution */
+ size_t dte_size; /* size of recorded payload */
+ dtrace_predicate_t *dte_predicate; /* predicate, if any */
+ dtrace_action_t *dte_action; /* actions, if any */
+ dtrace_ecb_t *dte_next; /* next ECB on probe */
+ dtrace_state_t *dte_state; /* pointer to state */
+ uint32_t dte_cond; /* security condition */
+ dtrace_probe_t *dte_probe; /* pointer to probe */
+ dtrace_action_t *dte_action_last; /* last action on ECB */
+ uint64_t dte_uarg; /* library argument */
+};
+
+struct dtrace_predicate {
+ dtrace_difo_t *dtp_difo; /* DIF object */
+ dtrace_cacheid_t dtp_cacheid; /* cache identifier */
+ int dtp_refcnt; /* reference count */
+};
+
+struct dtrace_action {
+ dtrace_actkind_t dta_kind; /* kind of action */
+ uint16_t dta_intuple; /* boolean: in aggregation */
+ uint32_t dta_refcnt; /* reference count */
+ dtrace_difo_t *dta_difo; /* pointer to DIFO */
+ dtrace_recdesc_t dta_rec; /* record description */
+ dtrace_action_t *dta_prev; /* previous action */
+ dtrace_action_t *dta_next; /* next action */
+};
+
+typedef struct dtrace_aggregation {
+ dtrace_action_t dtag_action; /* action; must be first */
+ dtrace_aggid_t dtag_id; /* identifier */
+ dtrace_ecb_t *dtag_ecb; /* corresponding ECB */
+ dtrace_action_t *dtag_first; /* first action in tuple */
+ uint32_t dtag_base; /* base of aggregation */
+ uint8_t dtag_hasarg; /* boolean: has argument */
+ uint64_t dtag_initial; /* initial value */
+ void (*dtag_aggregate)(uint64_t *, uint64_t, uint64_t);
+} dtrace_aggregation_t;
+
+/*
+ * DTrace Buffers
+ *
+ * Principal buffers, aggregation buffers, and speculative buffers are all
+ * managed with the dtrace_buffer structure. By default, this structure
+ * includes twin data buffers -- dtb_tomax and dtb_xamot -- that serve as the
+ * active and passive buffers, respectively. For speculative buffers,
+ * dtb_xamot will be NULL; for "ring" and "fill" buffers, dtb_xamot will point
+ * to a scratch buffer. For all buffer types, the dtrace_buffer structure is
+ * always allocated on a per-CPU basis; a single dtrace_buffer structure is
+ * never shared among CPUs. (That is, there is never true sharing of the
+ * dtrace_buffer structure; to prevent false sharing of the structure, it must
+ * always be aligned to the coherence granularity -- generally 64 bytes.)
+ *
+ * One of the critical design decisions of DTrace is that a given ECB always
+ * stores the same quantity and type of data. This is done to assure that the
+ * only metadata required for an ECB's traced data is the EPID. That is, from
+ * the EPID, the consumer can determine the data layout. (The data buffer
+ * layout is shown schematically below.) By assuring that one can determine
+ * data layout from the EPID, the metadata stream can be separated from the
+ * data stream -- simplifying the data stream enormously. The ECB always
+ * proceeds the recorded data as part of the dtrace_rechdr_t structure that
+ * includes the EPID and a high-resolution timestamp used for output ordering
+ * consistency.
+ *
+ * base of data buffer ---> +--------+--------------------+--------+
+ * | rechdr | data | rechdr |
+ * +--------+------+--------+----+--------+
+ * | data | rechdr | data |
+ * +---------------+--------+-------------+
+ * | data, cont. |
+ * +--------+--------------------+--------+
+ * | rechdr | data | |
+ * +--------+--------------------+ |
+ * | || |
+ * | || |
+ * | \/ |
+ * : :
+ * . .
+ * . .
+ * . .
+ * : :
+ * | |
+ * limit of data buffer ---> +--------------------------------------+
+ *
+ * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the
+ * principal buffer (both scratch and payload) exceed the available space. If
+ * the ECB's needs exceed available space (and if the principal buffer policy
+ * is the default "switch" policy), the ECB is dropped, the buffer's drop count
+ * is incremented, and processing advances to the next ECB. If the ECB's needs
+ * can be met with the available space, the ECB is processed, but the offset in
+ * the principal buffer is only advanced if the ECB completes processing
+ * without error.
+ *
+ * When a buffer is to be switched (either because the buffer is the principal
+ * buffer with a "switch" policy or because it is an aggregation buffer), a
+ * cross call is issued to the CPU associated with the buffer. In the cross
+ * call context, interrupts are disabled, and the active and the inactive
+ * buffers are atomically switched. This involves switching the data pointers,
+ * copying the various state fields (offset, drops, errors, etc.) into their
+ * inactive equivalents, and clearing the state fields. Because interrupts are
+ * disabled during this procedure, the switch is guaranteed to appear atomic to
+ * dtrace_probe().
+ *
+ * DTrace Ring Buffering
+ *
+ * To process a ring buffer correctly, one must know the oldest valid record.
+ * Processing starts at the oldest record in the buffer and continues until
+ * the end of the buffer is reached. Processing then resumes starting with
+ * the record stored at offset 0 in the buffer, and continues until the
+ * youngest record is processed. If trace records are of a fixed-length,
+ * determining the oldest record is trivial:
+ *
+ * - If the ring buffer has not wrapped, the oldest record is the record
+ * stored at offset 0.
+ *
+ * - If the ring buffer has wrapped, the oldest record is the record stored
+ * at the current offset.
+ *
+ * With variable length records, however, just knowing the current offset
+ * doesn't suffice for determining the oldest valid record: assuming that one
+ * allows for arbitrary data, one has no way of searching forward from the
+ * current offset to find the oldest valid record. (That is, one has no way
+ * of separating data from metadata.) It would be possible to simply refuse to
+ * process any data in the ring buffer between the current offset and the
+ * limit, but this leaves (potentially) an enormous amount of otherwise valid
+ * data unprocessed.
+ *
+ * To effect ring buffering, we track two offsets in the buffer: the current
+ * offset and the _wrapped_ offset. If a request is made to reserve some
+ * amount of data, and the buffer has wrapped, the wrapped offset is
+ * incremented until the wrapped offset minus the current offset is greater
+ * than or equal to the reserve request. This is done by repeatedly looking
+ * up the ECB corresponding to the EPID at the current wrapped offset, and
+ * incrementing the wrapped offset by the size of the data payload
+ * corresponding to that ECB. If this offset is greater than or equal to the
+ * limit of the data buffer, the wrapped offset is set to 0. Thus, the
+ * current offset effectively "chases" the wrapped offset around the buffer.
+ * Schematically:
+ *
+ * base of data buffer ---> +------+--------------------+------+
+ * | EPID | data | EPID |
+ * +------+--------+------+----+------+
+ * | data | EPID | data |
+ * +---------------+------+-----------+
+ * | data, cont. |
+ * +------+---------------------------+
+ * | EPID | data |
+ * current offset ---> +------+---------------------------+
+ * | invalid data |
+ * wrapped offset ---> +------+--------------------+------+
+ * | EPID | data | EPID |
+ * +------+--------+------+----+------+
+ * | data | EPID | data |
+ * +---------------+------+-----------+
+ * : :
+ * . .
+ * . ... valid data ... .
+ * . .
+ * : :
+ * +------+-------------+------+------+
+ * | EPID | data | EPID | data |
+ * +------+------------++------+------+
+ * | data, cont. | leftover |
+ * limit of data buffer ---> +-------------------+--------------+
+ *
+ * If the amount of requested buffer space exceeds the amount of space
+ * available between the current offset and the end of the buffer:
+ *
+ * (1) all words in the data buffer between the current offset and the limit
+ * of the data buffer (marked "leftover", above) are set to
+ * DTRACE_EPIDNONE
+ *
+ * (2) the wrapped offset is set to zero
+ *
+ * (3) the iteration process described above occurs until the wrapped offset
+ * is greater than the amount of desired space.
+ *
+ * The wrapped offset is implemented by (re-)using the inactive offset.
+ * In a "switch" buffer policy, the inactive offset stores the offset in
+ * the inactive buffer; in a "ring" buffer policy, it stores the wrapped
+ * offset.
+ *
+ * DTrace Scratch Buffering
+ *
+ * Some ECBs may wish to allocate dynamically-sized temporary scratch memory.
+ * To accommodate such requests easily, scratch memory may be allocated in
+ * the buffer beyond the current offset plus the needed memory of the current
+ * ECB. If there isn't sufficient room in the buffer for the requested amount
+ * of scratch space, the allocation fails and an error is generated. Scratch
+ * memory is tracked in the dtrace_mstate_t and is automatically freed when
+ * the ECB ceases processing. Note that ring buffers cannot allocate their
+ * scratch from the principal buffer -- lest they needlessly overwrite older,
+ * valid data. Ring buffers therefore have their own dedicated scratch buffer
+ * from which scratch is allocated.
+ */
+#define DTRACEBUF_RING 0x0001 /* bufpolicy set to "ring" */
+#define DTRACEBUF_FILL 0x0002 /* bufpolicy set to "fill" */
+#define DTRACEBUF_NOSWITCH 0x0004 /* do not switch buffer */
+#define DTRACEBUF_WRAPPED 0x0008 /* ring buffer has wrapped */
+#define DTRACEBUF_DROPPED 0x0010 /* drops occurred */
+#define DTRACEBUF_ERROR 0x0020 /* errors occurred */
+#define DTRACEBUF_FULL 0x0040 /* "fill" buffer is full */
+#define DTRACEBUF_CONSUMED 0x0080 /* buffer has been consumed */
+#define DTRACEBUF_INACTIVE 0x0100 /* buffer is not yet active */
+
+typedef struct dtrace_buffer {
+ uint64_t dtb_offset; /* current offset in buffer */
+ uint64_t dtb_size; /* size of buffer */
+ uint32_t dtb_flags; /* flags */
+ uint32_t dtb_drops; /* number of drops */
+ caddr_t dtb_tomax; /* active buffer */
+ caddr_t dtb_xamot; /* inactive buffer */
+ uint32_t dtb_xamot_flags; /* inactive flags */
+ uint32_t dtb_xamot_drops; /* drops in inactive buffer */
+ uint64_t dtb_xamot_offset; /* offset in inactive buffer */
+ uint32_t dtb_errors; /* number of errors */
+ uint32_t dtb_xamot_errors; /* errors in inactive buffer */
+#ifndef _LP64
+ uint64_t dtb_pad1; /* pad out to 64 bytes */
+#endif
+ uint64_t dtb_switched; /* time of last switch */
+ uint64_t dtb_interval; /* observed switch interval */
+ uint64_t dtb_pad2[6]; /* pad to avoid false sharing */
+} dtrace_buffer_t;
+
+/*
+ * DTrace Aggregation Buffers
+ *
+ * Aggregation buffers use much of the same mechanism as described above
+ * ("DTrace Buffers"). However, because an aggregation is fundamentally a
+ * hash, there exists dynamic metadata associated with an aggregation buffer
+ * that is not associated with other kinds of buffers. This aggregation
+ * metadata is _only_ relevant for the in-kernel implementation of
+ * aggregations; it is not actually relevant to user-level consumers. To do
+ * this, we allocate dynamic aggregation data (hash keys and hash buckets)
+ * starting below the _limit_ of the buffer, and we allocate data from the
+ * _base_ of the buffer. When the aggregation buffer is copied out, _only_ the
+ * data is copied out; the metadata is simply discarded. Schematically,
+ * aggregation buffers look like:
+ *
+ * base of data buffer ---> +-------+------+-----------+-------+
+ * | aggid | key | value | aggid |
+ * +-------+------+-----------+-------+
+ * | key |
+ * +-------+-------+-----+------------+
+ * | value | aggid | key | value |
+ * +-------+------++-----+------+-----+
+ * | aggid | key | value | |
+ * +-------+------+-------------+ |
+ * | || |
+ * | || |
+ * | \/ |
+ * : :
+ * . .
+ * . .
+ * . .
+ * : :
+ * | /\ |
+ * | || +------------+
+ * | || | |
+ * +---------------------+ |
+ * | hash keys |
+ * | (dtrace_aggkey structures) |
+ * | |
+ * +----------------------------------+
+ * | hash buckets |
+ * | (dtrace_aggbuffer structure) |
+ * | |
+ * limit of data buffer ---> +----------------------------------+
+ *
+ *
+ * As implied above, just as we assure that ECBs always store a constant
+ * amount of data, we assure that a given aggregation -- identified by its
+ * aggregation ID -- always stores data of a constant quantity and type.
+ * As with EPIDs, this allows the aggregation ID to serve as the metadata for a
+ * given record.
+ *
+ * Note that the size of the dtrace_aggkey structure must be sizeof (uintptr_t)
+ * aligned. (If this the structure changes such that this becomes false, an
+ * assertion will fail in dtrace_aggregate().)
+ */
+typedef struct dtrace_aggkey {
+ uint32_t dtak_hashval; /* hash value */
+ uint32_t dtak_action:4; /* action -- 4 bits */
+ uint32_t dtak_size:28; /* size -- 28 bits */
+ caddr_t dtak_data; /* data pointer */
+ struct dtrace_aggkey *dtak_next; /* next in hash chain */
+} dtrace_aggkey_t;
+
+typedef struct dtrace_aggbuffer {
+ uintptr_t dtagb_hashsize; /* number of buckets */
+ uintptr_t dtagb_free; /* free list of keys */
+ dtrace_aggkey_t **dtagb_hash; /* hash table */
+} dtrace_aggbuffer_t;
+
+/*
+ * DTrace Speculations
+ *
+ * Speculations have a per-CPU buffer and a global state. Once a speculation
+ * buffer has been comitted or discarded, it cannot be reused until all CPUs
+ * have taken the same action (commit or discard) on their respective
+ * speculative buffer. However, because DTrace probes may execute in arbitrary
+ * context, other CPUs cannot simply be cross-called at probe firing time to
+ * perform the necessary commit or discard. The speculation states thus
+ * optimize for the case that a speculative buffer is only active on one CPU at
+ * the time of a commit() or discard() -- for if this is the case, other CPUs
+ * need not take action, and the speculation is immediately available for
+ * reuse. If the speculation is active on multiple CPUs, it must be
+ * asynchronously cleaned -- potentially leading to a higher rate of dirty
+ * speculative drops. The speculation states are as follows:
+ *
+ * DTRACESPEC_INACTIVE <= Initial state; inactive speculation
+ * DTRACESPEC_ACTIVE <= Allocated, but not yet speculatively traced to
+ * DTRACESPEC_ACTIVEONE <= Speculatively traced to on one CPU
+ * DTRACESPEC_ACTIVEMANY <= Speculatively traced to on more than one CPU
+ * DTRACESPEC_COMMITTING <= Currently being commited on one CPU
+ * DTRACESPEC_COMMITTINGMANY <= Currently being commited on many CPUs
+ * DTRACESPEC_DISCARDING <= Currently being discarded on many CPUs
+ *
+ * The state transition diagram is as follows:
+ *
+ * +----------------------------------------------------------+
+ * | |
+ * | +------------+ |
+ * | +-------------------| COMMITTING |<-----------------+ |
+ * | | +------------+ | |
+ * | | copied spec. ^ commit() on | | discard() on
+ * | | into principal | active CPU | | active CPU
+ * | | | commit() | |
+ * V V | | |
+ * +----------+ +--------+ +-----------+
+ * | INACTIVE |---------------->| ACTIVE |--------------->| ACTIVEONE |
+ * +----------+ speculation() +--------+ speculate() +-----------+
+ * ^ ^ | | |
+ * | | | discard() | |
+ * | | asynchronously | discard() on | | speculate()
+ * | | cleaned V inactive CPU | | on inactive
+ * | | +------------+ | | CPU
+ * | +-------------------| DISCARDING |<-----------------+ |
+ * | +------------+ |
+ * | asynchronously ^ |
+ * | copied spec. | discard() |
+ * | into principal +------------------------+ |
+ * | | V
+ * +----------------+ commit() +------------+
+ * | COMMITTINGMANY |<----------------------------------| ACTIVEMANY |
+ * +----------------+ +------------+
+ */
+typedef enum dtrace_speculation_state {
+ DTRACESPEC_INACTIVE = 0,
+ DTRACESPEC_ACTIVE,
+ DTRACESPEC_ACTIVEONE,
+ DTRACESPEC_ACTIVEMANY,
+ DTRACESPEC_COMMITTING,
+ DTRACESPEC_COMMITTINGMANY,
+ DTRACESPEC_DISCARDING
+} dtrace_speculation_state_t;
+
+typedef struct dtrace_speculation {
+ dtrace_speculation_state_t dtsp_state; /* current speculation state */
+ int dtsp_cleaning; /* non-zero if being cleaned */
+ dtrace_buffer_t *dtsp_buffer; /* speculative buffer */
+} dtrace_speculation_t;
+
+/*
+ * DTrace Dynamic Variables
+ *
+ * The dynamic variable problem is obviously decomposed into two subproblems:
+ * allocating new dynamic storage, and freeing old dynamic storage. The
+ * presence of the second problem makes the first much more complicated -- or
+ * rather, the absence of the second renders the first trivial. This is the
+ * case with aggregations, for which there is effectively no deallocation of
+ * dynamic storage. (Or more accurately, all dynamic storage is deallocated
+ * when a snapshot is taken of the aggregation.) As DTrace dynamic variables
+ * allow for both dynamic allocation and dynamic deallocation, the
+ * implementation of dynamic variables is quite a bit more complicated than
+ * that of their aggregation kin.
+ *
+ * We observe that allocating new dynamic storage is tricky only because the
+ * size can vary -- the allocation problem is much easier if allocation sizes
+ * are uniform. We further observe that in D, the size of dynamic variables is
+ * actually _not_ dynamic -- dynamic variable sizes may be determined by static
+ * analysis of DIF text. (This is true even of putatively dynamically-sized
+ * objects like strings and stacks, the sizes of which are dictated by the
+ * "stringsize" and "stackframes" variables, respectively.) We exploit this by
+ * performing this analysis on all DIF before enabling any probes. For each
+ * dynamic load or store, we calculate the dynamically-allocated size plus the
+ * size of the dtrace_dynvar structure plus the storage required to key the
+ * data. For all DIF, we take the largest value and dub it the _chunksize_.
+ * We then divide dynamic memory into two parts: a hash table that is wide
+ * enough to have every chunk in its own bucket, and a larger region of equal
+ * chunksize units. Whenever we wish to dynamically allocate a variable, we
+ * always allocate a single chunk of memory. Depending on the uniformity of
+ * allocation, this will waste some amount of memory -- but it eliminates the
+ * non-determinism inherent in traditional heap fragmentation.
+ *
+ * Dynamic objects are allocated by storing a non-zero value to them; they are
+ * deallocated by storing a zero value to them. Dynamic variables are
+ * complicated enormously by being shared between CPUs. In particular,
+ * consider the following scenario:
+ *
+ * CPU A CPU B
+ * +---------------------------------+ +---------------------------------+
+ * | | | |
+ * | allocates dynamic object a[123] | | |
+ * | by storing the value 345 to it | | |
+ * | ---------> |
+ * | | | wishing to load from object |
+ * | | | a[123], performs lookup in |
+ * | | | dynamic variable space |
+ * | <--------- |
+ * | deallocates object a[123] by | | |
+ * | storing 0 to it | | |
+ * | | | |
+ * | allocates dynamic object b[567] | | performs load from a[123] |
+ * | by storing the value 789 to it | | |
+ * : : : :
+ * . . . .
+ *
+ * This is obviously a race in the D program, but there are nonetheless only
+ * two valid values for CPU B's load from a[123]: 345 or 0. Most importantly,
+ * CPU B may _not_ see the value 789 for a[123].
+ *
+ * There are essentially two ways to deal with this:
+ *
+ * (1) Explicitly spin-lock variables. That is, if CPU B wishes to load
+ * from a[123], it needs to lock a[123] and hold the lock for the
+ * duration that it wishes to manipulate it.
+ *
+ * (2) Avoid reusing freed chunks until it is known that no CPU is referring
+ * to them.
+ *
+ * The implementation of (1) is rife with complexity, because it requires the
+ * user of a dynamic variable to explicitly decree when they are done using it.
+ * Were all variables by value, this perhaps wouldn't be debilitating -- but
+ * dynamic variables of non-scalar types are tracked by reference. That is, if
+ * a dynamic variable is, say, a string, and that variable is to be traced to,
+ * say, the principal buffer, the DIF emulation code returns to the main
+ * dtrace_probe() loop a pointer to the underlying storage, not the contents of
+ * the storage. Further, code calling on DIF emulation would have to be aware
+ * that the DIF emulation has returned a reference to a dynamic variable that
+ * has been potentially locked. The variable would have to be unlocked after
+ * the main dtrace_probe() loop is finished with the variable, and the main
+ * dtrace_probe() loop would have to be careful to not call any further DIF
+ * emulation while the variable is locked to avoid deadlock. More generally,
+ * if one were to implement (1), DIF emulation code dealing with dynamic
+ * variables could only deal with one dynamic variable at a time (lest deadlock
+ * result). To sum, (1) exports too much subtlety to the users of dynamic
+ * variables -- increasing maintenance burden and imposing serious constraints
+ * on future DTrace development.
+ *
+ * The implementation of (2) is also complex, but the complexity is more
+ * manageable. We need to be sure that when a variable is deallocated, it is
+ * not placed on a traditional free list, but rather on a _dirty_ list. Once a
+ * variable is on a dirty list, it cannot be found by CPUs performing a
+ * subsequent lookup of the variable -- but it may still be in use by other
+ * CPUs. To assure that all CPUs that may be seeing the old variable have
+ * cleared out of probe context, a dtrace_sync() can be issued. Once the
+ * dtrace_sync() has completed, it can be known that all CPUs are done
+ * manipulating the dynamic variable -- the dirty list can be atomically
+ * appended to the free list. Unfortunately, there's a slight hiccup in this
+ * mechanism: dtrace_sync() may not be issued from probe context. The
+ * dtrace_sync() must be therefore issued asynchronously from non-probe
+ * context. For this we rely on the DTrace cleaner, a cyclic that runs at the
+ * "cleanrate" frequency. To ease this implementation, we define several chunk
+ * lists:
+ *
+ * - Dirty. Deallocated chunks, not yet cleaned. Not available.
+ *
+ * - Rinsing. Formerly dirty chunks that are currently being asynchronously
+ * cleaned. Not available, but will be shortly. Dynamic variable
+ * allocation may not spin or block for availability, however.
+ *
+ * - Clean. Clean chunks, ready for allocation -- but not on the free list.
+ *
+ * - Free. Available for allocation.
+ *
+ * Moreover, to avoid absurd contention, _each_ of these lists is implemented
+ * on a per-CPU basis. This is only for performance, not correctness; chunks
+ * may be allocated from another CPU's free list. The algorithm for allocation
+ * then is this:
+ *
+ * (1) Attempt to atomically allocate from current CPU's free list. If list
+ * is non-empty and allocation is successful, allocation is complete.
+ *
+ * (2) If the clean list is non-empty, atomically move it to the free list,
+ * and reattempt (1).
+ *
+ * (3) If the dynamic variable space is in the CLEAN state, look for free
+ * and clean lists on other CPUs by setting the current CPU to the next
+ * CPU, and reattempting (1). If the next CPU is the current CPU (that
+ * is, if all CPUs have been checked), atomically switch the state of
+ * the dynamic variable space based on the following:
+ *
+ * - If no free chunks were found and no dirty chunks were found,
+ * atomically set the state to EMPTY.
+ *
+ * - If dirty chunks were found, atomically set the state to DIRTY.
+ *
+ * - If rinsing chunks were found, atomically set the state to RINSING.
+ *
+ * (4) Based on state of dynamic variable space state, increment appropriate
+ * counter to indicate dynamic drops (if in EMPTY state) vs. dynamic
+ * dirty drops (if in DIRTY state) vs. dynamic rinsing drops (if in
+ * RINSING state). Fail the allocation.
+ *
+ * The cleaning cyclic operates with the following algorithm: for all CPUs
+ * with a non-empty dirty list, atomically move the dirty list to the rinsing
+ * list. Perform a dtrace_sync(). For all CPUs with a non-empty rinsing list,
+ * atomically move the rinsing list to the clean list. Perform another
+ * dtrace_sync(). By this point, all CPUs have seen the new clean list; the
+ * state of the dynamic variable space can be restored to CLEAN.
+ *
+ * There exist two final races that merit explanation. The first is a simple
+ * allocation race:
+ *
+ * CPU A CPU B
+ * +---------------------------------+ +---------------------------------+
+ * | | | |
+ * | allocates dynamic object a[123] | | allocates dynamic object a[123] |
+ * | by storing the value 345 to it | | by storing the value 567 to it |
+ * | | | |
+ * : : : :
+ * . . . .
+ *
+ * Again, this is a race in the D program. It can be resolved by having a[123]
+ * hold the value 345 or a[123] hold the value 567 -- but it must be true that
+ * a[123] have only _one_ of these values. (That is, the racing CPUs may not
+ * put the same element twice on the same hash chain.) This is resolved
+ * simply: before the allocation is undertaken, the start of the new chunk's
+ * hash chain is noted. Later, after the allocation is complete, the hash
+ * chain is atomically switched to point to the new element. If this fails
+ * (because of either concurrent allocations or an allocation concurrent with a
+ * deletion), the newly allocated chunk is deallocated to the dirty list, and
+ * the whole process of looking up (and potentially allocating) the dynamic
+ * variable is reattempted.
+ *
+ * The final race is a simple deallocation race:
+ *
+ * CPU A CPU B
+ * +---------------------------------+ +---------------------------------+
+ * | | | |
+ * | deallocates dynamic object | | deallocates dynamic object |
+ * | a[123] by storing the value 0 | | a[123] by storing the value 0 |
+ * | to it | | to it |
+ * | | | |
+ * : : : :
+ * . . . .
+ *
+ * Once again, this is a race in the D program, but it is one that we must
+ * handle without corrupting the underlying data structures. Because
+ * deallocations require the deletion of a chunk from the middle of a hash
+ * chain, we cannot use a single-word atomic operation to remove it. For this,
+ * we add a spin lock to the hash buckets that is _only_ used for deallocations
+ * (allocation races are handled as above). Further, this spin lock is _only_
+ * held for the duration of the delete; before control is returned to the DIF
+ * emulation code, the hash bucket is unlocked.
+ */
+typedef struct dtrace_key {
+ uint64_t dttk_value; /* data value or data pointer */
+ uint64_t dttk_size; /* 0 if by-val, >0 if by-ref */
+} dtrace_key_t;
+
+typedef struct dtrace_tuple {
+ uint32_t dtt_nkeys; /* number of keys in tuple */
+ uint32_t dtt_pad; /* padding */
+ dtrace_key_t dtt_key[1]; /* array of tuple keys */
+} dtrace_tuple_t;
+
+typedef struct dtrace_dynvar {
+ uint64_t dtdv_hashval; /* hash value -- 0 if free */
+ struct dtrace_dynvar *dtdv_next; /* next on list or hash chain */
+ void *dtdv_data; /* pointer to data */
+ dtrace_tuple_t dtdv_tuple; /* tuple key */
+} dtrace_dynvar_t;
+
+typedef enum dtrace_dynvar_op {
+ DTRACE_DYNVAR_ALLOC,
+ DTRACE_DYNVAR_NOALLOC,
+ DTRACE_DYNVAR_DEALLOC
+} dtrace_dynvar_op_t;
+
+typedef struct dtrace_dynhash {
+ dtrace_dynvar_t *dtdh_chain; /* hash chain for this bucket */
+ uintptr_t dtdh_lock; /* deallocation lock */
+#ifdef _LP64
+ uintptr_t dtdh_pad[6]; /* pad to avoid false sharing */
+#else
+ uintptr_t dtdh_pad[14]; /* pad to avoid false sharing */
+#endif
+} dtrace_dynhash_t;
+
+typedef struct dtrace_dstate_percpu {
+ dtrace_dynvar_t *dtdsc_free; /* free list for this CPU */
+ dtrace_dynvar_t *dtdsc_dirty; /* dirty list for this CPU */
+ dtrace_dynvar_t *dtdsc_rinsing; /* rinsing list for this CPU */
+ dtrace_dynvar_t *dtdsc_clean; /* clean list for this CPU */
+ uint64_t dtdsc_drops; /* number of capacity drops */
+ uint64_t dtdsc_dirty_drops; /* number of dirty drops */
+ uint64_t dtdsc_rinsing_drops; /* number of rinsing drops */
+#ifdef _LP64
+ uint64_t dtdsc_pad; /* pad to avoid false sharing */
+#else
+ uint64_t dtdsc_pad[2]; /* pad to avoid false sharing */
+#endif
+} dtrace_dstate_percpu_t;
+
+typedef enum dtrace_dstate_state {
+ DTRACE_DSTATE_CLEAN = 0,
+ DTRACE_DSTATE_EMPTY,
+ DTRACE_DSTATE_DIRTY,
+ DTRACE_DSTATE_RINSING
+} dtrace_dstate_state_t;
+
+typedef struct dtrace_dstate {
+ void *dtds_base; /* base of dynamic var. space */
+ size_t dtds_size; /* size of dynamic var. space */
+ size_t dtds_hashsize; /* number of buckets in hash */
+ size_t dtds_chunksize; /* size of each chunk */
+ dtrace_dynhash_t *dtds_hash; /* pointer to hash table */
+ dtrace_dstate_state_t dtds_state; /* current dynamic var. state */
+ dtrace_dstate_percpu_t *dtds_percpu; /* per-CPU dyn. var. state */
+} dtrace_dstate_t;
+
+/*
+ * DTrace Variable State
+ *
+ * The DTrace variable state tracks user-defined variables in its dtrace_vstate
+ * structure. Each DTrace consumer has exactly one dtrace_vstate structure,
+ * but some dtrace_vstate structures may exist without a corresponding DTrace
+ * consumer (see "DTrace Helpers", below). As described in <sys/dtrace.h>,
+ * user-defined variables can have one of three scopes:
+ *
+ * DIFV_SCOPE_GLOBAL => global scope
+ * DIFV_SCOPE_THREAD => thread-local scope (i.e. "self->" variables)
+ * DIFV_SCOPE_LOCAL => clause-local scope (i.e. "this->" variables)
+ *
+ * The variable state tracks variables by both their scope and their allocation
+ * type:
+ *
+ * - The dtvs_globals and dtvs_locals members each point to an array of
+ * dtrace_statvar structures. These structures contain both the variable
+ * metadata (dtrace_difv structures) and the underlying storage for all
+ * statically allocated variables, including statically allocated
+ * DIFV_SCOPE_GLOBAL variables and all DIFV_SCOPE_LOCAL variables.
+ *
+ * - The dtvs_tlocals member points to an array of dtrace_difv structures for
+ * DIFV_SCOPE_THREAD variables. As such, this array tracks _only_ the
+ * variable metadata for DIFV_SCOPE_THREAD variables; the underlying storage
+ * is allocated out of the dynamic variable space.
+ *
+ * - The dtvs_dynvars member is the dynamic variable state associated with the
+ * variable state. The dynamic variable state (described in "DTrace Dynamic
+ * Variables", above) tracks all DIFV_SCOPE_THREAD variables and all
+ * dynamically-allocated DIFV_SCOPE_GLOBAL variables.
+ */
+typedef struct dtrace_statvar {
+ uint64_t dtsv_data; /* data or pointer to it */
+ size_t dtsv_size; /* size of pointed-to data */
+ int dtsv_refcnt; /* reference count */
+ dtrace_difv_t dtsv_var; /* variable metadata */
+} dtrace_statvar_t;
+
+typedef struct dtrace_vstate {
+ dtrace_state_t *dtvs_state; /* back pointer to state */
+ dtrace_statvar_t **dtvs_globals; /* statically-allocated glbls */
+ int dtvs_nglobals; /* number of globals */
+ dtrace_difv_t *dtvs_tlocals; /* thread-local metadata */
+ int dtvs_ntlocals; /* number of thread-locals */
+ dtrace_statvar_t **dtvs_locals; /* clause-local data */
+ int dtvs_nlocals; /* number of clause-locals */
+ dtrace_dstate_t dtvs_dynvars; /* dynamic variable state */
+} dtrace_vstate_t;
+
+/*
+ * DTrace Machine State
+ *
+ * In the process of processing a fired probe, DTrace needs to track and/or
+ * cache some per-CPU state associated with that particular firing. This is
+ * state that is always discarded after the probe firing has completed, and
+ * much of it is not specific to any DTrace consumer, remaining valid across
+ * all ECBs. This state is tracked in the dtrace_mstate structure.
+ */
+#define DTRACE_MSTATE_ARGS 0x00000001
+#define DTRACE_MSTATE_PROBE 0x00000002
+#define DTRACE_MSTATE_EPID 0x00000004
+#define DTRACE_MSTATE_TIMESTAMP 0x00000008
+#define DTRACE_MSTATE_STACKDEPTH 0x00000010
+#define DTRACE_MSTATE_CALLER 0x00000020
+#define DTRACE_MSTATE_IPL 0x00000040
+#define DTRACE_MSTATE_FLTOFFS 0x00000080
+#define DTRACE_MSTATE_WALLTIMESTAMP 0x00000100
+#define DTRACE_MSTATE_USTACKDEPTH 0x00000200
+#define DTRACE_MSTATE_UCALLER 0x00000400
+
+typedef struct dtrace_mstate {
+ uintptr_t dtms_scratch_base; /* base of scratch space */
+ uintptr_t dtms_scratch_ptr; /* current scratch pointer */
+ size_t dtms_scratch_size; /* scratch size */
+ uint32_t dtms_present; /* variables that are present */
+ uint64_t dtms_arg[5]; /* cached arguments */
+ dtrace_epid_t dtms_epid; /* current EPID */
+ uint64_t dtms_timestamp; /* cached timestamp */
+ hrtime_t dtms_walltimestamp; /* cached wall timestamp */
+ int dtms_stackdepth; /* cached stackdepth */
+ int dtms_ustackdepth; /* cached ustackdepth */
+ struct dtrace_probe *dtms_probe; /* current probe */
+ uintptr_t dtms_caller; /* cached caller */
+ uint64_t dtms_ucaller; /* cached user-level caller */
+ int dtms_ipl; /* cached interrupt pri lev */
+ int dtms_fltoffs; /* faulting DIFO offset */
+ uintptr_t dtms_strtok; /* saved strtok() pointer */
+ uintptr_t dtms_strtok_limit; /* upper bound of strtok ptr */
+ uint32_t dtms_access; /* memory access rights */
+ dtrace_difo_t *dtms_difo; /* current dif object */
+ file_t *dtms_getf; /* cached rval of getf() */
+} dtrace_mstate_t;
+
+#define DTRACE_COND_OWNER 0x1
+#define DTRACE_COND_USERMODE 0x2
+#define DTRACE_COND_ZONEOWNER 0x4
+
+#define DTRACE_PROBEKEY_MAXDEPTH 8 /* max glob recursion depth */
+
+/*
+ * Access flag used by dtrace_mstate.dtms_access.
+ */
+#define DTRACE_ACCESS_KERNEL 0x1 /* the priv to read kmem */
+
+
+/*
+ * DTrace Activity
+ *
+ * Each DTrace consumer is in one of several states, which (for purposes of
+ * avoiding yet-another overloading of the noun "state") we call the current
+ * _activity_. The activity transitions on dtrace_go() (from DTRACIOCGO), on
+ * dtrace_stop() (from DTRACIOCSTOP) and on the exit() action. Activities may
+ * only transition in one direction; the activity transition diagram is a
+ * directed acyclic graph. The activity transition diagram is as follows:
+ *
+ *
+ * +----------+ +--------+ +--------+
+ * | INACTIVE |------------------>| WARMUP |------------------>| ACTIVE |
+ * +----------+ dtrace_go(), +--------+ dtrace_go(), +--------+
+ * before BEGIN | after BEGIN | | |
+ * | | | |
+ * exit() action | | | |
+ * from BEGIN ECB | | | |
+ * | | | |
+ * v | | |
+ * +----------+ exit() action | | |
+ * +-----------------------------| DRAINING |<-------------------+ | |
+ * | +----------+ | |
+ * | | | |
+ * | dtrace_stop(), | | |
+ * | before END | | |
+ * | | | |
+ * | v | |
+ * | +---------+ +----------+ | |
+ * | | STOPPED |<----------------| COOLDOWN |<----------------------+ |
+ * | +---------+ dtrace_stop(), +----------+ dtrace_stop(), |
+ * | after END before END |
+ * | |
+ * | +--------+ |
+ * +----------------------------->| KILLED |<--------------------------+
+ * deadman timeout or +--------+ deadman timeout or
+ * killed consumer killed consumer
+ *
+ * Note that once a DTrace consumer has stopped tracing, there is no way to
+ * restart it; if a DTrace consumer wishes to restart tracing, it must reopen
+ * the DTrace pseudodevice.
+ */
+typedef enum dtrace_activity {
+ DTRACE_ACTIVITY_INACTIVE = 0, /* not yet running */
+ DTRACE_ACTIVITY_WARMUP, /* while starting */
+ DTRACE_ACTIVITY_ACTIVE, /* running */
+ DTRACE_ACTIVITY_DRAINING, /* before stopping */
+ DTRACE_ACTIVITY_COOLDOWN, /* while stopping */
+ DTRACE_ACTIVITY_STOPPED, /* after stopping */
+ DTRACE_ACTIVITY_KILLED /* killed */
+} dtrace_activity_t;
+
+/*
+ * DTrace Helper Implementation
+ *
+ * A description of the helper architecture may be found in <sys/dtrace.h>.
+ * Each process contains a pointer to its helpers in its p_dtrace_helpers
+ * member. This is a pointer to a dtrace_helpers structure, which contains an
+ * array of pointers to dtrace_helper structures, helper variable state (shared
+ * among a process's helpers) and a generation count. (The generation count is
+ * used to provide an identifier when a helper is added so that it may be
+ * subsequently removed.) The dtrace_helper structure is self-explanatory,
+ * containing pointers to the objects needed to execute the helper. Note that
+ * helpers are _duplicated_ across fork(2), and destroyed on exec(2). No more
+ * than dtrace_helpers_max are allowed per-process.
+ */
+#define DTRACE_HELPER_ACTION_USTACK 0
+#define DTRACE_NHELPER_ACTIONS 1
+
+typedef struct dtrace_helper_action {
+ int dtha_generation; /* helper action generation */
+ int dtha_nactions; /* number of actions */
+ dtrace_difo_t *dtha_predicate; /* helper action predicate */
+ dtrace_difo_t **dtha_actions; /* array of actions */
+ struct dtrace_helper_action *dtha_next; /* next helper action */
+} dtrace_helper_action_t;
+
+typedef struct dtrace_helper_provider {
+ int dthp_generation; /* helper provider generation */
+ uint32_t dthp_ref; /* reference count */
+ dof_helper_t dthp_prov; /* DOF w/ provider and probes */
+} dtrace_helper_provider_t;
+
+typedef struct dtrace_helpers {
+ dtrace_helper_action_t **dthps_actions; /* array of helper actions */
+ dtrace_vstate_t dthps_vstate; /* helper action var. state */
+ dtrace_helper_provider_t **dthps_provs; /* array of providers */
+ uint_t dthps_nprovs; /* count of providers */
+ uint_t dthps_maxprovs; /* provider array size */
+ int dthps_generation; /* current generation */
+ pid_t dthps_pid; /* pid of associated proc */
+ int dthps_deferred; /* helper in deferred list */
+ struct dtrace_helpers *dthps_next; /* next pointer */
+ struct dtrace_helpers *dthps_prev; /* prev pointer */
+} dtrace_helpers_t;
+
+/*
+ * DTrace Helper Action Tracing
+ *
+ * Debugging helper actions can be arduous. To ease the development and
+ * debugging of helpers, DTrace contains a tracing-framework-within-a-tracing-
+ * framework: helper tracing. If dtrace_helptrace_enabled is non-zero (which
+ * it is by default on DEBUG kernels), all helper activity will be traced to a
+ * global, in-kernel ring buffer. Each entry includes a pointer to the specific
+ * helper, the location within the helper, and a trace of all local variables.
+ * The ring buffer may be displayed in a human-readable format with the
+ * ::dtrace_helptrace mdb(1) dcmd.
+ */
+#define DTRACE_HELPTRACE_NEXT (-1)
+#define DTRACE_HELPTRACE_DONE (-2)
+#define DTRACE_HELPTRACE_ERR (-3)
+
+typedef struct dtrace_helptrace {
+ dtrace_helper_action_t *dtht_helper; /* helper action */
+ int dtht_where; /* where in helper action */
+ int dtht_nlocals; /* number of locals */
+ int dtht_fault; /* type of fault (if any) */
+ int dtht_fltoffs; /* DIF offset */
+ uint64_t dtht_illval; /* faulting value */
+ uint64_t dtht_locals[1]; /* local variables */
+} dtrace_helptrace_t;
+
+/*
+ * DTrace Credentials
+ *
+ * In probe context, we have limited flexibility to examine the credentials
+ * of the DTrace consumer that created a particular enabling. We use
+ * the Least Privilege interfaces to cache the consumer's cred pointer and
+ * some facts about that credential in a dtrace_cred_t structure. These
+ * can limit the consumer's breadth of visibility and what actions the
+ * consumer may take.
+ */
+#define DTRACE_CRV_ALLPROC 0x01
+#define DTRACE_CRV_KERNEL 0x02
+#define DTRACE_CRV_ALLZONE 0x04
+
+#define DTRACE_CRV_ALL (DTRACE_CRV_ALLPROC | DTRACE_CRV_KERNEL | \
+ DTRACE_CRV_ALLZONE)
+
+#define DTRACE_CRA_PROC 0x0001
+#define DTRACE_CRA_PROC_CONTROL 0x0002
+#define DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER 0x0004
+#define DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE 0x0008
+#define DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG 0x0010
+#define DTRACE_CRA_KERNEL 0x0020
+#define DTRACE_CRA_KERNEL_DESTRUCTIVE 0x0040
+
+#define DTRACE_CRA_ALL (DTRACE_CRA_PROC | \
+ DTRACE_CRA_PROC_CONTROL | \
+ DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER | \
+ DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE | \
+ DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG | \
+ DTRACE_CRA_KERNEL | \
+ DTRACE_CRA_KERNEL_DESTRUCTIVE)
+
+typedef struct dtrace_cred {
+ cred_t *dcr_cred;
+ uint8_t dcr_destructive;
+ uint8_t dcr_visible;
+ uint16_t dcr_action;
+} dtrace_cred_t;
+
+/*
+ * DTrace Consumer State
+ *
+ * Each DTrace consumer has an associated dtrace_state structure that contains
+ * its in-kernel DTrace state -- including options, credentials, statistics and
+ * pointers to ECBs, buffers, speculations and formats. A dtrace_state
+ * structure is also allocated for anonymous enablings. When anonymous state
+ * is grabbed, the grabbing consumers dts_anon pointer is set to the grabbed
+ * dtrace_state structure.
+ */
+struct dtrace_state {
+#ifdef illumos
+ dev_t dts_dev; /* device */
+#else
+ struct cdev *dts_dev; /* device */
+#endif
+ int dts_necbs; /* total number of ECBs */
+ dtrace_ecb_t **dts_ecbs; /* array of ECBs */
+ dtrace_epid_t dts_epid; /* next EPID to allocate */
+ size_t dts_needed; /* greatest needed space */
+ struct dtrace_state *dts_anon; /* anon. state, if grabbed */
+ dtrace_activity_t dts_activity; /* current activity */
+ dtrace_vstate_t dts_vstate; /* variable state */
+ dtrace_buffer_t *dts_buffer; /* principal buffer */
+ dtrace_buffer_t *dts_aggbuffer; /* aggregation buffer */
+ dtrace_speculation_t *dts_speculations; /* speculation array */
+ int dts_nspeculations; /* number of speculations */
+ int dts_naggregations; /* number of aggregations */
+ dtrace_aggregation_t **dts_aggregations; /* aggregation array */
+#ifdef illumos
+ vmem_t *dts_aggid_arena; /* arena for aggregation IDs */
+#else
+ struct unrhdr *dts_aggid_arena; /* arena for aggregation IDs */
+#endif
+ uint64_t dts_errors; /* total number of errors */
+ uint32_t dts_speculations_busy; /* number of spec. busy */
+ uint32_t dts_speculations_unavail; /* number of spec unavail */
+ uint32_t dts_stkstroverflows; /* stack string tab overflows */
+ uint32_t dts_dblerrors; /* errors in ERROR probes */
+ uint32_t dts_reserve; /* space reserved for END */
+ hrtime_t dts_laststatus; /* time of last status */
+#ifdef illumos
+ cyclic_id_t dts_cleaner; /* cleaning cyclic */
+ cyclic_id_t dts_deadman; /* deadman cyclic */
+#else
+ struct callout dts_cleaner; /* Cleaning callout. */
+ struct callout dts_deadman; /* Deadman callout. */
+#endif
+ hrtime_t dts_alive; /* time last alive */
+ char dts_speculates; /* boolean: has speculations */
+ char dts_destructive; /* boolean: has dest. actions */
+ int dts_nformats; /* number of formats */
+ char **dts_formats; /* format string array */
+ dtrace_optval_t dts_options[DTRACEOPT_MAX]; /* options */
+ dtrace_cred_t dts_cred; /* credentials */
+ size_t dts_nretained; /* number of retained enabs */
+ int dts_getf; /* number of getf() calls */
+ uint64_t dts_rstate[NCPU][2]; /* per-CPU random state */
+};
+
+struct dtrace_provider {
+ dtrace_pattr_t dtpv_attr; /* provider attributes */
+ dtrace_ppriv_t dtpv_priv; /* provider privileges */
+ dtrace_pops_t dtpv_pops; /* provider operations */
+ char *dtpv_name; /* provider name */
+ void *dtpv_arg; /* provider argument */
+ hrtime_t dtpv_defunct; /* when made defunct */
+ struct dtrace_provider *dtpv_next; /* next provider */
+};
+
+struct dtrace_meta {
+ dtrace_mops_t dtm_mops; /* meta provider operations */
+ char *dtm_name; /* meta provider name */
+ void *dtm_arg; /* meta provider user arg */
+ uint64_t dtm_count; /* no. of associated provs. */
+};
+
+/*
+ * DTrace Enablings
+ *
+ * A dtrace_enabling structure is used to track a collection of ECB
+ * descriptions -- before they have been turned into actual ECBs. This is
+ * created as a result of DOF processing, and is generally used to generate
+ * ECBs immediately thereafter. However, enablings are also generally
+ * retained should the probes they describe be created at a later time; as
+ * each new module or provider registers with the framework, the retained
+ * enablings are reevaluated, with any new match resulting in new ECBs. To
+ * prevent probes from being matched more than once, the enabling tracks the
+ * last probe generation matched, and only matches probes from subsequent
+ * generations.
+ */
+typedef struct dtrace_enabling {
+ dtrace_ecbdesc_t **dten_desc; /* all ECB descriptions */
+ int dten_ndesc; /* number of ECB descriptions */
+ int dten_maxdesc; /* size of ECB array */
+ dtrace_vstate_t *dten_vstate; /* associated variable state */
+ dtrace_genid_t dten_probegen; /* matched probe generation */
+ dtrace_ecbdesc_t *dten_current; /* current ECB description */
+ int dten_error; /* current error value */
+ int dten_primed; /* boolean: set if primed */
+ struct dtrace_enabling *dten_prev; /* previous enabling */
+ struct dtrace_enabling *dten_next; /* next enabling */
+} dtrace_enabling_t;
+
+/*
+ * DTrace Anonymous Enablings
+ *
+ * Anonymous enablings are DTrace enablings that are not associated with a
+ * controlling process, but rather derive their enabling from DOF stored as
+ * properties in the dtrace.conf file. If there is an anonymous enabling, a
+ * DTrace consumer state and enabling are created on attach. The state may be
+ * subsequently grabbed by the first consumer specifying the "grabanon"
+ * option. As long as an anonymous DTrace enabling exists, dtrace(7D) will
+ * refuse to unload.
+ */
+typedef struct dtrace_anon {
+ dtrace_state_t *dta_state; /* DTrace consumer state */
+ dtrace_enabling_t *dta_enabling; /* pointer to enabling */
+ processorid_t dta_beganon; /* which CPU BEGIN ran on */
+} dtrace_anon_t;
+
+/*
+ * DTrace Error Debugging
+ */
+#ifdef DEBUG
+#define DTRACE_ERRDEBUG
+#endif
+
+#ifdef DTRACE_ERRDEBUG
+
+typedef struct dtrace_errhash {
+ const char *dter_msg; /* error message */
+ int dter_count; /* number of times seen */
+} dtrace_errhash_t;
+
+#define DTRACE_ERRHASHSZ 256 /* must be > number of err msgs */
+
+#endif /* DTRACE_ERRDEBUG */
+
+/*
+ * DTrace Toxic Ranges
+ *
+ * DTrace supports safe loads from probe context; if the address turns out to
+ * be invalid, a bit will be set by the kernel indicating that DTrace
+ * encountered a memory error, and DTrace will propagate the error to the user
+ * accordingly. However, there may exist some regions of memory in which an
+ * arbitrary load can change system state, and from which it is impossible to
+ * recover from such a load after it has been attempted. Examples of this may
+ * include memory in which programmable I/O registers are mapped (for which a
+ * read may have some implications for the device) or (in the specific case of
+ * UltraSPARC-I and -II) the virtual address hole. The platform is required
+ * to make DTrace aware of these toxic ranges; DTrace will then check that
+ * target addresses are not in a toxic range before attempting to issue a
+ * safe load.
+ */
+typedef struct dtrace_toxrange {
+ uintptr_t dtt_base; /* base of toxic range */
+ uintptr_t dtt_limit; /* limit of toxic range */
+} dtrace_toxrange_t;
+
+#ifdef illumos
+extern uint64_t dtrace_getarg(int, int);
+#else
+extern uint64_t __noinline dtrace_getarg(int, int);
+#endif
+extern greg_t dtrace_getfp(void);
+extern int dtrace_getipl(void);
+extern uintptr_t dtrace_caller(int);
+extern uint32_t dtrace_cas32(uint32_t *, uint32_t, uint32_t);
+extern void *dtrace_casptr(volatile void *, volatile void *, volatile void *);
+extern void dtrace_copyin(uintptr_t, uintptr_t, size_t, volatile uint16_t *);
+extern void dtrace_copyinstr(uintptr_t, uintptr_t, size_t, volatile uint16_t *);
+extern void dtrace_copyout(uintptr_t, uintptr_t, size_t, volatile uint16_t *);
+extern void dtrace_copyoutstr(uintptr_t, uintptr_t, size_t,
+ volatile uint16_t *);
+extern void dtrace_getpcstack(pc_t *, int, int, uint32_t *);
+extern ulong_t dtrace_getreg(struct trapframe *, uint_t);
+extern int dtrace_getstackdepth(int);
+extern void dtrace_getupcstack(uint64_t *, int);
+extern void dtrace_getufpstack(uint64_t *, uint64_t *, int);
+extern int dtrace_getustackdepth(void);
+extern uintptr_t dtrace_fulword(void *);
+extern uint8_t dtrace_fuword8(void *);
+extern uint16_t dtrace_fuword16(void *);
+extern uint32_t dtrace_fuword32(void *);
+extern uint64_t dtrace_fuword64(void *);
+extern void dtrace_probe_error(dtrace_state_t *, dtrace_epid_t, int, int,
+ int, uintptr_t);
+extern int dtrace_assfail(const char *, const char *, int);
+extern int dtrace_attached(void);
+#ifdef illumos
+extern hrtime_t dtrace_gethrestime(void);
+#endif
+
+#ifdef __sparc
+extern void dtrace_flush_windows(void);
+extern void dtrace_flush_user_windows(void);
+extern uint_t dtrace_getotherwin(void);
+extern uint_t dtrace_getfprs(void);
+#else
+extern void dtrace_copy(uintptr_t, uintptr_t, size_t);
+extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *);
+#endif
+
+/*
+ * DTrace Assertions
+ *
+ * DTrace calls ASSERT and VERIFY from probe context. To assure that a failed
+ * ASSERT or VERIFY does not induce a markedly more catastrophic failure (e.g.,
+ * one from which a dump cannot be gleaned), DTrace must define its own ASSERT
+ * and VERIFY macros to be ones that may safely be called from probe context.
+ * This header file must thus be included by any DTrace component that calls
+ * ASSERT and/or VERIFY from probe context, and _only_ by those components.
+ * (The only exception to this is kernel debugging infrastructure at user-level
+ * that doesn't depend on calling ASSERT.)
+ */
+#undef ASSERT
+#undef VERIFY
+#define VERIFY(EX) ((void)((EX) || \
+ dtrace_assfail(#EX, __FILE__, __LINE__)))
+#ifdef DEBUG
+#define ASSERT(EX) ((void)((EX) || \
+ dtrace_assfail(#EX, __FILE__, __LINE__)))
+#else
+#define ASSERT(X) ((void)0)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DTRACE_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h b/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h
new file mode 100644
index 000000000000..971b19e6ccd8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ERRORQ_H
+#define _ERRORQ_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/nvpair.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct errorq errorq_t;
+typedef struct errorq_elem errorq_elem_t;
+typedef void (*errorq_func_t)(void *, const void *, const errorq_elem_t *);
+
+/*
+ * Public flags for errorq_create(): bit range 0-15
+ */
+#define ERRORQ_VITAL 0x0001 /* drain queue automatically on system reset */
+
+/*
+ * Public flags for errorq_dispatch():
+ */
+#define ERRORQ_ASYNC 0 /* schedule async queue drain for caller */
+#define ERRORQ_SYNC 1 /* do not schedule drain; caller will drain */
+
+#ifdef _KERNEL
+
+extern errorq_t *errorq_create(const char *, errorq_func_t, void *,
+ ulong_t, size_t, uint_t, uint_t);
+
+extern errorq_t *errorq_nvcreate(const char *, errorq_func_t, void *,
+ ulong_t, size_t, uint_t, uint_t);
+
+extern void errorq_destroy(errorq_t *);
+extern void errorq_dispatch(errorq_t *, const void *, size_t, uint_t);
+extern void errorq_drain(errorq_t *);
+extern void errorq_init(void);
+extern void errorq_panic(void);
+extern errorq_elem_t *errorq_reserve(errorq_t *);
+extern void errorq_commit(errorq_t *, errorq_elem_t *, uint_t);
+extern void errorq_cancel(errorq_t *, errorq_elem_t *);
+extern nvlist_t *errorq_elem_nvl(errorq_t *, const errorq_elem_t *);
+extern nv_alloc_t *errorq_elem_nva(errorq_t *, const errorq_elem_t *);
+extern void *errorq_elem_dup(errorq_t *, const errorq_elem_t *,
+ errorq_elem_t **);
+extern void errorq_dump();
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ERRORQ_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h b/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h
new file mode 100644
index 000000000000..3f9a665f0076
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_EXTDIRENT_H
+#define _SYS_EXTDIRENT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+#if defined(_KERNEL)
+
+/*
+ * Extended file-system independent directory entry. This style of
+ * dirent provides additional informational flag bits for each
+ * directory entry. This dirent will be returned instead of the
+ * standard dirent if a VOP_READDIR() requests dirent flags via
+ * V_RDDIR_ENTFLAGS, and if the file system supports the flags.
+ */
+typedef struct edirent {
+ ino64_t ed_ino; /* "inode number" of entry */
+ off64_t ed_off; /* offset of disk directory entry */
+ uint32_t ed_eflags; /* per-entry flags */
+ unsigned short ed_reclen; /* length of this record */
+ char ed_name[1]; /* name of file */
+} edirent_t;
+
+#define EDIRENT_RECLEN(namelen) \
+ ((offsetof(edirent_t, ed_name[0]) + 1 + (namelen) + 7) & ~ 7)
+#define EDIRENT_NAMELEN(reclen) \
+ ((reclen) - (offsetof(edirent_t, ed_name[0])))
+
+/*
+ * Extended entry flags
+ * Extended entries include a bitfield of extra information
+ * regarding that entry.
+ */
+#define ED_CASE_CONFLICT 0x10 /* Disconsidering case, entry is not unique */
+
+/*
+ * Extended flags accessor function
+ */
+#define ED_CASE_CONFLICTS(x) ((x)->ed_eflags & ED_CASE_CONFLICT)
+
+#endif /* defined(_KERNEL) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_EXTDIRENT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h
new file mode 100644
index 000000000000..1d3d5a3bafd7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h
@@ -0,0 +1,98 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FASTTRAP_H
+#define _SYS_FASTTRAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/fasttrap_isa.h>
+#include <sys/dtrace.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef illumos
+#define FASTTRAPIOC (('m' << 24) | ('r' << 16) | ('f' << 8))
+#define FASTTRAPIOC_MAKEPROBE (FASTTRAPIOC | 1)
+#define FASTTRAPIOC_GETINSTR (FASTTRAPIOC | 2)
+#else
+#define FASTTRAPIOC_GETINSTR _IO('f', 2)
+#define FASTTRAPIOC_MAKEPROBE _IO('f', 3)
+#endif
+
+typedef enum fasttrap_probe_type {
+ DTFTP_NONE = 0,
+ DTFTP_ENTRY,
+ DTFTP_RETURN,
+ DTFTP_OFFSETS,
+ DTFTP_POST_OFFSETS,
+ DTFTP_IS_ENABLED
+} fasttrap_probe_type_t;
+
+typedef struct fasttrap_probe_spec {
+ pid_t ftps_pid;
+ fasttrap_probe_type_t ftps_type;
+
+ char ftps_func[DTRACE_FUNCNAMELEN];
+ char ftps_mod[DTRACE_MODNAMELEN];
+
+ uint64_t ftps_pc;
+ uint64_t ftps_size;
+ uint64_t ftps_noffs;
+ uint64_t ftps_offs[1];
+} fasttrap_probe_spec_t;
+
+typedef struct fasttrap_instr_query {
+ uint64_t ftiq_pc;
+ pid_t ftiq_pid;
+ fasttrap_instr_t ftiq_instr;
+} fasttrap_instr_query_t;
+
+/*
+ * To support the fasttrap provider from very early in a process's life,
+ * the run-time linker, ld.so.1, has a program header of type PT_SUNWDTRACE
+ * which points to a data object which must be PT_SUNWDTRACE_SIZE bytes.
+ * This structure mimics the fasttrap provider section of the ulwp_t structure.
+ * When the fasttrap provider is changed to require new or different
+ * instructions, the data object in ld.so.1 and the thread initializers in libc
+ * (libc_init() and _thrp_create()) need to be updated to include the new
+ * instructions, and PT_SUNWDTRACE needs to be changed to a new unique number
+ * (while the old value gets assigned something like PT_SUNWDTRACE_1). Since the
+ * linker must be backward compatible with old Solaris releases, it must have
+ * program headers for each of the PT_SUNWDTRACE versions. The kernel's
+ * elfexec() function only has to look for the latest version of the
+ * PT_SUNWDTRACE program header.
+ */
+#define PT_SUNWDTRACE_SIZE FASTTRAP_SUNWDTRACE_SIZE
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FASTTRAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h
new file mode 100644
index 000000000000..d2fbf5f4981a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h
@@ -0,0 +1,235 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_IMPL_H
+#define _FASTTRAP_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/dtrace.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/fasttrap.h>
+#include <sys/fasttrap_isa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Fasttrap Providers, Probes and Tracepoints
+ *
+ * Each Solaris process can have multiple providers -- the pid provider as
+ * well as any number of user-level statically defined tracing (USDT)
+ * providers. Those providers are each represented by a fasttrap_provider_t.
+ * All providers for a given process have a pointer to a shared
+ * fasttrap_proc_t. The fasttrap_proc_t has two states: active or defunct.
+ * When the count of active providers goes to zero it becomes defunct; a
+ * provider drops its active count when it is removed individually or as part
+ * of a mass removal when a process exits or performs an exec.
+ *
+ * Each probe is represented by a fasttrap_probe_t which has a pointer to
+ * its associated provider as well as a list of fasttrap_id_tp_t structures
+ * which are tuples combining a fasttrap_id_t and a fasttrap_tracepoint_t.
+ * A fasttrap_tracepoint_t represents the actual point of instrumentation
+ * and it contains two lists of fasttrap_id_t structures (to be fired pre-
+ * and post-instruction emulation) that identify the probes attached to the
+ * tracepoint. Tracepoints also have a pointer to the fasttrap_proc_t for the
+ * process they trace which is used when looking up a tracepoint both when a
+ * probe fires and when enabling and disabling probes.
+ *
+ * It's important to note that probes are preallocated with the necessary
+ * number of tracepoints, but that tracepoints can be shared by probes and
+ * swapped between probes. If a probe's preallocated tracepoint is enabled
+ * (and, therefore, the associated probe is enabled), and that probe is
+ * then disabled, ownership of that tracepoint may be exchanged for an
+ * unused tracepoint belonging to another probe that was attached to the
+ * enabled tracepoint.
+ *
+ * On FreeBSD, fasttrap providers also maintain per-thread scratch space for use
+ * by the ISA-specific fasttrap code. The fasttrap_scrblock_t type stores the
+ * virtual address of a page-sized memory block that is mapped into a process'
+ * address space. Each block is carved up into chunks (fasttrap_scrspace_t) for
+ * use by individual threads, which keep the address of their scratch space
+ * chunk in their struct kdtrace_thread. A thread's scratch space isn't released
+ * until it exits.
+ */
+
+#ifndef illumos
+typedef struct fasttrap_scrblock {
+ vm_offset_t ftsb_addr; /* address of a scratch block */
+ LIST_ENTRY(fasttrap_scrblock) ftsb_next;/* next block in list */
+} fasttrap_scrblock_t;
+#define FASTTRAP_SCRBLOCK_SIZE PAGE_SIZE
+
+typedef struct fasttrap_scrspace {
+ uintptr_t ftss_addr; /* scratch space address */
+ LIST_ENTRY(fasttrap_scrspace) ftss_next;/* next in list */
+} fasttrap_scrspace_t;
+#define FASTTRAP_SCRSPACE_SIZE 64
+#endif
+
+typedef struct fasttrap_proc {
+ pid_t ftpc_pid; /* process ID for this proc */
+ uint64_t ftpc_acount; /* count of active providers */
+ uint64_t ftpc_rcount; /* count of extant providers */
+ kmutex_t ftpc_mtx; /* lock on all but acount */
+ struct fasttrap_proc *ftpc_next; /* next proc in hash chain */
+#ifndef illumos
+ LIST_HEAD(, fasttrap_scrblock) ftpc_scrblks; /* mapped scratch blocks */
+ LIST_HEAD(, fasttrap_scrspace) ftpc_fscr; /* free scratch space */
+ LIST_HEAD(, fasttrap_scrspace) ftpc_ascr; /* used scratch space */
+#endif
+} fasttrap_proc_t;
+
+typedef struct fasttrap_provider {
+ pid_t ftp_pid; /* process ID for this prov */
+ char ftp_name[DTRACE_PROVNAMELEN]; /* prov name (w/o the pid) */
+ dtrace_provider_id_t ftp_provid; /* DTrace provider handle */
+ uint_t ftp_marked; /* mark for possible removal */
+ uint_t ftp_retired; /* mark when retired */
+ kmutex_t ftp_mtx; /* provider lock */
+ kmutex_t ftp_cmtx; /* lock on creating probes */
+ uint64_t ftp_rcount; /* enabled probes ref count */
+ uint64_t ftp_ccount; /* consumers creating probes */
+ uint64_t ftp_mcount; /* meta provider count */
+ fasttrap_proc_t *ftp_proc; /* shared proc for all provs */
+ struct fasttrap_provider *ftp_next; /* next prov in hash chain */
+} fasttrap_provider_t;
+
+typedef struct fasttrap_id fasttrap_id_t;
+typedef struct fasttrap_probe fasttrap_probe_t;
+typedef struct fasttrap_tracepoint fasttrap_tracepoint_t;
+
+struct fasttrap_id {
+ fasttrap_probe_t *fti_probe; /* referrring probe */
+ fasttrap_id_t *fti_next; /* enabled probe list on tp */
+ fasttrap_probe_type_t fti_ptype; /* probe type */
+};
+
+typedef struct fasttrap_id_tp {
+ fasttrap_id_t fit_id;
+ fasttrap_tracepoint_t *fit_tp;
+} fasttrap_id_tp_t;
+
+struct fasttrap_probe {
+ dtrace_id_t ftp_id; /* DTrace probe identifier */
+ pid_t ftp_pid; /* pid for this probe */
+ fasttrap_provider_t *ftp_prov; /* this probe's provider */
+ uintptr_t ftp_faddr; /* associated function's addr */
+ size_t ftp_fsize; /* associated function's size */
+ uint64_t ftp_gen; /* modification generation */
+ uint64_t ftp_ntps; /* number of tracepoints */
+ uint8_t *ftp_argmap; /* native to translated args */
+ uint8_t ftp_nargs; /* translated argument count */
+ uint8_t ftp_enabled; /* is this probe enabled */
+ char *ftp_xtypes; /* translated types index */
+ char *ftp_ntypes; /* native types index */
+ fasttrap_id_tp_t ftp_tps[1]; /* flexible array */
+};
+
+#define FASTTRAP_ID_INDEX(id) \
+((fasttrap_id_tp_t *)(((char *)(id) - offsetof(fasttrap_id_tp_t, fit_id))) - \
+&(id)->fti_probe->ftp_tps[0])
+
+struct fasttrap_tracepoint {
+ fasttrap_proc_t *ftt_proc; /* associated process struct */
+ uintptr_t ftt_pc; /* address of tracepoint */
+ pid_t ftt_pid; /* pid of tracepoint */
+ fasttrap_machtp_t ftt_mtp; /* ISA-specific portion */
+ fasttrap_id_t *ftt_ids; /* NULL-terminated list */
+ fasttrap_id_t *ftt_retids; /* NULL-terminated list */
+ fasttrap_tracepoint_t *ftt_next; /* link in global hash */
+};
+
+typedef struct fasttrap_bucket {
+ kmutex_t ftb_mtx; /* bucket lock */
+ void *ftb_data; /* data payload */
+
+ uint8_t ftb_pad[64 - sizeof (kmutex_t) - sizeof (void *)];
+} fasttrap_bucket_t;
+
+typedef struct fasttrap_hash {
+ ulong_t fth_nent; /* power-of-2 num. of entries */
+ ulong_t fth_mask; /* fth_nent - 1 */
+ fasttrap_bucket_t *fth_table; /* array of buckets */
+} fasttrap_hash_t;
+
+/*
+ * If at some future point these assembly functions become observable by
+ * DTrace, then these defines should become separate functions so that the
+ * fasttrap provider doesn't trigger probes during internal operations.
+ */
+#define fasttrap_copyout copyout
+#define fasttrap_fuword32 fuword32
+#define fasttrap_suword32 suword32
+#define fasttrap_suword64 suword64
+
+#ifdef __amd64__
+#define fasttrap_fulword fuword64
+#define fasttrap_sulword suword64
+#else
+#define fasttrap_fulword fuword32
+#define fasttrap_sulword suword32
+#endif
+
+extern void fasttrap_sigtrap(proc_t *, kthread_t *, uintptr_t);
+#ifndef illumos
+extern fasttrap_scrspace_t *fasttrap_scraddr(struct thread *,
+ fasttrap_proc_t *);
+#endif
+
+extern dtrace_id_t fasttrap_probe_id;
+extern fasttrap_hash_t fasttrap_tpoints;
+
+#ifndef illumos
+extern struct rmlock fasttrap_tp_lock;
+#endif
+
+#define FASTTRAP_TPOINTS_INDEX(pid, pc) \
+ (((pc) / sizeof (fasttrap_instr_t) + (pid)) & fasttrap_tpoints.fth_mask)
+
+/*
+ * Must be implemented by fasttrap_isa.c
+ */
+extern int fasttrap_tracepoint_init(proc_t *, fasttrap_tracepoint_t *,
+ uintptr_t, fasttrap_probe_type_t);
+extern int fasttrap_tracepoint_install(proc_t *, fasttrap_tracepoint_t *);
+extern int fasttrap_tracepoint_remove(proc_t *, fasttrap_tracepoint_t *);
+
+struct trapframe;
+extern int fasttrap_pid_probe(struct trapframe *);
+extern int fasttrap_return_probe(struct trapframe *);
+
+extern uint64_t fasttrap_pid_getarg(void *, dtrace_id_t, void *, int, int);
+extern uint64_t fasttrap_usdt_getarg(void *, dtrace_id_t, void *, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h b/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h
new file mode 100644
index 000000000000..717da1e3a00e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h
@@ -0,0 +1,431 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Garrett D'Amore <garrett@damore.org>
+ *
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FEATURE_TESTS_H
+#define _SYS_FEATURE_TESTS_H
+
+#include <sys/ccompile.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Values of _POSIX_C_SOURCE
+ *
+ * undefined not a POSIX compilation
+ * 1 POSIX.1-1990 compilation
+ * 2 POSIX.2-1992 compilation
+ * 199309L POSIX.1b-1993 compilation (Real Time)
+ * 199506L POSIX.1c-1995 compilation (POSIX Threads)
+ * 200112L POSIX.1-2001 compilation (Austin Group Revision)
+ * 200809L POSIX.1-2008 compilation
+ */
+#if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE)
+#define _POSIX_C_SOURCE 1
+#endif
+
+/*
+ * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, _STRICT_SYMBOLS,
+ * and _STDC_C99 are Sun implementation specific macros created in order to
+ * compress common standards specified feature test macros for easier reading.
+ * These macros should not be used by the application developer as
+ * unexpected results may occur. Instead, the user should reference
+ * standards(5) for correct usage of the standards feature test macros.
+ *
+ * __XOPEN_OR_POSIX Used in cases where a symbol is defined by both
+ * X/Open or POSIX or in the negative, when neither
+ * X/Open or POSIX defines a symbol.
+ *
+ * _STRICT_STDC __STDC__ is specified by the C Standards and defined
+ * by the compiler. For Sun compilers the value of
+ * __STDC__ is either 1, 0, or not defined based on the
+ * compilation mode (see cc(1)). When the value of
+ * __STDC__ is 1 and in the absence of any other feature
+ * test macros, the namespace available to the application
+ * is limited to only those symbols defined by the C
+ * Standard. _STRICT_STDC provides a more readable means
+ * of identifying symbols defined by the standard, or in
+ * the negative, symbols that are extensions to the C
+ * Standard. See additional comments for GNU C differences.
+ *
+ * _STDC_C99 __STDC_VERSION__ is specified by the C standards and
+ * defined by the compiler and indicates the version of
+ * the C standard. A value of 199901L indicates a
+ * compiler that complies with ISO/IEC 9899:1999, other-
+ * wise known as the C99 standard.
+ *
+ * _STRICT_SYMBOLS Used in cases where symbol visibility is restricted
+ * by the standards, and the user has not explicitly
+ * relaxed the strictness via __EXTENSIONS__.
+ */
+
+#if defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE)
+#define __XOPEN_OR_POSIX
+#endif
+
+/*
+ * ISO/IEC 9899:1990 and it's revision, ISO/IEC 9899:1999 specify the
+ * following predefined macro name:
+ *
+ * __STDC__ The integer constant 1, intended to indicate a conforming
+ * implementation.
+ *
+ * Furthermore, a strictly conforming program shall use only those features
+ * of the language and library specified in these standards. A conforming
+ * implementation shall accept any strictly conforming program.
+ *
+ * Based on these requirements, Sun's C compiler defines __STDC__ to 1 for
+ * strictly conforming environments and __STDC__ to 0 for environments that
+ * use ANSI C semantics but allow extensions to the C standard. For non-ANSI
+ * C semantics, Sun's C compiler does not define __STDC__.
+ *
+ * The GNU C project interpretation is that __STDC__ should always be defined
+ * to 1 for compilation modes that accept ANSI C syntax regardless of whether
+ * or not extensions to the C standard are used. Violations of conforming
+ * behavior are conditionally flagged as warnings via the use of the
+ * -pedantic option. In addition to defining __STDC__ to 1, the GNU C
+ * compiler also defines __STRICT_ANSI__ as a means of specifying strictly
+ * conforming environments using the -ansi or -std=<standard> options.
+ *
+ * In the absence of any other compiler options, Sun and GNU set the value
+ * of __STDC__ as follows when using the following options:
+ *
+ * Value of __STDC__ __STRICT_ANSI__
+ *
+ * cc -Xa (default) 0 undefined
+ * cc -Xt (transitional) 0 undefined
+ * cc -Xc (strictly conforming) 1 undefined
+ * cc -Xs (K&R C) undefined undefined
+ *
+ * gcc (default) 1 undefined
+ * gcc -ansi, -std={c89, c99,...) 1 defined
+ * gcc -traditional (K&R) undefined undefined
+ *
+ * The default compilation modes for Sun C compilers versus GNU C compilers
+ * results in a differing value for __STDC__ which results in a more
+ * restricted namespace when using Sun compilers. To allow both GNU and Sun
+ * interpretations to peacefully co-exist, we use the following Sun
+ * implementation _STRICT_STDC_ macro:
+ */
+
+#if (__STDC__ - 0 == 1 && !defined(__GNUC__)) || \
+ (defined(__GNUC__) && defined(__STRICT_ANSI__))
+#define _STRICT_STDC
+#else
+#undef _STRICT_STDC
+#endif
+
+/*
+ * Compiler complies with ISO/IEC 9899:1999
+ */
+
+#if __STDC_VERSION__ - 0 >= 199901L
+#ifndef _STDC_C99
+#define _STDC_C99
+#endif
+#endif
+
+/*
+ * Use strict symbol visibility.
+ */
+#if (defined(_STRICT_STDC) || defined(__XOPEN_OR_POSIX)) && \
+ !defined(__EXTENSIONS__)
+#define _STRICT_SYMBOLS
+#endif
+
+/*
+ * Large file interfaces:
+ *
+ * _LARGEFILE_SOURCE
+ * 1 large file-related additions to POSIX
+ * interfaces requested (fseeko, etc.)
+ * _LARGEFILE64_SOURCE
+ * 1 transitional large-file-related interfaces
+ * requested (seek64, stat64, etc.)
+ *
+ * The corresponding announcement macros are respectively:
+ * _LFS_LARGEFILE
+ * _LFS64_LARGEFILE
+ * (These are set in <unistd.h>.)
+ *
+ * Requesting _LARGEFILE64_SOURCE implies requesting _LARGEFILE_SOURCE as
+ * well.
+ *
+ * The large file interfaces are made visible regardless of the initial values
+ * of the feature test macros under certain circumstances:
+ * - If no explicit standards-conforming environment is requested (neither
+ * of _POSIX_SOURCE nor _XOPEN_SOURCE is defined and the value of
+ * __STDC__ does not imply standards conformance).
+ * - Extended system interfaces are explicitly requested (__EXTENSIONS__
+ * is defined).
+ * - Access to in-kernel interfaces is requested (_KERNEL or _KMEMUSER is
+ * defined). (Note that this dependency is an artifact of the current
+ * kernel implementation and may change in future releases.)
+ */
+#if (!defined(_STRICT_STDC) && !defined(__XOPEN_OR_POSIX)) || \
+ defined(_KERNEL) || defined(_KMEMUSER) || \
+ defined(__EXTENSIONS__)
+#undef _LARGEFILE64_SOURCE
+#define _LARGEFILE64_SOURCE 1
+#endif
+#if _LARGEFILE64_SOURCE - 0 == 1
+#undef _LARGEFILE_SOURCE
+#define _LARGEFILE_SOURCE 1
+#endif
+
+/*
+ * Large file compilation environment control:
+ *
+ * The setting of _FILE_OFFSET_BITS controls the size of various file-related
+ * types and governs the mapping between file-related source function symbol
+ * names and the corresponding binary entry points.
+ *
+ * In the 32-bit environment, the default value is 32; if not set, set it to
+ * the default here, to simplify tests in other headers.
+ *
+ * In the 64-bit compilation environment, the only value allowed is 64.
+ */
+#if defined(_LP64)
+#ifndef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64
+#endif
+#if _FILE_OFFSET_BITS - 0 != 64
+#error "invalid _FILE_OFFSET_BITS value specified"
+#endif
+#else /* _LP64 */
+#ifndef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 32
+#endif
+#if _FILE_OFFSET_BITS - 0 != 32 && _FILE_OFFSET_BITS - 0 != 64
+#error "invalid _FILE_OFFSET_BITS value specified"
+#endif
+#endif /* _LP64 */
+
+/*
+ * Use of _XOPEN_SOURCE
+ *
+ * The following X/Open specifications are supported:
+ *
+ * X/Open Portability Guide, Issue 3 (XPG3)
+ * X/Open CAE Specification, Issue 4 (XPG4)
+ * X/Open CAE Specification, Issue 4, Version 2 (XPG4v2)
+ * X/Open CAE Specification, Issue 5 (XPG5)
+ * Open Group Technical Standard, Issue 6 (XPG6), also referred to as
+ * IEEE Std. 1003.1-2001 and ISO/IEC 9945:2002.
+ * Open Group Technical Standard, Issue 7 (XPG7), also referred to as
+ * IEEE Std. 1003.1-2008 and ISO/IEC 9945:2009.
+ *
+ * XPG4v2 is also referred to as UNIX 95 (SUS or SUSv1).
+ * XPG5 is also referred to as UNIX 98 or the Single Unix Specification,
+ * Version 2 (SUSv2)
+ * XPG6 is the result of a merge of the X/Open and POSIX specifications
+ * and as such is also referred to as IEEE Std. 1003.1-2001 in
+ * addition to UNIX 03 and SUSv3.
+ * XPG7 is also referred to as UNIX 08 and SUSv4.
+ *
+ * When writing a conforming X/Open application, as per the specification
+ * requirements, the appropriate feature test macros must be defined at
+ * compile time. These are as follows. For more info, see standards(5).
+ *
+ * Feature Test Macro Specification
+ * ------------------------------------------------ -------------
+ * _XOPEN_SOURCE XPG3
+ * _XOPEN_SOURCE && _XOPEN_VERSION = 4 XPG4
+ * _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED = 1 XPG4v2
+ * _XOPEN_SOURCE = 500 XPG5
+ * _XOPEN_SOURCE = 600 (or POSIX_C_SOURCE=200112L) XPG6
+ * _XOPEN_SOURCE = 700 (or POSIX_C_SOURCE=200809L) XPG7
+ *
+ * In order to simplify the guards within the headers, the following
+ * implementation private test macros have been created. Applications
+ * must NOT use these private test macros as unexpected results will
+ * occur.
+ *
+ * Note that in general, the use of these private macros is cumulative.
+ * For example, the use of _XPG3 with no other restrictions on the X/Open
+ * namespace will make the symbols visible for XPG3 through XPG6
+ * compilation environments. The use of _XPG4_2 with no other X/Open
+ * namespace restrictions indicates that the symbols were introduced in
+ * XPG4v2 and are therefore visible for XPG4v2 through XPG6 compilation
+ * environments, but not for XPG3 or XPG4 compilation environments.
+ *
+ * _XPG3 X/Open Portability Guide, Issue 3 (XPG3)
+ * _XPG4 X/Open CAE Specification, Issue 4 (XPG4)
+ * _XPG4_2 X/Open CAE Specification, Issue 4, Version 2 (XPG4v2/UNIX 95/SUS)
+ * _XPG5 X/Open CAE Specification, Issue 5 (XPG5/UNIX 98/SUSv2)
+ * _XPG6 Open Group Technical Standard, Issue 6 (XPG6/UNIX 03/SUSv3)
+ * _XPG7 Open Group Technical Standard, Issue 7 (XPG7/UNIX 08/SUSv4)
+ */
+
+/* X/Open Portability Guide, Issue 3 */
+#if defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE - 0 < 500) && \
+ (_XOPEN_VERSION - 0 < 4) && !defined(_XOPEN_SOURCE_EXTENDED)
+#define _XPG3
+/* X/Open CAE Specification, Issue 4 */
+#elif (defined(_XOPEN_SOURCE) && _XOPEN_VERSION - 0 == 4)
+#define _XPG4
+#define _XPG3
+/* X/Open CAE Specification, Issue 4, Version 2 */
+#elif (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE_EXTENDED - 0 == 1)
+#define _XPG4_2
+#define _XPG4
+#define _XPG3
+/* X/Open CAE Specification, Issue 5 */
+#elif (_XOPEN_SOURCE - 0 == 500)
+#define _XPG5
+#define _XPG4_2
+#define _XPG4
+#define _XPG3
+#undef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 199506L
+/* Open Group Technical Standard , Issue 6 */
+#elif (_XOPEN_SOURCE - 0 == 600) || (_POSIX_C_SOURCE - 0 == 200112L)
+#define _XPG6
+#define _XPG5
+#define _XPG4_2
+#define _XPG4
+#define _XPG3
+#undef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200112L
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+
+/* Open Group Technical Standard, Issue 7 */
+#elif (_XOPEN_SOURCE - 0 == 700) || (_POSIX_C_SOURCE - 0 == 200809L)
+#define _XPG7
+#define _XPG6
+#define _XPG5
+#define _XPG4_2
+#define _XPG4
+#define _XPG3
+#undef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 700
+#endif
+
+/*
+ * _XOPEN_VERSION is defined by the X/Open specifications and is not
+ * normally defined by the application, except in the case of an XPG4
+ * application. On the implementation side, _XOPEN_VERSION defined with
+ * the value of 3 indicates an XPG3 application. _XOPEN_VERSION defined
+ * with the value of 4 indicates an XPG4 or XPG4v2 (UNIX 95) application.
+ * _XOPEN_VERSION defined with a value of 500 indicates an XPG5 (UNIX 98)
+ * application and with a value of 600 indicates an XPG6 (UNIX 03)
+ * application and with a value of 700 indicates an XPG7 (UNIX 08).
+ * The appropriate version is determined by the use of the
+ * feature test macros described earlier. The value of _XOPEN_VERSION
+ * defaults to 3 otherwise indicating support for XPG3 applications.
+ */
+#ifndef _XOPEN_VERSION
+#if defined(_XPG7)
+#define _XOPEN_VERSION 700
+#elif defined(_XPG6)
+#define _XOPEN_VERSION 600
+#elif defined(_XPG5)
+#define _XOPEN_VERSION 500
+#elif defined(_XPG4_2)
+#define _XOPEN_VERSION 4
+#else
+#define _XOPEN_VERSION 3
+#endif
+#endif
+
+/*
+ * ANSI C and ISO 9899:1990 say the type long long doesn't exist in strictly
+ * conforming environments. ISO 9899:1999 says it does.
+ *
+ * The presence of _LONGLONG_TYPE says "long long exists" which is therefore
+ * defined in all but strictly conforming environments that disallow it.
+ */
+#if !defined(_STDC_C99) && defined(_STRICT_STDC) && !defined(__GNUC__)
+/*
+ * Resist attempts to force the definition of long long in this case.
+ */
+#if defined(_LONGLONG_TYPE)
+#error "No long long in strictly conforming ANSI C & 1990 ISO C environments"
+#endif
+#else
+#if !defined(_LONGLONG_TYPE)
+#define _LONGLONG_TYPE
+#endif
+#endif
+
+/*
+ * It is invalid to compile an XPG3, XPG4, XPG4v2, or XPG5 application
+ * using c99. The same is true for POSIX.1-1990, POSIX.2-1992, POSIX.1b,
+ * and POSIX.1c applications. Likewise, it is invalid to compile an XPG6
+ * or a POSIX.1-2001 application with anything other than a c99 or later
+ * compiler. Therefore, we force an error in both cases.
+ */
+#if defined(_STDC_C99) && (defined(__XOPEN_OR_POSIX) && !defined(_XPG6))
+#error "Compiler or options invalid for pre-UNIX 03 X/Open applications \
+ and pre-2001 POSIX applications"
+#elif !defined(_STDC_C99) && \
+ (defined(__XOPEN_OR_POSIX) && defined(_XPG6))
+#error "Compiler or options invalid; UNIX 03 and POSIX.1-2001 applications \
+ require the use of c99"
+#endif
+
+/*
+ * The following macro defines a value for the ISO C99 restrict
+ * keyword so that _RESTRICT_KYWD resolves to "restrict" if
+ * an ISO C99 compiler is used and "" (null string) if any other
+ * compiler is used. This allows for the use of single prototype
+ * declarations regardless of compiler version.
+ */
+#if (defined(__STDC__) && defined(_STDC_C99)) && !defined(__cplusplus)
+#define _RESTRICT_KYWD restrict
+#else
+#define _RESTRICT_KYWD
+#endif
+
+/*
+ * The following macro indicates header support for the ANSI C++
+ * standard. The ISO/IEC designation for this is ISO/IEC FDIS 14882.
+ */
+#define _ISO_CPP_14882_1998
+
+/*
+ * The following macro indicates header support for the C99 standard,
+ * ISO/IEC 9899:1999, Programming Languages - C.
+ */
+#define _ISO_C_9899_1999
+
+/*
+ * The following macro indicates header support for DTrace. The value is an
+ * integer that corresponds to the major version number for DTrace.
+ */
+#define _DTRACE_VERSION 1
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FEATURE_TESTS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
new file mode 100644
index 000000000000..029af540b3c7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
@@ -0,0 +1,97 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FM_FS_ZFS_H
+#define _SYS_FM_FS_ZFS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_ERROR_CLASS "fs.zfs"
+
+#define FM_EREPORT_ZFS_CHECKSUM "checksum"
+#define FM_EREPORT_ZFS_IO "io"
+#define FM_EREPORT_ZFS_DATA "data"
+#define FM_EREPORT_ZFS_POOL "zpool"
+#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
+#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
+#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data"
+#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas"
+#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum"
+#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small"
+#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label"
+#define FM_EREPORT_ZFS_IO_FAILURE "io_failure"
+#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
+#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
+#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write"
+
+#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
+#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"
+#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
+#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm"
+#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
+#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
+
+#define FM_EREPORT_FAILMODE_WAIT "wait"
+#define FM_EREPORT_FAILMODE_CONTINUE "continue"
+#define FM_EREPORT_FAILMODE_PANIC "panic"
+
+#define FM_RESOURCE_REMOVED "removed"
+#define FM_RESOURCE_AUTOREPLACE "autoreplace"
+#define FM_RESOURCE_STATECHANGE "statechange"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FM_FS_ZFS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
new file mode 100644
index 000000000000..f5f93421bd74
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
@@ -0,0 +1,369 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FM_PROTOCOL_H
+#define _SYS_FM_PROTOCOL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+#include <sys/varargs.h>
+#include <sys/nvpair.h>
+#else
+#include <libnvpair.h>
+#include <stdarg.h>
+#endif
+
+/* FM common member names */
+#define FM_CLASS "class"
+#define FM_VERSION "version"
+
+/* FM protocol category 1 class names */
+#define FM_EREPORT_CLASS "ereport"
+#define FM_FAULT_CLASS "fault"
+#define FM_DEFECT_CLASS "defect"
+#define FM_RSRC_CLASS "resource"
+#define FM_LIST_EVENT "list"
+#define FM_IREPORT_CLASS "ireport"
+
+/* FM list.* event class values */
+#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect"
+#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated"
+#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired"
+#define FM_LIST_UPDATED_CLASS FM_LIST_EVENT ".updated"
+#define FM_LIST_RESOLVED_CLASS FM_LIST_EVENT ".resolved"
+
+/* ereport class subcategory values */
+#define FM_ERROR_CPU "cpu"
+#define FM_ERROR_IO "io"
+
+/* ereport version and payload member names */
+#define FM_EREPORT_VERS0 0
+#define FM_EREPORT_VERSION FM_EREPORT_VERS0
+
+/* ereport payload member names */
+#define FM_EREPORT_DETECTOR "detector"
+#define FM_EREPORT_ENA "ena"
+
+/* list.* event payload member names */
+#define FM_LIST_EVENT_SIZE "list-sz"
+
+/* ireport.* event payload member names */
+#define FM_IREPORT_DETECTOR "detector"
+#define FM_IREPORT_UUID "uuid"
+#define FM_IREPORT_PRIORITY "pri"
+#define FM_IREPORT_ATTRIBUTES "attr"
+
+/*
+ * list.suspect, isolated, updated, repaired and resolved
+ * versions/payload member names.
+ */
+#define FM_SUSPECT_UUID "uuid"
+#define FM_SUSPECT_DIAG_CODE "code"
+#define FM_SUSPECT_DIAG_TIME "diag-time"
+#define FM_SUSPECT_DE "de"
+#define FM_SUSPECT_FAULT_LIST "fault-list"
+#define FM_SUSPECT_FAULT_SZ "fault-list-sz"
+#define FM_SUSPECT_FAULT_STATUS "fault-status"
+#define FM_SUSPECT_INJECTED "__injected"
+#define FM_SUSPECT_MESSAGE "message"
+#define FM_SUSPECT_RETIRE "retire"
+#define FM_SUSPECT_RESPONSE "response"
+#define FM_SUSPECT_SEVERITY "severity"
+
+#define FM_SUSPECT_VERS0 0
+#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0
+
+#define FM_SUSPECT_FAULTY 0x1
+#define FM_SUSPECT_UNUSABLE 0x2
+#define FM_SUSPECT_NOT_PRESENT 0x4
+#define FM_SUSPECT_DEGRADED 0x8
+#define FM_SUSPECT_REPAIRED 0x10
+#define FM_SUSPECT_REPLACED 0x20
+#define FM_SUSPECT_ACQUITTED 0x40
+
+/* fault event versions and payload member names */
+#define FM_FAULT_VERS0 0
+#define FM_FAULT_VERSION FM_FAULT_VERS0
+
+#define FM_FAULT_ASRU "asru"
+#define FM_FAULT_FRU "fru"
+#define FM_FAULT_FRU_LABEL "fru-label"
+#define FM_FAULT_CERTAINTY "certainty"
+#define FM_FAULT_RESOURCE "resource"
+#define FM_FAULT_LOCATION "location"
+
+/* resource event versions and payload member names */
+#define FM_RSRC_VERS0 0
+#define FM_RSRC_VERSION FM_RSRC_VERS0
+#define FM_RSRC_RESOURCE "resource"
+
+/* resource.fm.asru.* payload member names */
+#define FM_RSRC_ASRU_UUID "uuid"
+#define FM_RSRC_ASRU_CODE "code"
+#define FM_RSRC_ASRU_FAULTY "faulty"
+#define FM_RSRC_ASRU_REPAIRED "repaired"
+#define FM_RSRC_ASRU_REPLACED "replaced"
+#define FM_RSRC_ASRU_ACQUITTED "acquitted"
+#define FM_RSRC_ASRU_RESOLVED "resolved"
+#define FM_RSRC_ASRU_UNUSABLE "unusable"
+#define FM_RSRC_ASRU_EVENT "event"
+
+/* resource.fm.xprt.* versions and payload member names */
+#define FM_RSRC_XPRT_VERS0 0
+#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0
+#define FM_RSRC_XPRT_UUID "uuid"
+#define FM_RSRC_XPRT_SUBCLASS "subclass"
+#define FM_RSRC_XPRT_FAULT_STATUS "fault-status"
+#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru"
+
+/*
+ * FM ENA Format Macros
+ */
+#define ENA_FORMAT_MASK 0x3
+#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK)
+
+/* ENA format types */
+#define FM_ENA_FMT0 0
+#define FM_ENA_FMT1 1
+#define FM_ENA_FMT2 2
+
+/* Format 1 */
+#define ENA_FMT1_GEN_MASK 0x00000000000003FCull
+#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull
+#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull
+#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull
+#define ENA_FMT1_GEN_SHFT 2
+#define ENA_FMT1_ID_SHFT 10
+#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT
+#define ENA_FMT1_TIME_SHFT 20
+
+/* Format 2 */
+#define ENA_FMT2_GEN_MASK 0x00000000000003FCull
+#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull
+#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK
+#define ENA_FMT2_GEN_SHFT 2
+#define ENA_FMT2_ID_SHFT 10
+#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT
+
+/* Common FMRI type names */
+#define FM_FMRI_AUTHORITY "authority"
+#define FM_FMRI_SCHEME "scheme"
+#define FM_FMRI_SVC_AUTHORITY "svc-authority"
+#define FM_FMRI_FACILITY "facility"
+
+/* FMRI authority-type member names */
+#define FM_FMRI_AUTH_CHASSIS "chassis-id"
+#define FM_FMRI_AUTH_PRODUCT_SN "product-sn"
+#define FM_FMRI_AUTH_PRODUCT "product-id"
+#define FM_FMRI_AUTH_DOMAIN "domain-id"
+#define FM_FMRI_AUTH_SERVER "server-id"
+#define FM_FMRI_AUTH_HOST "host-id"
+
+#define FM_AUTH_VERS0 0
+#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0
+
+/* scheme name values */
+#define FM_FMRI_SCHEME_FMD "fmd"
+#define FM_FMRI_SCHEME_DEV "dev"
+#define FM_FMRI_SCHEME_HC "hc"
+#define FM_FMRI_SCHEME_SVC "svc"
+#define FM_FMRI_SCHEME_CPU "cpu"
+#define FM_FMRI_SCHEME_MEM "mem"
+#define FM_FMRI_SCHEME_MOD "mod"
+#define FM_FMRI_SCHEME_PKG "pkg"
+#define FM_FMRI_SCHEME_LEGACY "legacy-hc"
+#define FM_FMRI_SCHEME_ZFS "zfs"
+#define FM_FMRI_SCHEME_SW "sw"
+
+/* Scheme versions */
+#define FMD_SCHEME_VERSION0 0
+#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0
+#define DEV_SCHEME_VERSION0 0
+#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0
+#define FM_HC_VERS0 0
+#define FM_HC_SCHEME_VERSION FM_HC_VERS0
+#define CPU_SCHEME_VERSION0 0
+#define CPU_SCHEME_VERSION1 1
+#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1
+#define MEM_SCHEME_VERSION0 0
+#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0
+#define MOD_SCHEME_VERSION0 0
+#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0
+#define PKG_SCHEME_VERSION0 0
+#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0
+#define LEGACY_SCHEME_VERSION0 0
+#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0
+#define SVC_SCHEME_VERSION0 0
+#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0
+#define ZFS_SCHEME_VERSION0 0
+#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0
+#define SW_SCHEME_VERSION0 0
+#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0
+
+/* hc scheme member names */
+#define FM_FMRI_HC_SERIAL_ID "serial"
+#define FM_FMRI_HC_PART "part"
+#define FM_FMRI_HC_REVISION "revision"
+#define FM_FMRI_HC_ROOT "hc-root"
+#define FM_FMRI_HC_LIST_SZ "hc-list-sz"
+#define FM_FMRI_HC_LIST "hc-list"
+#define FM_FMRI_HC_SPECIFIC "hc-specific"
+
+/* facility member names */
+#define FM_FMRI_FACILITY_NAME "facility-name"
+#define FM_FMRI_FACILITY_TYPE "facility-type"
+
+/* hc-list version and member names */
+#define FM_FMRI_HC_NAME "hc-name"
+#define FM_FMRI_HC_ID "hc-id"
+
+#define HC_LIST_VERSION0 0
+#define FM_HC_LIST_VERSION HC_LIST_VERSION0
+
+/* hc-specific member names */
+#define FM_FMRI_HC_SPECIFIC_OFFSET "offset"
+#define FM_FMRI_HC_SPECIFIC_PHYSADDR "physaddr"
+
+/* fmd module scheme member names */
+#define FM_FMRI_FMD_NAME "mod-name"
+#define FM_FMRI_FMD_VERSION "mod-version"
+
+/* dev scheme member names */
+#define FM_FMRI_DEV_ID "devid"
+#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id"
+#define FM_FMRI_DEV_PATH "device-path"
+
+/* pkg scheme member names */
+#define FM_FMRI_PKG_BASEDIR "pkg-basedir"
+#define FM_FMRI_PKG_INST "pkg-inst"
+#define FM_FMRI_PKG_VERSION "pkg-version"
+
+/* svc scheme member names */
+#define FM_FMRI_SVC_NAME "svc-name"
+#define FM_FMRI_SVC_INSTANCE "svc-instance"
+#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id"
+
+/* svc-authority member names */
+#define FM_FMRI_SVC_AUTH_SCOPE "scope"
+#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn"
+
+/* cpu scheme member names */
+#define FM_FMRI_CPU_ID "cpuid"
+#define FM_FMRI_CPU_SERIAL_ID "serial"
+#define FM_FMRI_CPU_MASK "cpumask"
+#define FM_FMRI_CPU_VID "cpuvid"
+#define FM_FMRI_CPU_CPUFRU "cpufru"
+#define FM_FMRI_CPU_CACHE_INDEX "cacheindex"
+#define FM_FMRI_CPU_CACHE_WAY "cacheway"
+#define FM_FMRI_CPU_CACHE_BIT "cachebit"
+#define FM_FMRI_CPU_CACHE_TYPE "cachetype"
+
+#define FM_FMRI_CPU_CACHE_TYPE_L2 0
+#define FM_FMRI_CPU_CACHE_TYPE_L3 1
+
+/* legacy-hc scheme member names */
+#define FM_FMRI_LEGACY_HC "component"
+#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \
+ FM_FMRI_LEGACY_HC"="
+
+/* mem scheme member names */
+#define FM_FMRI_MEM_UNUM "unum"
+#define FM_FMRI_MEM_SERIAL_ID "serial"
+#define FM_FMRI_MEM_PHYSADDR "physaddr"
+#define FM_FMRI_MEM_MEMCONFIG "memconfig"
+#define FM_FMRI_MEM_OFFSET "offset"
+
+/* mod scheme member names */
+#define FM_FMRI_MOD_PKG "mod-pkg"
+#define FM_FMRI_MOD_NAME "mod-name"
+#define FM_FMRI_MOD_ID "mod-id"
+#define FM_FMRI_MOD_DESC "mod-desc"
+
+/* zfs scheme member names */
+#define FM_FMRI_ZFS_POOL "pool"
+#define FM_FMRI_ZFS_VDEV "vdev"
+
+/* sw scheme member names - extra indentation for members of an nvlist */
+#define FM_FMRI_SW_OBJ "object"
+#define FM_FMRI_SW_OBJ_PATH "path"
+#define FM_FMRI_SW_OBJ_ROOT "root"
+#define FM_FMRI_SW_OBJ_PKG "pkg"
+#define FM_FMRI_SW_SITE "site"
+#define FM_FMRI_SW_SITE_TOKEN "token"
+#define FM_FMRI_SW_SITE_MODULE "module"
+#define FM_FMRI_SW_SITE_FILE "file"
+#define FM_FMRI_SW_SITE_LINE "line"
+#define FM_FMRI_SW_SITE_FUNC "func"
+#define FM_FMRI_SW_CTXT "context"
+#define FM_FMRI_SW_CTXT_ORIGIN "origin"
+#define FM_FMRI_SW_CTXT_EXECNAME "execname"
+#define FM_FMRI_SW_CTXT_PID "pid"
+#define FM_FMRI_SW_CTXT_ZONE "zone"
+#define FM_FMRI_SW_CTXT_CTID "ctid"
+#define FM_FMRI_SW_CTXT_STACK "stack"
+
+extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
+extern void fm_nva_xdestroy(nv_alloc_t *);
+
+extern nvlist_t *fm_nvlist_create(nv_alloc_t *);
+extern void fm_nvlist_destroy(nvlist_t *, int);
+
+#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */
+#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */
+
+extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t,
+ const nvlist_t *, ...);
+extern void fm_payload_set(nvlist_t *, ...);
+extern int i_fm_payload_set(nvlist_t *, const char *, va_list);
+extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *,
+ int, ...);
+extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *,
+ const char *, const char *);
+extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *);
+extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t,
+ uint8_t *, const char *);
+extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
+ const char *, uint64_t);
+extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
+ const char *, const char *);
+extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
+extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *,
+ nvlist_t *, int, ...);
+
+extern uint64_t fm_ena_increment(uint64_t);
+extern uint64_t fm_ena_generate(uint64_t, uchar_t);
+extern uint64_t fm_ena_generation_get(uint64_t);
+extern uchar_t fm_ena_format_get(uint64_t);
+extern uint64_t fm_ena_id_get(uint64_t);
+extern uint64_t fm_ena_time_get(uint64_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FM_PROTOCOL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
new file mode 100644
index 000000000000..e99a370af7ae
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ */
+
+#ifndef _SYS_FM_UTIL_H
+#define _SYS_FM_UTIL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/nvpair.h>
+#include <sys/errorq.h>
+
+/*
+ * Shared user/kernel definitions for class length, error channel name,
+ * and kernel event publisher string.
+ */
+#define FM_MAX_CLASS 100
+#define FM_ERROR_CHAN "com.sun:fm:error"
+#define FM_PUB "fm"
+
+/*
+ * ereport dump device transport support
+ *
+ * Ereports are written out to the dump device at a proscribed offset from the
+ * end, similar to in-transit log messages. The ereports are represented as a
+ * erpt_dump_t header followed by ed_size bytes of packed native nvlist data.
+ *
+ * NOTE: All of these constants and the header must be defined so they have the
+ * same representation for *both* 32-bit and 64-bit producers and consumers.
+ */
+#define ERPT_MAGIC 0xf00d4eddU
+#define ERPT_MAX_ERRS 16
+#define ERPT_DATA_SZ (6 * 1024)
+#define ERPT_EVCH_MAX 256
+#define ERPT_HIWAT 64
+
+typedef struct erpt_dump {
+ uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */
+ uint32_t ed_chksum; /* checksum32() of packed nvlist data */
+ uint32_t ed_size; /* ereport (nvl) fixed buf size */
+ uint32_t ed_pad; /* reserved for future use */
+ hrtime_t ed_hrt_nsec; /* hrtime of this ereport */
+ hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */
+ struct {
+ uint64_t sec; /* seconds since gettimeofday() Epoch */
+ uint64_t nsec; /* nanoseconds past ed_tod_base.sec */
+ } ed_tod_base;
+} erpt_dump_t;
+
+#if defined(_KERNEL) || defined(_FAKE_KERNEL)
+#include <sys/systm.h>
+
+#define FM_STK_DEPTH 20 /* maximum stack depth */
+#define FM_SYM_SZ 64 /* maximum symbol size */
+#define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */
+
+#define FM_EREPORT_PAYLOAD_NAME_STACK "stack"
+
+extern errorq_t *ereport_errorq;
+extern void *ereport_dumpbuf;
+extern size_t ereport_dumplen;
+
+extern void fm_init(void);
+extern void fm_nvprint(nvlist_t *);
+#define fm_panic panic
+extern void fm_banner(void);
+
+extern void fm_ereport_dump(void);
+extern void fm_ereport_post(nvlist_t *, int);
+
+extern int is_fm_panic();
+#endif /* _KERNEL || _FAKE_KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FM_UTIL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
new file mode 100644
index 000000000000..db23bbe01b9f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -0,0 +1,1248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef _SYS_FS_ZFS_H
+#define _SYS_FS_ZFS_H
+
+#include <sys/types.h>
+#include <sys/ioccom.h>
+#include <sys/time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Types and constants shared between userland and the kernel.
+ */
+
+/*
+ * Each dataset can be one of the following types. These constants can be
+ * combined into masks that can be passed to various functions.
+ */
+typedef enum {
+ ZFS_TYPE_FILESYSTEM = (1 << 0),
+ ZFS_TYPE_SNAPSHOT = (1 << 1),
+ ZFS_TYPE_VOLUME = (1 << 2),
+ ZFS_TYPE_POOL = (1 << 3),
+ ZFS_TYPE_BOOKMARK = (1 << 4)
+} zfs_type_t;
+
+/*
+ * NB: lzc_dataset_type should be updated whenever a new objset type is added,
+ * if it represents a real type of a dataset that can be created from userland.
+ */
+typedef enum dmu_objset_type {
+ DMU_OST_NONE,
+ DMU_OST_META,
+ DMU_OST_ZFS,
+ DMU_OST_ZVOL,
+ DMU_OST_OTHER, /* For testing only! */
+ DMU_OST_ANY, /* Be careful! */
+ DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+#define ZFS_TYPE_DATASET \
+ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
+
+/*
+ * All of these include the terminating NUL byte.
+ */
+#define ZAP_MAXNAMELEN 256
+#define ZAP_MAXVALUELEN (1024 * 8)
+#define ZAP_OLDMAXVALUELEN 1024
+#define ZFS_MAX_DATASET_NAME_LEN 256
+
+/*
+ * Dataset properties are identified by these constants and must be added to
+ * the end of this list to ensure that external consumers are not affected
+ * by the change. If you make any changes to this list, be sure to update
+ * the property table in usr/src/common/zfs/zfs_prop.c.
+ */
+typedef enum {
+ ZPROP_CONT = -2,
+ ZPROP_INVAL = -1,
+ ZFS_PROP_TYPE = 0,
+ ZFS_PROP_CREATION,
+ ZFS_PROP_USED,
+ ZFS_PROP_AVAILABLE,
+ ZFS_PROP_REFERENCED,
+ ZFS_PROP_COMPRESSRATIO,
+ ZFS_PROP_MOUNTED,
+ ZFS_PROP_ORIGIN,
+ ZFS_PROP_QUOTA,
+ ZFS_PROP_RESERVATION,
+ ZFS_PROP_VOLSIZE,
+ ZFS_PROP_VOLBLOCKSIZE,
+ ZFS_PROP_RECORDSIZE,
+ ZFS_PROP_MOUNTPOINT,
+ ZFS_PROP_SHARENFS,
+ ZFS_PROP_CHECKSUM,
+ ZFS_PROP_COMPRESSION,
+ ZFS_PROP_ATIME,
+ ZFS_PROP_DEVICES,
+ ZFS_PROP_EXEC,
+ ZFS_PROP_SETUID,
+ ZFS_PROP_READONLY,
+ ZFS_PROP_ZONED,
+ ZFS_PROP_SNAPDIR,
+ ZFS_PROP_ACLMODE,
+ ZFS_PROP_ACLINHERIT,
+ ZFS_PROP_CREATETXG,
+ ZFS_PROP_NAME, /* not exposed to the user */
+ ZFS_PROP_CANMOUNT,
+ ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
+ ZFS_PROP_XATTR,
+ ZFS_PROP_NUMCLONES, /* not exposed to the user */
+ ZFS_PROP_COPIES,
+ ZFS_PROP_VERSION,
+ ZFS_PROP_UTF8ONLY,
+ ZFS_PROP_NORMALIZE,
+ ZFS_PROP_CASE,
+ ZFS_PROP_VSCAN,
+ ZFS_PROP_NBMAND,
+ ZFS_PROP_SHARESMB,
+ ZFS_PROP_REFQUOTA,
+ ZFS_PROP_REFRESERVATION,
+ ZFS_PROP_GUID,
+ ZFS_PROP_PRIMARYCACHE,
+ ZFS_PROP_SECONDARYCACHE,
+ ZFS_PROP_USEDSNAP,
+ ZFS_PROP_USEDDS,
+ ZFS_PROP_USEDCHILD,
+ ZFS_PROP_USEDREFRESERV,
+ ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
+ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
+ ZFS_PROP_DEFER_DESTROY,
+ ZFS_PROP_USERREFS,
+ ZFS_PROP_LOGBIAS,
+ ZFS_PROP_UNIQUE, /* not exposed to the user */
+ ZFS_PROP_OBJSETID, /* not exposed to the user */
+ ZFS_PROP_DEDUP,
+ ZFS_PROP_MLSLABEL,
+ ZFS_PROP_SYNC,
+ ZFS_PROP_DNODESIZE,
+ ZFS_PROP_REFRATIO,
+ ZFS_PROP_WRITTEN,
+ ZFS_PROP_CLONES,
+ ZFS_PROP_LOGICALUSED,
+ ZFS_PROP_LOGICALREFERENCED,
+ ZFS_PROP_INCONSISTENT, /* not exposed to the user */
+ ZFS_PROP_VOLMODE,
+ ZFS_PROP_FILESYSTEM_LIMIT,
+ ZFS_PROP_SNAPSHOT_LIMIT,
+ ZFS_PROP_FILESYSTEM_COUNT,
+ ZFS_PROP_SNAPSHOT_COUNT,
+ ZFS_PROP_REDUNDANT_METADATA,
+ ZFS_PROP_PREV_SNAP,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN,
+ ZFS_PROP_REMAPTXG, /* not exposed to the user */
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS,
+ ZFS_NUM_PROPS
+} zfs_prop_t;
+
+typedef enum {
+ ZFS_PROP_USERUSED,
+ ZFS_PROP_USERQUOTA,
+ ZFS_PROP_GROUPUSED,
+ ZFS_PROP_GROUPQUOTA,
+ ZFS_NUM_USERQUOTA_PROPS
+} zfs_userquota_prop_t;
+
+extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
+
+/*
+ * Pool properties are identified by these constants and must be added to the
+ * end of this list to ensure that external consumers are not affected
+ * by the change. If you make any changes to this list, be sure to update
+ * the property table in usr/src/common/zfs/zpool_prop.c.
+ */
+typedef enum {
+ ZPOOL_PROP_INVAL = -1,
+ ZPOOL_PROP_NAME,
+ ZPOOL_PROP_SIZE,
+ ZPOOL_PROP_CAPACITY,
+ ZPOOL_PROP_ALTROOT,
+ ZPOOL_PROP_HEALTH,
+ ZPOOL_PROP_GUID,
+ ZPOOL_PROP_VERSION,
+ ZPOOL_PROP_BOOTFS,
+ ZPOOL_PROP_DELEGATION,
+ ZPOOL_PROP_AUTOREPLACE,
+ ZPOOL_PROP_CACHEFILE,
+ ZPOOL_PROP_FAILUREMODE,
+ ZPOOL_PROP_LISTSNAPS,
+ ZPOOL_PROP_AUTOEXPAND,
+ ZPOOL_PROP_DEDUPDITTO,
+ ZPOOL_PROP_DEDUPRATIO,
+ ZPOOL_PROP_FREE,
+ ZPOOL_PROP_ALLOCATED,
+ ZPOOL_PROP_READONLY,
+ ZPOOL_PROP_COMMENT,
+ ZPOOL_PROP_EXPANDSZ,
+ ZPOOL_PROP_FREEING,
+ ZPOOL_PROP_FRAGMENTATION,
+ ZPOOL_PROP_LEAKED,
+ ZPOOL_PROP_MAXBLOCKSIZE,
+ ZPOOL_PROP_BOOTSIZE,
+ ZPOOL_PROP_CHECKPOINT,
+ ZPOOL_PROP_TNAME,
+ ZPOOL_PROP_MAXDNODESIZE,
+ ZPOOL_PROP_MULTIHOST,
+ ZPOOL_NUM_PROPS
+} zpool_prop_t;
+
+/* Small enough to not hog a whole line of printout in zpool(1M). */
+#define ZPROP_MAX_COMMENT 32
+
+#define ZPROP_VALUE "value"
+#define ZPROP_SOURCE "source"
+
+typedef enum {
+ ZPROP_SRC_NONE = 0x1,
+ ZPROP_SRC_DEFAULT = 0x2,
+ ZPROP_SRC_TEMPORARY = 0x4,
+ ZPROP_SRC_LOCAL = 0x8,
+ ZPROP_SRC_INHERITED = 0x10,
+ ZPROP_SRC_RECEIVED = 0x20
+} zprop_source_t;
+
+#define ZPROP_SRC_ALL 0x3f
+
+#define ZPROP_SOURCE_VAL_RECVD "$recvd"
+#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS"
+/*
+ * Dataset flag implemented as a special entry in the props zap object
+ * indicating that the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
+ * just as it did in earlier versions, and thereafter, local properties are
+ * preserved.
+ */
+#define ZPROP_HAS_RECVD "$hasrecvd"
+
+typedef enum {
+ ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
+ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
+} zprop_errflags_t;
+
+typedef int (*zprop_func)(int, void *);
+
+/*
+ * Properties to be set on the root file system of a new pool
+ * are stuffed into their own nvlist, which is then included in
+ * the properties nvlist with the pool properties.
+ */
+#define ZPOOL_ROOTFS_PROPS "root-props-nvl"
+
+/*
+ * Length of 'written@' and 'written#'
+ */
+#define ZFS_WRITTEN_PROP_PREFIX_LEN 8
+
+/*
+ * Dataset property functions shared between libzfs and kernel.
+ */
+const char *zfs_prop_default_string(zfs_prop_t);
+uint64_t zfs_prop_default_numeric(zfs_prop_t);
+boolean_t zfs_prop_readonly(zfs_prop_t);
+boolean_t zfs_prop_visible(zfs_prop_t prop);
+boolean_t zfs_prop_inheritable(zfs_prop_t);
+boolean_t zfs_prop_setonce(zfs_prop_t);
+const char *zfs_prop_to_name(zfs_prop_t);
+zfs_prop_t zfs_name_to_prop(const char *);
+boolean_t zfs_prop_user(const char *);
+boolean_t zfs_prop_userquota(const char *);
+int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
+int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
+boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
+
+/*
+ * Pool property functions shared between libzfs and kernel.
+ */
+zpool_prop_t zpool_name_to_prop(const char *);
+const char *zpool_prop_to_name(zpool_prop_t);
+const char *zpool_prop_default_string(zpool_prop_t);
+uint64_t zpool_prop_default_numeric(zpool_prop_t);
+boolean_t zpool_prop_readonly(zpool_prop_t);
+boolean_t zpool_prop_feature(const char *);
+boolean_t zpool_prop_unsupported(const char *name);
+int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
+int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
+uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
+
+/*
+ * Definitions for the Delegation.
+ */
+typedef enum {
+ ZFS_DELEG_WHO_UNKNOWN = 0,
+ ZFS_DELEG_USER = 'u',
+ ZFS_DELEG_USER_SETS = 'U',
+ ZFS_DELEG_GROUP = 'g',
+ ZFS_DELEG_GROUP_SETS = 'G',
+ ZFS_DELEG_EVERYONE = 'e',
+ ZFS_DELEG_EVERYONE_SETS = 'E',
+ ZFS_DELEG_CREATE = 'c',
+ ZFS_DELEG_CREATE_SETS = 'C',
+ ZFS_DELEG_NAMED_SET = 's',
+ ZFS_DELEG_NAMED_SET_SETS = 'S'
+} zfs_deleg_who_type_t;
+
+typedef enum {
+ ZFS_DELEG_NONE = 0,
+ ZFS_DELEG_PERM_LOCAL = 1,
+ ZFS_DELEG_PERM_DESCENDENT = 2,
+ ZFS_DELEG_PERM_LOCALDESCENDENT = 3,
+ ZFS_DELEG_PERM_CREATE = 4
+} zfs_deleg_inherit_t;
+
+#define ZFS_DELEG_PERM_UID "uid"
+#define ZFS_DELEG_PERM_GID "gid"
+#define ZFS_DELEG_PERM_GROUPS "groups"
+
+#define ZFS_MLSLABEL_DEFAULT "none"
+
+#define ZFS_SMB_ACL_SRC "src"
+#define ZFS_SMB_ACL_TARGET "target"
+
+typedef enum {
+ ZFS_CANMOUNT_OFF = 0,
+ ZFS_CANMOUNT_ON = 1,
+ ZFS_CANMOUNT_NOAUTO = 2
+} zfs_canmount_type_t;
+
+typedef enum {
+ ZFS_LOGBIAS_LATENCY = 0,
+ ZFS_LOGBIAS_THROUGHPUT = 1
+} zfs_logbias_op_t;
+
+typedef enum zfs_share_op {
+ ZFS_SHARE_NFS = 0,
+ ZFS_UNSHARE_NFS = 1,
+ ZFS_SHARE_SMB = 2,
+ ZFS_UNSHARE_SMB = 3
+} zfs_share_op_t;
+
+typedef enum zfs_smb_acl_op {
+ ZFS_SMB_ACL_ADD,
+ ZFS_SMB_ACL_REMOVE,
+ ZFS_SMB_ACL_RENAME,
+ ZFS_SMB_ACL_PURGE
+} zfs_smb_acl_op_t;
+
+typedef enum zfs_cache_type {
+ ZFS_CACHE_NONE = 0,
+ ZFS_CACHE_METADATA = 1,
+ ZFS_CACHE_ALL = 2
+} zfs_cache_type_t;
+
+typedef enum {
+ ZFS_SYNC_STANDARD = 0,
+ ZFS_SYNC_ALWAYS = 1,
+ ZFS_SYNC_DISABLED = 2
+} zfs_sync_type_t;
+
+typedef enum {
+ ZFS_VOLMODE_DEFAULT = 0,
+ ZFS_VOLMODE_GEOM = 1,
+ ZFS_VOLMODE_DEV = 2,
+ ZFS_VOLMODE_NONE = 3
+} zfs_volmode_t;
+
+typedef enum {
+ ZFS_DNSIZE_LEGACY = 0,
+ ZFS_DNSIZE_AUTO = 1,
+ ZFS_DNSIZE_1K = 1024,
+ ZFS_DNSIZE_2K = 2048,
+ ZFS_DNSIZE_4K = 4096,
+ ZFS_DNSIZE_8K = 8192,
+ ZFS_DNSIZE_16K = 16384
+} zfs_dnsize_type_t;
+
+typedef enum {
+ ZFS_REDUNDANT_METADATA_ALL,
+ ZFS_REDUNDANT_METADATA_MOST
+} zfs_redundant_metadata_type_t;
+
+/*
+ * On-disk version number.
+ */
+#define SPA_VERSION_1 1ULL
+#define SPA_VERSION_2 2ULL
+#define SPA_VERSION_3 3ULL
+#define SPA_VERSION_4 4ULL
+#define SPA_VERSION_5 5ULL
+#define SPA_VERSION_6 6ULL
+#define SPA_VERSION_7 7ULL
+#define SPA_VERSION_8 8ULL
+#define SPA_VERSION_9 9ULL
+#define SPA_VERSION_10 10ULL
+#define SPA_VERSION_11 11ULL
+#define SPA_VERSION_12 12ULL
+#define SPA_VERSION_13 13ULL
+#define SPA_VERSION_14 14ULL
+#define SPA_VERSION_15 15ULL
+#define SPA_VERSION_16 16ULL
+#define SPA_VERSION_17 17ULL
+#define SPA_VERSION_18 18ULL
+#define SPA_VERSION_19 19ULL
+#define SPA_VERSION_20 20ULL
+#define SPA_VERSION_21 21ULL
+#define SPA_VERSION_22 22ULL
+#define SPA_VERSION_23 23ULL
+#define SPA_VERSION_24 24ULL
+#define SPA_VERSION_25 25ULL
+#define SPA_VERSION_26 26ULL
+#define SPA_VERSION_27 27ULL
+#define SPA_VERSION_28 28ULL
+#define SPA_VERSION_5000 5000ULL
+
+/*
+ * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
+ * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
+ * and do the appropriate changes. Also bump the version number in
+ * usr/src/grub/capability.
+ */
+#define SPA_VERSION SPA_VERSION_5000
+#define SPA_VERSION_STRING "5000"
+
+/*
+ * Symbolic names for the changes that caused a SPA_VERSION switch.
+ * Used in the code when checking for presence or absence of a feature.
+ * Feel free to define multiple symbolic names for each version if there
+ * were multiple changes to on-disk structures during that version.
+ *
+ * NOTE: When checking the current SPA_VERSION in your code, be sure
+ * to use spa_version() since it reports the version of the
+ * last synced uberblock. Checking the in-flight version can
+ * be dangerous in some cases.
+ */
+#define SPA_VERSION_INITIAL SPA_VERSION_1
+#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
+#define SPA_VERSION_SPARES SPA_VERSION_3
+#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
+#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
+#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
+#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
+#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
+#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
+#define SPA_VERSION_BOOTFS SPA_VERSION_6
+#define SPA_VERSION_SLOGS SPA_VERSION_7
+#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
+#define SPA_VERSION_FUID SPA_VERSION_9
+#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
+#define SPA_VERSION_REFQUOTA SPA_VERSION_9
+#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
+#define SPA_VERSION_L2CACHE SPA_VERSION_10
+#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
+#define SPA_VERSION_ORIGIN SPA_VERSION_11
+#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
+#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
+#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
+#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
+#define SPA_VERSION_USERSPACE SPA_VERSION_15
+#define SPA_VERSION_STMF_PROP SPA_VERSION_16
+#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
+#define SPA_VERSION_USERREFS SPA_VERSION_18
+#define SPA_VERSION_HOLES SPA_VERSION_19
+#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
+#define SPA_VERSION_DEDUP SPA_VERSION_21
+#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
+#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
+#define SPA_VERSION_SA SPA_VERSION_24
+#define SPA_VERSION_SCAN SPA_VERSION_25
+#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
+#define SPA_VERSION_DEADLISTS SPA_VERSION_26
+#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
+#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
+#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
+#define SPA_VERSION_FEATURES SPA_VERSION_5000
+
+#define SPA_VERSION_IS_SUPPORTED(v) \
+ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
+ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
+
+/*
+ * ZPL version - rev'd whenever an incompatible on-disk format change
+ * occurs. This is independent of SPA/DMU/ZAP versioning. You must
+ * also update the version_table[] and help message in zfs_prop.c.
+ *
+ * When changing, be sure to teach GRUB how to read the new format!
+ * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
+ */
+#define ZPL_VERSION_1 1ULL
+#define ZPL_VERSION_2 2ULL
+#define ZPL_VERSION_3 3ULL
+#define ZPL_VERSION_4 4ULL
+#define ZPL_VERSION_5 5ULL
+#define ZPL_VERSION ZPL_VERSION_5
+#define ZPL_VERSION_STRING "5"
+
+#define ZPL_VERSION_INITIAL ZPL_VERSION_1
+#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
+#define ZPL_VERSION_FUID ZPL_VERSION_3
+#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
+#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
+#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
+#define ZPL_VERSION_SA ZPL_VERSION_5
+
+/* Rewind policy information */
+#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
+#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
+#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */
+#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */
+#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */
+#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */
+#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */
+
+typedef struct zpool_load_policy {
+ uint32_t zlp_rewind; /* rewind policy requested */
+ uint64_t zlp_maxmeta; /* max acceptable meta-data errors */
+ uint64_t zlp_maxdata; /* max acceptable data errors */
+ uint64_t zlp_txg; /* specific txg to load */
+} zpool_load_policy_t;
+
+/*
+ * The following are configuration names used in the nvlist describing a pool's
+ * configuration. New on-disk names should be prefixed with "<reverse-DNS>:"
+ * (e.g. "org.open-zfs:") to avoid conflicting names being developed
+ * independently.
+ */
+#define ZPOOL_CONFIG_VERSION "version"
+#define ZPOOL_CONFIG_POOL_NAME "name"
+#define ZPOOL_CONFIG_POOL_STATE "state"
+#define ZPOOL_CONFIG_POOL_TXG "txg"
+#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
+#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
+#define ZPOOL_CONFIG_TOP_GUID "top_guid"
+#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
+#define ZPOOL_CONFIG_TYPE "type"
+#define ZPOOL_CONFIG_CHILDREN "children"
+#define ZPOOL_CONFIG_ID "id"
+#define ZPOOL_CONFIG_GUID "guid"
+#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
+#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
+#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
+#define ZPOOL_CONFIG_PATH "path"
+#define ZPOOL_CONFIG_DEVID "devid"
+#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
+#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
+#define ZPOOL_CONFIG_ASHIFT "ashift"
+#define ZPOOL_CONFIG_ASIZE "asize"
+#define ZPOOL_CONFIG_DTL "DTL"
+#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */
+#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
+#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
+#define ZPOOL_CONFIG_ERRCOUNT "error_count"
+#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
+#define ZPOOL_CONFIG_SPARES "spares"
+#define ZPOOL_CONFIG_IS_SPARE "is_spare"
+#define ZPOOL_CONFIG_NPARITY "nparity"
+#define ZPOOL_CONFIG_HOSTID "hostid"
+#define ZPOOL_CONFIG_HOSTNAME "hostname"
+#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
+#define ZPOOL_CONFIG_UNSPARE "unspare"
+#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
+#define ZPOOL_CONFIG_IS_LOG "is_log"
+#define ZPOOL_CONFIG_L2CACHE "l2cache"
+#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
+#define ZPOOL_CONFIG_IS_HOLE "is_hole"
+#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram"
+#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats"
+#define ZPOOL_CONFIG_DDT_STATS "ddt_stats"
+#define ZPOOL_CONFIG_SPLIT "splitcfg"
+#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
+#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
+#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
+#define ZPOOL_CONFIG_REMOVING "removing"
+#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
+#define ZPOOL_CONFIG_COMMENT "comment"
+#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
+#define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */
+#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
+#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
+#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */
+#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */
+#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */
+#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */
+#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */
+#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */
+#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
+#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
+#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
+#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
+#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */
+#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */
+#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */
+#define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */
+#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
+#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
+#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
+
+/*
+ * The persistent vdev state is stored as separate values rather than a single
+ * 'vdev_state' entry. This is because a device can be in multiple states, such
+ * as offline and degraded.
+ */
+#define ZPOOL_CONFIG_OFFLINE "offline"
+#define ZPOOL_CONFIG_FAULTED "faulted"
+#define ZPOOL_CONFIG_DEGRADED "degraded"
+#define ZPOOL_CONFIG_REMOVED "removed"
+#define ZPOOL_CONFIG_FRU "fru"
+#define ZPOOL_CONFIG_AUX_STATE "aux_state"
+
+/* Pool load policy parameters */
+#define ZPOOL_LOAD_POLICY "load-policy"
+#define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy"
+#define ZPOOL_LOAD_REQUEST_TXG "load-request-txg"
+#define ZPOOL_LOAD_META_THRESH "load-meta-thresh"
+#define ZPOOL_LOAD_DATA_THRESH "load-data-thresh"
+
+/* Rewind data discovered */
+#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts"
+#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
+#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"
+
+#define VDEV_TYPE_ROOT "root"
+#define VDEV_TYPE_MIRROR "mirror"
+#define VDEV_TYPE_REPLACING "replacing"
+#define VDEV_TYPE_RAIDZ "raidz"
+#define VDEV_TYPE_DISK "disk"
+#define VDEV_TYPE_FILE "file"
+#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_HOLE "hole"
+#define VDEV_TYPE_SPARE "spare"
+#define VDEV_TYPE_LOG "log"
+#define VDEV_TYPE_L2CACHE "l2cache"
+#define VDEV_TYPE_INDIRECT "indirect"
+
+/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */
+#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \
+ "com.delphix:indirect_obsolete_sm"
+#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
+ "com.delphix:obsolete_counts_are_precise"
+#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
+ "com.delphix:pool_checkpoint_sm"
+
+#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
+ "org.zfsonlinux:allocation_bias"
+
+/* vdev metaslab allocation bias */
+#define VDEV_ALLOC_BIAS_LOG "log"
+#define VDEV_ALLOC_BIAS_SPECIAL "special"
+#define VDEV_ALLOC_BIAS_DEDUP "dedup"
+
+#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
+ "com.delphix:next_offset_to_initialize"
+#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
+ "com.delphix:vdev_initialize_state"
+#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
+ "com.delphix:vdev_initialize_action_time"
+
+/*
+ * This is needed in userland to report the minimum necessary device size.
+ *
+ * Note that the zfs test suite uses 64MB vdevs.
+ */
+#define SPA_MINDEVSIZE (64ULL << 20)
+
+/*
+ * Set if the fragmentation has not yet been calculated. This can happen
+ * because the space maps have not been upgraded or the histogram feature
+ * is not enabled.
+ */
+#define ZFS_FRAG_INVALID UINT64_MAX
+
+/*
+ * The location of the pool configuration repository, shared between kernel and
+ * userland.
+ */
+#define ZPOOL_CACHE "/boot/zfs/zpool.cache"
+
+/*
+ * vdev states are ordered from least to most healthy.
+ * A vdev that's CANT_OPEN or below is considered unusable.
+ */
+typedef enum vdev_state {
+ VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
+ VDEV_STATE_CLOSED, /* Not currently open */
+ VDEV_STATE_OFFLINE, /* Not allowed to open */
+ VDEV_STATE_REMOVED, /* Explicitly removed from system */
+ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
+ VDEV_STATE_FAULTED, /* External request to fault device */
+ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
+ VDEV_STATE_HEALTHY /* Presumed good */
+} vdev_state_t;
+
+#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY
+
+/*
+ * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
+ * of the vdev stats structure uses these constants to distinguish why.
+ */
+typedef enum vdev_aux {
+ VDEV_AUX_NONE, /* no error */
+ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
+ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
+ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
+ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
+ VDEV_AUX_TOO_SMALL, /* vdev size is too small */
+ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
+ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
+ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
+ VDEV_AUX_UNSUP_FEAT, /* unsupported features */
+ VDEV_AUX_SPARED, /* hot spare used in another pool */
+ VDEV_AUX_ERR_EXCEEDED, /* too many errors */
+ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
+ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
+ VDEV_AUX_EXTERNAL, /* external diagnosis */
+ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */
+ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */
+ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */
+ VDEV_AUX_ACTIVE /* vdev active on a different host */
+} vdev_aux_t;
+
+/*
+ * pool state. The following states are written to disk as part of the normal
+ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining
+ * states are software abstractions used at various levels to communicate
+ * pool state.
+ */
+typedef enum pool_state {
+ POOL_STATE_ACTIVE = 0, /* In active use */
+ POOL_STATE_EXPORTED, /* Explicitly exported */
+ POOL_STATE_DESTROYED, /* Explicitly destroyed */
+ POOL_STATE_SPARE, /* Reserved for hot spare use */
+ POOL_STATE_L2CACHE, /* Level 2 ARC device */
+ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
+ POOL_STATE_UNAVAIL, /* Internal libzfs state */
+ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
+} pool_state_t;
+
+/*
+ * mmp state. The following states provide additional detail describing
+ * why a pool couldn't be safely imported.
+ */
+typedef enum mmp_state {
+ MMP_STATE_ACTIVE = 0, /* In active use */
+ MMP_STATE_INACTIVE, /* Inactive and safe to import */
+ MMP_STATE_NO_HOSTID /* System hostid is not set */
+} mmp_state_t;
+
+/*
+ * Scan Functions.
+ */
+typedef enum pool_scan_func {
+ POOL_SCAN_NONE,
+ POOL_SCAN_SCRUB,
+ POOL_SCAN_RESILVER,
+ POOL_SCAN_FUNCS
+} pool_scan_func_t;
+
+/*
+ * Used to control scrub pause and resume.
+ */
+typedef enum pool_scrub_cmd {
+ POOL_SCRUB_NORMAL = 0,
+ POOL_SCRUB_PAUSE,
+ POOL_SCRUB_FLAGS_END
+} pool_scrub_cmd_t;
+
+/*
+ * Initialize functions.
+ */
+typedef enum pool_initialize_func {
+ POOL_INITIALIZE_DO,
+ POOL_INITIALIZE_CANCEL,
+ POOL_INITIALIZE_SUSPEND,
+ POOL_INITIALIZE_FUNCS
+} pool_initialize_func_t;
+
+/*
+ * ZIO types. Needed to interpret vdev statistics below.
+ */
+typedef enum zio_type {
+ ZIO_TYPE_NULL = 0,
+ ZIO_TYPE_READ,
+ ZIO_TYPE_WRITE,
+ ZIO_TYPE_FREE,
+ ZIO_TYPE_CLAIM,
+ ZIO_TYPE_IOCTL,
+ ZIO_TYPES
+} zio_type_t;
+
+/*
+ * Pool statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct pool_scan_stat {
+ /* values stored on disk */
+ uint64_t pss_func; /* pool_scan_func_t */
+ uint64_t pss_state; /* dsl_scan_state_t */
+ uint64_t pss_start_time; /* scan start time */
+ uint64_t pss_end_time; /* scan end time */
+ uint64_t pss_to_examine; /* total bytes to scan */
+ uint64_t pss_examined; /* total bytes located by scanner */
+ uint64_t pss_to_process; /* total bytes to process */
+ uint64_t pss_processed; /* total processed bytes */
+ uint64_t pss_errors; /* scan errors */
+
+ /* values not stored on disk */
+ uint64_t pss_pass_exam; /* examined bytes per scan pass */
+ uint64_t pss_pass_start; /* start time of a scan pass */
+ uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
+ /* cumulative time scrub spent paused, needed for rate calculation */
+ uint64_t pss_pass_scrub_spent_paused;
+
+ /* Sorted scrubbing new fields */
+ /* Stored on disk */
+ uint64_t pss_issued; /* total bytes checked by scanner */
+ /* Not stored on disk */
+ uint64_t pss_pass_issued; /* issued bytes per scan pass */
+} pool_scan_stat_t;
+
+typedef struct pool_removal_stat {
+ uint64_t prs_state; /* dsl_scan_state_t */
+ uint64_t prs_removing_vdev;
+ uint64_t prs_start_time;
+ uint64_t prs_end_time;
+ uint64_t prs_to_copy; /* bytes that need to be copied */
+ uint64_t prs_copied; /* bytes copied so far */
+ /*
+ * bytes of memory used for indirect mappings.
+ * This includes all removed vdevs.
+ */
+ uint64_t prs_mapping_memory;
+} pool_removal_stat_t;
+
+typedef enum dsl_scan_state {
+ DSS_NONE,
+ DSS_SCANNING,
+ DSS_FINISHED,
+ DSS_CANCELED,
+ DSS_NUM_STATES
+} dsl_scan_state_t;
+
+typedef enum {
+ CS_NONE,
+ CS_CHECKPOINT_EXISTS,
+ CS_CHECKPOINT_DISCARDING,
+ CS_NUM_STATES
+} checkpoint_state_t;
+
+typedef struct pool_checkpoint_stat {
+ uint64_t pcs_state; /* checkpoint_state_t */
+ uint64_t pcs_start_time; /* time checkpoint/discard started */
+ uint64_t pcs_space; /* checkpointed space */
+} pool_checkpoint_stat_t;
+
+typedef enum {
+ VDEV_INITIALIZE_NONE,
+ VDEV_INITIALIZE_ACTIVE,
+ VDEV_INITIALIZE_CANCELED,
+ VDEV_INITIALIZE_SUSPENDED,
+ VDEV_INITIALIZE_COMPLETE
+} vdev_initializing_state_t;
+
+/*
+ * Vdev statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct vdev_stat {
+ hrtime_t vs_timestamp; /* time since vdev load */
+ uint64_t vs_state; /* vdev state */
+ uint64_t vs_aux; /* see vdev_aux_t */
+ uint64_t vs_alloc; /* space allocated */
+ uint64_t vs_space; /* total capacity */
+ uint64_t vs_dspace; /* deflated capacity */
+ uint64_t vs_rsize; /* replaceable dev size */
+ uint64_t vs_esize; /* expandable dev size */
+ uint64_t vs_ops[ZIO_TYPES]; /* operation count */
+ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
+ uint64_t vs_read_errors; /* read errors */
+ uint64_t vs_write_errors; /* write errors */
+ uint64_t vs_checksum_errors; /* checksum errors */
+ uint64_t vs_self_healed; /* self-healed bytes */
+ uint64_t vs_scan_removing; /* removing? */
+ uint64_t vs_scan_processed; /* scan processed bytes */
+ uint64_t vs_configured_ashift; /* TLV vdev_ashift */
+ uint64_t vs_logical_ashift; /* vdev_logical_ashift */
+ uint64_t vs_physical_ashift; /* vdev_physical_ashift */
+ uint64_t vs_fragmentation; /* device fragmentation */
+ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
+ uint64_t vs_initialize_errors; /* initializing errors */
+ uint64_t vs_initialize_bytes_done; /* bytes initialized */
+ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */
+ uint64_t vs_initialize_state; /* vdev_initialzing_state_t */
+ uint64_t vs_initialize_action_time; /* time_t */
+} vdev_stat_t;
+#define VDEV_STAT_VALID(field, uint64_t_field_count) \
+ ((uint64_t_field_count * sizeof(uint64_t)) >= \
+ (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field)))
+
+/*
+ * DDT statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct ddt_object {
+ uint64_t ddo_count; /* number of elments in ddt */
+ uint64_t ddo_dspace; /* size of ddt on disk */
+ uint64_t ddo_mspace; /* size of ddt in-core */
+} ddt_object_t;
+
+typedef struct ddt_stat {
+ uint64_t dds_blocks; /* blocks */
+ uint64_t dds_lsize; /* logical size */
+ uint64_t dds_psize; /* physical size */
+ uint64_t dds_dsize; /* deflated allocated size */
+ uint64_t dds_ref_blocks; /* referenced blocks */
+ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
+ uint64_t dds_ref_psize; /* referenced psize * refcnt */
+ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
+} ddt_stat_t;
+
+typedef struct ddt_histogram {
+ ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
+} ddt_histogram_t;
+
+#define ZVOL_DRIVER "zvol"
+#define ZFS_DRIVER "zfs"
+#define ZFS_DEV_NAME "zfs"
+#define ZFS_DEV "/dev/" ZFS_DEV_NAME
+#define ZFS_DISK_ROOT "/dev/dsk"
+#define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/"
+#define ZFS_RDISK_ROOT "/dev/rdsk"
+#define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/"
+
+/* general zvol path */
+#define ZVOL_DIR "/dev/zvol"
+/* expansion */
+#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:"
+/* for dump and swap */
+#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/"
+#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/"
+
+#define ZVOL_PROP_NAME "name"
+#define ZVOL_DEFAULT_BLOCKSIZE 8192
+
+/*
+ * /dev/zfs ioctl numbers.
+ *
+ * These numbers cannot change over time. New ioctl numbers must be appended.
+ */
+typedef enum zfs_ioc {
+ /*
+ * Core features - 81/128 numbers reserved.
+ */
+#ifdef __FreeBSD__
+ ZFS_IOC_FIRST = 0,
+#else
+ ZFS_IOC_FIRST = ('Z' << 8),
+#endif
+ ZFS_IOC = ZFS_IOC_FIRST,
+ ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST,
+ ZFS_IOC_POOL_DESTROY,
+ ZFS_IOC_POOL_IMPORT,
+ ZFS_IOC_POOL_EXPORT,
+ ZFS_IOC_POOL_CONFIGS,
+ ZFS_IOC_POOL_STATS,
+ ZFS_IOC_POOL_TRYIMPORT,
+ ZFS_IOC_POOL_SCAN,
+ ZFS_IOC_POOL_FREEZE,
+ ZFS_IOC_POOL_UPGRADE,
+ ZFS_IOC_POOL_GET_HISTORY,
+ ZFS_IOC_VDEV_ADD,
+ ZFS_IOC_VDEV_REMOVE,
+ ZFS_IOC_VDEV_SET_STATE,
+ ZFS_IOC_VDEV_ATTACH,
+ ZFS_IOC_VDEV_DETACH,
+ ZFS_IOC_VDEV_SETPATH,
+ ZFS_IOC_VDEV_SETFRU,
+ ZFS_IOC_OBJSET_STATS,
+ ZFS_IOC_OBJSET_ZPLPROPS,
+ ZFS_IOC_DATASET_LIST_NEXT,
+ ZFS_IOC_SNAPSHOT_LIST_NEXT,
+ ZFS_IOC_SET_PROP,
+ ZFS_IOC_CREATE,
+ ZFS_IOC_DESTROY,
+ ZFS_IOC_ROLLBACK,
+ ZFS_IOC_RENAME,
+ ZFS_IOC_RECV,
+ ZFS_IOC_SEND,
+ ZFS_IOC_INJECT_FAULT,
+ ZFS_IOC_CLEAR_FAULT,
+ ZFS_IOC_INJECT_LIST_NEXT,
+ ZFS_IOC_ERROR_LOG,
+ ZFS_IOC_CLEAR,
+ ZFS_IOC_PROMOTE,
+ ZFS_IOC_DESTROY_SNAPS,
+ ZFS_IOC_SNAPSHOT,
+ ZFS_IOC_DSOBJ_TO_DSNAME,
+ ZFS_IOC_OBJ_TO_PATH,
+ ZFS_IOC_POOL_SET_PROPS,
+ ZFS_IOC_POOL_GET_PROPS,
+ ZFS_IOC_SET_FSACL,
+ ZFS_IOC_GET_FSACL,
+ ZFS_IOC_SHARE,
+ ZFS_IOC_INHERIT_PROP,
+ ZFS_IOC_SMB_ACL,
+ ZFS_IOC_USERSPACE_ONE,
+ ZFS_IOC_USERSPACE_MANY,
+ ZFS_IOC_USERSPACE_UPGRADE,
+ ZFS_IOC_HOLD,
+ ZFS_IOC_RELEASE,
+ ZFS_IOC_GET_HOLDS,
+ ZFS_IOC_OBJSET_RECVD_PROPS,
+ ZFS_IOC_VDEV_SPLIT,
+ ZFS_IOC_NEXT_OBJ,
+ ZFS_IOC_DIFF,
+ ZFS_IOC_TMP_SNAPSHOT,
+ ZFS_IOC_OBJ_TO_STATS,
+ ZFS_IOC_JAIL,
+ ZFS_IOC_UNJAIL,
+ ZFS_IOC_POOL_REGUID,
+ ZFS_IOC_SPACE_WRITTEN,
+ ZFS_IOC_SPACE_SNAPS,
+ ZFS_IOC_SEND_PROGRESS,
+ ZFS_IOC_POOL_REOPEN,
+ ZFS_IOC_LOG_HISTORY,
+ ZFS_IOC_SEND_NEW,
+ ZFS_IOC_SEND_SPACE,
+ ZFS_IOC_CLONE,
+ ZFS_IOC_BOOKMARK,
+ ZFS_IOC_GET_BOOKMARKS,
+ ZFS_IOC_DESTROY_BOOKMARKS,
+#ifdef __FreeBSD__
+ ZFS_IOC_NEXTBOOT,
+#endif
+ ZFS_IOC_CHANNEL_PROGRAM,
+ ZFS_IOC_REMAP,
+ ZFS_IOC_POOL_CHECKPOINT,
+ ZFS_IOC_POOL_DISCARD_CHECKPOINT,
+ ZFS_IOC_POOL_INITIALIZE,
+ ZFS_IOC_POOL_SYNC,
+ ZFS_IOC_SET_BOOTENV,
+ ZFS_IOC_GET_BOOTENV,
+ ZFS_IOC_LAST
+} zfs_ioc_t;
+
+/*
+ * ZFS-specific error codes used for returning descriptive errors
+ * to the userland through zfs ioctls.
+ *
+ * The enum implicitly includes all the error codes from errno.h.
+ * New code should use and extend this enum for errors that are
+ * not described precisely by generic errno codes.
+ *
+ * These numbers should not change over time. New entries should be appended.
+ */
+typedef enum {
+ ZFS_ERR_CHECKPOINT_EXISTS = 1024,
+ ZFS_ERR_DISCARDING_CHECKPOINT,
+ ZFS_ERR_NO_CHECKPOINT,
+ ZFS_ERR_DEVRM_IN_PROGRESS,
+ ZFS_ERR_VDEV_TOO_BIG,
+ ZFS_ERR_IOC_CMD_UNAVAIL,
+ ZFS_ERR_IOC_ARG_UNAVAIL,
+ ZFS_ERR_IOC_ARG_REQUIRED,
+ ZFS_ERR_IOC_ARG_BADTYPE,
+ ZFS_ERR_WRONG_PARENT,
+} zfs_errno_t;
+
+/*
+ * Internal SPA load state. Used by FMA diagnosis engine.
+ */
+typedef enum {
+ SPA_LOAD_NONE, /* no load in progress */
+ SPA_LOAD_OPEN, /* normal open */
+ SPA_LOAD_IMPORT, /* import in progress */
+ SPA_LOAD_TRYIMPORT, /* tryimport in progress */
+ SPA_LOAD_RECOVER, /* recovery requested */
+ SPA_LOAD_ERROR, /* load failed */
+ SPA_LOAD_CREATE /* creation in progress */
+} spa_load_state_t;
+
+/*
+ * Bookmark name values.
+ */
+#define ZPOOL_ERR_LIST "error list"
+#define ZPOOL_ERR_DATASET "dataset"
+#define ZPOOL_ERR_OBJECT "object"
+
+#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1)
+
+/*
+ * The following are names used in the nvlist describing
+ * the pool's history log.
+ */
+#define ZPOOL_HIST_RECORD "history record"
+#define ZPOOL_HIST_TIME "history time"
+#define ZPOOL_HIST_CMD "history command"
+#define ZPOOL_HIST_WHO "history who"
+#define ZPOOL_HIST_ZONE "history zone"
+#define ZPOOL_HIST_HOST "history hostname"
+#define ZPOOL_HIST_TXG "history txg"
+#define ZPOOL_HIST_INT_EVENT "history internal event"
+#define ZPOOL_HIST_INT_STR "history internal str"
+#define ZPOOL_HIST_INT_NAME "internal_name"
+#define ZPOOL_HIST_IOCTL "ioctl"
+#define ZPOOL_HIST_INPUT_NVL "in_nvl"
+#define ZPOOL_HIST_OUTPUT_NVL "out_nvl"
+#define ZPOOL_HIST_DSNAME "dsname"
+#define ZPOOL_HIST_DSID "dsid"
+#define ZPOOL_HIST_ERRNO "errno"
+
+/*
+ * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
+ */
+#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
+#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
+
+/*
+ * Flags for ZFS_IOC_VDEV_SET_STATE
+ */
+#define ZFS_ONLINE_CHECKREMOVE 0x1
+#define ZFS_ONLINE_UNSPARE 0x2
+#define ZFS_ONLINE_FORCEFAULT 0x4
+#define ZFS_ONLINE_EXPAND 0x8
+#define ZFS_OFFLINE_TEMPORARY 0x1
+
+/*
+ * Flags for ZFS_IOC_POOL_IMPORT
+ */
+#define ZFS_IMPORT_NORMAL 0x0
+#define ZFS_IMPORT_VERBATIM 0x1
+#define ZFS_IMPORT_ANY_HOST 0x2
+#define ZFS_IMPORT_MISSING_LOG 0x4
+#define ZFS_IMPORT_ONLY 0x8
+#define ZFS_IMPORT_CHECKPOINT 0x10
+#define ZFS_IMPORT_TEMP_NAME 0x20
+#define ZFS_IMPORT_SKIP_MMP 0x40
+
+/*
+ * Channel program argument/return nvlist keys and defaults.
+ */
+#define ZCP_ARG_PROGRAM "program"
+#define ZCP_ARG_ARGLIST "arg"
+#define ZCP_ARG_SYNC "sync"
+#define ZCP_ARG_INSTRLIMIT "instrlimit"
+#define ZCP_ARG_MEMLIMIT "memlimit"
+
+#define ZCP_ARG_CLIARGV "argv"
+
+#define ZCP_RET_ERROR "error"
+#define ZCP_RET_RETURN "return"
+
+#define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000)
+#define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT)
+#define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024)
+#define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT)
+
+/*
+ * nvlist name constants. Facilitate restricting snapshot iteration range for
+ * the "list next snapshot" ioctl
+ */
+#define SNAP_ITER_MIN_TXG "snap_iter_min_txg"
+#define SNAP_ITER_MAX_TXG "snap_iter_max_txg"
+
+/*
+ * Sysevent payload members. ZFS will generate the following sysevents with the
+ * given payloads:
+ *
+ * ESC_ZFS_RESILVER_START
+ * ESC_ZFS_RESILVER_END
+ * ESC_ZFS_POOL_DESTROY
+ * ESC_ZFS_POOL_REGUID
+ *
+ * ZFS_EV_POOL_NAME DATA_TYPE_STRING
+ * ZFS_EV_POOL_GUID DATA_TYPE_UINT64
+ *
+ * ESC_ZFS_VDEV_REMOVE
+ * ESC_ZFS_VDEV_CLEAR
+ * ESC_ZFS_VDEV_CHECK
+ *
+ * ZFS_EV_POOL_NAME DATA_TYPE_STRING
+ * ZFS_EV_POOL_GUID DATA_TYPE_UINT64
+ * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional)
+ * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64
+ *
+ * ESC_ZFS_HISTORY_EVENT
+ *
+ * ZFS_EV_POOL_NAME DATA_TYPE_STRING
+ * ZFS_EV_POOL_GUID DATA_TYPE_UINT64
+ * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional)
+ * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional)
+ * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional)
+ * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional)
+ * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional)
+ * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional)
+ * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional)
+ * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional)
+ * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional)
+ * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional)
+ * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional)
+ * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional)
+ *
+ * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the
+ * history log nvlist. The keynames will be free of any spaces or other
+ * characters that could be potentially unexpected to consumers of the
+ * sysevents.
+ */
+#define ZFS_EV_POOL_NAME "pool_name"
+#define ZFS_EV_POOL_GUID "pool_guid"
+#define ZFS_EV_VDEV_PATH "vdev_path"
+#define ZFS_EV_VDEV_GUID "vdev_guid"
+#define ZFS_EV_HIST_TIME "history_time"
+#define ZFS_EV_HIST_CMD "history_command"
+#define ZFS_EV_HIST_WHO "history_who"
+#define ZFS_EV_HIST_ZONE "history_zone"
+#define ZFS_EV_HIST_HOST "history_hostname"
+#define ZFS_EV_HIST_TXG "history_txg"
+#define ZFS_EV_HIST_INT_EVENT "history_internal_event"
+#define ZFS_EV_HIST_INT_STR "history_internal_str"
+#define ZFS_EV_HIST_INT_NAME "history_internal_name"
+#define ZFS_EV_HIST_IOCTL "history_ioctl"
+#define ZFS_EV_HIST_DSNAME "history_dsname"
+#define ZFS_EV_HIST_DSID "history_dsid"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
new file mode 100644
index 000000000000..36c9eaa7f18e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZUT_H
+#define _ZUT_H
+
+/*
+ * IOCTLs for the zfs unit test driver
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#define ZUT_DRIVER "zut"
+#define ZUT_DEV "/dev/zut"
+
+#define ZUT_VERSION_STRING "1"
+
+/*
+ * /dev/zut ioctl numbers.
+ */
+#define ZUT_IOC ('U' << 8)
+
+/* Request flags */
+#define ZUT_IGNORECASE 0x01
+#define ZUT_ACCFILTER 0x02
+#define ZUT_XATTR 0x04
+#define ZUT_EXTRDDIR 0x08
+#define ZUT_GETSTAT 0x10
+
+typedef struct zut_lookup {
+ int zl_reqflags;
+ int zl_deflags; /* output */
+ int zl_retcode; /* output */
+ char zl_dir[MAXPATHLEN];
+ char zl_file[MAXNAMELEN];
+ char zl_xfile[MAXNAMELEN];
+ char zl_real[MAXPATHLEN]; /* output */
+ uint64_t zl_xvattrs; /* output */
+ struct stat64 zl_statbuf; /* output */
+} zut_lookup_t;
+
+typedef struct zut_readdir {
+ uint64_t zr_buf; /* pointer to output buffer */
+ uint64_t zr_loffset; /* output */
+ char zr_dir[MAXPATHLEN];
+ char zr_file[MAXNAMELEN];
+ int zr_reqflags;
+ int zr_retcode; /* output */
+ int zr_eof; /* output */
+ uint_t zr_bytes; /* output */
+ uint_t zr_buflen;
+} zut_readdir_t;
+
+typedef enum zut_ioc {
+ ZUT_IOC_MIN_CMD = ZUT_IOC - 1,
+ ZUT_IOC_LOOKUP = ZUT_IOC,
+ ZUT_IOC_READDIR,
+ ZUT_IOC_MAX_CMD
+} zut_ioc_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZUT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h b/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
new file mode 100644
index 000000000000..39eeb905c72b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
@@ -0,0 +1,97 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_IDMAP_H
+#define _SYS_IDMAP_H
+
+
+/* Idmap status codes */
+#define IDMAP_SUCCESS 0
+#define IDMAP_NEXT 1
+#define IDMAP_ERR_OTHER -10000
+#define IDMAP_ERR_INTERNAL -9999
+#define IDMAP_ERR_MEMORY -9998
+#define IDMAP_ERR_NORESULT -9997
+#define IDMAP_ERR_NOTUSER -9996
+#define IDMAP_ERR_NOTGROUP -9995
+#define IDMAP_ERR_NOTSUPPORTED -9994
+#define IDMAP_ERR_W2U_NAMERULE -9993
+#define IDMAP_ERR_U2W_NAMERULE -9992
+#define IDMAP_ERR_CACHE -9991
+#define IDMAP_ERR_DB -9990
+#define IDMAP_ERR_ARG -9989
+#define IDMAP_ERR_SID -9988
+#define IDMAP_ERR_IDTYPE -9987
+#define IDMAP_ERR_RPC_HANDLE -9986
+#define IDMAP_ERR_RPC -9985
+#define IDMAP_ERR_CLIENT_HANDLE -9984
+#define IDMAP_ERR_BUSY -9983
+#define IDMAP_ERR_PERMISSION_DENIED -9982
+#define IDMAP_ERR_NOMAPPING -9981
+#define IDMAP_ERR_NEW_ID_ALLOC_REQD -9980
+#define IDMAP_ERR_DOMAIN -9979
+#define IDMAP_ERR_SECURITY -9978
+#define IDMAP_ERR_NOTFOUND -9977
+#define IDMAP_ERR_DOMAIN_NOTFOUND -9976
+#define IDMAP_ERR_UPDATE_NOTALLOWED -9975
+#define IDMAP_ERR_CFG -9974
+#define IDMAP_ERR_CFG_CHANGE -9973
+#define IDMAP_ERR_NOTMAPPED_WELLKNOWN -9972
+#define IDMAP_ERR_RETRIABLE_NET_ERR -9971
+#define IDMAP_ERR_W2U_NAMERULE_CONFLICT -9970
+#define IDMAP_ERR_U2W_NAMERULE_CONFLICT -9969
+#define IDMAP_ERR_BAD_UTF8 -9968
+#define IDMAP_ERR_NONE_GENERATED -9967
+#define IDMAP_ERR_PROP_UNKNOWN -9966
+#define IDMAP_ERR_NS_LDAP_OP_FAILED -9965
+#define IDMAP_ERR_NS_LDAP_PARTIAL -9964
+#define IDMAP_ERR_NS_LDAP_CFG -9963
+#define IDMAP_ERR_NS_LDAP_BAD_WINNAME -9962
+#define IDMAP_ERR_NO_ACTIVEDIRECTORY -9961
+
+/* Reserved GIDs for some well-known SIDs */
+#define IDMAP_WK_LOCAL_SYSTEM_GID 2147483648U /* 0x80000000 */
+#define IDMAP_WK_CREATOR_GROUP_GID 2147483649U
+#define IDMAP_WK__MAX_GID 2147483649U
+
+/* Reserved UIDs for some well-known SIDs */
+#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U
+#define IDMAP_WK__MAX_UID 2147483648U
+
+/* Reserved SIDs */
+#define IDMAP_WK_CREATOR_SID_AUTHORITY "S-1-3"
+
+/*
+ * Max door RPC size for ID mapping (can't be too large relative to the
+ * default user-land thread stack size, since clnt_door_call()
+ * alloca()s). See libidmap:idmap_init().
+ */
+#define IDMAP_MAX_DOOR_RPC (256 * 1024)
+
+#define IDMAP_SENTINEL_PID UINT32_MAX
+#define IDMAP_ID_IS_EPHEMERAL(pid) \
+ (((pid) > INT32_MAX) && ((pid) != IDMAP_SENTINEL_PID))
+
+#endif /* _SYS_IDMAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
new file mode 100644
index 000000000000..93f1855b3908
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
@@ -0,0 +1,697 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ISA_DEFS_H
+#define _SYS_ISA_DEFS_H
+
+/*
+ * This header file serves to group a set of well known defines and to
+ * set these for each instruction set architecture. These defines may
+ * be divided into two groups; characteristics of the processor and
+ * implementation choices for Solaris on a processor.
+ *
+ * Processor Characteristics:
+ *
+ * _LITTLE_ENDIAN / _BIG_ENDIAN:
+ * The natural byte order of the processor. A pointer to an int points
+ * to the least/most significant byte of that int.
+ *
+ * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD:
+ * The processor specific direction of stack growth. A push onto the
+ * stack increases/decreases the stack pointer, so it stores data at
+ * successively higher/lower addresses. (Stackless machines ignored
+ * without regrets).
+ *
+ * _LONG_LONG_HTOL / _LONG_LONG_LTOH:
+ * A pointer to a long long points to the most/least significant long
+ * within that long long.
+ *
+ * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH:
+ * The C compiler assigns bit fields from the high/low to the low/high end
+ * of an int (most to least significant vs. least to most significant).
+ *
+ * _IEEE_754:
+ * The processor (or supported implementations of the processor)
+ * supports the ieee-754 floating point standard. No other floating
+ * point standards are supported (or significant). Any other supported
+ * floating point formats are expected to be cased on the ISA processor
+ * symbol.
+ *
+ * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED:
+ * The C Compiler implements objects of type `char' as `unsigned' or
+ * `signed' respectively. This is really an implementation choice of
+ * the compiler writer, but it is specified in the ABI and tends to
+ * be uniform across compilers for an instruction set architecture.
+ * Hence, it has the properties of a processor characteristic.
+ *
+ * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT /
+ * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT /
+ * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT:
+ * The ABI defines alignment requirements of each of the primitive
+ * object types. Some, if not all, may be hardware requirements as
+ * well. The values are expressed in "byte-alignment" units.
+ *
+ * _MAX_ALIGNMENT:
+ * The most stringent alignment requirement as specified by the ABI.
+ * Equal to the maximum of all the above _XXX_ALIGNMENT values.
+ *
+ * _ALIGNMENT_REQUIRED:
+ * True or false (1 or 0) whether or not the hardware requires the ABI
+ * alignment.
+ *
+ * _LONG_LONG_ALIGNMENT_32
+ * The 32-bit ABI supported by a 64-bit kernel may have different
+ * alignment requirements for primitive object types. The value of this
+ * identifier is expressed in "byte-alignment" units.
+ *
+ * _HAVE_CPUID_INSN
+ * This indicates that the architecture supports the 'cpuid'
+ * instruction as defined by Intel. (Intel allows other vendors
+ * to extend the instruction for their own purposes.)
+ *
+ *
+ * Implementation Choices:
+ *
+ * _ILP32 / _LP64:
+ * This specifies the compiler data type implementation as specified in
+ * the relevant ABI. The choice between these is strongly influenced
+ * by the underlying hardware, but is not absolutely tied to it.
+ * Currently only two data type models are supported:
+ *
+ * _ILP32:
+ * Int/Long/Pointer are 32 bits. This is the historical UNIX
+ * and Solaris implementation. Due to its historical standing,
+ * this is the default case.
+ *
+ * _LP64:
+ * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen
+ * implementation for 64-bit ABIs such as SPARC V9.
+ *
+ * _I32LPx:
+ * A compilation environment where 'int' is 32-bit, and
+ * longs and pointers are simply the same size.
+ *
+ * In all cases, Char is 8 bits and Short is 16 bits.
+ *
+ * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16:
+ * This specifies the form of the disk VTOC (or label):
+ *
+ * _SUNOS_VTOC_8:
+ * This is a VTOC form which is upwardly compatible with the
+ * SunOS 4.x disk label and allows 8 partitions per disk.
+ *
+ * _SUNOS_VTOC_16:
+ * In this format the incore vtoc image matches the ondisk
+ * version. It allows 16 slices per disk, and is not
+ * compatible with the SunOS 4.x disk label.
+ *
+ * Note that these are not the only two VTOC forms possible and
+ * additional forms may be added. One possible form would be the
+ * SVr4 VTOC form. The symbol for that is reserved now, although
+ * it is not implemented.
+ *
+ * _SVR4_VTOC_16:
+ * This VTOC form is compatible with the System V Release 4
+ * VTOC (as implemented on the SVr4 Intel and 3b ports) with
+ * 16 partitions per disk.
+ *
+ *
+ * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR
+ * This describes the type of addresses used by system DMA:
+ *
+ * _DMA_USES_PHYSADDR:
+ * This type of DMA, used in the x86 implementation,
+ * requires physical addresses for DMA buffers. The 24-bit
+ * addresses used by some legacy boards is the source of the
+ * "low-memory" (<16MB) requirement for some devices using DMA.
+ *
+ * _DMA_USES_VIRTADDR:
+ * This method of DMA allows the use of virtual addresses for
+ * DMA transfers.
+ *
+ * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT
+ * This indicates the presence/absence of an fdisk table.
+ *
+ * _FIRMWARE_NEEDS_FDISK
+ * The fdisk table is required by system firmware. If present,
+ * it allows a disk to be subdivided into multiple fdisk
+ * partitions, each of which is equivalent to a separate,
+ * virtual disk. This enables the co-existence of multiple
+ * operating systems on a shared hard disk.
+ *
+ * _NO_FDISK_PRESENT
+ * If the fdisk table is absent, it is assumed that the entire
+ * media is allocated for a single operating system.
+ *
+ * _HAVE_TEM_FIRMWARE
+ * Defined if this architecture has the (fallback) option of
+ * using prom_* calls for doing I/O if a suitable kernel driver
+ * is not available to do it.
+ *
+ * _DONT_USE_1275_GENERIC_NAMES
+ * Controls whether or not device tree node names should
+ * comply with the IEEE 1275 "Generic Names" Recommended
+ * Practice. With _DONT_USE_GENERIC_NAMES, device-specific
+ * names identifying the particular device will be used.
+ *
+ * __i386_COMPAT
+ * This indicates whether the i386 ABI is supported as a *non-native*
+ * mode for the platform. When this symbol is defined:
+ * - 32-bit xstat-style system calls are enabled
+ * - 32-bit xmknod-style system calls are enabled
+ * - 32-bit system calls use i386 sizes -and- alignments
+ *
+ * Note that this is NOT defined for the i386 native environment!
+ *
+ * __x86
+ * This is ONLY a synonym for defined(__i386) || defined(__amd64)
+ * which is useful only insofar as these two architectures share
+ * common attributes. Analogous to __sparc.
+ *
+ * _PSM_MODULES
+ * This indicates whether or not the implementation uses PSM
+ * modules for processor support, reading /etc/mach from inside
+ * the kernel to extract a list.
+ *
+ * _RTC_CONFIG
+ * This indicates whether or not the implementation uses /etc/rtc_config
+ * to configure the real-time clock in the kernel.
+ *
+ * _UNIX_KRTLD
+ * This indicates that the implementation uses a dynamically
+ * linked unix + krtld to form the core kernel image at boot
+ * time, or (in the absence of this symbol) a prelinked kernel image.
+ *
+ * _OBP
+ * This indicates the firmware interface is OBP.
+ *
+ * _SOFT_HOSTID
+ * This indicates that the implementation obtains the hostid
+ * from the file /etc/hostid, rather than from hardware.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The following set of definitions characterize Solaris on AMD's
+ * 64-bit systems.
+ */
+#if defined(__x86_64) || defined(__amd64)
+
+#if !defined(__amd64)
+#define __amd64 /* preferred guard */
+#endif
+
+#if !defined(__x86)
+#define __x86
+#endif
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#ifdef illumos
+#define _LITTLE_ENDIAN
+#endif
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_LTOH
+#define _BIT_FIELDS_LTOH
+#define _IEEE_754
+#define _CHAR_IS_SIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_ALIGNMENT 8
+#define _LONG_LONG_ALIGNMENT 8
+#define _DOUBLE_ALIGNMENT 8
+#define _DOUBLE_COMPLEX_ALIGNMENT 8
+#define _LONG_DOUBLE_ALIGNMENT 16
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16
+#define _POINTER_ALIGNMENT 8
+#define _MAX_ALIGNMENT 16
+#define _ALIGNMENT_REQUIRED 1
+
+/*
+ * Different alignment constraints for the i386 ABI in compatibility mode
+ */
+#define _LONG_LONG_ALIGNMENT_32 4
+
+/*
+ * Define the appropriate "implementation choices".
+ */
+#if !defined(_LP64)
+#define _LP64
+#endif
+#if !defined(_I32LPx) && defined(_KERNEL)
+#define _I32LPx
+#endif
+#define _MULTI_DATAMODEL
+#define _SUNOS_VTOC_16
+#define _DMA_USES_PHYSADDR
+#define _FIRMWARE_NEEDS_FDISK
+#define __i386_COMPAT
+#define _PSM_MODULES
+#define _RTC_CONFIG
+#define _SOFT_HOSTID
+#define _DONT_USE_1275_GENERIC_NAMES
+#define _HAVE_CPUID_INSN
+
+/*
+ * The feature test macro __i386 is generic for all processors implementing
+ * the Intel 386 instruction set or a superset of it. Specifically, this
+ * includes all members of the 386, 486, and Pentium family of processors.
+ */
+#elif defined(__i386) || defined(__i386__)
+
+#if !defined(__i386)
+#define __i386
+#endif
+
+#if !defined(__x86)
+#define __x86
+#endif
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#ifdef illumos
+#define _LITTLE_ENDIAN
+#endif
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_LTOH
+#define _BIT_FIELDS_LTOH
+#define _IEEE_754
+#define _CHAR_IS_SIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_ALIGNMENT 4
+#define _LONG_LONG_ALIGNMENT 4
+#define _DOUBLE_ALIGNMENT 4
+#define _DOUBLE_COMPLEX_ALIGNMENT 4
+#define _LONG_DOUBLE_ALIGNMENT 4
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4
+#define _POINTER_ALIGNMENT 4
+#define _MAX_ALIGNMENT 4
+#define _ALIGNMENT_REQUIRED 0
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices".
+ */
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#if !defined(_I32LPx) && defined(_KERNEL)
+#define _I32LPx
+#endif
+#define _SUNOS_VTOC_16
+#define _DMA_USES_PHYSADDR
+#define _FIRMWARE_NEEDS_FDISK
+#define _PSM_MODULES
+#define _RTC_CONFIG
+#define _SOFT_HOSTID
+#define _DONT_USE_1275_GENERIC_NAMES
+#define _HAVE_CPUID_INSN
+
+#elif defined(__aarch64__)
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_LTOH
+#define _BIT_FIELDS_LTOH
+#define _IEEE_754
+#define _CHAR_IS_UNSIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_ALIGNMENT 8
+#define _LONG_LONG_ALIGNMENT 8
+#define _DOUBLE_ALIGNMENT 8
+#define _DOUBLE_COMPLEX_ALIGNMENT 8
+#define _LONG_DOUBLE_ALIGNMENT 16
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16
+#define _POINTER_ALIGNMENT 8
+#define _MAX_ALIGNMENT 16
+#define _ALIGNMENT_REQUIRED 1
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices"
+ */
+#if !defined(_LP64)
+#define _LP64
+#endif
+#define _SUNOS_VTOC_16
+#define _DMA_USES_PHYSADDR
+#define _FIRMWARE_NEEDS_FDISK
+#define _PSM_MODULES
+#define _RTC_CONFIG
+#define _DONT_USE_1275_GENERIC_NAMES
+#define _HAVE_CPUID_INSN
+
+#elif defined(__riscv)
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_LTOH
+#define _BIT_FIELDS_LTOH
+#define _IEEE_754
+#define _CHAR_IS_UNSIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_ALIGNMENT 8
+#define _LONG_LONG_ALIGNMENT 8
+#define _DOUBLE_ALIGNMENT 8
+#define _DOUBLE_COMPLEX_ALIGNMENT 8
+#define _LONG_DOUBLE_ALIGNMENT 16
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16
+#define _POINTER_ALIGNMENT 8
+#define _MAX_ALIGNMENT 16
+#define _ALIGNMENT_REQUIRED 1
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices"
+ */
+#if !defined(_LP64)
+#define _LP64
+#endif
+#define _SUNOS_VTOC_16
+#define _DMA_USES_PHYSADDR
+#define _FIRMWARE_NEEDS_FDISK
+#define _PSM_MODULES
+#define _RTC_CONFIG
+#define _DONT_USE_1275_GENERIC_NAMES
+#define _HAVE_CPUID_INSN
+
+#elif defined(__arm__)
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_LTOH
+#define _BIT_FIELDS_LTOH
+#define _IEEE_754
+#define _CHAR_IS_SIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_ALIGNMENT 4
+#define _LONG_LONG_ALIGNMENT 4
+#define _DOUBLE_ALIGNMENT 4
+#define _DOUBLE_COMPLEX_ALIGNMENT 4
+#define _LONG_DOUBLE_ALIGNMENT 4
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4
+#define _POINTER_ALIGNMENT 4
+#define _MAX_ALIGNMENT 4
+#define _ALIGNMENT_REQUIRED 0
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices".
+ */
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#if !defined(_I32LPx) && defined(_KERNEL)
+#define _I32LPx
+#endif
+#define _SUNOS_VTOC_16
+#define _DMA_USES_PHYSADDR
+#define _FIRMWARE_NEEDS_FDISK
+#define _PSM_MODULES
+#define _RTC_CONFIG
+#define _DONT_USE_1275_GENERIC_NAMES
+#define _HAVE_CPUID_INSN
+
+#elif defined(__mips__)
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_LTOH
+#define _BIT_FIELDS_LTOH
+#define _IEEE_754
+#define _CHAR_IS_SIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#if defined(__mips_n64)
+#define _LONG_ALIGNMENT 8
+#define _LONG_LONG_ALIGNMENT 8
+#define _DOUBLE_ALIGNMENT 8
+#define _DOUBLE_COMPLEX_ALIGNMENT 8
+#define _LONG_DOUBLE_ALIGNMENT 8
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8
+#define _POINTER_ALIGNMENT 8
+#define _MAX_ALIGNMENT 8
+#define _ALIGNMENT_REQUIRED 0
+
+#define _LONG_LONG_ALIGNMENT_32 _INT_ALIGNMENT
+/*
+ * Define the appropriate "implementation choices".
+ */
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#define _LONG_ALIGNMENT 4
+#define _LONG_LONG_ALIGNMENT 4
+#define _DOUBLE_ALIGNMENT 4
+#define _DOUBLE_COMPLEX_ALIGNMENT 4
+#define _LONG_DOUBLE_ALIGNMENT 4
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4
+#define _POINTER_ALIGNMENT 4
+#define _MAX_ALIGNMENT 4
+#define _ALIGNMENT_REQUIRED 0
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices".
+ */
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#if !defined(_I32LPx) && defined(_KERNEL)
+#define _I32LPx
+#endif
+#endif
+#define _SUNOS_VTOC_16
+#define _DMA_USES_PHYSADDR
+#define _FIRMWARE_NEEDS_FDISK
+#define _PSM_MODULES
+#define _RTC_CONFIG
+#define _DONT_USE_1275_GENERIC_NAMES
+#define _HAVE_CPUID_INSN
+
+#elif defined(__powerpc__)
+
+#if defined(__BIG_ENDIAN__)
+#define _BIT_FIELDS_HTOL
+#else
+#define _BIT_FIELDS_LTOH
+#endif
+
+/*
+ * The following set of definitions characterize the Solaris on SPARC systems.
+ *
+ * The symbol __sparc indicates any of the SPARC family of processor
+ * architectures. This includes SPARC V7, SPARC V8 and SPARC V9.
+ *
+ * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined
+ * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough
+ * to SPARC V8 for the former to be subsumed into the latter definition.)
+ *
+ * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined
+ * by Version 9 of the SPARC Architecture Manual.
+ *
+ * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only
+ * relevant when the symbol __sparc is defined.
+ */
+/*
+ * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added
+ * to support backwards builds. This workaround should be removed in s10_71.
+ */
+#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__)
+#if !defined(__sparc)
+#define __sparc
+#endif
+
+/*
+ * You can be 32-bit or 64-bit, but not both at the same time.
+ */
+#if defined(__sparcv8) && defined(__sparcv9)
+#error "SPARC Versions 8 and 9 are mutually exclusive choices"
+#endif
+
+/*
+ * Existing compilers do not set __sparcv8. Years will transpire before
+ * the compilers can be depended on to set the feature test macro. In
+ * the interim, we'll set it here on the basis of historical behaviour;
+ * if you haven't asked for SPARC V9, then you must've meant SPARC V8.
+ */
+#if !defined(__sparcv9) && !defined(__sparcv8)
+#define __sparcv8
+#endif
+
+/*
+ * Define the appropriate "processor characteristics" shared between
+ * all Solaris on SPARC systems.
+ */
+#ifdef illumos
+#define _BIG_ENDIAN
+#endif
+#define _STACK_GROWS_DOWNWARD
+#define _LONG_LONG_HTOL
+#define _BIT_FIELDS_HTOL
+#define _IEEE_754
+#define _CHAR_IS_SIGNED
+#define _BOOL_ALIGNMENT 1
+#define _CHAR_ALIGNMENT 1
+#define _SHORT_ALIGNMENT 2
+#define _INT_ALIGNMENT 4
+#define _FLOAT_ALIGNMENT 4
+#define _FLOAT_COMPLEX_ALIGNMENT 4
+#define _LONG_LONG_ALIGNMENT 8
+#define _DOUBLE_ALIGNMENT 8
+#define _DOUBLE_COMPLEX_ALIGNMENT 8
+#define _ALIGNMENT_REQUIRED 1
+
+/*
+ * Define the appropriate "implementation choices" shared between versions.
+ */
+#define _SUNOS_VTOC_8
+#define _DMA_USES_VIRTADDR
+#define _NO_FDISK_PRESENT
+#define _HAVE_TEM_FIRMWARE
+#define _OBP
+
+/*
+ * The following set of definitions characterize the implementation of
+ * 32-bit Solaris on SPARC V8 systems.
+ */
+#if defined(__sparcv8)
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _LONG_ALIGNMENT 4
+#define _LONG_DOUBLE_ALIGNMENT 8
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8
+#define _POINTER_ALIGNMENT 4
+#define _MAX_ALIGNMENT 8
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices"
+ */
+#define _ILP32
+#if !defined(_I32LPx) && defined(_KERNEL)
+#define _I32LPx
+#endif
+
+/*
+ * The following set of definitions characterize the implementation of
+ * 64-bit Solaris on SPARC V9 systems.
+ */
+#elif defined(__sparcv9)
+
+/*
+ * Define the appropriate "processor characteristics"
+ */
+#define _LONG_ALIGNMENT 8
+#define _LONG_DOUBLE_ALIGNMENT 16
+#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16
+#define _POINTER_ALIGNMENT 8
+#define _MAX_ALIGNMENT 16
+
+#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
+
+/*
+ * Define the appropriate "implementation choices"
+ */
+#if !defined(_LP64)
+#define _LP64
+#endif
+#if !defined(_I32LPx)
+#define _I32LPx
+#endif
+#define _MULTI_DATAMODEL
+
+#else
+#error "unknown SPARC version"
+#endif
+
+/*
+ * #error is strictly ansi-C, but works as well as anything for K&R systems.
+ */
+#else
+#error "ISA not supported"
+#endif
+
+#if defined(_ILP32) && defined(_LP64)
+#error "Both _ILP32 and _LP64 are defined"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ISA_DEFS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/list.h b/sys/cddl/contrib/opensolaris/uts/common/sys/list.h
new file mode 100644
index 000000000000..6db92ed42955
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/list.h
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_LIST_H
+#define _SYS_LIST_H
+
+#include <sys/list_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct list_node list_node_t;
+typedef struct list list_t;
+
+void list_create(list_t *, size_t, size_t);
+void list_destroy(list_t *);
+
+void list_insert_after(list_t *, void *, void *);
+void list_insert_before(list_t *, void *, void *);
+void list_insert_head(list_t *, void *);
+void list_insert_tail(list_t *, void *);
+void list_remove(list_t *, void *);
+void *list_remove_head(list_t *);
+void *list_remove_tail(list_t *);
+void list_move_tail(list_t *, list_t *);
+
+void *list_head(list_t *);
+void *list_tail(list_t *);
+void *list_next(list_t *, void *);
+void *list_prev(list_t *, void *);
+int list_is_empty(list_t *);
+
+void list_link_init(list_node_t *);
+void list_link_replace(list_node_t *, list_node_t *);
+
+int list_link_active(list_node_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h
new file mode 100644
index 000000000000..a6614f9a38c2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_LIST_IMPL_H
+#define _SYS_LIST_IMPL_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct list_node {
+ struct list_node *list_next;
+ struct list_node *list_prev;
+};
+
+struct list {
+ size_t list_size;
+ size_t list_offset;
+ struct list_node list_head;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_LIST_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/note.h b/sys/cddl/contrib/opensolaris/uts/common/sys/note.h
new file mode 100644
index 000000000000..6625b68d9eb4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/note.h
@@ -0,0 +1,54 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1994 by Sun Microsystems, Inc.
+ */
+
+/*
+ * sys/note.h: interface for annotating source with info for tools
+ *
+ * This is the underlying interface; NOTE (/usr/include/note.h) is the
+ * preferred interface, but all exported header files should include this
+ * file directly and use _NOTE so as not to take "NOTE" from the user's
+ * namespace. For consistency, *all* kernel source should use _NOTE.
+ *
+ * By default, annotations expand to nothing. This file implements
+ * that. Tools using annotations will interpose a different version
+ * of this file that will expand annotations as needed.
+ */
+
+#ifndef _SYS_NOTE_H
+#define _SYS_NOTE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _NOTE
+#define _NOTE(s)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_NOTE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
new file mode 100644
index 000000000000..52d6aea0a364
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
@@ -0,0 +1,351 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_NVPAIR_H
+#define _SYS_NVPAIR_H
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/errno.h>
+
+#if defined(_KERNEL) && !defined(_BOOT)
+#include <sys/kmem.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ DATA_TYPE_DONTCARE = -1,
+ DATA_TYPE_UNKNOWN = 0,
+ DATA_TYPE_BOOLEAN,
+ DATA_TYPE_BYTE,
+ DATA_TYPE_INT16,
+ DATA_TYPE_UINT16,
+ DATA_TYPE_INT32,
+ DATA_TYPE_UINT32,
+ DATA_TYPE_INT64,
+ DATA_TYPE_UINT64,
+ DATA_TYPE_STRING,
+ DATA_TYPE_BYTE_ARRAY,
+ DATA_TYPE_INT16_ARRAY,
+ DATA_TYPE_UINT16_ARRAY,
+ DATA_TYPE_INT32_ARRAY,
+ DATA_TYPE_UINT32_ARRAY,
+ DATA_TYPE_INT64_ARRAY,
+ DATA_TYPE_UINT64_ARRAY,
+ DATA_TYPE_STRING_ARRAY,
+ DATA_TYPE_HRTIME,
+ DATA_TYPE_NVLIST,
+ DATA_TYPE_NVLIST_ARRAY,
+ DATA_TYPE_BOOLEAN_VALUE,
+ DATA_TYPE_INT8,
+ DATA_TYPE_UINT8,
+ DATA_TYPE_BOOLEAN_ARRAY,
+ DATA_TYPE_INT8_ARRAY,
+#if !defined(_KERNEL)
+ DATA_TYPE_UINT8_ARRAY,
+ DATA_TYPE_DOUBLE
+#else
+ DATA_TYPE_UINT8_ARRAY
+#endif
+} data_type_t;
+
+typedef struct nvpair {
+ int32_t nvp_size; /* size of this nvpair */
+ int16_t nvp_name_sz; /* length of name string */
+ int16_t nvp_reserve; /* not used */
+ int32_t nvp_value_elem; /* number of elements for array types */
+ data_type_t nvp_type; /* type of value */
+ /* name string */
+ /* aligned ptr array for string arrays */
+ /* aligned array of data for value */
+} nvpair_t;
+
+/* nvlist header */
+typedef struct nvlist {
+ int32_t nvl_version;
+ uint32_t nvl_nvflag; /* persistent flags */
+ uint64_t nvl_priv; /* ptr to private data if not packed */
+ uint32_t nvl_flag;
+ int32_t nvl_pad; /* currently not used, for alignment */
+} nvlist_t;
+
+/* nvp implementation version */
+#define NV_VERSION 0
+
+/* nvlist pack encoding */
+#define NV_ENCODE_NATIVE 0
+#define NV_ENCODE_XDR 1
+
+/* nvlist persistent unique name flags, stored in nvl_nvflags */
+#define NV_UNIQUE_NAME 0x1
+#define NV_UNIQUE_NAME_TYPE 0x2
+
+/* nvlist lookup pairs related flags */
+#define NV_FLAG_NOENTOK 0x1
+
+/* convenience macros */
+#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul)
+#define NV_ALIGN4(x) (((x) + 3) & ~3)
+
+#define NVP_SIZE(nvp) ((nvp)->nvp_size)
+#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t))
+#define NVP_TYPE(nvp) ((nvp)->nvp_type)
+#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem)
+#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \
+ + (nvp)->nvp_name_sz))
+
+#define NVL_VERSION(nvl) ((nvl)->nvl_version)
+#define NVL_SIZE(nvl) ((nvl)->nvl_size)
+#define NVL_FLAG(nvl) ((nvl)->nvl_flag)
+
+/* NV allocator framework */
+typedef struct nv_alloc_ops nv_alloc_ops_t;
+
+typedef struct nv_alloc {
+ const nv_alloc_ops_t *nva_ops;
+ void *nva_arg;
+} nv_alloc_t;
+
+struct nv_alloc_ops {
+ int (*nv_ao_init)(nv_alloc_t *, __va_list);
+ void (*nv_ao_fini)(nv_alloc_t *);
+ void *(*nv_ao_alloc)(nv_alloc_t *, size_t);
+ void (*nv_ao_free)(nv_alloc_t *, void *, size_t);
+ void (*nv_ao_reset)(nv_alloc_t *);
+};
+
+extern const nv_alloc_ops_t *nv_fixed_ops;
+extern nv_alloc_t *nv_alloc_nosleep;
+
+#if defined(_KERNEL) && !defined(_BOOT)
+extern nv_alloc_t *nv_alloc_sleep;
+#endif
+
+int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...);
+void nv_alloc_reset(nv_alloc_t *);
+void nv_alloc_fini(nv_alloc_t *);
+
+/* list management */
+int nvlist_alloc(nvlist_t **, uint_t, int);
+void nvlist_free(nvlist_t *);
+int nvlist_size(nvlist_t *, size_t *, int);
+int nvlist_pack(nvlist_t *, char **, size_t *, int, int);
+int nvlist_unpack(char *, size_t, nvlist_t **, int);
+int nvlist_dup(nvlist_t *, nvlist_t **, int);
+int nvlist_merge(nvlist_t *, nvlist_t *, int);
+
+uint_t nvlist_nvflag(nvlist_t *);
+
+int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *);
+int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *);
+int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *);
+int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *);
+nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *);
+
+int nvlist_add_nvpair(nvlist_t *, nvpair_t *);
+int nvlist_add_boolean(nvlist_t *, const char *);
+int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
+int nvlist_add_byte(nvlist_t *, const char *, uchar_t);
+int nvlist_add_int8(nvlist_t *, const char *, int8_t);
+int nvlist_add_uint8(nvlist_t *, const char *, uint8_t);
+int nvlist_add_int16(nvlist_t *, const char *, int16_t);
+int nvlist_add_uint16(nvlist_t *, const char *, uint16_t);
+int nvlist_add_int32(nvlist_t *, const char *, int32_t);
+int nvlist_add_uint32(nvlist_t *, const char *, uint32_t);
+int nvlist_add_int64(nvlist_t *, const char *, int64_t);
+int nvlist_add_uint64(nvlist_t *, const char *, uint64_t);
+int nvlist_add_string(nvlist_t *, const char *, const char *);
+int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
+int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t);
+int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t);
+int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t);
+int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t);
+int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t);
+int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t);
+int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t);
+int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t);
+int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t);
+int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t);
+int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t);
+int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t);
+int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t);
+#if !defined(_KERNEL)
+int nvlist_add_double(nvlist_t *, const char *, double);
+#endif
+
+int nvlist_remove(nvlist_t *, const char *, data_type_t);
+int nvlist_remove_all(nvlist_t *, const char *);
+int nvlist_remove_nvpair(nvlist_t *, nvpair_t *);
+
+int nvlist_lookup_boolean(nvlist_t *, const char *);
+int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
+int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *);
+int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *);
+int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *);
+int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *);
+int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *);
+int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *);
+int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *);
+int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *);
+int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *);
+int nvlist_lookup_string(nvlist_t *, const char *, char **);
+int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **);
+int nvlist_lookup_boolean_array(nvlist_t *, const char *,
+ boolean_t **, uint_t *);
+int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *);
+int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *);
+int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *);
+int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *);
+int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *);
+int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *);
+int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *);
+int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *);
+int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *);
+int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *);
+int nvlist_lookup_nvlist_array(nvlist_t *, const char *,
+ nvlist_t ***, uint_t *);
+int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *);
+int nvlist_lookup_pairs(nvlist_t *, int, ...);
+#if !defined(_KERNEL)
+int nvlist_lookup_double(nvlist_t *, const char *, double *);
+#endif
+
+int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **);
+int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **,
+ int *, char **);
+boolean_t nvlist_exists(nvlist_t *, const char *);
+boolean_t nvlist_empty(nvlist_t *);
+
+/* processing nvpair */
+nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *);
+nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *);
+char *nvpair_name(nvpair_t *);
+data_type_t nvpair_type(nvpair_t *);
+int nvpair_type_is_array(nvpair_t *);
+int nvpair_value_boolean_value(nvpair_t *, boolean_t *);
+int nvpair_value_byte(nvpair_t *, uchar_t *);
+int nvpair_value_int8(nvpair_t *, int8_t *);
+int nvpair_value_uint8(nvpair_t *, uint8_t *);
+int nvpair_value_int16(nvpair_t *, int16_t *);
+int nvpair_value_uint16(nvpair_t *, uint16_t *);
+int nvpair_value_int32(nvpair_t *, int32_t *);
+int nvpair_value_uint32(nvpair_t *, uint32_t *);
+int nvpair_value_int64(nvpair_t *, int64_t *);
+int nvpair_value_uint64(nvpair_t *, uint64_t *);
+int nvpair_value_string(nvpair_t *, char **);
+int nvpair_value_nvlist(nvpair_t *, nvlist_t **);
+int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *);
+int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *);
+int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *);
+int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *);
+int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *);
+int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *);
+int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *);
+int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *);
+int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *);
+int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *);
+int nvpair_value_string_array(nvpair_t *, char ***, uint_t *);
+int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *);
+int nvpair_value_hrtime(nvpair_t *, hrtime_t *);
+#if !defined(_KERNEL)
+int nvpair_value_double(nvpair_t *, double *);
+#endif
+
+nvlist_t *fnvlist_alloc(void);
+void fnvlist_free(nvlist_t *);
+size_t fnvlist_size(nvlist_t *);
+char *fnvlist_pack(nvlist_t *, size_t *);
+void fnvlist_pack_free(char *, size_t);
+nvlist_t *fnvlist_unpack(char *, size_t);
+nvlist_t *fnvlist_dup(nvlist_t *);
+void fnvlist_merge(nvlist_t *, nvlist_t *);
+size_t fnvlist_num_pairs(nvlist_t *);
+
+void fnvlist_add_boolean(nvlist_t *, const char *);
+void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
+void fnvlist_add_byte(nvlist_t *, const char *, uchar_t);
+void fnvlist_add_int8(nvlist_t *, const char *, int8_t);
+void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t);
+void fnvlist_add_int16(nvlist_t *, const char *, int16_t);
+void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t);
+void fnvlist_add_int32(nvlist_t *, const char *, int32_t);
+void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t);
+void fnvlist_add_int64(nvlist_t *, const char *, int64_t);
+void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t);
+void fnvlist_add_string(nvlist_t *, const char *, const char *);
+void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
+void fnvlist_add_nvpair(nvlist_t *, nvpair_t *);
+void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t);
+void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t);
+void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t);
+void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t);
+void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t);
+void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t);
+void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t);
+void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t);
+void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t);
+void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t);
+void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t);
+void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t);
+
+void fnvlist_remove(nvlist_t *, const char *);
+void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *);
+
+nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name);
+boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name);
+boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name);
+uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name);
+int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name);
+int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name);
+int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name);
+int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name);
+uint8_t fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name);
+uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name);
+uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name);
+uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name);
+char *fnvlist_lookup_string(nvlist_t *nvl, const char *name);
+nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name);
+
+boolean_t fnvpair_value_boolean_value(nvpair_t *nvp);
+uchar_t fnvpair_value_byte(nvpair_t *nvp);
+int8_t fnvpair_value_int8(nvpair_t *nvp);
+int16_t fnvpair_value_int16(nvpair_t *nvp);
+int32_t fnvpair_value_int32(nvpair_t *nvp);
+int64_t fnvpair_value_int64(nvpair_t *nvp);
+uint8_t fnvpair_value_uint8_t(nvpair_t *nvp);
+uint16_t fnvpair_value_uint16(nvpair_t *nvp);
+uint32_t fnvpair_value_uint32(nvpair_t *nvp);
+uint64_t fnvpair_value_uint64(nvpair_t *nvp);
+char *fnvpair_value_string(nvpair_t *nvp);
+nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_NVPAIR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
new file mode 100644
index 000000000000..c9874b3e4db7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _NVPAIR_IMPL_H
+#define _NVPAIR_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/nvpair.h>
+
+/*
+ * The structures here provided for information and debugging purposes only
+ * may be changed in the future.
+ */
+
+/*
+ * implementation linked list for pre-packed data
+ */
+typedef struct i_nvp i_nvp_t;
+
+struct i_nvp {
+ union {
+ /* ensure alignment */
+ uint64_t _nvi_align;
+
+ struct {
+ /* pointer to next nvpair */
+ i_nvp_t *_nvi_next;
+
+ /* pointer to prev nvpair */
+ i_nvp_t *_nvi_prev;
+
+ /* next pair in table bucket */
+ i_nvp_t *_nvi_hashtable_next;
+ } _nvi;
+ } _nvi_un;
+
+ /* nvpair */
+ nvpair_t nvi_nvp;
+};
+#define nvi_next _nvi_un._nvi._nvi_next
+#define nvi_prev _nvi_un._nvi._nvi_prev
+#define nvi_hashtable_next _nvi_un._nvi._nvi_hashtable_next
+
+typedef struct {
+ i_nvp_t *nvp_list; /* linked list of nvpairs */
+ i_nvp_t *nvp_last; /* last nvpair */
+ i_nvp_t *nvp_curr; /* current walker nvpair */
+ nv_alloc_t *nvp_nva; /* pluggable allocator */
+ uint32_t nvp_stat; /* internal state */
+
+ i_nvp_t **nvp_hashtable; /* table of entries used for lookup */
+ uint32_t nvp_nbuckets; /* # of buckets in hash table */
+ uint32_t nvp_nentries; /* # of entries in hash table */
+} nvpriv_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NVPAIR_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h b/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
new file mode 100644
index 000000000000..ec4b7471e50c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
@@ -0,0 +1,140 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
+ * All Rights Reserved
+ *
+ */
+
+/*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ *
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_PROCESSOR_H
+#define _SYS_PROCESSOR_H
+
+#include <sys/types.h>
+#include <sys/procset.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Definitions for p_online, processor_info & lgrp system calls.
+ */
+
+/*
+ * Type for an lgrpid
+ */
+typedef uint16_t lgrpid_t;
+
+/*
+ * Type for processor name (CPU number).
+ */
+typedef int processorid_t;
+typedef int chipid_t;
+
+/*
+ * Flags and return values for p_online(2), and pi_state for processor_info(2).
+ * These flags are *not* for in-kernel examination of CPU states.
+ * See <sys/cpuvar.h> for appropriate informational functions.
+ */
+#define P_OFFLINE 0x0001 /* processor is offline, as quiet as possible */
+#define P_ONLINE 0x0002 /* processor is online */
+#define P_STATUS 0x0003 /* value passed to p_online to request status */
+#define P_FAULTED 0x0004 /* processor is offline, in faulted state */
+#define P_POWEROFF 0x0005 /* processor is powered off */
+#define P_NOINTR 0x0006 /* processor is online, but no I/O interrupts */
+#define P_SPARE 0x0007 /* processor is offline, can be reactivated */
+#define P_BAD P_FAULTED /* unused but defined by USL */
+#define P_FORCED 0x10000000 /* force processor offline */
+
+/*
+ * String names for processor states defined above.
+ */
+#define PS_OFFLINE "off-line"
+#define PS_ONLINE "on-line"
+#define PS_FAULTED "faulted"
+#define PS_POWEROFF "powered-off"
+#define PS_NOINTR "no-intr"
+#define PS_SPARE "spare"
+
+/*
+ * Structure filled in by processor_info(2). This structure
+ * SHOULD NOT BE MODIFIED. Changes to the structure would
+ * negate ABI compatibility.
+ *
+ * The string fields are guaranteed to contain a NULL.
+ *
+ * The pi_fputypes field contains a (possibly empty) comma-separated
+ * list of floating point identifier strings.
+ */
+#define PI_TYPELEN 16 /* max size of CPU type string */
+#define PI_FPUTYPE 32 /* max size of FPU types string */
+
+typedef struct {
+ int pi_state; /* processor state, see above */
+ char pi_processor_type[PI_TYPELEN]; /* ASCII CPU type */
+ char pi_fputypes[PI_FPUTYPE]; /* ASCII FPU types */
+ int pi_clock; /* CPU clock freq in MHz */
+} processor_info_t;
+
+/*
+ * Binding values for processor_bind(2)
+ */
+#define PBIND_NONE -1 /* LWP/thread is not bound */
+#define PBIND_QUERY -2 /* don't set, just return the binding */
+#define PBIND_HARD -3 /* prevents offlining CPU (default) */
+#define PBIND_SOFT -4 /* allows offlining CPU */
+#define PBIND_QUERY_TYPE -5 /* Return binding type */
+
+/*
+ * User-level system call interface prototypes
+ */
+#ifndef _KERNEL
+
+extern int p_online(processorid_t processorid, int flag);
+extern int processor_info(processorid_t processorid,
+ processor_info_t *infop);
+extern int processor_bind(idtype_t idtype, id_t id,
+ processorid_t processorid, processorid_t *obind);
+extern processorid_t getcpuid(void);
+extern lgrpid_t gethomelgroup(void);
+
+#else /* _KERNEL */
+
+/*
+ * Internal interface prototypes
+ */
+extern int p_online_internal(processorid_t, int, int *);
+extern int p_online_internal_locked(processorid_t, int, int *);
+
+#endif /* !_KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_PROCESSOR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h b/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h
new file mode 100644
index 000000000000..a7d58e52534c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h
@@ -0,0 +1,166 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ifndef _SYS_PROCSET_H
+#define _SYS_PROCSET_H
+
+#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.6 */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/feature_tests.h>
+#include <sys/types.h>
+#include <sys/signal.h>
+
+/*
+ * This file defines the data needed to specify a set of
+ * processes. These types are used by the sigsend, sigsendset,
+ * priocntl, priocntlset, waitid, evexit, and evexitset system
+ * calls.
+ */
+#define P_INITPID 1
+#define P_INITUID 0
+#define P_INITPGID 0
+
+#ifndef _IDTYPE_T_DECLARED
+
+/*
+ * The following defines the values for an identifier type. It
+ * specifies the interpretation of an id value. An idtype and
+ * id together define a simple set of processes.
+ */
+typedef enum
+#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
+ idtype /* pollutes XPG4.2 namespace */
+#endif
+ {
+ P_PID, /* A process identifier. */
+ P_PPID, /* A parent process identifier. */
+ P_PGID, /* A process group (job control group) */
+ /* identifier. */
+ P_SID, /* A session identifier. */
+ P_CID, /* A scheduling class identifier. */
+ P_UID, /* A user identifier. */
+ P_GID, /* A group identifier. */
+ P_ALL, /* All processes. */
+ P_LWPID, /* An LWP identifier. */
+ P_TASKID, /* A task identifier. */
+ P_PROJID, /* A project identifier. */
+ P_POOLID, /* A pool identifier. */
+ P_ZONEID, /* A zone identifier. */
+ P_CTID, /* A (process) contract identifier. */
+ P_CPUID, /* CPU identifier. */
+ P_PSETID /* Processor set identifier */
+} idtype_t;
+
+#define _IDTYPE_T_DECLARED
+
+#endif
+
+/*
+ * The following defines the operations which can be performed to
+ * combine two simple sets of processes to form another set of
+ * processes.
+ */
+#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
+typedef enum idop {
+ POP_DIFF, /* Set difference. The processes which */
+ /* are in the left operand set and not */
+ /* in the right operand set. */
+ POP_AND, /* Set disjunction. The processes */
+ /* which are in both the left and right */
+ /* operand sets. */
+ POP_OR, /* Set conjunction. The processes */
+ /* which are in either the left or the */
+ /* right operand sets (or both). */
+ POP_XOR /* Set exclusive or. The processes */
+ /* which are in either the left or */
+ /* right operand sets but not in both. */
+} idop_t;
+
+
+/*
+ * The following structure is used to define a set of processes.
+ * The set is defined in terms of two simple sets of processes
+ * and an operator which operates on these two operand sets.
+ */
+typedef struct procset {
+ idop_t p_op; /* The operator connection the */
+ /* following two operands each */
+ /* of which is a simple set of */
+ /* processes. */
+
+ idtype_t p_lidtype;
+ /* The type of the left operand */
+ /* simple set. */
+ id_t p_lid; /* The id of the left operand. */
+
+ idtype_t p_ridtype;
+ /* The type of the right */
+ /* operand simple set. */
+ id_t p_rid; /* The id of the right operand. */
+} procset_t;
+
+/*
+ * The following macro can be used to initialize a procset_t
+ * structure.
+ */
+#define setprocset(psp, op, ltype, lid, rtype, rid) \
+ (psp)->p_op = (op); \
+ (psp)->p_lidtype = (ltype); \
+ (psp)->p_lid = (lid); \
+ (psp)->p_ridtype = (rtype); \
+ (psp)->p_rid = (rid);
+
+#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
+
+#ifdef illumos
+#ifdef _KERNEL
+
+struct proc;
+
+extern int dotoprocs(procset_t *, int (*)(), char *);
+extern int dotolwp(procset_t *, int (*)(), char *);
+extern int procinset(struct proc *, procset_t *);
+extern int sigsendproc(struct proc *, sigsend_t *);
+extern int sigsendset(procset_t *, sigsend_t *);
+extern boolean_t cur_inset_only(procset_t *);
+extern id_t getmyid(idtype_t);
+
+#endif /* _KERNEL */
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_PROCSET_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h b/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h
new file mode 100644
index 000000000000..6431bf22bca0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h
@@ -0,0 +1,162 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYNCH_H
+#define _SYS_SYNCH_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifndef _ASM
+#include <sys/types.h>
+#include <sys/int_types.h>
+#endif /* _ASM */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _ASM
+/*
+ * Thread and LWP mutexes have the same type
+ * definitions.
+ *
+ * NOTE:
+ *
+ * POSIX requires that <pthread.h> define the structures pthread_mutex_t
+ * and pthread_cond_t. Although these structures are identical to mutex_t
+ * (lwp_mutex_t) and cond_t (lwp_cond_t), defined here, a typedef of these
+ * types would require including <synch.h> in <pthread.h>, pulling in
+ * non-posix symbols/constants, violating POSIX namespace restrictions. Hence,
+ * pthread_mutex_t/pthread_cond_t have been redefined (in <sys/types.h>).
+ * Any modifications done to mutex_t/lwp_mutex_t or cond_t/lwp_cond_t must
+ * also be done to pthread_mutex_t/pthread_cond_t.
+ */
+typedef struct _lwp_mutex {
+ struct {
+ uint16_t flag1;
+ uint8_t flag2;
+ uint8_t ceiling;
+ union {
+ uint16_t bcptype;
+ struct {
+ uint8_t count_type1;
+ uint8_t count_type2;
+ } mtype_rcount;
+ } mbcp_type_un;
+ uint16_t magic;
+ } flags;
+ union {
+ struct {
+ uint8_t pad[8];
+ } lock64;
+ struct {
+ uint32_t ownerpid;
+ uint32_t lockword;
+ } lock32;
+ upad64_t owner64;
+ } lock;
+ upad64_t data;
+} lwp_mutex_t;
+
+/*
+ * Thread and LWP condition variables have the same
+ * type definition.
+ * NOTE:
+ * The layout of the following structure should be kept in sync with the
+ * layout of pthread_cond_t in sys/types.h. See NOTE above for lwp_mutex_t.
+ */
+typedef struct _lwp_cond {
+ struct {
+ uint8_t flag[4];
+ uint16_t type;
+ uint16_t magic;
+ } flags;
+ upad64_t data;
+} lwp_cond_t;
+
+/*
+ * LWP semaphores
+ */
+typedef struct _lwp_sema {
+ uint32_t count; /* semaphore count */
+ uint16_t type;
+ uint16_t magic;
+ uint8_t flags[8]; /* last byte reserved for waiters */
+ upad64_t data; /* optional data */
+} lwp_sema_t;
+
+/*
+ * Thread and LWP rwlocks have the same type definition.
+ * NOTE: The layout of this structure should be kept in sync with the layout
+ * of the correponding structure of pthread_rwlock_t in sys/types.h.
+ * Also, because we have to deal with C++, there is an identical structure
+ * for rwlock_t in head/sync.h that we cannot change.
+ */
+typedef struct _lwp_rwlock {
+ int32_t readers; /* rwstate word */
+ uint16_t type;
+ uint16_t magic;
+ lwp_mutex_t mutex; /* used with process-shared rwlocks */
+ lwp_cond_t readercv; /* used only to indicate ownership */
+ lwp_cond_t writercv; /* used only to indicate ownership */
+} lwp_rwlock_t;
+
+#endif /* _ASM */
+/*
+ * Definitions of synchronization types.
+ */
+#define USYNC_THREAD 0x00 /* private to a process */
+#define USYNC_PROCESS 0x01 /* shared by processes */
+
+/* Keep the following values in sync with pthread.h */
+#define LOCK_NORMAL 0x00 /* same as USYNC_THREAD */
+#define LOCK_SHARED 0x01 /* same as USYNC_PROCESS */
+#define LOCK_ERRORCHECK 0x02 /* error check lock */
+#define LOCK_RECURSIVE 0x04 /* recursive lock */
+#define LOCK_PRIO_INHERIT 0x10 /* priority inheritance lock */
+#define LOCK_PRIO_PROTECT 0x20 /* priority ceiling lock */
+#define LOCK_ROBUST 0x40 /* robust lock */
+
+/*
+ * USYNC_PROCESS_ROBUST is a deprecated historical type. It is mapped
+ * into (USYNC_PROCESS | LOCK_ROBUST) by mutex_init(). Application code
+ * should be revised to use (USYNC_PROCESS | LOCK_ROBUST) rather than this.
+ */
+#define USYNC_PROCESS_ROBUST 0x08
+
+/*
+ * lwp_mutex_t flags
+ */
+#define LOCK_OWNERDEAD 0x1
+#define LOCK_NOTRECOVERABLE 0x2
+#define LOCK_INITED 0x4
+#define LOCK_UNMAPPED 0x8
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYNCH_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
new file mode 100644
index 000000000000..9a259828fc1f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
@@ -0,0 +1,289 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SYSEVENT_H
+#define _SYS_SYSEVENT_H
+
+#include <sys/nvpair.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef NULL
+#if defined(_LP64) && !defined(__cplusplus)
+#define NULL 0L
+#else
+#define NULL 0
+#endif
+#endif
+
+/* Internal registration class and subclass */
+#define EC_ALL "register_all_classes"
+#define EC_SUB_ALL "register_all_subclasses"
+
+/*
+ * Event allocation/enqueuing sleep/nosleep flags
+ */
+#define SE_SLEEP 0
+#define SE_NOSLEEP 1
+
+/* Framework error codes */
+#define SE_EINVAL 1 /* Invalid argument */
+#define SE_ENOMEM 2 /* Unable to allocate memory */
+#define SE_EQSIZE 3 /* Maximum event q size exceeded */
+#define SE_EFAULT 4 /* Copy fault */
+#define SE_NOTFOUND 5 /* Attribute not found */
+#define SE_NO_TRANSPORT 6 /* sysevent transport down */
+
+/* Internal data types */
+
+#define SE_DATA_TYPE_BYTE DATA_TYPE_BYTE
+#define SE_DATA_TYPE_INT16 DATA_TYPE_INT16
+#define SE_DATA_TYPE_UINT16 DATA_TYPE_UINT16
+#define SE_DATA_TYPE_INT32 DATA_TYPE_INT32
+#define SE_DATA_TYPE_UINT32 DATA_TYPE_UINT32
+#define SE_DATA_TYPE_INT64 DATA_TYPE_INT64
+#define SE_DATA_TYPE_UINT64 DATA_TYPE_UINT64
+#define SE_DATA_TYPE_STRING DATA_TYPE_STRING
+#define SE_DATA_TYPE_BYTES DATA_TYPE_BYTE_ARRAY
+#define SE_DATA_TYPE_TIME DATA_TYPE_HRTIME
+
+#define SE_KERN_PID 0
+
+#define SUNW_VENDOR "SUNW"
+#define SE_USR_PUB "usr:"
+#define SE_KERN_PUB "kern:"
+#define SUNW_KERN_PUB SUNW_VENDOR ":" SE_KERN_PUB
+#define SUNW_USR_PUB SUNW_VENDOR ":" SE_USR_PUB
+
+/*
+ * Event header and attribute value limits
+ */
+#define MAX_ATTR_NAME 1024
+#define MAX_STRING_SZ 1024
+#define MAX_BYTE_ARRAY 1024
+
+#define MAX_CLASS_LEN 64
+#define MAX_SUBCLASS_LEN 64
+#define MAX_PUB_LEN 128
+#define MAX_CHNAME_LEN 128
+#define MAX_SUBID_LEN 16
+
+/*
+ * Limit for the event payload size
+ */
+#define MAX_EV_SIZE_LEN (SHRT_MAX/4)
+
+/* Opaque sysevent_t data type */
+typedef void *sysevent_t;
+
+/* Opaque channel bind data type */
+typedef void evchan_t;
+
+/* sysevent attribute list */
+typedef nvlist_t sysevent_attr_list_t;
+
+/* sysevent attribute name-value pair */
+typedef nvpair_t sysevent_attr_t;
+
+/* Unique event identifier */
+typedef struct sysevent_id {
+ uint64_t eid_seq;
+ hrtime_t eid_ts;
+} sysevent_id_t;
+
+/* Event attribute value structures */
+typedef struct sysevent_bytes {
+ int32_t size;
+ uchar_t *data;
+} sysevent_bytes_t;
+
+typedef struct sysevent_value {
+ int32_t value_type; /* data type */
+ union {
+ uchar_t sv_byte;
+ int16_t sv_int16;
+ uint16_t sv_uint16;
+ int32_t sv_int32;
+ uint32_t sv_uint32;
+ int64_t sv_int64;
+ uint64_t sv_uint64;
+ hrtime_t sv_time;
+ char *sv_string;
+ sysevent_bytes_t sv_bytes;
+ } value;
+} sysevent_value_t;
+
+/*
+ * The following flags determine the memory allocation semantics to use for
+ * kernel event buffer allocation by userland and kernel versions of
+ * sysevent_evc_publish().
+ *
+ * EVCH_SLEEP and EVCH_NOSLEEP respectively map to KM_SLEEP and KM_NOSLEEP.
+ * EVCH_TRYHARD is a kernel-only publish flag that allow event allocation
+ * routines to use use alternate kmem caches in situations where free memory
+ * may be low. Kernel callers of sysevent_evc_publish() must set flags to
+ * one of EVCH_SLEEP, EVCH_NOSLEEP or EVCH_TRYHARD. Userland callers of
+ * sysevent_evc_publish() must set flags to one of EVCH_SLEEP or EVCH_NOSLEEP.
+ *
+ * EVCH_QWAIT determines whether or not we should wait for slots in the event
+ * queue at publication time. EVCH_QWAIT may be used by kernel and userland
+ * publishers and must be used in conjunction with any of one of EVCH_SLEEP,
+ * EVCH_NOSLEEP or EVCH_TRYHARD (kernel-only).
+ */
+
+#define EVCH_NOSLEEP 0x0001 /* No sleep on kmem_alloc() */
+#define EVCH_SLEEP 0x0002 /* Sleep on kmem_alloc() */
+#define EVCH_TRYHARD 0x0004 /* May use alternate kmem cache for alloc */
+#define EVCH_QWAIT 0x0008 /* Wait for slot in event queue */
+
+/*
+ * Meaning of flags for subscribe. Bits 8 to 15 are dedicated to
+ * the consolidation private interface, so flags defined here are restricted
+ * to the LSB.
+ *
+ * EVCH_SUB_KEEP indicates that this subscription should persist even if
+ * this subscriber id should die unexpectedly; matching events will be
+ * queued (up to a limit) and will be delivered if/when we restart again
+ * with the same subscriber id.
+ */
+#define EVCH_SUB_KEEP 0x01
+
+/*
+ * Subscriptions may be wildcarded, but we limit the number of
+ * wildcards permitted.
+ */
+#define EVCH_WILDCARD_MAX 10
+
+/*
+ * Used in unsubscribe to indicate all subscriber ids for a channel.
+ */
+#define EVCH_ALLSUB "all_subs"
+
+/*
+ * Meaning of flags parameter of channel bind function
+ *
+ * EVCH_CREAT indicates to create a channel if not already present.
+ *
+ * EVCH_HOLD_PEND indicates that events should be published to this
+ * channel even if there are no matching subscribers present; when
+ * a subscriber belatedly binds to the channel and registers their
+ * subscriptions they will receive events that predate their bind.
+ * If the channel is closed, however, with no remaining bindings then
+ * the channel is destroyed.
+ *
+ * EVCH_HOLD_PEND_INDEF is a stronger version of EVCH_HOLD_PEND -
+ * even if the channel has no remaining bindings it will not be
+ * destroyed so long as events remain unconsumed. This is suitable for
+ * use with short-lived event producers that may bind to (create) the
+ * channel and exit before the intended consumer has started.
+ */
+#define EVCH_CREAT 0x0001
+#define EVCH_HOLD_PEND 0x0002
+#define EVCH_HOLD_PEND_INDEF 0x0004
+#define EVCH_B_FLAGS 0x0007 /* All valid bits */
+
+/*
+ * Meaning of commands of evc_control function
+ */
+#define EVCH_GET_CHAN_LEN_MAX 1 /* Get event queue length limit */
+#define EVCH_GET_CHAN_LEN 2 /* Get event queue length */
+#define EVCH_SET_CHAN_LEN 3 /* Set event queue length */
+#define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */
+
+#ifdef illumos
+/*
+ * Shared user/kernel event channel interface definitions
+ */
+extern int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
+extern int sysevent_evc_unbind(evchan_t *);
+extern int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
+ int (*)(sysevent_t *, void *), void *, uint32_t);
+extern int sysevent_evc_unsubscribe(evchan_t *, const char *);
+extern int sysevent_evc_publish(evchan_t *, const char *, const char *,
+ const char *, const char *, nvlist_t *, uint32_t);
+extern int sysevent_evc_control(evchan_t *, int, ...);
+extern int sysevent_evc_setpropnvl(evchan_t *, nvlist_t *);
+extern int sysevent_evc_getpropnvl(evchan_t *, nvlist_t **);
+#endif /* illumos */
+
+#ifndef _KERNEL
+
+#ifdef illumos
+/*
+ * Userland-only event channel interfaces
+ */
+
+#include <door.h>
+
+typedef struct sysevent_subattr sysevent_subattr_t;
+
+extern sysevent_subattr_t *sysevent_subattr_alloc(void);
+extern void sysevent_subattr_free(sysevent_subattr_t *);
+
+extern void sysevent_subattr_thrattr(sysevent_subattr_t *, pthread_attr_t *);
+extern void sysevent_subattr_sigmask(sysevent_subattr_t *, sigset_t *);
+
+extern void sysevent_subattr_thrcreate(sysevent_subattr_t *,
+ door_xcreate_server_func_t *, void *);
+extern void sysevent_subattr_thrsetup(sysevent_subattr_t *,
+ door_xcreate_thrsetup_func_t *, void *);
+
+extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *,
+ int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *);
+#endif /* illumos */
+
+#else
+
+/*
+ * Kernel log_event interfaces.
+ */
+extern int log_sysevent(sysevent_t *, int, sysevent_id_t *);
+
+extern sysevent_t *sysevent_alloc(char *, char *, char *, int);
+extern void sysevent_free(sysevent_t *);
+extern int sysevent_add_attr(sysevent_attr_list_t **, char *,
+ sysevent_value_t *, int);
+extern void sysevent_free_attr(sysevent_attr_list_t *);
+extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
+extern void sysevent_detach_attributes(sysevent_t *);
+#ifdef illumos
+extern char *sysevent_get_class_name(sysevent_t *);
+extern char *sysevent_get_subclass_name(sysevent_t *);
+extern uint64_t sysevent_get_seq(sysevent_t *);
+extern void sysevent_get_time(sysevent_t *, hrtime_t *);
+extern size_t sysevent_get_size(sysevent_t *);
+extern char *sysevent_get_pub(sysevent_t *);
+extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
+#endif /* illumos */
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSEVENT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
new file mode 100644
index 000000000000..9d3107d09011
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
@@ -0,0 +1,256 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYSEVENT_DEV_H
+#define _SYS_SYSEVENT_DEV_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/sysevent/eventdefs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Event schema for EC_DEV_ADD/ESC_DISK
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_DISK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name to the raw device.
+ * The name does not include the slice number component.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_ADD/ESC_NETWORK
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_NETWORK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name associated with the device if exists.
+ * /dev name associated with the driver for DLPI
+ * Style-2 only drivers.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_ADD/ESC_PRINTER
+ *
+ * Event Class - EC_DEV_ADD
+ * Event Sub-Class - ESC_PRINTER
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev/printers name associated with the device
+ * if exists.
+ * /dev name associated with the device if it exists
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property>
+ * Attribute Type - data type of the devinfo_node_property
+ * Attribute Value - value of the devinfo_node_property
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_DISK
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_DISK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name to the raw device.
+ * The name does not include the slice number component.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_NETWORK
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_NETWORK
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev name associated with the device if exists.
+ * /dev name associated with the driver for DLPI
+ * Style-2 only drivers.
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_REMOVE/ESC_PRINTER
+ *
+ * Event Class - EC_DEV_REMOVE
+ * Event Sub-Class - ESC_PRINTER
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - /dev/printers name associated with the device
+ * if exists.
+ * /dev name associated with the device if it exists
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path of the device without the "/devices"
+ * prefix.
+ *
+ * Attribute Name - DEV_DRIVER_NAME
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - driver name
+ *
+ * Attribute Name - DEV_INSTANCE
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - driver instance number
+ *
+ *
+ * Event schema for EC_DEV_BRANCH/ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE
+ *
+ * Event Class - EC_DEV_BRANCH
+ * Event Sub-Class - ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE
+ *
+ * Attribute Name - EV_VERSION
+ * Attribute Type - DATA_TYPE_INT32
+ * Attribute Value - event version number
+ *
+ * Attribute Name - DEV_PHYS_PATH
+ * Attribute Type - DATA_TYPE_STRING
+ * Attribute Value - physical path to the root node of the device subtree
+ * without the "/devices" prefix.
+ */
+
+#define EV_VERSION "version"
+#define DEV_PHYS_PATH "phys_path"
+#define DEV_NAME "dev_name"
+#define DEV_DRIVER_NAME "driver_name"
+#define DEV_INSTANCE "instance"
+#define DEV_PROP_PREFIX "prop-"
+
+#define EV_V1 1
+
+/* maximum number of devinfo node properties added to the event */
+#define MAX_PROP_COUNT 100
+
+/* only properties with size less than PROP_LEN_LIMIT are added to the event */
+#define PROP_LEN_LIMIT 1024
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSEVENT_DEV_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
new file mode 100644
index 000000000000..f9f81e0213cf
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _SYS_SYSEVENT_EVENTDEFS_H
+#define _SYS_SYSEVENT_EVENTDEFS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * eventdefs.h contains public definitions for sysevent types (classes
+ * and subclasses). All additions/removal/changes are subject
+ * to PSARC approval.
+ */
+
+/* Sysevent Class definitions */
+#define EC_NONE "EC_none"
+#define EC_PRIV "EC_priv"
+#define EC_PLATFORM "EC_platform" /* events private to platform */
+#define EC_DR "EC_dr" /* Dynamic reconfiguration event class */
+#define EC_ENV "EC_env" /* Environmental monitor event class */
+#define EC_DOMAIN "EC_domain" /* Domain event class */
+#define EC_AP_DRIVER "EC_ap_driver" /* Alternate Pathing event class */
+#define EC_IPMP "EC_ipmp" /* IP Multipathing event class */
+#define EC_DEV_ADD "EC_dev_add" /* device add event class */
+#define EC_DEV_REMOVE "EC_dev_remove" /* device remove event class */
+#define EC_DEV_BRANCH "EC_dev_branch" /* device tree branch event class */
+#define EC_DEV_STATUS "EC_dev_status" /* device status event class */
+#define EC_FM "EC_fm" /* FMA error report event */
+#define EC_ZFS "EC_zfs" /* ZFS event */
+#define EC_DATALINK "EC_datalink" /* datalink event */
+#define EC_VRRP "EC_vrrp" /* VRRP event */
+
+/*
+ * The following event class is reserved for exclusive use
+ * by Sun Cluster software.
+ */
+#define EC_CLUSTER "EC_Cluster"
+
+/*
+ * EC_DR subclass definitions - supporting attributes (name/value pairs)
+ * are found in sys/sysevent/dr.h
+ */
+
+/* Attachment point state change */
+#define ESC_DR_AP_STATE_CHANGE "ESC_dr_ap_state_change"
+#define ESC_DR_REQ "ESC_dr_req" /* Request DR */
+#define ESC_DR_TARGET_STATE_CHANGE "ESC_dr_target_state_change"
+
+/*
+ * EC_ENV subclass definitions - supporting attributes (name/value pairs)
+ * are found in sys/sysevent/env.h
+ */
+#define ESC_ENV_TEMP "ESC_env_temp" /* Temperature change event subclass */
+#define ESC_ENV_FAN "ESC_env_fan" /* Fan status change event subclass */
+#define ESC_ENV_POWER "ESC_env_power" /* Power supply change event subclass */
+#define ESC_ENV_LED "ESC_env_led" /* LED change event subclass */
+
+/*
+ * EC_DOMAIN subclass definitions - supporting attributes (name/value pairs)
+ * are found in sys/sysevent/domain.h
+ */
+
+/* Domain state change */
+#define ESC_DOMAIN_STATE_CHANGE "ESC_domain_state_change"
+/* Domain loghost name change */
+#define ESC_DOMAIN_LOGHOST_CHANGE "ESC_domain_loghost_change"
+
+/*
+ * EC_AP_DRIVER subclass definitions - supporting attributes (name/value pairs)
+ * are found in sys/sysevent/ap_driver.h
+ */
+
+/* Alternate Pathing path switch */
+#define ESC_AP_DRIVER_PATHSWITCH "ESC_ap_driver_pathswitch"
+/* Alternate Pathing database commit */
+#define ESC_AP_DRIVER_COMMIT "ESC_ap_driver_commit"
+/* Alternate Pathing physical path status change */
+#define ESC_AP_DRIVER_PHYS_PATH_STATUS_CHANGE \
+ "ESC_ap_driver_phys_path_status_change"
+
+/*
+ * EC_IPMP subclass definitions - supporting attributes (name/value pairs)
+ * are found in sys/sysevent/ipmp.h
+ */
+
+/* IPMP group has changed state */
+#define ESC_IPMP_GROUP_STATE "ESC_ipmp_group_state"
+
+/* IPMP group has been created or removed */
+#define ESC_IPMP_GROUP_CHANGE "ESC_ipmp_group_change"
+
+/* IPMP group has had an interface added or removed */
+#define ESC_IPMP_GROUP_MEMBER_CHANGE "ESC_ipmp_group_member_change"
+
+/* Interface within an IPMP group has changed state or type */
+#define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change"
+
+/* IPMP probe has changed state */
+#define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state"
+
+/*
+ * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes
+ * (name/value pairs) are found in sys/sysevent/dev.h
+ */
+#define ESC_DISK "disk" /* disk device */
+#define ESC_NETWORK "network" /* network interface */
+#define ESC_PRINTER "printer" /* printer device */
+#define ESC_LOFI "lofi" /* lofi device */
+
+/*
+ * EC_DEV_BRANCH subclass definitions - supporting attributes (name/value pairs)
+ * are found in sys/sysevent/dev.h
+ */
+
+/* device tree branch added */
+#define ESC_DEV_BRANCH_ADD "ESC_dev_branch_add"
+
+/* device tree branch removed */
+#define ESC_DEV_BRANCH_REMOVE "ESC_dev_branch_remove"
+
+/*
+ * EC_DEV_STATUS subclass definitions
+ *
+ * device capacity dynamically changed
+ */
+#define ESC_DEV_DLE "ESC_dev_dle"
+
+/* LUN has received an eject request from the user */
+#define ESC_DEV_EJECT_REQUEST "ESC_dev_eject_request"
+
+/* FMA Fault and Error event protocol subclass */
+#define ESC_FM_ERROR "ESC_FM_error"
+#define ESC_FM_ERROR_REPLAY "ESC_FM_error_replay"
+
+/* Service processor subclass definitions */
+#define ESC_PLATFORM_SP_RESET "ESC_platform_sp_reset"
+
+/*
+ * EC_PWRCTL subclass definitions
+ */
+#define EC_PWRCTL "EC_pwrctl"
+#define ESC_PWRCTL_ADD "ESC_pwrctl_add"
+#define ESC_PWRCTL_REMOVE "ESC_pwrctl_remove"
+#define ESC_PWRCTL_WARN "ESC_pwrctl_warn"
+#define ESC_PWRCTL_LOW "ESC_pwrctl_low"
+#define ESC_PWRCTL_STATE_CHANGE "ESC_pwrctl_state_change"
+#define ESC_PWRCTL_POWER_BUTTON "ESC_pwrctl_power_button"
+#define ESC_PWRCTL_BRIGHTNESS_UP "ESC_pwrctl_brightness_up"
+#define ESC_PWRCTL_BRIGHTNESS_DOWN "ESC_pwrctl_brightness_down"
+
+/* EC_ACPIEV subclass definitions */
+#define EC_ACPIEV "EC_acpiev"
+#define ESC_ACPIEV_DISPLAY_SWITCH "ESC_acpiev_display_switch"
+#define ESC_ACPIEV_SCREEN_LOCK "ESC_acpiev_screen_lock"
+#define ESC_ACPIEV_SLEEP "ESC_acpiev_sleep"
+#define ESC_ACPIEV_AUDIO_MUTE "ESC_acpiev_audio_mute"
+#define ESC_ACPIEV_WIFI "ESC_acpiev_wifi"
+#define ESC_ACPIEV_TOUCHPAD "ESC_acpiev_touchpad"
+
+/*
+ * ZFS subclass definitions. supporting attributes (name/value paris) are found
+ * in sys/fs/zfs.h
+ */
+#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start"
+#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish"
+#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove"
+#define ESC_ZFS_VDEV_REMOVE_AUX "ESC_ZFS_vdev_remove_aux"
+#define ESC_ZFS_VDEV_REMOVE_DEV "ESC_ZFS_vdev_remove_dev"
+#define ESC_ZFS_POOL_CREATE "ESC_ZFS_pool_create"
+#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy"
+#define ESC_ZFS_POOL_IMPORT "ESC_ZFS_pool_import"
+#define ESC_ZFS_VDEV_ADD "ESC_ZFS_vdev_add"
+#define ESC_ZFS_VDEV_ATTACH "ESC_ZFS_vdev_attach"
+#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear"
+#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check"
+#define ESC_ZFS_VDEV_ONLINE "ESC_ZFS_vdev_online"
+#define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync"
+#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start"
+#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish"
+#define ESC_ZFS_SCRUB_ABORT "ESC_ZFS_scrub_abort"
+#define ESC_ZFS_SCRUB_RESUME "ESC_ZFS_scrub_resume"
+#define ESC_ZFS_SCRUB_PAUSED "ESC_ZFS_scrub_paused"
+#define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare"
+#define ESC_ZFS_BOOTFS_VDEV_ATTACH "ESC_ZFS_bootfs_vdev_attach"
+#define ESC_ZFS_POOL_REGUID "ESC_ZFS_pool_reguid"
+#define ESC_ZFS_HISTORY_EVENT "ESC_ZFS_history_event"
+#define ESC_ZFS_VDEV_AUTOEXPAND "ESC_ZFS_vdev_autoexpand"
+
+/*
+ * datalink subclass definitions.
+ */
+#define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */
+
+/*
+ * VRRP subclass definitions. Supporting attributes (name/value paris) are
+ * found in sys/sysevent/vrrp.h
+ */
+#define ESC_VRRP_STATE_CHANGE "ESC_vrrp_state_change"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSEVENT_EVENTDEFS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
new file mode 100644
index 000000000000..7b738c8b0d13
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
@@ -0,0 +1,466 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYSMACROS_H
+#define _SYS_SYSMACROS_H
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/libkern.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Some macros for units conversion
+ */
+/*
+ * Disk blocks (sectors) and bytes.
+ */
+#ifndef dtob
+#define dtob(DD) ((DD) << DEV_BSHIFT)
+#endif
+#ifndef btod
+#define btod(BB) (((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT)
+#endif
+#define btodt(BB) ((BB) >> DEV_BSHIFT)
+#define lbtod(BB) (((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT)
+
+/* common macros */
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) < 0 ? -(a) : (a))
+#endif
+#ifndef SIGNOF
+#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0)
+#endif
+
+#ifdef _KERNEL
+
+/*
+ * Convert a single byte to/from binary-coded decimal (BCD).
+ */
+extern unsigned char byte_to_bcd[256];
+extern unsigned char bcd_to_byte[256];
+
+#define BYTE_TO_BCD(x) byte_to_bcd[(x) & 0xff]
+#define BCD_TO_BYTE(x) bcd_to_byte[(x) & 0xff]
+
+#endif /* _KERNEL */
+
+/*
+ * WARNING: The device number macros defined here should not be used by device
+ * drivers or user software. Device drivers should use the device functions
+ * defined in the DDI/DKI interface (see also ddi.h). Application software
+ * should make use of the library routines available in makedev(3). A set of
+ * new device macros are provided to operate on the expanded device number
+ * format supported in SVR4. Macro versions of the DDI device functions are
+ * provided for use by kernel proper routines only. Macro routines bmajor(),
+ * major(), minor(), emajor(), eminor(), and makedev() will be removed or
+ * their definitions changed at the next major release following SVR4.
+ */
+
+#define O_BITSMAJOR 7 /* # of SVR3 major device bits */
+#define O_BITSMINOR 8 /* # of SVR3 minor device bits */
+#define O_MAXMAJ 0x7f /* SVR3 max major value */
+#define O_MAXMIN 0xff /* SVR3 max minor value */
+
+
+#define L_BITSMAJOR32 14 /* # of SVR4 major device bits */
+#define L_BITSMINOR32 18 /* # of SVR4 minor device bits */
+#define L_MAXMAJ32 0x3fff /* SVR4 max major value */
+#define L_MAXMIN32 0x3ffff /* MAX minor for 3b2 software drivers. */
+ /* For 3b2 hardware devices the minor is */
+ /* restricted to 256 (0-255) */
+
+#ifdef _LP64
+#define L_BITSMAJOR 32 /* # of major device bits in 64-bit Solaris */
+#define L_BITSMINOR 32 /* # of minor device bits in 64-bit Solaris */
+#define L_MAXMAJ 0xfffffffful /* max major value */
+#define L_MAXMIN 0xfffffffful /* max minor value */
+#else
+#define L_BITSMAJOR L_BITSMAJOR32
+#define L_BITSMINOR L_BITSMINOR32
+#define L_MAXMAJ L_MAXMAJ32
+#define L_MAXMIN L_MAXMIN32
+#endif
+
+#ifdef illumos
+#ifdef _KERNEL
+
+/* major part of a device internal to the kernel */
+
+#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+#define bmajor(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+
+/* get internal major part of expanded device number */
+
+#define getmajor(x) (major_t)((((dev_t)(x)) >> L_BITSMINOR) & L_MAXMAJ)
+
+/* minor part of a device internal to the kernel */
+
+#define minor(x) (minor_t)((x) & O_MAXMIN)
+
+/* get internal minor part of expanded device number */
+
+#define getminor(x) (minor_t)((x) & L_MAXMIN)
+
+#else
+
+/* major part of a device external from the kernel (same as emajor below) */
+
+#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+
+/* minor part of a device external from the kernel (same as eminor below) */
+
+#define minor(x) (minor_t)((x) & O_MAXMIN)
+
+#endif /* _KERNEL */
+
+/* create old device number */
+
+#define makedev(x, y) (unsigned short)(((x) << O_BITSMINOR) | ((y) & O_MAXMIN))
+
+/* make an new device number */
+
+#define makedevice(x, y) (dev_t)(((dev_t)(x) << L_BITSMINOR) | ((y) & L_MAXMIN))
+
+
+/*
+ * emajor() allows kernel/driver code to print external major numbers
+ * eminor() allows kernel/driver code to print external minor numbers
+ */
+
+#define emajor(x) \
+ (major_t)(((unsigned int)(x) >> O_BITSMINOR) > O_MAXMAJ) ? \
+ NODEV : (((unsigned int)(x) >> O_BITSMINOR) & O_MAXMAJ)
+
+#define eminor(x) \
+ (minor_t)((x) & O_MAXMIN)
+
+/*
+ * get external major and minor device
+ * components from expanded device number
+ */
+#define getemajor(x) (major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \
+ NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ))
+#define geteminor(x) (minor_t)((x) & L_MAXMIN)
+#endif /* illumos */
+
+/*
+ * These are versions of the kernel routines for compressing and
+ * expanding long device numbers that don't return errors.
+ */
+#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR)
+
+#define DEVCMPL(x) (x)
+#define DEVEXPL(x) (x)
+
+#else
+
+#define DEVCMPL(x) \
+ (dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \
+ ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \
+ ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32)))
+
+#define DEVEXPL(x) \
+ (((x) == NODEV32) ? NODEV : \
+ makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32))
+
+#endif /* L_BITSMAJOR32 ... */
+
+/* convert to old (SVR3.2) dev format */
+
+#define cmpdev(x) \
+ (o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \
+ ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \
+ ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN)))
+
+/* convert to new (SVR4) dev format */
+
+#define expdev(x) \
+ (dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \
+ ((x) & O_MAXMIN))
+
+/*
+ * Macro for checking power of 2 address alignment.
+ */
+#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
+
+/*
+ * Macros for counting and rounding.
+ */
+#ifndef howmany
+#define howmany(x, y) (((x)+((y)-1))/(y))
+#endif
+#ifndef roundup
+#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
+#endif
+/*
+ * Macro to determine if value is a power of 2
+ */
+#define ISP2(x) (((x) & ((x) - 1)) == 0)
+
+/*
+ * Macros for various sorts of alignment and rounding. The "align" must
+ * be a power of 2. Often times it is a block, sector, or page.
+ */
+
+/*
+ * return x rounded down to an align boundary
+ * eg, P2ALIGN(1200, 1024) == 1024 (1*align)
+ * eg, P2ALIGN(1024, 1024) == 1024 (1*align)
+ * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align)
+ * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+#define P2ALIGN(x, align) ((x) & -(align))
+
+/*
+ * return x % (mod) align
+ * eg, P2PHASE(0x1234, 0x100) == 0x34 (x-0x12*align)
+ * eg, P2PHASE(0x5600, 0x100) == 0x00 (x-0x56*align)
+ */
+#define P2PHASE(x, align) ((x) & ((align) - 1))
+
+/*
+ * return how much space is left in this block (but if it's perfectly
+ * aligned, return 0).
+ * eg, P2NPHASE(0x1234, 0x100) == 0xcc (0x13*align-x)
+ * eg, P2NPHASE(0x5600, 0x100) == 0x00 (0x56*align-x)
+ */
+#define P2NPHASE(x, align) (-(x) & ((align) - 1))
+
+/*
+ * return x rounded up to an align boundary
+ * eg, P2ROUNDUP(0x1234, 0x100) == 0x1300 (0x13*align)
+ * eg, P2ROUNDUP(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
+
+/*
+ * return the ending address of the block that x is in
+ * eg, P2END(0x1234, 0x100) == 0x12ff (0x13*align - 1)
+ * eg, P2END(0x5600, 0x100) == 0x56ff (0x57*align - 1)
+ */
+#define P2END(x, align) (-(~(x) & -(align)))
+
+/*
+ * return x rounded up to the next phase (offset) within align.
+ * phase should be < align.
+ * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase)
+ * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase)
+ */
+#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
+
+/*
+ * return TRUE if adding len to off would cause it to cross an align
+ * boundary.
+ * eg, P2BOUNDARY(0x1234, 0xe0, 0x100) == TRUE (0x1234 + 0xe0 == 0x1314)
+ * eg, P2BOUNDARY(0x1234, 0x50, 0x100) == FALSE (0x1234 + 0x50 == 0x1284)
+ */
+#define P2BOUNDARY(off, len, align) \
+ (((off) ^ ((off) + (len) - 1)) > (align) - 1)
+
+/*
+ * Return TRUE if they have the same highest bit set.
+ * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000)
+ * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000)
+ */
+#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y)))
+
+/*
+ * Typed version of the P2* macros. These macros should be used to ensure
+ * that the result is correctly calculated based on the data type of (x),
+ * which is passed in as the last argument, regardless of the data
+ * type of the alignment. For example, if (x) is of type uint64_t,
+ * and we want to round it up to a page boundary using "PAGESIZE" as
+ * the alignment, we can do either
+ * P2ROUNDUP(x, (uint64_t)PAGESIZE)
+ * or
+ * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t)
+ */
+#define P2ALIGN_TYPED(x, align, type) \
+ ((type)(x) & -(type)(align))
+#define P2PHASE_TYPED(x, align, type) \
+ ((type)(x) & ((type)(align) - 1))
+#define P2NPHASE_TYPED(x, align, type) \
+ (-(type)(x) & ((type)(align) - 1))
+#define P2ROUNDUP_TYPED(x, align, type) \
+ (-(-(type)(x) & -(type)(align)))
+#define P2END_TYPED(x, align, type) \
+ (-(~(type)(x) & -(type)(align)))
+#define P2PHASEUP_TYPED(x, align, phase, type) \
+ ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align)))
+#define P2CROSS_TYPED(x, y, align, type) \
+ (((type)(x) ^ (type)(y)) > (type)(align) - 1)
+#define P2SAMEHIGHBIT_TYPED(x, y, type) \
+ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y)))
+
+/*
+ * Macros to atomically increment/decrement a variable. mutex and var
+ * must be pointers.
+ */
+#define INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex)
+#define DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex)
+
+/*
+ * Macros to declare bitfields - the order in the parameter list is
+ * Low to High - that is, declare bit 0 first. We only support 8-bit bitfields
+ * because if a field crosses a byte boundary it's not likely to be meaningful
+ * without reassembly in its nonnative endianness.
+ */
+#if defined(_BIT_FIELDS_LTOH)
+#define DECL_BITFIELD2(_a, _b) \
+ uint8_t _a, _b
+#define DECL_BITFIELD3(_a, _b, _c) \
+ uint8_t _a, _b, _c
+#define DECL_BITFIELD4(_a, _b, _c, _d) \
+ uint8_t _a, _b, _c, _d
+#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \
+ uint8_t _a, _b, _c, _d, _e
+#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \
+ uint8_t _a, _b, _c, _d, _e, _f
+#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \
+ uint8_t _a, _b, _c, _d, _e, _f, _g
+#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \
+ uint8_t _a, _b, _c, _d, _e, _f, _g, _h
+#elif defined(_BIT_FIELDS_HTOL)
+#define DECL_BITFIELD2(_a, _b) \
+ uint8_t _b, _a
+#define DECL_BITFIELD3(_a, _b, _c) \
+ uint8_t _c, _b, _a
+#define DECL_BITFIELD4(_a, _b, _c, _d) \
+ uint8_t _d, _c, _b, _a
+#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \
+ uint8_t _e, _d, _c, _b, _a
+#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \
+ uint8_t _f, _e, _d, _c, _b, _a
+#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \
+ uint8_t _g, _f, _e, _d, _c, _b, _a
+#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \
+ uint8_t _h, _g, _f, _e, _d, _c, _b, _a
+#else
+#error One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined
+#endif /* _BIT_FIELDS_LTOH */
+
+#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof)
+
+/* avoid any possibility of clashing with <stddef.h> version */
+
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#endif
+
+/*
+ * Find highest one bit set.
+ * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ * High order bit is 31 (or 63 in _LP64 kernel).
+ */
+static __inline int
+highbit(unsigned long i)
+{
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSL)
+ return (flsl(i));
+#else
+ int h = 1;
+
+ if (i == 0)
+ return (0);
+#ifdef _LP64
+ if (i & 0xffffffff00000000ul) {
+ h += 32; i >>= 32;
+ }
+#endif
+ if (i & 0xffff0000) {
+ h += 16; i >>= 16;
+ }
+ if (i & 0xff00) {
+ h += 8; i >>= 8;
+ }
+ if (i & 0xf0) {
+ h += 4; i >>= 4;
+ }
+ if (i & 0xc) {
+ h += 2; i >>= 2;
+ }
+ if (i & 0x2) {
+ h += 1;
+ }
+ return (h);
+#endif
+}
+
+/*
+ * Find highest one bit set.
+ * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ */
+static __inline int
+highbit64(uint64_t i)
+{
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSLL)
+ return (flsll(i));
+#else
+ int h = 1;
+
+ if (i == 0)
+ return (0);
+ if (i & 0xffffffff00000000ULL) {
+ h += 32; i >>= 32;
+ }
+ if (i & 0xffff0000) {
+ h += 16; i >>= 16;
+ }
+ if (i & 0xff00) {
+ h += 8; i >>= 8;
+ }
+ if (i & 0xf0) {
+ h += 4; i >>= 4;
+ }
+ if (i & 0xc) {
+ h += 2; i >>= 2;
+ }
+ if (i & 0x2) {
+ h += 1;
+ }
+ return (h);
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSMACROS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h b/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
new file mode 100644
index 000000000000..beff02c5a9cf
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TASKQ_H
+#define _SYS_TASKQ_H
+
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/taskqueue.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TASKQ_NAMELEN 31
+
+struct taskqueue;
+struct taskq {
+ struct taskqueue *tq_queue;
+};
+
+typedef struct taskq taskq_t;
+typedef uintptr_t taskqid_t;
+typedef void (task_func_t)(void *);
+
+typedef struct taskq_ent {
+ struct task tqent_task;
+ task_func_t *tqent_func;
+ void *tqent_arg;
+} taskq_ent_t;
+
+struct proc;
+
+/*
+ * Public flags for taskq_create(): bit range 0-15
+ */
+#define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */
+#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */
+#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */
+#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */
+#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */
+
+/*
+ * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
+ * KM_SLEEP/KM_NOSLEEP.
+ */
+#define TQ_SLEEP 0x00 /* Can block for memory */
+#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */
+#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
+#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */
+#define TQ_FRONT 0x08 /* Put task at the front of the queue */
+
+#define TASKQID_INVALID ((taskqid_t)0)
+
+#ifdef _KERNEL
+
+extern taskq_t *system_taskq;
+
+void taskq_init(void);
+void taskq_mp_init(void);
+
+taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
+taskq_t *taskq_create_proc(const char *, int, pri_t, int, int,
+ struct proc *, uint_t);
+taskq_t *taskq_create_sysdc(const char *, int, int, int,
+ struct proc *, uint_t, uint_t);
+taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+ taskq_ent_t *);
+void nulltask(void *);
+void taskq_destroy(taskq_t *);
+void taskq_wait(taskq_t *);
+void taskq_wait_id(taskq_t *, taskqid_t);
+void taskq_suspend(taskq_t *);
+int taskq_suspended(taskq_t *);
+void taskq_resume(taskq_t *);
+int taskq_member(taskq_t *, kthread_t *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TASKQ_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
new file mode 100644
index 000000000000..1496fa356835
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_U8_TEXTPREP_H
+#define _SYS_U8_TEXTPREP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef illumos
+/*
+ * Unicode encoding conversion functions and their macros.
+ */
+#define UCONV_IN_BIG_ENDIAN 0x0001
+#define UCONV_OUT_BIG_ENDIAN 0x0002
+#define UCONV_IN_SYSTEM_ENDIAN 0x0004
+#define UCONV_OUT_SYSTEM_ENDIAN 0x0008
+#define UCONV_IN_LITTLE_ENDIAN 0x0010
+#define UCONV_OUT_LITTLE_ENDIAN 0x0020
+#define UCONV_IGNORE_NULL 0x0040
+#define UCONV_IN_ACCEPT_BOM 0x0080
+#define UCONV_OUT_EMIT_BOM 0x0100
+
+extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *,
+ int);
+extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int);
+extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *,
+ int);
+extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int);
+extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int);
+extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int);
+#endif /* illumos */
+
+/*
+ * UTF-8 text preparation functions and their macros.
+ *
+ * Among the macros defined, U8_CANON_DECOMP, U8_COMPAT_DECOMP, and
+ * U8_CANON_COMP are not public interfaces and must not be used directly
+ * at the flag input argument.
+ */
+#define U8_STRCMP_CS (0x00000001)
+#define U8_STRCMP_CI_UPPER (0x00000002)
+#define U8_STRCMP_CI_LOWER (0x00000004)
+
+#define U8_CANON_DECOMP (0x00000010)
+#define U8_COMPAT_DECOMP (0x00000020)
+#define U8_CANON_COMP (0x00000040)
+
+#define U8_STRCMP_NFD (U8_CANON_DECOMP)
+#define U8_STRCMP_NFC (U8_CANON_DECOMP | U8_CANON_COMP)
+#define U8_STRCMP_NFKD (U8_COMPAT_DECOMP)
+#define U8_STRCMP_NFKC (U8_COMPAT_DECOMP | U8_CANON_COMP)
+
+#define U8_TEXTPREP_TOUPPER (U8_STRCMP_CI_UPPER)
+#define U8_TEXTPREP_TOLOWER (U8_STRCMP_CI_LOWER)
+
+#define U8_TEXTPREP_NFD (U8_STRCMP_NFD)
+#define U8_TEXTPREP_NFC (U8_STRCMP_NFC)
+#define U8_TEXTPREP_NFKD (U8_STRCMP_NFKD)
+#define U8_TEXTPREP_NFKC (U8_STRCMP_NFKC)
+
+#define U8_TEXTPREP_IGNORE_NULL (0x00010000)
+#define U8_TEXTPREP_IGNORE_INVALID (0x00020000)
+#define U8_TEXTPREP_NOWAIT (0x00040000)
+
+#define U8_UNICODE_320 (0)
+#define U8_UNICODE_500 (1)
+#define U8_UNICODE_LATEST (U8_UNICODE_500)
+
+#define U8_VALIDATE_ENTIRE (0x00100000)
+#define U8_VALIDATE_CHECK_ADDITIONAL (0x00200000)
+#define U8_VALIDATE_UCS2_RANGE (0x00400000)
+
+#define U8_ILLEGAL_CHAR (-1)
+#define U8_OUT_OF_RANGE_CHAR (-2)
+
+extern int u8_validate(char *, size_t, char **, int, int *);
+extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *);
+extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t,
+ int *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_U8_TEXTPREP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h
new file mode 100644
index 000000000000..de6866096160
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h
@@ -0,0 +1,35376 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * COPYRIGHT AND PERMISSION NOTICE
+ *
+ * Copyright (c) 1991-2006 Unicode, Inc. All rights reserved. Distributed under
+ * the Terms of Use in http://www.unicode.org/copyright.html.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of the Unicode data files and any associated documentation (the
+ * "Data Files") or Unicode software and any associated documentation (the
+ * "Software") to deal in the Data Files or Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, and/or sell copies of the Data Files or Software, and
+ * to permit persons to whom the Data Files or Software are furnished to do so,
+ * provided that (a) the above copyright notice(s) and this permission notice
+ * appear with all copies of the Data Files or Software, (b) both the above
+ * copyright notice(s) and this permission notice appear in associated
+ * documentation, and (c) there is clear notice in each modified Data File or
+ * in the Software as well as in the documentation associated with the Data
+ * File(s) or Software that the data or software has been modified.
+ *
+ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
+ * CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THE DATA FILES OR SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall not
+ * be used in advertising or otherwise to promote the sale, use or other
+ * dealings in these Data Files or Software without prior written authorization
+ * of the copyright holder.
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be
+ * registered in some jurisdictions. All other trademarks and registered
+ * trademarks mentioned herein are the property of their respective owners.
+ */
+/*
+ * This file has been modified by Sun Microsystems, Inc.
+ */
+
+#ifndef _SYS_U8_TEXTPREP_DATA_H
+#define _SYS_U8_TEXTPREP_DATA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * To get to the combining class data, composition mappings, decomposition
+ * mappings, and case conversion mappings of Unicode, the data structures
+ * formulated and their meanings are like the following:
+ *
+ * Each UTF-8 character is seen as a 4-byte entity so that U+0061 (or 0x61 in
+ * UTF-8) would be seen as 0x00 0x00 0x00 0x61. Similarly, U+1D15E would be
+ * 0xF0 0x9D 0x85 0x9E in UTF-8.
+ *
+ * The first byte (MSB) value is an index to the b1_tbl, such as
+ * u8_common_b1_tbl and u8_composition_b1_tbl tables. A b1_tbl has
+ * indices to b2_tbl tables that have indices to b3_tbl. Each b3_tbl has
+ * either indices to b4_tbl or indices to b4_tbl and base values for
+ * displacement calculations later by using the u8_displacement_t type at
+ * below. Each b4_tbl table then has indices to the final tables.
+ *
+ * As an example, if we have a character with code value of U+1D15E which is
+ * 0xF0 0x9D 0x85 0x9E in UTF-8, the target decomposition character bytes
+ * that will be mapped by the mapping procedure would be the ones between
+ * the start_index and the end_index computed as like the following:
+ *
+ * b2_tbl_id = u8_common_b1_tbl[0][0xF0];
+ * b3_tbl_id = u8_decomp_b2_tbl[0][b2_tbl_id][0x9D];
+ * b4_tbl_id = u8_decomp_b3_tbl[0][b3_tbl_id][0x85].tbl_id;
+ * b4_base = u8_decomp_b3_tbl[0][b3_tbl_id][0x85].base;
+ * if (b4_tbl_id >= 0x8000) {
+ * b4_tbl_id -= 0x8000;
+ * start_index = u8_decomp_b4_16bit_tbl[0][b4_tbl_id][0x9E];
+ * end_index = u8_decomp_b4_16bit_tbl[0][b4_tbl_id][0x9E + 1];
+ * } else {
+ * start_index = u8_decomp_b4_tbl[0][b4_tbl_id][0x9E];
+ * end_index = u8_decomp_b4_tbl[0][b4_tbl_id][0x9E + 1];
+ * }
+ *
+ * The start_index and the end_index can be used to retrieve the bytes
+ * possibly of multiple UTF-8 characters from the final tables.
+ *
+ * The "[0]" at the above indicates this is for Unicode Version 3.2.0 data
+ * as of today. Consequently, the "[1]" indicates another Unicode version
+ * data and it is Unicode 5.0.0 as of today.
+ *
+ * The mapping procedures and the data structures are more or less similar or
+ * alike among different mappings. You might want to read the u8_textprep.c
+ * for specific details.
+ *
+ * The tool programs created and used to generate the tables in this file are
+ * saved at PSARC/2007/149/materials/ as tools.tar.gz file.
+ */
+
+/* The following is a component type for the b4_tbl vectors. */
+typedef struct {
+ uint16_t tbl_id;
+ uint16_t base;
+} u8_displacement_t;
+
+/*
+ * The U8_TBL_ELEMENT_NOT_DEF macro indicates a byte that is not defined or
+ * used. The U8_TBL_ELEMENT_FILLER indicates the end of a UTF-8 character at
+ * the final tables.
+ */
+#define U8_TBL_ELEMENT_NOT_DEF (0xff)
+#define N_ U8_TBL_ELEMENT_NOT_DEF
+
+#define U8_TBL_ELEMENT_FILLER (0xf7)
+#define FIL_ U8_TBL_ELEMENT_FILLER
+
+/*
+ * The common b1_tbl for combining class, decompositions, tolower, and
+ * toupper case conversion mappings.
+ */
+static const uchar_t u8_common_b1_tbl[2][256] = {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 1, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 1, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+};
+
+static const uchar_t u8_combining_class_b2_tbl[2][2][256] = {
+ {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 1, 2, 3, 4, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, 5,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ {
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, 6, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+
+ },
+ {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 1, 2, 3, 4, N_, N_, N_, N_,
+ N_, N_, 5, N_, N_, N_, N_, 6,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ {
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 7, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, 8, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+
+ },
+
+};
+
+static const uchar_t u8_combining_class_b3_tbl[2][9][256] = {
+ {
+ { /* Third byte table 0. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, 0, 1, N_, N_,
+ N_, N_, 2, N_, N_, N_, 3, 4,
+ N_, 5, N_, 6, 7, 8, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 1. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, N_, 19,
+ N_, 20, N_, 21, N_, 22, N_, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 2. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 32, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, 33, N_, N_, 34,
+ N_, N_, 35, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 3. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, 36, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 4. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 37, N_, 38, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 5. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, 39, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 40, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 6. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, 41, 42, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 7. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 8. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ },
+ {
+ { /* Third byte table 0. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, 0, 1, N_, N_,
+ N_, N_, 2, N_, N_, N_, 3, 4,
+ 5, 6, N_, 7, 8, 9, N_, 10,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 1. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, N_, 21,
+ N_, 22, 23, 24, N_, 25, N_, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 2. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 35, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, 36, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, 37, N_, N_, 38,
+ N_, N_, 39, N_, 40, N_, N_, N_,
+ 41, N_, N_, N_, 42, 43, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, 44,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 3. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, 45, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 4. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 46, N_, 47, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 5. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 48, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 6. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, 49, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 50, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 7. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 51, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ { /* Third byte table 8. */
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, 52, 53, N_,
+ N_, 54, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ },
+};
+
+/*
+ * Unlike other b4_tbl, the b4_tbl for combining class data has
+ * the combining class values not indices to the final tables.
+ */
+static const uchar_t u8_combining_class_b4_tbl[2][55][256] = {
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 230, 230, 230, 230, 230, 230,
+ 230, 230, 230, 230, 230, 230, 230, 230,
+ 230, 230, 230, 230, 230, 232, 220, 220,
+ 220, 220, 232, 216, 220, 220, 220, 220,
+ 220, 202, 202, 220, 220, 220, 220, 202,
+ 202, 220, 220, 220, 220, 220, 220, 220,
+ 220, 220, 220, 220, 1, 1, 1, 1,
+ 1, 220, 220, 220, 220, 230, 230, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 230, 230, 230, 240, 230, 220,
+ 220, 220, 230, 230, 230, 220, 220, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 234, 234, 233, 230, 230, 230, 230, 230,
+ 230, 230, 230, 230, 230, 230, 230, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 230, 230, 230, 230, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 220, 230, 230, 230, 230, 220, 230,
+ 230, 230, 222, 220, 230, 230, 230, 230,
+ 230, 230, 0, 220, 220, 220, 220, 220,
+ 230, 230, 220, 230, 230, 222, 228, 230,
+ 10, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 0, 20, 21, 22, 0, 23,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 24, 25, 0, 230, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 27, 28, 29, 30, 31,
+ 32, 33, 34, 230, 230, 220, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 35, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 230, 230,
+ 230, 230, 230, 230, 230, 0, 0, 230,
+ 230, 230, 230, 220, 230, 0, 0, 230,
+ 230, 0, 220, 230, 230, 220, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 36, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 220, 230, 230, 220, 230, 230, 220,
+ 220, 220, 230, 220, 220, 230, 220, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 220, 230, 220, 230, 220, 230,
+ 220, 230, 230, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 230, 220, 230, 230, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 84, 91, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 9, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 103, 103, 9, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 107, 107, 107, 107, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 118, 118, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 122, 122, 122, 122, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 220, 220, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 220, 0, 220,
+ 0, 216, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 129, 130, 0, 132, 0, 0, 0,
+ 0, 0, 130, 130, 130, 130, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 130, 0, 230, 230, 9, 0, 230, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 220, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 0, 9, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 9, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 9, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 9, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 228, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 36. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 1, 1, 230, 230, 230, 230,
+ 1, 1, 1, 230, 230, 0, 0, 0,
+ 0, 230, 0, 0, 0, 1, 1, 230,
+ 220, 230, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 37. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 218, 228, 232, 222, 224, 224,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 38. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 8, 8, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 39. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 26, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 40. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 230, 230, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 41. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 216, 216, 1,
+ 1, 1, 0, 0, 0, 226, 216, 216,
+ 216, 216, 216, 0, 0, 0, 0, 0,
+ 0, 0, 0, 220, 220, 220, 220, 220,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 42. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 220, 220, 220, 0, 0, 230, 230, 230,
+ 230, 230, 220, 220, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 230, 230, 230, 230, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 43. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 44. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 45. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 46. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 47. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 48. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 49. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 50. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 51. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 52. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 53. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 54. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ },
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 230, 230, 230, 230, 230, 230,
+ 230, 230, 230, 230, 230, 230, 230, 230,
+ 230, 230, 230, 230, 230, 232, 220, 220,
+ 220, 220, 232, 216, 220, 220, 220, 220,
+ 220, 202, 202, 220, 220, 220, 220, 202,
+ 202, 220, 220, 220, 220, 220, 220, 220,
+ 220, 220, 220, 220, 1, 1, 1, 1,
+ 1, 220, 220, 220, 220, 230, 230, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 230, 230, 230, 240, 230, 220,
+ 220, 220, 230, 230, 230, 220, 220, 0,
+ 230, 230, 230, 220, 220, 220, 220, 230,
+ 232, 220, 220, 230, 233, 234, 234, 233,
+ 234, 234, 233, 230, 230, 230, 230, 230,
+ 230, 230, 230, 230, 230, 230, 230, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 230, 230, 230, 230, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 220, 230, 230, 230, 230, 220, 230,
+ 230, 230, 222, 220, 230, 230, 230, 230,
+ 230, 230, 220, 220, 220, 220, 220, 220,
+ 230, 230, 220, 230, 230, 222, 228, 230,
+ 10, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 19, 20, 21, 22, 0, 23,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 24, 25, 0, 230, 220, 0, 18,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 230, 230, 230, 230, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 27, 28, 29, 30, 31,
+ 32, 33, 34, 230, 230, 220, 220, 230,
+ 230, 230, 230, 230, 220, 230, 230, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 35, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 230, 230,
+ 230, 230, 230, 230, 230, 0, 0, 230,
+ 230, 230, 230, 220, 230, 0, 0, 230,
+ 230, 0, 220, 230, 230, 220, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 36, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 220, 230, 230, 220, 230, 230, 220,
+ 220, 220, 230, 220, 220, 230, 220, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 220, 230, 220, 230, 220, 230,
+ 220, 230, 230, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 230, 230, 230, 230, 230,
+ 230, 230, 220, 230, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 230, 220, 230, 230, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 84, 91, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 9, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 103, 103, 9, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 107, 107, 107, 107, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 118, 118, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 122, 122, 122, 122, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 220, 220, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 220, 0, 220,
+ 0, 216, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 129, 130, 0, 132, 0, 0, 0,
+ 0, 0, 130, 130, 130, 130, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 130, 0, 230, 230, 9, 0, 230, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 220, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 0, 9, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 36. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 37. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 9, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 9, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 38. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 9, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 230, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 39. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 228, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 40. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 222, 230, 220, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 41. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 230,
+ 220, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 42. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 43. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 9, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 230, 220, 230, 230, 230,
+ 230, 230, 230, 230, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 44. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 220, 230, 230, 230, 230, 230,
+ 230, 230, 220, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 230, 220,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 45. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 1, 1, 230, 230, 230, 230,
+ 1, 1, 1, 230, 230, 0, 0, 0,
+ 0, 230, 0, 0, 0, 1, 1, 230,
+ 220, 230, 1, 1, 220, 220, 220, 220,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 46. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 218, 228, 232, 222, 224, 224,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 47. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 8, 8, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 48. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 9, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 49. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 26, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 50. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 230, 230, 230, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 51. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 220, 0, 230,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 230, 1, 220, 0, 0, 0, 0, 9,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 52. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 216, 216, 1,
+ 1, 1, 0, 0, 0, 226, 216, 216,
+ 216, 216, 216, 0, 0, 0, 0, 0,
+ 0, 0, 0, 220, 220, 220, 220, 220,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 53. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 220, 220, 220, 0, 0, 230, 230, 230,
+ 230, 230, 220, 220, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 230, 230, 230, 230, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ { /* Fourth byte table 54. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 230, 230, 230, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ },
+};
+
+static const uchar_t u8_composition_b1_tbl[2][256] = {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+};
+
+static const uchar_t u8_composition_b2_tbl[2][1][256] = {
+ {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 1, 2, 3, 4, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+
+ },
+ {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 1, 2, 3, 4, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+
+ },
+
+};
+
+static const u8_displacement_t u8_composition_b3_tbl[2][5][256] = {
+ {
+ { /* Third byte table 0. */
+ { 0x8000, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0, 2470 },
+ { 0x8001, 2491 }, { 1, 2871 }, { 2, 2959 },
+ { 3, 3061 }, { 4, 3212 }, { 5, 3226 },
+ { N_, 0 }, { 6, 3270 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0x8002, 3277 },
+ { 7, 3774 }, { 8, 3949 }, { 9, 4198 },
+ { N_, 0 }, { 10, 4265 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 11, 4293 }, { 12, 4312 }, { N_, 0 },
+ { 13, 4326 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 1. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 14, 4347 },
+ { N_, 0 }, { N_, 0 }, { 15, 4374 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 16, 4391 },
+ { 17, 4416 }, { 18, 4425 }, { N_, 0 },
+ { 19, 4451 }, { 20, 4460 }, { 21, 4469 },
+ { N_, 0 }, { 22, 4503 }, { N_, 0 },
+ { 23, 4529 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 2. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 24, 4563 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 25, 4572 }, { 26, 4588 },
+ { 27, 4620 }, { 28, 4666 }, { 0x8003, 4682 },
+ { 0x8004, 5254 }, { 29, 5616 }, { 30, 5646 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 3. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 31, 5684 },
+ { 32, 5708 }, { 33, 5732 }, { 34, 5780 },
+ { 35, 5900 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 4. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 36, 6012 }, { 37, 6241 }, { 38, 6358 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ },
+ {
+ { /* Third byte table 0. */
+ { 0x8000, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0, 2470 },
+ { 0x8001, 2491 }, { 1, 2871 }, { 2, 2959 },
+ { 3, 3061 }, { 4, 3212 }, { 5, 3226 },
+ { N_, 0 }, { 6, 3270 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0x8002, 3277 },
+ { 7, 3774 }, { 8, 3949 }, { 9, 4198 },
+ { N_, 0 }, { 10, 4265 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 11, 4293 }, { 12, 4312 }, { N_, 0 },
+ { 13, 4326 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 1. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 14, 4347 },
+ { N_, 0 }, { N_, 0 }, { 15, 4374 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 16, 4391 },
+ { 17, 4416 }, { 18, 4425 }, { N_, 0 },
+ { 19, 4451 }, { 20, 4460 }, { 21, 4469 },
+ { N_, 0 }, { 22, 4503 }, { N_, 0 },
+ { 23, 4529 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 2. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 24, 4563 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 25, 4572 }, { 26, 4662 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 27, 4671 }, { 28, 4687 },
+ { 29, 4719 }, { 30, 4765 }, { 0x8003, 4781 },
+ { 0x8004, 5353 }, { 31, 5715 }, { 32, 5745 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 3. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 33, 5783 },
+ { 34, 5807 }, { 35, 5831 }, { 36, 5879 },
+ { 37, 5999 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 4. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 38, 6111 }, { 39, 6340 }, { 40, 6457 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ },
+};
+
+static const uchar_t u8_composition_b4_tbl[2][41][257] = {
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 29, 58, 58, 58, 58,
+ 58, 58, 58, 58, 58, 58, 58, 58,
+ 58, 58, 58, 73, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 15, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 38, 46, 46, 46, 46,
+ 46, 54, 62, 62, 62, 62, 62, 62,
+ 62, 70, 78, 86, 94, 94, 94, 94,
+ 94, 94, 94, 94, 94, 94, 94, 94,
+ 94, 94, 94, 94, 94, 94, 94, 94,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 36, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 108, 144, 144, 144, 144, 144, 144, 144,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 14, 22, 30, 30, 30, 30, 30, 37,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 15, 15, 15, 15, 70, 70,
+ 70, 70, 112, 133, 154, 154, 154, 162,
+ 162, 162, 162, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 20, 20, 20, 27, 27, 46, 59,
+ 66, 91, 91, 98, 98, 98, 98, 105,
+ 105, 105, 105, 105, 130, 130, 130, 130,
+ 137, 137, 137, 137, 144, 144, 151, 151,
+ 151, 164, 164, 164, 171, 171, 190, 203,
+ 210, 235, 235, 242, 242, 242, 242, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 25, 25, 25, 25,
+ 32, 32, 32, 32, 39, 39, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 53, 53, 53, 60, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 21, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 14, 14, 14, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 9, 9, 9, 9, 9, 9, 9,
+ 9, 18, 18, 18, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 17,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 25,
+ 25, 25, 25, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 17,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 25, 25, 25, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 8,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 8, 16, 16, 16, 16,
+ 16, 16, 16, 24, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 15, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 38, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 8, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 16,
+ 16, 16, 16, 16, 16, 16, 16, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 8, 8, 16, 16, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 8, 8, 16, 16, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 8, 8, 8, 8,
+ 8, 16, 16, 16, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 32, 32, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 8, 8, 16, 16,
+ 16, 24, 24, 24, 24, 24, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 40, 40, 40, 48, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 64, 72, 72, 72, 80,
+ 88, 88, 88, 96, 104, 112, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 8, 16, 16, 16, 24,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 40, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 56, 56, 56, 56, 56,
+ 56, 64, 72, 72, 80, 80, 80, 80,
+ 80, 80, 80, 88, 96, 104, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112,
+ },
+ { /* Fourth byte table 36. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 9,
+ 9, 9, 9, 9, 18, 18, 27, 27,
+ 36, 36, 45, 45, 54, 54, 63, 63,
+ 72, 72, 81, 81, 90, 90, 99, 99,
+ 108, 108, 117, 117, 117, 126, 126, 135,
+ 135, 144, 144, 144, 144, 144, 144, 144,
+ 161, 161, 161, 178, 178, 178, 195, 195,
+ 195, 212, 212, 212, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229,
+ },
+ { /* Fourth byte table 37. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 18,
+ 18, 18, 18, 18, 27, 27, 36, 36,
+ 45, 45, 54, 54, 63, 63, 72, 72,
+ 81, 81, 90, 90, 99, 99, 108, 108,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117,
+ },
+ { /* Fourth byte table 38. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 9, 9, 9, 18, 18, 27,
+ 27, 36, 36, 36, 36, 36, 36, 36,
+ 53, 53, 53, 70, 70, 70, 87, 87,
+ 87, 104, 104, 104, 121, 121, 121, 121,
+ 121, 121, 121, 121, 121, 121, 121, 121,
+ 121, 121, 121, 121, 121, 121, 121, 121,
+ 130, 139, 148, 157, 157, 157, 157, 157,
+ 157, 157, 157, 157, 157, 157, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166,
+ },
+ { /* Fourth byte table 39. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 40. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ },
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 29, 58, 58, 58, 58,
+ 58, 58, 58, 58, 58, 58, 58, 58,
+ 58, 58, 58, 73, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 15, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 38, 46, 46, 46, 46,
+ 46, 54, 62, 62, 62, 62, 62, 62,
+ 62, 70, 78, 86, 94, 94, 94, 94,
+ 94, 94, 94, 94, 94, 94, 94, 94,
+ 94, 94, 94, 94, 94, 94, 94, 94,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102,
+ 102,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 36, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 108, 144, 144, 144, 144, 144, 144, 144,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151, 151, 151, 151, 151, 151, 151, 151,
+ 151,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 14, 22, 30, 30, 30, 30, 30, 37,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 15, 15, 15, 15, 70, 70,
+ 70, 70, 112, 133, 154, 154, 154, 162,
+ 162, 162, 162, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 20, 20, 20, 27, 27, 46, 59,
+ 66, 91, 91, 98, 98, 98, 98, 105,
+ 105, 105, 105, 105, 130, 130, 130, 130,
+ 137, 137, 137, 137, 144, 144, 151, 151,
+ 151, 164, 164, 164, 171, 171, 190, 203,
+ 210, 235, 235, 242, 242, 242, 242, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249, 249, 249, 249, 249, 249, 249, 249,
+ 249,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 25, 25, 25, 25,
+ 32, 32, 32, 32, 39, 39, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 53, 53, 53, 60, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 21, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 14, 14, 14, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 9, 9, 9, 9, 9, 9, 9,
+ 9, 18, 18, 18, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 17,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 25,
+ 25, 25, 25, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 17,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 25, 25, 25, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ 34,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 9, 9,
+ 18, 18, 27, 27, 36, 36, 45, 45,
+ 45, 45, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 63, 63, 72, 72, 81,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 8,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 8, 16, 16, 16, 16,
+ 16, 16, 16, 24, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 15, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 38, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 8, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 16,
+ 16, 16, 16, 16, 16, 16, 16, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 8, 8, 16, 16, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 8, 8, 16, 16, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 8, 8, 8, 8,
+ 8, 16, 16, 16, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 32, 32, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 36. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 8, 8, 16, 16,
+ 16, 24, 24, 24, 24, 24, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 40, 40, 40, 48, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 64, 72, 72, 72, 80,
+ 88, 88, 88, 96, 104, 112, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 120,
+ 120,
+ },
+ { /* Fourth byte table 37. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 8, 16, 16, 16, 24,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 40, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 56, 56, 56, 56, 56,
+ 56, 64, 72, 72, 80, 80, 80, 80,
+ 80, 80, 80, 88, 96, 104, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112,
+ },
+ { /* Fourth byte table 38. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 9,
+ 9, 9, 9, 9, 18, 18, 27, 27,
+ 36, 36, 45, 45, 54, 54, 63, 63,
+ 72, 72, 81, 81, 90, 90, 99, 99,
+ 108, 108, 117, 117, 117, 126, 126, 135,
+ 135, 144, 144, 144, 144, 144, 144, 144,
+ 161, 161, 161, 178, 178, 178, 195, 195,
+ 195, 212, 212, 212, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229, 229, 229, 229, 229, 229, 229, 229,
+ 229,
+ },
+ { /* Fourth byte table 39. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 18,
+ 18, 18, 18, 18, 27, 27, 36, 36,
+ 45, 45, 54, 54, 63, 63, 72, 72,
+ 81, 81, 90, 90, 99, 99, 108, 108,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117,
+ 117,
+ },
+ { /* Fourth byte table 40. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 9, 9, 9, 18, 18, 27,
+ 27, 36, 36, 36, 36, 36, 36, 36,
+ 53, 53, 53, 70, 70, 70, 87, 87,
+ 87, 104, 104, 104, 121, 121, 121, 121,
+ 121, 121, 121, 121, 121, 121, 121, 121,
+ 121, 121, 121, 121, 121, 121, 121, 121,
+ 130, 139, 148, 157, 157, 157, 157, 157,
+ 157, 157, 157, 157, 157, 157, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166, 166, 166, 166, 166, 166, 166, 166,
+ 166,
+ },
+ },
+};
+
+static const uint16_t u8_composition_b4_16bit_tbl[2][5][257] = {
+ {
+ { /* Fourth byte 16-bit table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 8, 16, 24,
+ 24, 24, 124, 146, 177, 219, 327, 335,
+ 379, 427, 521, 528, 562, 602, 624, 683,
+ 782, 797, 797, 849, 894, 941, 1061, 1076,
+ 1118, 1133, 1193, 1233, 1233, 1233, 1233, 1233,
+ 1233, 1233, 1333, 1355, 1386, 1428, 1536, 1544,
+ 1588, 1643, 1731, 1744, 1778, 1818, 1840, 1899,
+ 1998, 2013, 2013, 2065, 2110, 2164, 2284, 2299,
+ 2348, 2363, 2430, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470,
+ },
+ { /* Fourth byte 16-bit table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 29, 29, 36, 43, 56,
+ 64, 64, 64, 93, 93, 93, 93, 93,
+ 101, 101, 101, 101, 101, 130, 151, 158,
+ 158, 165, 165, 165, 165, 190, 190, 190,
+ 190, 190, 190, 219, 219, 226, 233, 246,
+ 254, 254, 254, 283, 283, 283, 283, 283,
+ 291, 291, 291, 291, 291, 320, 341, 348,
+ 348, 355, 355, 355, 355, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380,
+ },
+ { /* Fourth byte 16-bit table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 49, 49, 49, 49, 77, 77,
+ 112, 112, 160, 160, 160, 160, 160, 160,
+ 188, 188, 196, 196, 196, 196, 237, 237,
+ 237, 237, 272, 272, 272, 280, 280, 288,
+ 288, 288, 344, 344, 344, 344, 372, 372,
+ 414, 414, 469, 469, 469, 469, 469, 469,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497,
+ },
+ { /* Fourth byte 16-bit table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 29, 58, 66, 74, 82, 90, 98,
+ 106, 135, 164, 172, 180, 188, 196, 204,
+ 212, 227, 242, 242, 242, 242, 242, 242,
+ 242, 257, 272, 272, 272, 272, 272, 272,
+ 272, 301, 330, 338, 346, 354, 362, 370,
+ 378, 407, 436, 444, 452, 460, 468, 476,
+ 484, 506, 528, 528, 528, 528, 528, 528,
+ 528, 550, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572,
+ },
+ { /* Fourth byte 16-bit table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 15, 30, 30, 30, 30, 30, 30,
+ 30, 45, 60, 60, 60, 60, 60, 60,
+ 60, 82, 104, 104, 104, 104, 104, 104,
+ 104, 104, 126, 126, 126, 126, 126, 126,
+ 126, 155, 184, 192, 200, 208, 216, 224,
+ 232, 261, 290, 298, 306, 314, 322, 330,
+ 338, 346, 346, 346, 346, 354, 354, 354,
+ 354, 354, 354, 354, 354, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362,
+ },
+ },
+ {
+ { /* Fourth byte 16-bit table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 8, 16, 24,
+ 24, 24, 124, 146, 177, 219, 327, 335,
+ 379, 427, 521, 528, 562, 602, 624, 683,
+ 782, 797, 797, 849, 894, 941, 1061, 1076,
+ 1118, 1133, 1193, 1233, 1233, 1233, 1233, 1233,
+ 1233, 1233, 1333, 1355, 1386, 1428, 1536, 1544,
+ 1588, 1643, 1731, 1744, 1778, 1818, 1840, 1899,
+ 1998, 2013, 2013, 2065, 2110, 2164, 2284, 2299,
+ 2348, 2363, 2430, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470,
+ 2470,
+ },
+ { /* Fourth byte 16-bit table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 29, 29, 36, 43, 56,
+ 64, 64, 64, 93, 93, 93, 93, 93,
+ 101, 101, 101, 101, 101, 130, 151, 158,
+ 158, 165, 165, 165, 165, 190, 190, 190,
+ 190, 190, 190, 219, 219, 226, 233, 246,
+ 254, 254, 254, 283, 283, 283, 283, 283,
+ 291, 291, 291, 291, 291, 320, 341, 348,
+ 348, 355, 355, 355, 355, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380, 380, 380, 380, 380, 380, 380, 380,
+ 380,
+ },
+ { /* Fourth byte 16-bit table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 49, 49, 49, 49, 77, 77,
+ 112, 112, 160, 160, 160, 160, 160, 160,
+ 188, 188, 196, 196, 196, 196, 237, 237,
+ 237, 237, 272, 272, 272, 280, 280, 288,
+ 288, 288, 344, 344, 344, 344, 372, 372,
+ 414, 414, 469, 469, 469, 469, 469, 469,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497, 497, 497, 497, 497, 497, 497, 497,
+ 497,
+ },
+ { /* Fourth byte 16-bit table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 29, 58, 66, 74, 82, 90, 98,
+ 106, 135, 164, 172, 180, 188, 196, 204,
+ 212, 227, 242, 242, 242, 242, 242, 242,
+ 242, 257, 272, 272, 272, 272, 272, 272,
+ 272, 301, 330, 338, 346, 354, 362, 370,
+ 378, 407, 436, 444, 452, 460, 468, 476,
+ 484, 506, 528, 528, 528, 528, 528, 528,
+ 528, 550, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572, 572, 572, 572, 572, 572, 572, 572,
+ 572,
+ },
+ { /* Fourth byte 16-bit table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 15, 30, 30, 30, 30, 30, 30,
+ 30, 45, 60, 60, 60, 60, 60, 60,
+ 60, 82, 104, 104, 104, 104, 104, 104,
+ 104, 104, 126, 126, 126, 126, 126, 126,
+ 126, 155, 184, 192, 200, 208, 216, 224,
+ 232, 261, 290, 298, 306, 314, 322, 330,
+ 338, 346, 346, 346, 346, 354, 354, 354,
+ 354, 354, 354, 354, 354, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362, 362, 362, 362, 362, 362, 362, 362,
+ 362,
+ },
+ },
+};
+
+static const uchar_t u8_composition_final_tbl[2][6623] = {
+ {
+ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAE, FIL_,
+ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA0, FIL_,
+ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAF, FIL_,
+ 0x10, 0xCC, 0x86, FIL_, 0xC4, 0x82, FIL_, 0xCC,
+ 0x87, FIL_, 0xC8, 0xA6, FIL_, 0xCC, 0x8F, FIL_,
+ 0xC8, 0x80, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x82,
+ FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x81, FIL_, 0xCC,
+ 0x80, FIL_, 0xC3, 0x80, FIL_, 0xCC, 0x83, FIL_,
+ 0xC3, 0x83, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA,
+ 0xA0, FIL_, 0xCC, 0xA5, FIL_, 0xE1, 0xB8, 0x80,
+ FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x82, FIL_, 0xCC,
+ 0x84, FIL_, 0xC4, 0x80, FIL_, 0xCC, 0x88, FIL_,
+ 0xC3, 0x84, FIL_, 0xCC, 0x8A, FIL_, 0xC3, 0x85,
+ FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x84, FIL_, 0xCC,
+ 0x89, FIL_, 0xE1, 0xBA, 0xA2, FIL_, 0xCC, 0x8C,
+ FIL_, 0xC7, 0x8D, FIL_, 0x03, 0xCC, 0x87, FIL_,
+ 0xE1, 0xB8, 0x82, FIL_, 0xCC, 0xB1, FIL_, 0xE1,
+ 0xB8, 0x86, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8,
+ 0x84, FIL_, 0x05, 0xCC, 0xA7, FIL_, 0xC3, 0x87,
+ FIL_, 0xCC, 0x81, FIL_, 0xC4, 0x86, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC4, 0x8C, FIL_, 0xCC, 0x87, FIL_,
+ 0xC4, 0x8A, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x88,
+ FIL_, 0x06, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8E,
+ FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x90, FIL_,
+ 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x92, FIL_, 0xCC,
+ 0x87, FIL_, 0xE1, 0xB8, 0x8A, FIL_, 0xCC, 0x8C,
+ FIL_, 0xC4, 0x8E, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xB8, 0x8C, FIL_, 0x11, 0xCC, 0x80, FIL_, 0xC3,
+ 0x88, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x89, FIL_,
+ 0xCC, 0x82, FIL_, 0xC3, 0x8A, FIL_, 0xCC, 0x88,
+ FIL_, 0xC3, 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xC8,
+ 0xA8, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x86, FIL_,
+ 0xCC, 0x8F, FIL_, 0xC8, 0x84, FIL_, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBA, 0xBA, FIL_, 0xCC, 0xB0, FIL_,
+ 0xE1, 0xB8, 0x9A, FIL_, 0xCC, 0xAD, FIL_, 0xE1,
+ 0xB8, 0x98, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA,
+ 0xBC, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xB8,
+ FIL_, 0xCC, 0x84, FIL_, 0xC4, 0x92, FIL_, 0xCC,
+ 0x86, FIL_, 0xC4, 0x94, FIL_, 0xCC, 0x87, FIL_,
+ 0xC4, 0x96, FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x98,
+ FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x9A, FIL_, 0x01,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9E, FIL_, 0x07,
+ 0xCC, 0x8C, FIL_, 0xC7, 0xA6, FIL_, 0xCC, 0x87,
+ FIL_, 0xC4, 0xA0, FIL_, 0xCC, 0x84, FIL_, 0xE1,
+ 0xB8, 0xA0, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x9C,
+ FIL_, 0xCC, 0x81, FIL_, 0xC7, 0xB4, FIL_, 0xCC,
+ 0xA7, FIL_, 0xC4, 0xA2, FIL_, 0xCC, 0x86, FIL_,
+ 0xC4, 0x9E, FIL_, 0x07, 0xCC, 0xAE, FIL_, 0xE1,
+ 0xB8, 0xAA, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB8,
+ 0xA2, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA6,
+ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0xA4, FIL_,
+ 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0xA8, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC8, 0x9E, FIL_, 0xCC, 0x82, FIL_,
+ 0xC4, 0xA4, FIL_, 0x0F, 0xCC, 0x84, FIL_, 0xC4,
+ 0xAA, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x8C, FIL_,
+ 0xCC, 0xA8, FIL_, 0xC4, 0xAE, FIL_, 0xCC, 0x83,
+ FIL_, 0xC4, 0xA8, FIL_, 0xCC, 0x88, FIL_, 0xC3,
+ 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x8D, FIL_,
+ 0xCC, 0x8F, FIL_, 0xC8, 0x88, FIL_, 0xCC, 0x86,
+ FIL_, 0xC4, 0xAC, FIL_, 0xCC, 0x91, FIL_, 0xC8,
+ 0x8A, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8F, FIL_,
+ 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x88, FIL_, 0xCC,
+ 0x87, FIL_, 0xC4, 0xB0, FIL_, 0xCC, 0xA3, FIL_,
+ 0xE1, 0xBB, 0x8A, FIL_, 0xCC, 0xB0, FIL_, 0xE1,
+ 0xB8, 0xAC, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x8E,
+ FIL_, 0x01, 0xCC, 0x82, FIL_, 0xC4, 0xB4, FIL_,
+ 0x05, 0xCC, 0x8C, FIL_, 0xC7, 0xA8, FIL_, 0xCC,
+ 0xB1, FIL_, 0xE1, 0xB8, 0xB4, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB8, 0xB0, FIL_, 0xCC, 0xA7, FIL_,
+ 0xC4, 0xB6, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8,
+ 0xB2, FIL_, 0x06, 0xCC, 0xA7, FIL_, 0xC4, 0xBB,
+ FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0xBD, FIL_, 0xCC,
+ 0xB1, FIL_, 0xE1, 0xB8, 0xBA, FIL_, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xB8, 0xB6, FIL_, 0xCC, 0xAD, FIL_,
+ 0xE1, 0xB8, 0xBC, FIL_, 0xCC, 0x81, FIL_, 0xC4,
+ 0xB9, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xB8,
+ 0xBE, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x82,
+ FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x80, FIL_,
+ 0x09, 0xCC, 0x80, FIL_, 0xC7, 0xB8, FIL_, 0xCC,
+ 0xAD, FIL_, 0xE1, 0xB9, 0x8A, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0x84, FIL_, 0xCC, 0xB1, FIL_,
+ 0xE1, 0xB9, 0x88, FIL_, 0xCC, 0x83, FIL_, 0xC3,
+ 0x91, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x86,
+ FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x83, FIL_, 0xCC,
+ 0xA7, FIL_, 0xC5, 0x85, FIL_, 0xCC, 0x8C, FIL_,
+ 0xC5, 0x87, FIL_, 0x10, 0xCC, 0xA8, FIL_, 0xC7,
+ 0xAA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x8E, FIL_,
+ 0xCC, 0x80, FIL_, 0xC3, 0x92, FIL_, 0xCC, 0x9B,
+ FIL_, 0xC6, 0xA0, FIL_, 0xCC, 0x8F, FIL_, 0xC8,
+ 0x8C, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x93, FIL_,
+ 0xCC, 0x87, FIL_, 0xC8, 0xAE, FIL_, 0xCC, 0x8C,
+ FIL_, 0xC7, 0x91, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xBB, 0x8C, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x94,
+ FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8C, FIL_, 0xCC,
+ 0x83, FIL_, 0xC3, 0x95, FIL_, 0xCC, 0x86, FIL_,
+ 0xC5, 0x8E, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x96,
+ FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0x90, FIL_, 0xCC,
+ 0x89, FIL_, 0xE1, 0xBB, 0x8E, FIL_, 0x02, 0xCC,
+ 0x87, FIL_, 0xE1, 0xB9, 0x96, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0x94, FIL_, 0x08, 0xCC, 0x91,
+ FIL_, 0xC8, 0x92, FIL_, 0xCC, 0xA7, FIL_, 0xC5,
+ 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0x98, FIL_,
+ 0xCC, 0xB1, FIL_, 0xE1, 0xB9, 0x9E, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xB9, 0x9A, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0x98, FIL_, 0xCC, 0x81, FIL_,
+ 0xC5, 0x94, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x90,
+ FIL_, 0x07, 0xCC, 0x81, FIL_, 0xC5, 0x9A, FIL_,
+ 0xCC, 0x82, FIL_, 0xC5, 0x9C, FIL_, 0xCC, 0xA7,
+ FIL_, 0xC5, 0x9E, FIL_, 0xCC, 0x8C, FIL_, 0xC5,
+ 0xA0, FIL_, 0xCC, 0xA6, FIL_, 0xC8, 0x98, FIL_,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA0, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xB9, 0xA2, FIL_, 0x07, 0xCC,
+ 0x8C, FIL_, 0xC5, 0xA4, FIL_, 0xCC, 0xB1, FIL_,
+ 0xE1, 0xB9, 0xAE, FIL_, 0xCC, 0xA6, FIL_, 0xC8,
+ 0x9A, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA2, FIL_,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xAA, FIL_, 0xCC,
+ 0xAD, FIL_, 0xE1, 0xB9, 0xB0, FIL_, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xB9, 0xAC, FIL_, 0x13, 0xCC, 0xA8,
+ FIL_, 0xC5, 0xB2, FIL_, 0xCC, 0x83, FIL_, 0xC5,
+ 0xA8, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0xAA, FIL_,
+ 0xCC, 0x81, FIL_, 0xC3, 0x9A, FIL_, 0xCC, 0x86,
+ FIL_, 0xC5, 0xAC, FIL_, 0xCC, 0x8A, FIL_, 0xC5,
+ 0xAE, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x99, FIL_,
+ 0xCC, 0x91, FIL_, 0xC8, 0x96, FIL_, 0xCC, 0x8B,
+ FIL_, 0xC5, 0xB0, FIL_, 0xCC, 0xA4, FIL_, 0xE1,
+ 0xB9, 0xB2, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9,
+ 0xB4, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x94, FIL_,
+ 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB6, FIL_, 0xCC,
+ 0x9B, FIL_, 0xC6, 0xAF, FIL_, 0xCC, 0x82, FIL_,
+ 0xC3, 0x9B, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x9C,
+ FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x93, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xBB, 0xA4, FIL_, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBB, 0xA6, FIL_, 0x02, 0xCC, 0x83,
+ FIL_, 0xE1, 0xB9, 0xBC, FIL_, 0xCC, 0xA3, FIL_,
+ 0xE1, 0xB9, 0xBE, FIL_, 0x06, 0xCC, 0x82, FIL_,
+ 0xC5, 0xB4, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xBA,
+ 0x84, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xBA, 0x86,
+ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x88, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x82, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBA, 0x80, FIL_, 0x02, 0xCC,
+ 0x87, FIL_, 0xE1, 0xBA, 0x8A, FIL_, 0xCC, 0x88,
+ FIL_, 0xE1, 0xBA, 0x8C, FIL_, 0x09, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBB, 0xB6, FIL_, 0xCC, 0x87, FIL_,
+ 0xE1, 0xBA, 0x8E, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xBB, 0xB4, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x9D,
+ FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xB2, FIL_, 0xCC,
+ 0x82, FIL_, 0xC5, 0xB6, FIL_, 0xCC, 0x88, FIL_,
+ 0xC5, 0xB8, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB,
+ 0xB2, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xB8,
+ FIL_, 0x06, 0xCC, 0x87, FIL_, 0xC5, 0xBB, FIL_,
+ 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x92, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC5, 0xBD, FIL_, 0xCC, 0xB1, FIL_,
+ 0xE1, 0xBA, 0x94, FIL_, 0xCC, 0x82, FIL_, 0xE1,
+ 0xBA, 0x90, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xB9,
+ FIL_, 0x10, 0xCC, 0x8C, FIL_, 0xC7, 0x8E, FIL_,
+ 0xCC, 0x8F, FIL_, 0xC8, 0x81, FIL_, 0xCC, 0xA8,
+ FIL_, 0xC4, 0x85, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xBA, 0xA1, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x83,
+ FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA3, FIL_,
+ 0xCC, 0x84, FIL_, 0xC4, 0x81, FIL_, 0xCC, 0x91,
+ FIL_, 0xC8, 0x83, FIL_, 0xCC, 0x8A, FIL_, 0xC3,
+ 0xA5, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xA4, FIL_,
+ 0xCC, 0x83, FIL_, 0xC3, 0xA3, FIL_, 0xCC, 0x82,
+ FIL_, 0xC3, 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xC3,
+ 0xA1, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xA0, FIL_,
+ 0xCC, 0x87, FIL_, 0xC8, 0xA7, FIL_, 0xCC, 0xA5,
+ FIL_, 0xE1, 0xB8, 0x81, FIL_, 0x03, 0xCC, 0xB1,
+ FIL_, 0xE1, 0xB8, 0x87, FIL_, 0xCC, 0xA3, FIL_,
+ 0xE1, 0xB8, 0x85, FIL_, 0xCC, 0x87, FIL_, 0xE1,
+ 0xB8, 0x83, FIL_, 0x05, 0xCC, 0x87, FIL_, 0xC4,
+ 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xC3, 0xA7, FIL_,
+ 0xCC, 0x82, FIL_, 0xC4, 0x89, FIL_, 0xCC, 0x8C,
+ FIL_, 0xC4, 0x8D, FIL_, 0xCC, 0x81, FIL_, 0xC4,
+ 0x87, FIL_, 0x06, 0xCC, 0xAD, FIL_, 0xE1, 0xB8,
+ 0x93, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x8B,
+ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0x8D, FIL_,
+ 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8F, FIL_, 0xCC,
+ 0xA7, FIL_, 0xE1, 0xB8, 0x91, FIL_, 0xCC, 0x8C,
+ FIL_, 0xC4, 0x8F, FIL_, 0x11, 0xCC, 0xA8, FIL_,
+ 0xC4, 0x99, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x9B,
+ FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x97, FIL_, 0xCC,
+ 0x88, FIL_, 0xC3, 0xAB, FIL_, 0xCC, 0xA3, FIL_,
+ 0xE1, 0xBA, 0xB9, FIL_, 0xCC, 0xB0, FIL_, 0xE1,
+ 0xB8, 0x9B, FIL_, 0xCC, 0x84, FIL_, 0xC4, 0x93,
+ FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x99, FIL_,
+ 0xCC, 0x83, FIL_, 0xE1, 0xBA, 0xBD, FIL_, 0xCC,
+ 0x86, FIL_, 0xC4, 0x95, FIL_, 0xCC, 0xA7, FIL_,
+ 0xC8, 0xA9, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA,
+ 0xBB, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x85, FIL_,
+ 0xCC, 0x81, FIL_, 0xC3, 0xA9, FIL_, 0xCC, 0x91,
+ FIL_, 0xC8, 0x87, FIL_, 0xCC, 0x80, FIL_, 0xC3,
+ 0xA8, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xAA, FIL_,
+ 0x01, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9F, FIL_,
+ 0x07, 0xCC, 0x86, FIL_, 0xC4, 0x9F, FIL_, 0xCC,
+ 0xA7, FIL_, 0xC4, 0xA3, FIL_, 0xCC, 0x81, FIL_,
+ 0xC7, 0xB5, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x9D,
+ FIL_, 0xCC, 0x87, FIL_, 0xC4, 0xA1, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC7, 0xA7, FIL_, 0xCC, 0x84, FIL_,
+ 0xE1, 0xB8, 0xA1, FIL_, 0x08, 0xCC, 0x8C, FIL_,
+ 0xC8, 0x9F, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xA5,
+ FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA7, FIL_,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0xA3, FIL_, 0xCC,
+ 0xB1, FIL_, 0xE1, 0xBA, 0x96, FIL_, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xB8, 0xA5, FIL_, 0xCC, 0xA7, FIL_,
+ 0xE1, 0xB8, 0xA9, FIL_, 0xCC, 0xAE, FIL_, 0xE1,
+ 0xB8, 0xAB, FIL_, 0x0E, 0xCC, 0x81, FIL_, 0xC3,
+ 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xAC, FIL_,
+ 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0x8B, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC7, 0x90, FIL_, 0xCC, 0x89, FIL_,
+ 0xE1, 0xBB, 0x89, FIL_, 0xCC, 0x91, FIL_, 0xC8,
+ 0x8B, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x89, FIL_,
+ 0xCC, 0x82, FIL_, 0xC3, 0xAE, FIL_, 0xCC, 0xB0,
+ FIL_, 0xE1, 0xB8, 0xAD, FIL_, 0xCC, 0xA8, FIL_,
+ 0xC4, 0xAF, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0xAD,
+ FIL_, 0xCC, 0x84, FIL_, 0xC4, 0xAB, FIL_, 0xCC,
+ 0x83, FIL_, 0xC4, 0xA9, FIL_, 0xCC, 0x88, FIL_,
+ 0xC3, 0xAF, FIL_, 0x02, 0xCC, 0x82, FIL_, 0xC4,
+ 0xB5, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0xB0, FIL_,
+ 0x05, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0xB3, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xB1, FIL_, 0xCC,
+ 0xA7, FIL_, 0xC4, 0xB7, FIL_, 0xCC, 0x8C, FIL_,
+ 0xC7, 0xA9, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8,
+ 0xB5, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8,
+ 0xB7, FIL_, 0xCC, 0x81, FIL_, 0xC4, 0xBA, FIL_,
+ 0xCC, 0xA7, FIL_, 0xC4, 0xBC, FIL_, 0xCC, 0x8C,
+ FIL_, 0xC4, 0xBE, FIL_, 0xCC, 0xB1, FIL_, 0xE1,
+ 0xB8, 0xBB, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8,
+ 0xBD, FIL_, 0x03, 0xCC, 0xA3, FIL_, 0xE1, 0xB9,
+ 0x83, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xBF,
+ FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x81, FIL_,
+ 0x09, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x87, FIL_,
+ 0xCC, 0x83, FIL_, 0xC3, 0xB1, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0x85, FIL_, 0xCC, 0xB1, FIL_,
+ 0xE1, 0xB9, 0x89, FIL_, 0xCC, 0x81, FIL_, 0xC5,
+ 0x84, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x86, FIL_,
+ 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0x8B, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC5, 0x88, FIL_, 0xCC, 0x80, FIL_,
+ 0xC7, 0xB9, FIL_, 0x10, 0xCC, 0x89, FIL_, 0xE1,
+ 0xBB, 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xB3,
+ FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xB2, FIL_, 0xCC,
+ 0x87, FIL_, 0xC8, 0xAF, FIL_, 0xCC, 0x8F, FIL_,
+ 0xC8, 0x8D, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB,
+ 0x8D, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8D, FIL_,
+ 0xCC, 0x8C, FIL_, 0xC7, 0x92, FIL_, 0xCC, 0x86,
+ FIL_, 0xC5, 0x8F, FIL_, 0xCC, 0x8B, FIL_, 0xC5,
+ 0x91, FIL_, 0xCC, 0x9B, FIL_, 0xC6, 0xA1, FIL_,
+ 0xCC, 0x91, FIL_, 0xC8, 0x8F, FIL_, 0xCC, 0xA8,
+ FIL_, 0xC7, 0xAB, FIL_, 0xCC, 0x88, FIL_, 0xC3,
+ 0xB6, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xB5, FIL_,
+ 0xCC, 0x82, FIL_, 0xC3, 0xB4, FIL_, 0x02, 0xCC,
+ 0x87, FIL_, 0xE1, 0xB9, 0x97, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0x95, FIL_, 0x08, 0xCC, 0xB1,
+ FIL_, 0xE1, 0xB9, 0x9F, FIL_, 0xCC, 0x87, FIL_,
+ 0xE1, 0xB9, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC5,
+ 0x95, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x91, FIL_,
+ 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x9B, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC5, 0x99, FIL_, 0xCC, 0x91, FIL_,
+ 0xC8, 0x93, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x97,
+ FIL_, 0x07, 0xCC, 0xA6, FIL_, 0xC8, 0x99, FIL_,
+ 0xCC, 0x8C, FIL_, 0xC5, 0xA1, FIL_, 0xCC, 0x81,
+ FIL_, 0xC5, 0x9B, FIL_, 0xCC, 0x87, FIL_, 0xE1,
+ 0xB9, 0xA1, FIL_, 0xCC, 0x82, FIL_, 0xC5, 0x9D,
+ FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x9F, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xB9, 0xA3, FIL_, 0x08, 0xCC,
+ 0x88, FIL_, 0xE1, 0xBA, 0x97, FIL_, 0xCC, 0xAD,
+ FIL_, 0xE1, 0xB9, 0xB1, FIL_, 0xCC, 0xB1, FIL_,
+ 0xE1, 0xB9, 0xAF, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xB9, 0xAD, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA5,
+ FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA3, FIL_, 0xCC,
+ 0x87, FIL_, 0xE1, 0xB9, 0xAB, FIL_, 0xCC, 0xA6,
+ FIL_, 0xC8, 0x9B, FIL_, 0x13, 0xCC, 0x81, FIL_,
+ 0xC3, 0xBA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x97,
+ FIL_, 0xCC, 0x83, FIL_, 0xC5, 0xA9, FIL_, 0xCC,
+ 0x8F, FIL_, 0xC8, 0x95, FIL_, 0xCC, 0xA8, FIL_,
+ 0xC5, 0xB3, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xBB,
+ FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xBC, FIL_, 0xCC,
+ 0x80, FIL_, 0xC3, 0xB9, FIL_, 0xCC, 0xA3, FIL_,
+ 0xE1, 0xBB, 0xA5, FIL_, 0xCC, 0xA4, FIL_, 0xE1,
+ 0xB9, 0xB3, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB,
+ 0xA7, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, 0xB5,
+ FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB7, FIL_,
+ 0xCC, 0x9B, FIL_, 0xC6, 0xB0, FIL_, 0xCC, 0x84,
+ FIL_, 0xC5, 0xAB, FIL_, 0xCC, 0x8B, FIL_, 0xC5,
+ 0xB1, FIL_, 0xCC, 0x86, FIL_, 0xC5, 0xAD, FIL_,
+ 0xCC, 0x8C, FIL_, 0xC7, 0x94, FIL_, 0xCC, 0x8A,
+ FIL_, 0xC5, 0xAF, FIL_, 0x02, 0xCC, 0x83, FIL_,
+ 0xE1, 0xB9, 0xBD, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xB9, 0xBF, FIL_, 0x07, 0xCC, 0x82, FIL_, 0xC5,
+ 0xB5, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0x81,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x83, FIL_,
+ 0xCC, 0x88, FIL_, 0xE1, 0xBA, 0x85, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xBA, 0x89, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xBA, 0x87, FIL_, 0xCC, 0x8A, FIL_,
+ 0xE1, 0xBA, 0x98, FIL_, 0x02, 0xCC, 0x87, FIL_,
+ 0xE1, 0xBA, 0x8B, FIL_, 0xCC, 0x88, FIL_, 0xE1,
+ 0xBA, 0x8D, FIL_, 0x0A, 0xCC, 0x87, FIL_, 0xE1,
+ 0xBA, 0x8F, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB,
+ 0xB9, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xB3,
+ FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xB7, FIL_,
+ 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB5, FIL_, 0xCC,
+ 0x82, FIL_, 0xC5, 0xB7, FIL_, 0xCC, 0x84, FIL_,
+ 0xC8, 0xB3, FIL_, 0xCC, 0x8A, FIL_, 0xE1, 0xBA,
+ 0x99, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xBF, FIL_,
+ 0xCC, 0x81, FIL_, 0xC3, 0xBD, FIL_, 0x06, 0xCC,
+ 0x8C, FIL_, 0xC5, 0xBE, FIL_, 0xCC, 0x87, FIL_,
+ 0xC5, 0xBC, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xBA,
+ 0x95, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x93,
+ FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xBA, FIL_, 0xCC,
+ 0x82, FIL_, 0xE1, 0xBA, 0x91, FIL_, 0x03, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBF, 0xAD, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBF, 0x81, FIL_, 0xCC, 0x81, FIL_,
+ 0xCE, 0x85, FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1,
+ 0xBA, 0xA8, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA,
+ 0xAA, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA4,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA6, FIL_,
+ 0x01, 0xCC, 0x84, FIL_, 0xC7, 0x9E, FIL_, 0x01,
+ 0xCC, 0x81, FIL_, 0xC7, 0xBA, FIL_, 0x02, 0xCC,
+ 0x84, FIL_, 0xC7, 0xA2, FIL_, 0xCC, 0x81, FIL_,
+ 0xC7, 0xBC, FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1,
+ 0xB8, 0x88, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBA, 0xBE, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB,
+ 0x80, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x84,
+ FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x82, FIL_,
+ 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xAE, FIL_,
+ 0x04, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x96, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x90, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBB, 0x92, FIL_, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBB, 0x94, FIL_, 0x03, 0xCC, 0x84,
+ FIL_, 0xC8, 0xAC, FIL_, 0xCC, 0x81, FIL_, 0xE1,
+ 0xB9, 0x8C, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9,
+ 0x8E, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAA,
+ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xC7, 0xBE, FIL_,
+ 0x04, 0xCC, 0x80, FIL_, 0xC7, 0x9B, FIL_, 0xCC,
+ 0x84, FIL_, 0xC7, 0x95, FIL_, 0xCC, 0x8C, FIL_,
+ 0xC7, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x97,
+ FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA9,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA7, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA5, FIL_, 0xCC,
+ 0x83, FIL_, 0xE1, 0xBA, 0xAB, FIL_, 0x01, 0xCC,
+ 0x84, FIL_, 0xC7, 0x9F, FIL_, 0x01, 0xCC, 0x81,
+ FIL_, 0xC7, 0xBB, FIL_, 0x02, 0xCC, 0x84, FIL_,
+ 0xC7, 0xA3, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0xBD,
+ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x89,
+ FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x83,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xBF, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x81, FIL_, 0xCC,
+ 0x83, FIL_, 0xE1, 0xBB, 0x85, FIL_, 0x01, 0xCC,
+ 0x81, FIL_, 0xE1, 0xB8, 0xAF, FIL_, 0x04, 0xCC,
+ 0x83, FIL_, 0xE1, 0xBB, 0x97, FIL_, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBB, 0x95, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBB, 0x93, FIL_, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBB, 0x91, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1,
+ 0xB9, 0x8D, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xAD,
+ FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, 0x8F, FIL_,
+ 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAB, FIL_, 0x01,
+ 0xCC, 0x81, FIL_, 0xC7, 0xBF, FIL_, 0x04, 0xCC,
+ 0x81, FIL_, 0xC7, 0x98, FIL_, 0xCC, 0x84, FIL_,
+ 0xC7, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x9A,
+ FIL_, 0xCC, 0x80, FIL_, 0xC7, 0x9C, FIL_, 0x04,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xB0, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBA, 0xAE, FIL_, 0xCC, 0x83,
+ FIL_, 0xE1, 0xBA, 0xB4, FIL_, 0xCC, 0x89, FIL_,
+ 0xE1, 0xBA, 0xB2, FIL_, 0x04, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBA, 0xB1, FIL_, 0xCC, 0x83, FIL_, 0xE1,
+ 0xBA, 0xB5, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA,
+ 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xB3,
+ FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x96,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x94, FIL_,
+ 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x95, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x97, FIL_, 0x02,
+ 0xCC, 0x80, FIL_, 0xE1, 0xB9, 0x90, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xB9, 0x92, FIL_, 0x02, 0xCC,
+ 0x80, FIL_, 0xE1, 0xB9, 0x91, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0x93, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA4, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA5, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA6, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA7, FIL_, 0x01, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0xB8, FIL_, 0x01, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0xB9, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xE1, 0xB9, 0xBA, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xE1, 0xB9, 0xBB, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xBA, 0x9B, FIL_, 0x05, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBB, 0x9C, FIL_, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBB, 0x9A, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xBB, 0xA2, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB,
+ 0xA0, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x9E,
+ FIL_, 0x05, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xA1,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x9B, FIL_,
+ 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA3, FIL_, 0xCC,
+ 0x89, FIL_, 0xE1, 0xBB, 0x9F, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBB, 0x9D, FIL_, 0x05, 0xCC, 0x83,
+ FIL_, 0xE1, 0xBB, 0xAE, FIL_, 0xCC, 0xA3, FIL_,
+ 0xE1, 0xBB, 0xB0, FIL_, 0xCC, 0x89, FIL_, 0xE1,
+ 0xBB, 0xAC, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB,
+ 0xA8, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xAA,
+ FIL_, 0x05, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB1,
+ FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xAF, FIL_,
+ 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xAD, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBB, 0xA9, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBB, 0xAB, FIL_, 0x01, 0xCC, 0x8C,
+ FIL_, 0xC7, 0xAE, FIL_, 0x01, 0xCC, 0x84, FIL_,
+ 0xC7, 0xAC, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7,
+ 0xAD, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA0,
+ FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA1, FIL_,
+ 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9C, FIL_,
+ 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9D, FIL_,
+ 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xB0, FIL_, 0x01,
+ 0xCC, 0x84, FIL_, 0xC8, 0xB1, FIL_, 0x01, 0xCC,
+ 0x8C, FIL_, 0xC7, 0xAF, FIL_, 0x07, 0xCC, 0x93,
+ FIL_, 0xE1, 0xBC, 0x88, FIL_, 0xCC, 0x94, FIL_,
+ 0xE1, 0xBC, 0x89, FIL_, 0xCC, 0x81, FIL_, 0xCE,
+ 0x86, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xBC,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBE, 0xBA, FIL_,
+ 0xCC, 0x84, FIL_, 0xE1, 0xBE, 0xB9, FIL_, 0xCC,
+ 0x86, FIL_, 0xE1, 0xBE, 0xB8, FIL_, 0x04, 0xCC,
+ 0x81, FIL_, 0xCE, 0x88, FIL_, 0xCC, 0x94, FIL_,
+ 0xE1, 0xBC, 0x99, FIL_, 0xCC, 0x93, FIL_, 0xE1,
+ 0xBC, 0x98, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF,
+ 0x88, FIL_, 0x05, 0xCC, 0x94, FIL_, 0xE1, 0xBC,
+ 0xA9, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8A,
+ FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x89, FIL_, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBF, 0x8C, FIL_, 0xCC, 0x93,
+ FIL_, 0xE1, 0xBC, 0xA8, FIL_, 0x07, 0xCC, 0x81,
+ FIL_, 0xCE, 0x8A, FIL_, 0xCC, 0x88, FIL_, 0xCE,
+ 0xAA, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0x98,
+ FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0x99, FIL_,
+ 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB8, FIL_, 0xCC,
+ 0x94, FIL_, 0xE1, 0xBC, 0xB9, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBF, 0x9A, FIL_, 0x04, 0xCC, 0x94,
+ FIL_, 0xE1, 0xBD, 0x89, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBF, 0xB8, FIL_, 0xCC, 0x81, FIL_, 0xCE,
+ 0x8C, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0x88,
+ FIL_, 0x01, 0xCC, 0x94, FIL_, 0xE1, 0xBF, 0xAC,
+ FIL_, 0x06, 0xCC, 0x81, FIL_, 0xCE, 0x8E, FIL_,
+ 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0xA8, FIL_, 0xCC,
+ 0x94, FIL_, 0xE1, 0xBD, 0x99, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBF, 0xAA, FIL_, 0xCC, 0x84, FIL_,
+ 0xE1, 0xBF, 0xA9, FIL_, 0xCC, 0x88, FIL_, 0xCE,
+ 0xAB, FIL_, 0x05, 0xCC, 0x80, FIL_, 0xE1, 0xBF,
+ 0xBA, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x8F, FIL_,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xBC, FIL_, 0xCC,
+ 0x94, FIL_, 0xE1, 0xBD, 0xA9, FIL_, 0xCC, 0x93,
+ FIL_, 0xE1, 0xBD, 0xA8, FIL_, 0x01, 0xCD, 0x85,
+ FIL_, 0xE1, 0xBE, 0xB4, FIL_, 0x01, 0xCD, 0x85,
+ FIL_, 0xE1, 0xBF, 0x84, FIL_, 0x08, 0xCC, 0x81,
+ FIL_, 0xCE, 0xAC, FIL_, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBD, 0xB0, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC,
+ 0x80, FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x81,
+ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBE, 0xB6, FIL_,
+ 0xCC, 0x86, FIL_, 0xE1, 0xBE, 0xB0, FIL_, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xB3, FIL_, 0xCC, 0x84,
+ FIL_, 0xE1, 0xBE, 0xB1, FIL_, 0x04, 0xCC, 0x81,
+ FIL_, 0xCE, 0xAD, FIL_, 0xCC, 0x94, FIL_, 0xE1,
+ 0xBC, 0x91, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD,
+ 0xB2, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0x90,
+ FIL_, 0x06, 0xCC, 0x81, FIL_, 0xCE, 0xAE, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB4, FIL_, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBF, 0x83, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBF, 0x86, FIL_, 0xCC, 0x94, FIL_,
+ 0xE1, 0xBC, 0xA1, FIL_, 0xCC, 0x93, FIL_, 0xE1,
+ 0xBC, 0xA0, FIL_, 0x08, 0xCD, 0x82, FIL_, 0xE1,
+ 0xBF, 0x96, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF,
+ 0x90, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB0,
+ FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAF, FIL_, 0xCC,
+ 0x94, FIL_, 0xE1, 0xBC, 0xB1, FIL_, 0xCC, 0x84,
+ FIL_, 0xE1, 0xBF, 0x91, FIL_, 0xCC, 0x88, FIL_,
+ 0xCF, 0x8A, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD,
+ 0xB6, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xCF, 0x8C,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB8, FIL_,
+ 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0x80, FIL_, 0xCC,
+ 0x94, FIL_, 0xE1, 0xBD, 0x81, FIL_, 0x02, 0xCC,
+ 0x93, FIL_, 0xE1, 0xBF, 0xA4, FIL_, 0xCC, 0x94,
+ FIL_, 0xE1, 0xBF, 0xA5, FIL_, 0x08, 0xCC, 0x93,
+ FIL_, 0xE1, 0xBD, 0x90, FIL_, 0xCC, 0x94, FIL_,
+ 0xE1, 0xBD, 0x91, FIL_, 0xCC, 0x86, FIL_, 0xE1,
+ 0xBF, 0xA0, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF,
+ 0xA6, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0xA1,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xBA, FIL_,
+ 0xCC, 0x81, FIL_, 0xCF, 0x8D, FIL_, 0xCC, 0x88,
+ FIL_, 0xCF, 0x8B, FIL_, 0x06, 0xCC, 0x94, FIL_,
+ 0xE1, 0xBD, 0xA1, FIL_, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBF, 0xB3, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD,
+ 0xBC, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0xB6,
+ FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0xA0, FIL_,
+ 0xCC, 0x81, FIL_, 0xCF, 0x8E, FIL_, 0x03, 0xCD,
+ 0x82, FIL_, 0xE1, 0xBF, 0x97, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBF, 0x92, FIL_, 0xCC, 0x81, FIL_,
+ 0xCE, 0x90, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBF, 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xB0,
+ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0xA7, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB4, FIL_,
+ 0x02, 0xCC, 0x88, FIL_, 0xCF, 0x94, FIL_, 0xCC,
+ 0x81, FIL_, 0xCF, 0x93, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xD0, 0x87, FIL_, 0x02, 0xCC, 0x86, FIL_,
+ 0xD3, 0x90, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x92,
+ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x83, FIL_,
+ 0x03, 0xCC, 0x86, FIL_, 0xD3, 0x96, FIL_, 0xCC,
+ 0x80, FIL_, 0xD0, 0x80, FIL_, 0xCC, 0x88, FIL_,
+ 0xD0, 0x81, FIL_, 0x02, 0xCC, 0x88, FIL_, 0xD3,
+ 0x9C, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x81, FIL_,
+ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9E, FIL_, 0x04,
+ 0xCC, 0x80, FIL_, 0xD0, 0x8D, FIL_, 0xCC, 0x88,
+ FIL_, 0xD3, 0xA4, FIL_, 0xCC, 0x86, FIL_, 0xD0,
+ 0x99, FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xA2, FIL_,
+ 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x8C, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xA6, FIL_, 0x04, 0xCC,
+ 0x86, FIL_, 0xD0, 0x8E, FIL_, 0xCC, 0x8B, FIL_,
+ 0xD3, 0xB2, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB0,
+ FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xAE, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xB4, FIL_, 0x01, 0xCC,
+ 0x88, FIL_, 0xD3, 0xB8, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xD3, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_,
+ 0xD3, 0x91, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x93,
+ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x93, FIL_,
+ 0x03, 0xCC, 0x80, FIL_, 0xD1, 0x90, FIL_, 0xCC,
+ 0x88, FIL_, 0xD1, 0x91, FIL_, 0xCC, 0x86, FIL_,
+ 0xD3, 0x97, FIL_, 0x02, 0xCC, 0x88, FIL_, 0xD3,
+ 0x9D, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x82, FIL_,
+ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9F, FIL_, 0x04,
+ 0xCC, 0x88, FIL_, 0xD3, 0xA5, FIL_, 0xCC, 0x86,
+ FIL_, 0xD0, 0xB9, FIL_, 0xCC, 0x80, FIL_, 0xD1,
+ 0x9D, FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xA3, FIL_,
+ 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x9C, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xA7, FIL_, 0x04, 0xCC,
+ 0x84, FIL_, 0xD3, 0xAF, FIL_, 0xCC, 0x86, FIL_,
+ 0xD1, 0x9E, FIL_, 0xCC, 0x8B, FIL_, 0xD3, 0xB3,
+ FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB1, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xB5, FIL_, 0x01, 0xCC,
+ 0x88, FIL_, 0xD3, 0xB9, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xD3, 0xAD, FIL_, 0x01, 0xCC, 0x88, FIL_,
+ 0xD1, 0x97, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1,
+ 0xB6, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, 0xB7,
+ FIL_, 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9A, FIL_,
+ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9B, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xAA, FIL_, 0x01, 0xCC,
+ 0x88, FIL_, 0xD3, 0xAB, FIL_, 0x03, 0xD9, 0x94,
+ FIL_, 0xD8, 0xA3, FIL_, 0xD9, 0x93, FIL_, 0xD8,
+ 0xA2, FIL_, 0xD9, 0x95, FIL_, 0xD8, 0xA5, FIL_,
+ 0x01, 0xD9, 0x94, FIL_, 0xD8, 0xA4, FIL_, 0x01,
+ 0xD9, 0x94, FIL_, 0xD8, 0xA6, FIL_, 0x01, 0xD9,
+ 0x94, FIL_, 0xDB, 0x82, FIL_, 0x01, 0xD9, 0x94,
+ FIL_, 0xDB, 0x93, FIL_, 0x01, 0xD9, 0x94, FIL_,
+ 0xDB, 0x80, FIL_, 0x01, 0xE0, 0xA4, 0xBC, FIL_,
+ 0xE0, 0xA4, 0xA9, FIL_, 0x01, 0xE0, 0xA4, 0xBC,
+ FIL_, 0xE0, 0xA4, 0xB1, FIL_, 0x01, 0xE0, 0xA4,
+ 0xBC, FIL_, 0xE0, 0xA4, 0xB4, FIL_, 0x02, 0xE0,
+ 0xA6, 0xBE, FIL_, 0xE0, 0xA7, 0x8B, FIL_, 0xE0,
+ 0xA7, 0x97, FIL_, 0xE0, 0xA7, 0x8C, FIL_, 0x03,
+ 0xE0, 0xAD, 0x97, FIL_, 0xE0, 0xAD, 0x8C, FIL_,
+ 0xE0, 0xAC, 0xBE, FIL_, 0xE0, 0xAD, 0x8B, FIL_,
+ 0xE0, 0xAD, 0x96, FIL_, 0xE0, 0xAD, 0x88, FIL_,
+ 0x01, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAE, 0x94,
+ FIL_, 0x02, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, 0xAF,
+ 0x8A, FIL_, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAF,
+ 0x8C, FIL_, 0x01, 0xE0, 0xAE, 0xBE, FIL_, 0xE0,
+ 0xAF, 0x8B, FIL_, 0x01, 0xE0, 0xB1, 0x96, FIL_,
+ 0xE0, 0xB1, 0x88, FIL_, 0x01, 0xE0, 0xB3, 0x95,
+ FIL_, 0xE0, 0xB3, 0x80, FIL_, 0x03, 0xE0, 0xB3,
+ 0x95, FIL_, 0xE0, 0xB3, 0x87, FIL_, 0xE0, 0xB3,
+ 0x82, FIL_, 0xE0, 0xB3, 0x8A, FIL_, 0xE0, 0xB3,
+ 0x96, FIL_, 0xE0, 0xB3, 0x88, FIL_, 0x01, 0xE0,
+ 0xB3, 0x95, FIL_, 0xE0, 0xB3, 0x8B, FIL_, 0x02,
+ 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8A, FIL_,
+ 0xE0, 0xB5, 0x97, FIL_, 0xE0, 0xB5, 0x8C, FIL_,
+ 0x01, 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8B,
+ FIL_, 0x03, 0xE0, 0xB7, 0x8F, FIL_, 0xE0, 0xB7,
+ 0x9C, FIL_, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, 0xB7,
+ 0x9A, FIL_, 0xE0, 0xB7, 0x9F, FIL_, 0xE0, 0xB7,
+ 0x9E, FIL_, 0x01, 0xE0, 0xB7, 0x8A, FIL_, 0xE0,
+ 0xB7, 0x9D, FIL_, 0x01, 0xE1, 0x80, 0xAE, FIL_,
+ 0xE1, 0x80, 0xA6, FIL_, 0x01, 0xCC, 0x84, FIL_,
+ 0xE1, 0xB8, 0xB8, FIL_, 0x01, 0xCC, 0x84, FIL_,
+ 0xE1, 0xB8, 0xB9, FIL_, 0x01, 0xCC, 0x84, FIL_,
+ 0xE1, 0xB9, 0x9C, FIL_, 0x01, 0xCC, 0x84, FIL_,
+ 0xE1, 0xB9, 0x9D, FIL_, 0x01, 0xCC, 0x87, FIL_,
+ 0xE1, 0xB9, 0xA8, FIL_, 0x01, 0xCC, 0x87, FIL_,
+ 0xE1, 0xB9, 0xA9, FIL_, 0x02, 0xCC, 0x86, FIL_,
+ 0xE1, 0xBA, 0xB6, FIL_, 0xCC, 0x82, FIL_, 0xE1,
+ 0xBA, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xE1,
+ 0xBA, 0xB7, FIL_, 0xCC, 0x82, FIL_, 0xE1, 0xBA,
+ 0xAD, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB,
+ 0x86, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB,
+ 0x87, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB,
+ 0x98, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB,
+ 0x99, FIL_, 0x04, 0xCC, 0x80, FIL_, 0xE1, 0xBC,
+ 0x82, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x84,
+ FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x80, FIL_,
+ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x86, FIL_, 0x04,
+ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x87, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBC, 0x83, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBC, 0x85, FIL_, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x81, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x82, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x83, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x84, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x85, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x86, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x87, FIL_, 0x04, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x88, FIL_, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBC, 0x8A, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC,
+ 0x8E, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8C,
+ FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8D,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x8B, FIL_,
+ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x8F, FIL_, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x89, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x8A, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x8B, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x8C, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x8D, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x8E, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x8F, FIL_, 0x02, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBC, 0x92, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBC, 0x94, FIL_, 0x02, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBC, 0x93, FIL_, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBC, 0x95, FIL_, 0x02, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBC, 0x9A, FIL_, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBC, 0x9C, FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBC, 0x9B, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC,
+ 0x9D, FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC,
+ 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x90,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xA4, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA2, FIL_, 0x04,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA3, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBC, 0xA5, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBC, 0xA7, FIL_, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x91, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x92, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x93, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x94, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x95, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x96, FIL_, 0x01, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0x97, FIL_, 0x04, 0xCD, 0x82, FIL_,
+ 0xE1, 0xBC, 0xAE, FIL_, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBC, 0xAC, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE,
+ 0x98, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xAA,
+ FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xAF,
+ FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x99, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xAD, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBC, 0xAB, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x9A, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x9B, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x9C, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x9D, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x9E, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0x9F, FIL_, 0x03, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBC, 0xB4, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBC, 0xB6, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBC, 0xB2, FIL_, 0x03, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBC, 0xB5, FIL_, 0xCD, 0x82, FIL_, 0xE1,
+ 0xBC, 0xB7, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC,
+ 0xB3, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBC,
+ 0xBC, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xBA,
+ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xBE, FIL_,
+ 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xBB, FIL_,
+ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xBF, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBC, 0xBD, FIL_, 0x02, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBD, 0x82, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBD, 0x84, FIL_, 0x02, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBD, 0x83, FIL_, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBD, 0x85, FIL_, 0x02, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBD, 0x8C, FIL_, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBD, 0x8A, FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBD, 0x8D, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD,
+ 0x8B, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBD,
+ 0x94, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x96,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x92, FIL_,
+ 0x03, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x97, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x95, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBD, 0x93, FIL_, 0x03, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBD, 0x9D, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBD, 0x9F, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBD, 0x9B, FIL_, 0x04, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBD, 0xA4, FIL_, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBD, 0xA2, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD,
+ 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA0,
+ FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xA7,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0xA5, FIL_,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA1, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBD, 0xA3, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xA2, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xA3, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xA4, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xA5, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xA6, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xA7, FIL_, 0x04, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBD, 0xAC, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBD, 0xAA, FIL_, 0xCD, 0x82, FIL_,
+ 0xE1, 0xBD, 0xAE, FIL_, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xA8, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBD, 0xAD, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE,
+ 0xA9, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xAF,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xAB, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAA, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAB, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAC, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAD, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAE, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAF, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xB2, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0x82, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB2, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xB7, FIL_,
+ 0x03, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x8F, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8D, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBF, 0x8E, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBF, 0x87, FIL_, 0x01, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBF, 0xB7, FIL_, 0x03, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBF, 0x9D, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBF, 0x9F, FIL_, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBF, 0x9E, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x86, 0x9A, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x86, 0x9B, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x86, 0xAE, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x87, 0x8D, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x87, 0x8F, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x87, 0x8E, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x88, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x88, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x88, 0x8C, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x88, 0xA4, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x88, 0xA6, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0x81, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0x87, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0xAD, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0xA2, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0xB0, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0xB1, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0xB4, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0xB5, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0xB8, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x89, 0xB9, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0x80, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0x81, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8B, 0xA0, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8B, 0xA1, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0x85, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0x88, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8B, 0xA2, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8B, 0xA3, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0xAC, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0xAD, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0xAE, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8A, 0xAF, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8B, 0xAA, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8B, 0xAB, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8B, 0xAC, FIL_, 0x01, 0xCC, 0xB8, FIL_,
+ 0xE2, 0x8B, 0xAD, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x82, 0x94, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x81, 0x8C, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x81, 0x8E, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x90, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x92,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81,
+ 0x94, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x81, 0x96, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x81, 0x98, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x81, 0x9A, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x81, 0x9C, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9E, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA0, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA2,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81,
+ 0xA5, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x81, 0xA7, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x81, 0xA9, FIL_, 0x02, 0xE3, 0x82, 0x9A,
+ FIL_, 0xE3, 0x81, 0xB1, FIL_, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x81, 0xB0, FIL_, 0x02, 0xE3, 0x82,
+ 0x9A, FIL_, 0xE3, 0x81, 0xB4, FIL_, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x81, 0xB3, FIL_, 0x02, 0xE3,
+ 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB7, FIL_, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB6, FIL_, 0x02,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB9, FIL_,
+ 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xBA, FIL_,
+ 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xBC,
+ FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xBD,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82,
+ 0x9E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x83, 0xB4, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x82, 0xAC, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x82, 0xAE, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x82, 0xB0, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB2, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB4, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB6,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82,
+ 0xB8, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x82, 0xBA, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x82, 0xBC, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x82, 0xBE, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x83, 0x80, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x83, 0x82, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x85, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x87,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83,
+ 0x89, FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x83, 0x90, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3,
+ 0x83, 0x91, FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x83, 0x93, FIL_, 0xE3, 0x82, 0x9A, FIL_,
+ 0xE3, 0x83, 0x94, FIL_, 0x02, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x83, 0x96, FIL_, 0xE3, 0x82, 0x9A,
+ FIL_, 0xE3, 0x83, 0x97, FIL_, 0x02, 0xE3, 0x82,
+ 0x9A, FIL_, 0xE3, 0x83, 0x9A, FIL_, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x83, 0x99, FIL_, 0x02, 0xE3,
+ 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x9D, FIL_, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x83, 0x9C, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0xB7, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0xB8,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83,
+ 0xB9, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x83, 0xBA, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x83, 0xBE, FIL_, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ },
+ {
+ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAE, FIL_,
+ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA0, FIL_,
+ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAF, FIL_,
+ 0x10, 0xCC, 0xA5, FIL_, 0xE1, 0xB8, 0x80, FIL_,
+ 0xCC, 0x87, FIL_, 0xC8, 0xA6, FIL_, 0xCC, 0x83,
+ FIL_, 0xC3, 0x83, FIL_, 0xCC, 0x91, FIL_, 0xC8,
+ 0x82, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x80, FIL_,
+ 0xCC, 0x8A, FIL_, 0xC3, 0x85, FIL_, 0xCC, 0x88,
+ FIL_, 0xC3, 0x84, FIL_, 0xCC, 0x89, FIL_, 0xE1,
+ 0xBA, 0xA2, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA,
+ 0xA0, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8D, FIL_,
+ 0xCC, 0x80, FIL_, 0xC3, 0x80, FIL_, 0xCC, 0x81,
+ FIL_, 0xC3, 0x81, FIL_, 0xCC, 0x82, FIL_, 0xC3,
+ 0x82, FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x84, FIL_,
+ 0xCC, 0x86, FIL_, 0xC4, 0x82, FIL_, 0xCC, 0x84,
+ FIL_, 0xC4, 0x80, FIL_, 0x03, 0xCC, 0xB1, FIL_,
+ 0xE1, 0xB8, 0x86, FIL_, 0xCC, 0x87, FIL_, 0xE1,
+ 0xB8, 0x82, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8,
+ 0x84, FIL_, 0x05, 0xCC, 0xA7, FIL_, 0xC3, 0x87,
+ FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8C, FIL_, 0xCC,
+ 0x81, FIL_, 0xC4, 0x86, FIL_, 0xCC, 0x82, FIL_,
+ 0xC4, 0x88, FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x8A,
+ FIL_, 0x06, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x90,
+ FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8E, FIL_, 0xCC,
+ 0xB1, FIL_, 0xE1, 0xB8, 0x8E, FIL_, 0xCC, 0xAD,
+ FIL_, 0xE1, 0xB8, 0x92, FIL_, 0xCC, 0xA3, FIL_,
+ 0xE1, 0xB8, 0x8C, FIL_, 0xCC, 0x87, FIL_, 0xE1,
+ 0xB8, 0x8A, FIL_, 0x11, 0xCC, 0x84, FIL_, 0xC4,
+ 0x92, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x94, FIL_,
+ 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xB8, FIL_, 0xCC,
+ 0x91, FIL_, 0xC8, 0x86, FIL_, 0xCC, 0x82, FIL_,
+ 0xC3, 0x8A, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x84,
+ FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x98, FIL_,
+ 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xBA, FIL_, 0xCC,
+ 0xA7, FIL_, 0xC8, 0xA8, FIL_, 0xCC, 0x8C, FIL_,
+ 0xC4, 0x9A, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x88,
+ FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x98, FIL_, 0xCC,
+ 0x83, FIL_, 0xE1, 0xBA, 0xBC, FIL_, 0xCC, 0x87,
+ FIL_, 0xC4, 0x96, FIL_, 0xCC, 0x81, FIL_, 0xC3,
+ 0x89, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x8B, FIL_,
+ 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0x9A, FIL_, 0x01,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9E, FIL_, 0x07,
+ 0xCC, 0x8C, FIL_, 0xC7, 0xA6, FIL_, 0xCC, 0x86,
+ FIL_, 0xC4, 0x9E, FIL_, 0xCC, 0x82, FIL_, 0xC4,
+ 0x9C, FIL_, 0xCC, 0xA7, FIL_, 0xC4, 0xA2, FIL_,
+ 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xA0, FIL_, 0xCC,
+ 0x81, FIL_, 0xC7, 0xB4, FIL_, 0xCC, 0x87, FIL_,
+ 0xC4, 0xA0, FIL_, 0x07, 0xCC, 0x87, FIL_, 0xE1,
+ 0xB8, 0xA2, FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8,
+ 0xA8, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xA4, FIL_,
+ 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA6, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC8, 0x9E, FIL_, 0xCC, 0xAE, FIL_,
+ 0xE1, 0xB8, 0xAA, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xB8, 0xA4, FIL_, 0x0F, 0xCC, 0xB0, FIL_, 0xE1,
+ 0xB8, 0xAC, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8F,
+ FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x8C, FIL_, 0xCC,
+ 0x89, FIL_, 0xE1, 0xBB, 0x88, FIL_, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xBB, 0x8A, FIL_, 0xCC, 0x91, FIL_,
+ 0xC8, 0x8A, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x8F,
+ FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x8E, FIL_, 0xCC,
+ 0x81, FIL_, 0xC3, 0x8D, FIL_, 0xCC, 0x83, FIL_,
+ 0xC4, 0xA8, FIL_, 0xCC, 0x87, FIL_, 0xC4, 0xB0,
+ FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x88, FIL_, 0xCC,
+ 0xA8, FIL_, 0xC4, 0xAE, FIL_, 0xCC, 0x86, FIL_,
+ 0xC4, 0xAC, FIL_, 0xCC, 0x84, FIL_, 0xC4, 0xAA,
+ FIL_, 0x01, 0xCC, 0x82, FIL_, 0xC4, 0xB4, FIL_,
+ 0x05, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xB0, FIL_,
+ 0xCC, 0x8C, FIL_, 0xC7, 0xA8, FIL_, 0xCC, 0xB1,
+ FIL_, 0xE1, 0xB8, 0xB4, FIL_, 0xCC, 0xA7, FIL_,
+ 0xC4, 0xB6, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8,
+ 0xB2, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8,
+ 0xB6, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0xBD, FIL_,
+ 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0xBC, FIL_, 0xCC,
+ 0xB1, FIL_, 0xE1, 0xB8, 0xBA, FIL_, 0xCC, 0xA7,
+ FIL_, 0xC4, 0xBB, FIL_, 0xCC, 0x81, FIL_, 0xC4,
+ 0xB9, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xB8,
+ 0xBE, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x80,
+ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x82, FIL_,
+ 0x09, 0xCC, 0x83, FIL_, 0xC3, 0x91, FIL_, 0xCC,
+ 0x81, FIL_, 0xC5, 0x83, FIL_, 0xCC, 0xA7, FIL_,
+ 0xC5, 0x85, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0x87,
+ FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x84, FIL_,
+ 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x86, FIL_, 0xCC,
+ 0xB1, FIL_, 0xE1, 0xB9, 0x88, FIL_, 0xCC, 0xAD,
+ FIL_, 0xE1, 0xB9, 0x8A, FIL_, 0xCC, 0x80, FIL_,
+ 0xC7, 0xB8, FIL_, 0x10, 0xCC, 0x89, FIL_, 0xE1,
+ 0xBB, 0x8E, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8C,
+ FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x94, FIL_, 0xCC,
+ 0x86, FIL_, 0xC5, 0x8E, FIL_, 0xCC, 0x83, FIL_,
+ 0xC3, 0x95, FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0x90,
+ FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x96, FIL_, 0xCC,
+ 0x9B, FIL_, 0xC6, 0xA0, FIL_, 0xCC, 0x91, FIL_,
+ 0xC8, 0x8E, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x91,
+ FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x8C, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xBB, 0x8C, FIL_, 0xCC, 0x80,
+ FIL_, 0xC3, 0x92, FIL_, 0xCC, 0xA8, FIL_, 0xC7,
+ 0xAA, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xAE, FIL_,
+ 0xCC, 0x81, FIL_, 0xC3, 0x93, FIL_, 0x02, 0xCC,
+ 0x87, FIL_, 0xE1, 0xB9, 0x96, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0x94, FIL_, 0x08, 0xCC, 0xA7,
+ FIL_, 0xC5, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC5,
+ 0x98, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x92, FIL_,
+ 0xCC, 0x8F, FIL_, 0xC8, 0x90, FIL_, 0xCC, 0x81,
+ FIL_, 0xC5, 0x94, FIL_, 0xCC, 0x87, FIL_, 0xE1,
+ 0xB9, 0x98, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9,
+ 0x9E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x9A,
+ FIL_, 0x07, 0xCC, 0xA6, FIL_, 0xC8, 0x98, FIL_,
+ 0xCC, 0x81, FIL_, 0xC5, 0x9A, FIL_, 0xCC, 0x82,
+ FIL_, 0xC5, 0x9C, FIL_, 0xCC, 0xA7, FIL_, 0xC5,
+ 0x9E, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA0, FIL_,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA0, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xB9, 0xA2, FIL_, 0x07, 0xCC,
+ 0xA6, FIL_, 0xC8, 0x9A, FIL_, 0xCC, 0x87, FIL_,
+ 0xE1, 0xB9, 0xAA, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xB9, 0xAC, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9,
+ 0xAE, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB0,
+ FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA2, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC5, 0xA4, FIL_, 0x13, 0xCC, 0x8A,
+ FIL_, 0xC5, 0xAE, FIL_, 0xCC, 0x88, FIL_, 0xC3,
+ 0x9C, FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0xB0, FIL_,
+ 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB6, FIL_, 0xCC,
+ 0xA8, FIL_, 0xC5, 0xB2, FIL_, 0xCC, 0x8C, FIL_,
+ 0xC7, 0x93, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x99,
+ FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x94, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xBB, 0xA4, FIL_, 0xCC, 0xA4,
+ FIL_, 0xE1, 0xB9, 0xB2, FIL_, 0xCC, 0x81, FIL_,
+ 0xC3, 0x9A, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x9B,
+ FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, 0xB4, FIL_,
+ 0xCC, 0x83, FIL_, 0xC5, 0xA8, FIL_, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBB, 0xA6, FIL_, 0xCC, 0x84, FIL_,
+ 0xC5, 0xAA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x96,
+ FIL_, 0xCC, 0x86, FIL_, 0xC5, 0xAC, FIL_, 0xCC,
+ 0x9B, FIL_, 0xC6, 0xAF, FIL_, 0x02, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xB9, 0xBE, FIL_, 0xCC, 0x83, FIL_,
+ 0xE1, 0xB9, 0xBC, FIL_, 0x06, 0xCC, 0x88, FIL_,
+ 0xE1, 0xBA, 0x84, FIL_, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBA, 0x82, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA,
+ 0x80, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x88,
+ FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB4, FIL_, 0xCC,
+ 0x87, FIL_, 0xE1, 0xBA, 0x86, FIL_, 0x02, 0xCC,
+ 0x88, FIL_, 0xE1, 0xBA, 0x8C, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xBA, 0x8A, FIL_, 0x09, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBB, 0xB6, FIL_, 0xCC, 0xA3, FIL_,
+ 0xE1, 0xBB, 0xB4, FIL_, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBB, 0xB2, FIL_, 0xCC, 0x88, FIL_, 0xC5, 0xB8,
+ FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x9D, FIL_, 0xCC,
+ 0x83, FIL_, 0xE1, 0xBB, 0xB8, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xBA, 0x8E, FIL_, 0xCC, 0x84, FIL_,
+ 0xC8, 0xB2, FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB6,
+ FIL_, 0x06, 0xCC, 0x82, FIL_, 0xE1, 0xBA, 0x90,
+ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x92, FIL_,
+ 0xCC, 0xB1, FIL_, 0xE1, 0xBA, 0x94, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC5, 0xBD, FIL_, 0xCC, 0x87, FIL_,
+ 0xC5, 0xBB, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xB9,
+ FIL_, 0x10, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xA1,
+ FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x85, FIL_, 0xCC,
+ 0x81, FIL_, 0xC3, 0xA1, FIL_, 0xCC, 0x82, FIL_,
+ 0xC3, 0xA2, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA,
+ 0xA3, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xA3, FIL_,
+ 0xCC, 0x8C, FIL_, 0xC7, 0x8E, FIL_, 0xCC, 0x8A,
+ FIL_, 0xC3, 0xA5, FIL_, 0xCC, 0x88, FIL_, 0xC3,
+ 0xA4, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xA7, FIL_,
+ 0xCC, 0x91, FIL_, 0xC8, 0x83, FIL_, 0xCC, 0xA5,
+ FIL_, 0xE1, 0xB8, 0x81, FIL_, 0xCC, 0x84, FIL_,
+ 0xC4, 0x81, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x81,
+ FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x83, FIL_, 0xCC,
+ 0x80, FIL_, 0xC3, 0xA0, FIL_, 0x03, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xB8, 0x85, FIL_, 0xCC, 0x87, FIL_,
+ 0xE1, 0xB8, 0x83, FIL_, 0xCC, 0xB1, FIL_, 0xE1,
+ 0xB8, 0x87, FIL_, 0x05, 0xCC, 0x87, FIL_, 0xC4,
+ 0x8B, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8D, FIL_,
+ 0xCC, 0x82, FIL_, 0xC4, 0x89, FIL_, 0xCC, 0x81,
+ FIL_, 0xC4, 0x87, FIL_, 0xCC, 0xA7, FIL_, 0xC3,
+ 0xA7, FIL_, 0x06, 0xCC, 0x87, FIL_, 0xE1, 0xB8,
+ 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x91,
+ FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8F, FIL_,
+ 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0x8D, FIL_, 0xCC,
+ 0x8C, FIL_, 0xC4, 0x8F, FIL_, 0xCC, 0xAD, FIL_,
+ 0xE1, 0xB8, 0x93, FIL_, 0x11, 0xCC, 0x80, FIL_,
+ 0xC3, 0xA8, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xA9,
+ FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xAA, FIL_, 0xCC,
+ 0x88, FIL_, 0xC3, 0xAB, FIL_, 0xCC, 0x84, FIL_,
+ 0xC4, 0x93, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x95,
+ FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x97, FIL_, 0xCC,
+ 0xA8, FIL_, 0xC4, 0x99, FIL_, 0xCC, 0x8C, FIL_,
+ 0xC4, 0x9B, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x85,
+ FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x87, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xBA, 0xB9, FIL_, 0xCC, 0xA7,
+ FIL_, 0xC8, 0xA9, FIL_, 0xCC, 0x83, FIL_, 0xE1,
+ 0xBA, 0xBD, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA,
+ 0xBB, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x99,
+ FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0x9B, FIL_,
+ 0x01, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9F, FIL_,
+ 0x07, 0xCC, 0x86, FIL_, 0xC4, 0x9F, FIL_, 0xCC,
+ 0x87, FIL_, 0xC4, 0xA1, FIL_, 0xCC, 0x82, FIL_,
+ 0xC4, 0x9D, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xB8,
+ 0xA1, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0xA7, FIL_,
+ 0xCC, 0xA7, FIL_, 0xC4, 0xA3, FIL_, 0xCC, 0x81,
+ FIL_, 0xC7, 0xB5, FIL_, 0x08, 0xCC, 0xA7, FIL_,
+ 0xE1, 0xB8, 0xA9, FIL_, 0xCC, 0xB1, FIL_, 0xE1,
+ 0xBA, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC8, 0x9F,
+ FIL_, 0xCC, 0xAE, FIL_, 0xE1, 0xB8, 0xAB, FIL_,
+ 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA7, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xB8, 0xA5, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB8, 0xA3, FIL_, 0xCC, 0x82, FIL_,
+ 0xC4, 0xA5, FIL_, 0x0E, 0xCC, 0x88, FIL_, 0xC3,
+ 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x89,
+ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0x8B, FIL_,
+ 0xCC, 0x82, FIL_, 0xC3, 0xAE, FIL_, 0xCC, 0x81,
+ FIL_, 0xC3, 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xC3,
+ 0xAC, FIL_, 0xCC, 0x83, FIL_, 0xC4, 0xA9, FIL_,
+ 0xCC, 0x84, FIL_, 0xC4, 0xAB, FIL_, 0xCC, 0x86,
+ FIL_, 0xC4, 0xAD, FIL_, 0xCC, 0xA8, FIL_, 0xC4,
+ 0xAF, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0xAD,
+ FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x90, FIL_, 0xCC,
+ 0x91, FIL_, 0xC8, 0x8B, FIL_, 0xCC, 0x8F, FIL_,
+ 0xC8, 0x89, FIL_, 0x02, 0xCC, 0x8C, FIL_, 0xC7,
+ 0xB0, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xB5, FIL_,
+ 0x05, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0xB5, FIL_,
+ 0xCC, 0xA7, FIL_, 0xC4, 0xB7, FIL_, 0xCC, 0x8C,
+ FIL_, 0xC7, 0xA9, FIL_, 0xCC, 0x81, FIL_, 0xE1,
+ 0xB8, 0xB1, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8,
+ 0xB3, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8,
+ 0xB7, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0xBD,
+ FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0xBB, FIL_,
+ 0xCC, 0xA7, FIL_, 0xC4, 0xBC, FIL_, 0xCC, 0x81,
+ FIL_, 0xC4, 0xBA, FIL_, 0xCC, 0x8C, FIL_, 0xC4,
+ 0xBE, FIL_, 0x03, 0xCC, 0x87, FIL_, 0xE1, 0xB9,
+ 0x81, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x83,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xBF, FIL_,
+ 0x09, 0xCC, 0x80, FIL_, 0xC7, 0xB9, FIL_, 0xCC,
+ 0xAD, FIL_, 0xE1, 0xB9, 0x8B, FIL_, 0xCC, 0x83,
+ FIL_, 0xC3, 0xB1, FIL_, 0xCC, 0x81, FIL_, 0xC5,
+ 0x84, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x87,
+ FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, 0x89, FIL_,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x85, FIL_, 0xCC,
+ 0xA7, FIL_, 0xC5, 0x86, FIL_, 0xCC, 0x8C, FIL_,
+ 0xC5, 0x88, FIL_, 0x10, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xBB, 0x8D, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xAF,
+ FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xB2, FIL_, 0xCC,
+ 0x91, FIL_, 0xC8, 0x8F, FIL_, 0xCC, 0x89, FIL_,
+ 0xE1, 0xBB, 0x8F, FIL_, 0xCC, 0x88, FIL_, 0xC3,
+ 0xB6, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xB5, FIL_,
+ 0xCC, 0x81, FIL_, 0xC3, 0xB3, FIL_, 0xCC, 0x8C,
+ FIL_, 0xC7, 0x92, FIL_, 0xCC, 0xA8, FIL_, 0xC7,
+ 0xAB, FIL_, 0xCC, 0x9B, FIL_, 0xC6, 0xA1, FIL_,
+ 0xCC, 0x84, FIL_, 0xC5, 0x8D, FIL_, 0xCC, 0x86,
+ FIL_, 0xC5, 0x8F, FIL_, 0xCC, 0x8B, FIL_, 0xC5,
+ 0x91, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xB4, FIL_,
+ 0xCC, 0x8F, FIL_, 0xC8, 0x8D, FIL_, 0x02, 0xCC,
+ 0x87, FIL_, 0xE1, 0xB9, 0x97, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0x95, FIL_, 0x08, 0xCC, 0x8C,
+ FIL_, 0xC5, 0x99, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xB9, 0x9B, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x95,
+ FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x97, FIL_, 0xCC,
+ 0xB1, FIL_, 0xE1, 0xB9, 0x9F, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0x99, FIL_, 0xCC, 0x91, FIL_,
+ 0xC8, 0x93, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x91,
+ FIL_, 0x07, 0xCC, 0xA7, FIL_, 0xC5, 0x9F, FIL_,
+ 0xCC, 0x82, FIL_, 0xC5, 0x9D, FIL_, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA1, FIL_, 0xCC, 0xA6, FIL_,
+ 0xC8, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x9B,
+ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0xA3, FIL_,
+ 0xCC, 0x8C, FIL_, 0xC5, 0xA1, FIL_, 0x08, 0xCC,
+ 0xA6, FIL_, 0xC8, 0x9B, FIL_, 0xCC, 0xAD, FIL_,
+ 0xE1, 0xB9, 0xB1, FIL_, 0xCC, 0xB1, FIL_, 0xE1,
+ 0xB9, 0xAF, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9,
+ 0xAD, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xAB,
+ FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA5, FIL_, 0xCC,
+ 0xA7, FIL_, 0xC5, 0xA3, FIL_, 0xCC, 0x88, FIL_,
+ 0xE1, 0xBA, 0x97, FIL_, 0x13, 0xCC, 0x8A, FIL_,
+ 0xC5, 0xAF, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x95,
+ FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x94, FIL_, 0xCC,
+ 0x80, FIL_, 0xC3, 0xB9, FIL_, 0xCC, 0x9B, FIL_,
+ 0xC6, 0xB0, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xBB,
+ FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xBA, FIL_, 0xCC,
+ 0x88, FIL_, 0xC3, 0xBC, FIL_, 0xCC, 0x83, FIL_,
+ 0xC5, 0xA9, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB,
+ 0xA7, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0xAB, FIL_,
+ 0xCC, 0x86, FIL_, 0xC5, 0xAD, FIL_, 0xCC, 0xAD,
+ FIL_, 0xE1, 0xB9, 0xB7, FIL_, 0xCC, 0x8B, FIL_,
+ 0xC5, 0xB1, FIL_, 0xCC, 0xA8, FIL_, 0xC5, 0xB3,
+ FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x97, FIL_, 0xCC,
+ 0xA4, FIL_, 0xE1, 0xB9, 0xB3, FIL_, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xBB, 0xA5, FIL_, 0xCC, 0xB0, FIL_,
+ 0xE1, 0xB9, 0xB5, FIL_, 0x02, 0xCC, 0x83, FIL_,
+ 0xE1, 0xB9, 0xBD, FIL_, 0xCC, 0xA3, FIL_, 0xE1,
+ 0xB9, 0xBF, FIL_, 0x07, 0xCC, 0x8A, FIL_, 0xE1,
+ 0xBA, 0x98, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xBA,
+ 0x87, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x83,
+ FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB5, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBA, 0x81, FIL_, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xBA, 0x89, FIL_, 0xCC, 0x88, FIL_,
+ 0xE1, 0xBA, 0x85, FIL_, 0x02, 0xCC, 0x87, FIL_,
+ 0xE1, 0xBA, 0x8B, FIL_, 0xCC, 0x88, FIL_, 0xE1,
+ 0xBA, 0x8D, FIL_, 0x0A, 0xCC, 0x87, FIL_, 0xE1,
+ 0xBA, 0x8F, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB,
+ 0xB5, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xB7,
+ FIL_, 0xCC, 0x8A, FIL_, 0xE1, 0xBA, 0x99, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xB3, FIL_, 0xCC,
+ 0x83, FIL_, 0xE1, 0xBB, 0xB9, FIL_, 0xCC, 0x88,
+ FIL_, 0xC3, 0xBF, FIL_, 0xCC, 0x81, FIL_, 0xC3,
+ 0xBD, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xB3, FIL_,
+ 0xCC, 0x82, FIL_, 0xC5, 0xB7, FIL_, 0x06, 0xCC,
+ 0xB1, FIL_, 0xE1, 0xBA, 0x95, FIL_, 0xCC, 0xA3,
+ FIL_, 0xE1, 0xBA, 0x93, FIL_, 0xCC, 0x82, FIL_,
+ 0xE1, 0xBA, 0x91, FIL_, 0xCC, 0x81, FIL_, 0xC5,
+ 0xBA, FIL_, 0xCC, 0x87, FIL_, 0xC5, 0xBC, FIL_,
+ 0xCC, 0x8C, FIL_, 0xC5, 0xBE, FIL_, 0x03, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBF, 0xAD, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBF, 0x81, FIL_, 0xCC, 0x81, FIL_,
+ 0xCE, 0x85, FIL_, 0x04, 0xCC, 0x83, FIL_, 0xE1,
+ 0xBA, 0xAA, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA,
+ 0xA4, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA8,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA6, FIL_,
+ 0x01, 0xCC, 0x84, FIL_, 0xC7, 0x9E, FIL_, 0x01,
+ 0xCC, 0x81, FIL_, 0xC7, 0xBA, FIL_, 0x02, 0xCC,
+ 0x84, FIL_, 0xC7, 0xA2, FIL_, 0xCC, 0x81, FIL_,
+ 0xC7, 0xBC, FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1,
+ 0xB8, 0x88, FIL_, 0x04, 0xCC, 0x83, FIL_, 0xE1,
+ 0xBB, 0x84, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB,
+ 0x80, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x82,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xBE, FIL_,
+ 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xAE, FIL_,
+ 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x90, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x92, FIL_, 0xCC,
+ 0x89, FIL_, 0xE1, 0xBB, 0x94, FIL_, 0xCC, 0x83,
+ FIL_, 0xE1, 0xBB, 0x96, FIL_, 0x03, 0xCC, 0x84,
+ FIL_, 0xC8, 0xAC, FIL_, 0xCC, 0x88, FIL_, 0xE1,
+ 0xB9, 0x8E, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB9,
+ 0x8C, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAA,
+ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xC7, 0xBE, FIL_,
+ 0x04, 0xCC, 0x80, FIL_, 0xC7, 0x9B, FIL_, 0xCC,
+ 0x84, FIL_, 0xC7, 0x95, FIL_, 0xCC, 0x8C, FIL_,
+ 0xC7, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x97,
+ FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA5,
+ FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, 0xAB, FIL_,
+ 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA9, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBA, 0xA7, FIL_, 0x01, 0xCC,
+ 0x84, FIL_, 0xC7, 0x9F, FIL_, 0x01, 0xCC, 0x81,
+ FIL_, 0xC7, 0xBB, FIL_, 0x02, 0xCC, 0x81, FIL_,
+ 0xC7, 0xBD, FIL_, 0xCC, 0x84, FIL_, 0xC7, 0xA3,
+ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x89,
+ FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x83,
+ FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x85, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x81, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBA, 0xBF, FIL_, 0x01, 0xCC,
+ 0x81, FIL_, 0xE1, 0xB8, 0xAF, FIL_, 0x04, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBB, 0x93, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBB, 0x91, FIL_, 0xCC, 0x83, FIL_,
+ 0xE1, 0xBB, 0x97, FIL_, 0xCC, 0x89, FIL_, 0xE1,
+ 0xBB, 0x95, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1,
+ 0xB9, 0x8D, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9,
+ 0x8F, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xAD, FIL_,
+ 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAB, FIL_, 0x01,
+ 0xCC, 0x81, FIL_, 0xC7, 0xBF, FIL_, 0x04, 0xCC,
+ 0x8C, FIL_, 0xC7, 0x9A, FIL_, 0xCC, 0x84, FIL_,
+ 0xC7, 0x96, FIL_, 0xCC, 0x80, FIL_, 0xC7, 0x9C,
+ FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x98, FIL_, 0x04,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xAE, FIL_, 0xCC,
+ 0x83, FIL_, 0xE1, 0xBA, 0xB4, FIL_, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBA, 0xB2, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBA, 0xB0, FIL_, 0x04, 0xCC, 0x83, FIL_,
+ 0xE1, 0xBA, 0xB5, FIL_, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBA, 0xB1, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA,
+ 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xB3,
+ FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x96,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x94, FIL_,
+ 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x95, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x97, FIL_, 0x02,
+ 0xCC, 0x80, FIL_, 0xE1, 0xB9, 0x90, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xB9, 0x92, FIL_, 0x02, 0xCC,
+ 0x81, FIL_, 0xE1, 0xB9, 0x93, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xB9, 0x91, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA4, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA5, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA6, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xB9, 0xA7, FIL_, 0x01, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0xB8, FIL_, 0x01, 0xCC, 0x81,
+ FIL_, 0xE1, 0xB9, 0xB9, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xE1, 0xB9, 0xBA, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xE1, 0xB9, 0xBB, FIL_, 0x01, 0xCC, 0x87,
+ FIL_, 0xE1, 0xBA, 0x9B, FIL_, 0x05, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBB, 0x9C, FIL_, 0xCC, 0x89, FIL_,
+ 0xE1, 0xBB, 0x9E, FIL_, 0xCC, 0x83, FIL_, 0xE1,
+ 0xBB, 0xA0, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB,
+ 0x9A, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA2,
+ FIL_, 0x05, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xA1,
+ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA3, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x9B, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBB, 0x9D, FIL_, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBB, 0x9F, FIL_, 0x05, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBB, 0xA8, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBB, 0xAA, FIL_, 0xCC, 0x89, FIL_, 0xE1,
+ 0xBB, 0xAC, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB,
+ 0xAE, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB0,
+ FIL_, 0x05, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xAB,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0xA9, FIL_,
+ 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xAF, FIL_, 0xCC,
+ 0xA3, FIL_, 0xE1, 0xBB, 0xB1, FIL_, 0xCC, 0x89,
+ FIL_, 0xE1, 0xBB, 0xAD, FIL_, 0x01, 0xCC, 0x8C,
+ FIL_, 0xC7, 0xAE, FIL_, 0x01, 0xCC, 0x84, FIL_,
+ 0xC7, 0xAC, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7,
+ 0xAD, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA0,
+ FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA1, FIL_,
+ 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9C, FIL_,
+ 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9D, FIL_,
+ 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xB0, FIL_, 0x01,
+ 0xCC, 0x84, FIL_, 0xC8, 0xB1, FIL_, 0x01, 0xCC,
+ 0x8C, FIL_, 0xC7, 0xAF, FIL_, 0x07, 0xCC, 0x93,
+ FIL_, 0xE1, 0xBC, 0x88, FIL_, 0xCC, 0x81, FIL_,
+ 0xCE, 0x86, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBE,
+ 0xB8, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBE, 0xB9,
+ FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x89, FIL_,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xBC, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBE, 0xBA, FIL_, 0x04, 0xCC,
+ 0x94, FIL_, 0xE1, 0xBC, 0x99, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBF, 0x88, FIL_, 0xCC, 0x81, FIL_,
+ 0xCE, 0x88, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC,
+ 0x98, FIL_, 0x05, 0xCD, 0x85, FIL_, 0xE1, 0xBF,
+ 0x8C, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x89, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8A, FIL_, 0xCC,
+ 0x93, FIL_, 0xE1, 0xBC, 0xA8, FIL_, 0xCC, 0x94,
+ FIL_, 0xE1, 0xBC, 0xA9, FIL_, 0x07, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBF, 0x9A, FIL_, 0xCC, 0x84, FIL_,
+ 0xE1, 0xBF, 0x99, FIL_, 0xCC, 0x93, FIL_, 0xE1,
+ 0xBC, 0xB8, FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC,
+ 0xB9, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0x98,
+ FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x8A, FIL_, 0xCC,
+ 0x88, FIL_, 0xCE, 0xAA, FIL_, 0x04, 0xCC, 0x81,
+ FIL_, 0xCE, 0x8C, FIL_, 0xCC, 0x94, FIL_, 0xE1,
+ 0xBD, 0x89, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD,
+ 0x88, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0xB8,
+ FIL_, 0x01, 0xCC, 0x94, FIL_, 0xE1, 0xBF, 0xAC,
+ FIL_, 0x06, 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0x99,
+ FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0xA8, FIL_,
+ 0xCC, 0x88, FIL_, 0xCE, 0xAB, FIL_, 0xCC, 0x84,
+ FIL_, 0xE1, 0xBF, 0xA9, FIL_, 0xCC, 0x81, FIL_,
+ 0xCE, 0x8E, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF,
+ 0xAA, FIL_, 0x05, 0xCC, 0x93, FIL_, 0xE1, 0xBD,
+ 0xA8, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xBC,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0xBA, FIL_,
+ 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0xA9, FIL_, 0xCC,
+ 0x81, FIL_, 0xCE, 0x8F, FIL_, 0x01, 0xCD, 0x85,
+ FIL_, 0xE1, 0xBE, 0xB4, FIL_, 0x01, 0xCD, 0x85,
+ FIL_, 0xE1, 0xBF, 0x84, FIL_, 0x08, 0xCD, 0x85,
+ FIL_, 0xE1, 0xBE, 0xB3, FIL_, 0xCC, 0x84, FIL_,
+ 0xE1, 0xBE, 0xB1, FIL_, 0xCC, 0x86, FIL_, 0xE1,
+ 0xBE, 0xB0, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD,
+ 0xB0, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAC, FIL_,
+ 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x81, FIL_, 0xCC,
+ 0x93, FIL_, 0xE1, 0xBC, 0x80, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBE, 0xB6, FIL_, 0x04, 0xCC, 0x93,
+ FIL_, 0xE1, 0xBC, 0x90, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBD, 0xB2, FIL_, 0xCC, 0x94, FIL_, 0xE1,
+ 0xBC, 0x91, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAD,
+ FIL_, 0x06, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0xA1,
+ FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAE, FIL_, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBF, 0x83, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBF, 0x86, FIL_, 0xCC, 0x93, FIL_,
+ 0xE1, 0xBC, 0xA0, FIL_, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBD, 0xB4, FIL_, 0x08, 0xCC, 0x88, FIL_, 0xCF,
+ 0x8A, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAF, FIL_,
+ 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB0, FIL_, 0xCC,
+ 0x94, FIL_, 0xE1, 0xBC, 0xB1, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBD, 0xB6, FIL_, 0xCC, 0x86, FIL_,
+ 0xE1, 0xBF, 0x90, FIL_, 0xCC, 0x84, FIL_, 0xE1,
+ 0xBF, 0x91, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF,
+ 0x96, FIL_, 0x04, 0xCC, 0x93, FIL_, 0xE1, 0xBD,
+ 0x80, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB8,
+ FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0x81, FIL_,
+ 0xCC, 0x81, FIL_, 0xCF, 0x8C, FIL_, 0x02, 0xCC,
+ 0x93, FIL_, 0xE1, 0xBF, 0xA4, FIL_, 0xCC, 0x94,
+ FIL_, 0xE1, 0xBF, 0xA5, FIL_, 0x08, 0xCC, 0x81,
+ FIL_, 0xCF, 0x8D, FIL_, 0xCC, 0x94, FIL_, 0xE1,
+ 0xBD, 0x91, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF,
+ 0xA6, FIL_, 0xCC, 0x88, FIL_, 0xCF, 0x8B, FIL_,
+ 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0xA1, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBD, 0xBA, FIL_, 0xCC, 0x93,
+ FIL_, 0xE1, 0xBD, 0x90, FIL_, 0xCC, 0x86, FIL_,
+ 0xE1, 0xBF, 0xA0, FIL_, 0x06, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBD, 0xBC, FIL_, 0xCC, 0x94, FIL_, 0xE1,
+ 0xBD, 0xA1, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD,
+ 0xA0, FIL_, 0xCC, 0x81, FIL_, 0xCF, 0x8E, FIL_,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB3, FIL_, 0xCD,
+ 0x82, FIL_, 0xE1, 0xBF, 0xB6, FIL_, 0x03, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBF, 0x92, FIL_, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBF, 0x97, FIL_, 0xCC, 0x81, FIL_,
+ 0xCE, 0x90, FIL_, 0x03, 0xCD, 0x82, FIL_, 0xE1,
+ 0xBF, 0xA7, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF,
+ 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xB0, FIL_,
+ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB4, FIL_,
+ 0x02, 0xCC, 0x88, FIL_, 0xCF, 0x94, FIL_, 0xCC,
+ 0x81, FIL_, 0xCF, 0x93, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xD0, 0x87, FIL_, 0x02, 0xCC, 0x88, FIL_,
+ 0xD3, 0x92, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x90,
+ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x83, FIL_,
+ 0x03, 0xCC, 0x88, FIL_, 0xD0, 0x81, FIL_, 0xCC,
+ 0x80, FIL_, 0xD0, 0x80, FIL_, 0xCC, 0x86, FIL_,
+ 0xD3, 0x96, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xD3,
+ 0x81, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x9C, FIL_,
+ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9E, FIL_, 0x04,
+ 0xCC, 0x84, FIL_, 0xD3, 0xA2, FIL_, 0xCC, 0x88,
+ FIL_, 0xD3, 0xA4, FIL_, 0xCC, 0x86, FIL_, 0xD0,
+ 0x99, FIL_, 0xCC, 0x80, FIL_, 0xD0, 0x8D, FIL_,
+ 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x8C, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xA6, FIL_, 0x04, 0xCC,
+ 0x8B, FIL_, 0xD3, 0xB2, FIL_, 0xCC, 0x88, FIL_,
+ 0xD3, 0xB0, FIL_, 0xCC, 0x86, FIL_, 0xD0, 0x8E,
+ FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xAE, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xB4, FIL_, 0x01, 0xCC,
+ 0x88, FIL_, 0xD3, 0xB8, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xD3, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_,
+ 0xD3, 0x91, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x93,
+ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x93, FIL_,
+ 0x03, 0xCC, 0x80, FIL_, 0xD1, 0x90, FIL_, 0xCC,
+ 0x86, FIL_, 0xD3, 0x97, FIL_, 0xCC, 0x88, FIL_,
+ 0xD1, 0x91, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xD3,
+ 0x82, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x9D, FIL_,
+ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9F, FIL_, 0x04,
+ 0xCC, 0x86, FIL_, 0xD0, 0xB9, FIL_, 0xCC, 0x88,
+ FIL_, 0xD3, 0xA5, FIL_, 0xCC, 0x84, FIL_, 0xD3,
+ 0xA3, FIL_, 0xCC, 0x80, FIL_, 0xD1, 0x9D, FIL_,
+ 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x9C, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xA7, FIL_, 0x04, 0xCC,
+ 0x8B, FIL_, 0xD3, 0xB3, FIL_, 0xCC, 0x84, FIL_,
+ 0xD3, 0xAF, FIL_, 0xCC, 0x86, FIL_, 0xD1, 0x9E,
+ FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB1, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xB5, FIL_, 0x01, 0xCC,
+ 0x88, FIL_, 0xD3, 0xB9, FIL_, 0x01, 0xCC, 0x88,
+ FIL_, 0xD3, 0xAD, FIL_, 0x01, 0xCC, 0x88, FIL_,
+ 0xD1, 0x97, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1,
+ 0xB6, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, 0xB7,
+ FIL_, 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9A, FIL_,
+ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9B, FIL_, 0x01,
+ 0xCC, 0x88, FIL_, 0xD3, 0xAA, FIL_, 0x01, 0xCC,
+ 0x88, FIL_, 0xD3, 0xAB, FIL_, 0x03, 0xD9, 0x94,
+ FIL_, 0xD8, 0xA3, FIL_, 0xD9, 0x95, FIL_, 0xD8,
+ 0xA5, FIL_, 0xD9, 0x93, FIL_, 0xD8, 0xA2, FIL_,
+ 0x01, 0xD9, 0x94, FIL_, 0xD8, 0xA4, FIL_, 0x01,
+ 0xD9, 0x94, FIL_, 0xD8, 0xA6, FIL_, 0x01, 0xD9,
+ 0x94, FIL_, 0xDB, 0x82, FIL_, 0x01, 0xD9, 0x94,
+ FIL_, 0xDB, 0x93, FIL_, 0x01, 0xD9, 0x94, FIL_,
+ 0xDB, 0x80, FIL_, 0x01, 0xE0, 0xA4, 0xBC, FIL_,
+ 0xE0, 0xA4, 0xA9, FIL_, 0x01, 0xE0, 0xA4, 0xBC,
+ FIL_, 0xE0, 0xA4, 0xB1, FIL_, 0x01, 0xE0, 0xA4,
+ 0xBC, FIL_, 0xE0, 0xA4, 0xB4, FIL_, 0x02, 0xE0,
+ 0xA6, 0xBE, FIL_, 0xE0, 0xA7, 0x8B, FIL_, 0xE0,
+ 0xA7, 0x97, FIL_, 0xE0, 0xA7, 0x8C, FIL_, 0x03,
+ 0xE0, 0xAD, 0x96, FIL_, 0xE0, 0xAD, 0x88, FIL_,
+ 0xE0, 0xAC, 0xBE, FIL_, 0xE0, 0xAD, 0x8B, FIL_,
+ 0xE0, 0xAD, 0x97, FIL_, 0xE0, 0xAD, 0x8C, FIL_,
+ 0x01, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAE, 0x94,
+ FIL_, 0x02, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAF,
+ 0x8C, FIL_, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, 0xAF,
+ 0x8A, FIL_, 0x01, 0xE0, 0xAE, 0xBE, FIL_, 0xE0,
+ 0xAF, 0x8B, FIL_, 0x01, 0xE0, 0xB1, 0x96, FIL_,
+ 0xE0, 0xB1, 0x88, FIL_, 0x01, 0xE0, 0xB3, 0x95,
+ FIL_, 0xE0, 0xB3, 0x80, FIL_, 0x03, 0xE0, 0xB3,
+ 0x82, FIL_, 0xE0, 0xB3, 0x8A, FIL_, 0xE0, 0xB3,
+ 0x96, FIL_, 0xE0, 0xB3, 0x88, FIL_, 0xE0, 0xB3,
+ 0x95, FIL_, 0xE0, 0xB3, 0x87, FIL_, 0x01, 0xE0,
+ 0xB3, 0x95, FIL_, 0xE0, 0xB3, 0x8B, FIL_, 0x02,
+ 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8A, FIL_,
+ 0xE0, 0xB5, 0x97, FIL_, 0xE0, 0xB5, 0x8C, FIL_,
+ 0x01, 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8B,
+ FIL_, 0x03, 0xE0, 0xB7, 0x9F, FIL_, 0xE0, 0xB7,
+ 0x9E, FIL_, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, 0xB7,
+ 0x9A, FIL_, 0xE0, 0xB7, 0x8F, FIL_, 0xE0, 0xB7,
+ 0x9C, FIL_, 0x01, 0xE0, 0xB7, 0x8A, FIL_, 0xE0,
+ 0xB7, 0x9D, FIL_, 0x01, 0xE1, 0x80, 0xAE, FIL_,
+ 0xE1, 0x80, 0xA6, FIL_, 0x01, 0xE1, 0xAC, 0xB5,
+ FIL_, 0xE1, 0xAC, 0x86, FIL_, 0x01, 0xE1, 0xAC,
+ 0xB5, FIL_, 0xE1, 0xAC, 0x88, FIL_, 0x01, 0xE1,
+ 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8A, FIL_, 0x01,
+ 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8C, FIL_,
+ 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8E,
+ FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC,
+ 0x92, FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1,
+ 0xAC, 0xBB, FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_,
+ 0xE1, 0xAC, 0xBD, FIL_, 0x01, 0xE1, 0xAC, 0xB5,
+ FIL_, 0xE1, 0xAD, 0x80, FIL_, 0x01, 0xE1, 0xAC,
+ 0xB5, FIL_, 0xE1, 0xAD, 0x81, FIL_, 0x01, 0xE1,
+ 0xAC, 0xB5, FIL_, 0xE1, 0xAD, 0x83, FIL_, 0x01,
+ 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xB8, FIL_, 0x01,
+ 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xB9, FIL_, 0x01,
+ 0xCC, 0x84, FIL_, 0xE1, 0xB9, 0x9C, FIL_, 0x01,
+ 0xCC, 0x84, FIL_, 0xE1, 0xB9, 0x9D, FIL_, 0x01,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA8, FIL_, 0x01,
+ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA9, FIL_, 0x02,
+ 0xCC, 0x86, FIL_, 0xE1, 0xBA, 0xB6, FIL_, 0xCC,
+ 0x82, FIL_, 0xE1, 0xBA, 0xAC, FIL_, 0x02, 0xCC,
+ 0x82, FIL_, 0xE1, 0xBA, 0xAD, FIL_, 0xCC, 0x86,
+ FIL_, 0xE1, 0xBA, 0xB7, FIL_, 0x01, 0xCC, 0x82,
+ FIL_, 0xE1, 0xBB, 0x86, FIL_, 0x01, 0xCC, 0x82,
+ FIL_, 0xE1, 0xBB, 0x87, FIL_, 0x01, 0xCC, 0x82,
+ FIL_, 0xE1, 0xBB, 0x98, FIL_, 0x01, 0xCC, 0x82,
+ FIL_, 0xE1, 0xBB, 0x99, FIL_, 0x04, 0xCD, 0x85,
+ FIL_, 0xE1, 0xBE, 0x80, FIL_, 0xCD, 0x82, FIL_,
+ 0xE1, 0xBC, 0x86, FIL_, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBC, 0x82, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC,
+ 0x84, FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC,
+ 0x87, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x85,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x83, FIL_,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x81, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x82, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x83, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x84, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x85, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x86, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x87, FIL_, 0x04,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8C, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBC, 0x8A, FIL_, 0xCD, 0x85,
+ FIL_, 0xE1, 0xBE, 0x88, FIL_, 0xCD, 0x82, FIL_,
+ 0xE1, 0xBC, 0x8E, FIL_, 0x04, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBC, 0x8B, FIL_, 0xCD, 0x82, FIL_, 0xE1,
+ 0xBC, 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC,
+ 0x8D, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x89,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8A,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8B,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8C,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8D,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8E,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8F,
+ FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x92,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x94, FIL_,
+ 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x93, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x95, FIL_, 0x02,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x9A, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBC, 0x9C, FIL_, 0x02, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBC, 0x9B, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBC, 0x9D, FIL_, 0x04, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBC, 0xA2, FIL_, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBC, 0xA4, FIL_, 0xCD, 0x82, FIL_, 0xE1,
+ 0xBC, 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE,
+ 0x90, FIL_, 0x04, 0xCD, 0x85, FIL_, 0xE1, 0xBE,
+ 0x91, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xA5,
+ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xA7, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA3, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x92, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x93, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x94, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x95, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x96, FIL_, 0x01,
+ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x97, FIL_, 0x04,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xAC, FIL_, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBC, 0xAA, FIL_, 0xCD, 0x85,
+ FIL_, 0xE1, 0xBE, 0x98, FIL_, 0xCD, 0x82, FIL_,
+ 0xE1, 0xBC, 0xAE, FIL_, 0x04, 0xCD, 0x82, FIL_,
+ 0xE1, 0xBC, 0xAF, FIL_, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC,
+ 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xAB,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9A,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9B,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9C,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9D,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9E,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9F,
+ FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xB4,
+ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xB2, FIL_,
+ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xB6, FIL_, 0x03,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xB3, FIL_, 0xCD,
+ 0x82, FIL_, 0xE1, 0xBC, 0xB7, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBC, 0xB5, FIL_, 0x03, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBC, 0xBC, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBC, 0xBA, FIL_, 0xCD, 0x82, FIL_, 0xE1,
+ 0xBC, 0xBE, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBC, 0xBB, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC,
+ 0xBF, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xBD,
+ FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x82,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x84, FIL_,
+ 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x85, FIL_,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x83, FIL_, 0x02,
+ 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x8A, FIL_, 0xCC,
+ 0x81, FIL_, 0xE1, 0xBD, 0x8C, FIL_, 0x02, 0xCC,
+ 0x80, FIL_, 0xE1, 0xBD, 0x8B, FIL_, 0xCC, 0x81,
+ FIL_, 0xE1, 0xBD, 0x8D, FIL_, 0x03, 0xCD, 0x82,
+ FIL_, 0xE1, 0xBD, 0x96, FIL_, 0xCC, 0x80, FIL_,
+ 0xE1, 0xBD, 0x92, FIL_, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBD, 0x94, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1,
+ 0xBD, 0x93, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD,
+ 0x97, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x95,
+ FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x9B,
+ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x9F, FIL_,
+ 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x9D, FIL_, 0x04,
+ 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xA6, FIL_, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xA0, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBD, 0xA2, FIL_, 0xCC, 0x81, FIL_,
+ 0xE1, 0xBD, 0xA4, FIL_, 0x04, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0xA1, FIL_, 0xCD, 0x82, FIL_, 0xE1,
+ 0xBD, 0xA7, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD,
+ 0xA5, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xA3,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA2,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA3,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA4,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA5,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA6,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA7,
+ FIL_, 0x04, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xAA,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0xAC, FIL_,
+ 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xAE, FIL_, 0xCD,
+ 0x85, FIL_, 0xE1, 0xBE, 0xA8, FIL_, 0x04, 0xCD,
+ 0x82, FIL_, 0xE1, 0xBD, 0xAF, FIL_, 0xCC, 0x80,
+ FIL_, 0xE1, 0xBD, 0xAB, FIL_, 0xCD, 0x85, FIL_,
+ 0xE1, 0xBE, 0xA9, FIL_, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBD, 0xAD, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xAA, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xAB, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xAC, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xAD, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xAE, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xAF, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xB2, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBF, 0x82, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBF, 0xB2, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1,
+ 0xBE, 0xB7, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1,
+ 0xBF, 0x8E, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF,
+ 0x8D, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x8F,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0x87,
+ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB7,
+ FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x9D,
+ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBF, 0x9E, FIL_,
+ 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x9F, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0x9A, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0x9B, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0xAE, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8D, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8F, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8E, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x84, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x89, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x8C, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0xA4, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0xA6, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x81, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x84, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x87, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x89, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAD, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA2, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB0, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB1, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB4, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB5, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB8, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB9, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x80, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x81, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA0, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA1, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x84, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x85, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x88, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x89, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA2, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA3, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAC, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAD, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAE, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAF, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAA, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAB, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAC, FIL_, 0x01,
+ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAD, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0x94, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x8C,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81,
+ 0x8E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x81, 0x90, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x81, 0x92, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x81, 0x94, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x81, 0x96, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x81, 0x98, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9A, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9C,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81,
+ 0x9E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x81, 0xA0, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x81, 0xA2, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x81, 0xA5, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x81, 0xA7, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA9, FIL_, 0x02,
+ 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB1, FIL_,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB0, FIL_,
+ 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB3,
+ FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB4,
+ FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81,
+ 0xB6, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81,
+ 0xB7, FIL_, 0x02, 0xE3, 0x82, 0x9A, FIL_, 0xE3,
+ 0x81, 0xBA, FIL_, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x81, 0xB9, FIL_, 0x02, 0xE3, 0x82, 0x9A, FIL_,
+ 0xE3, 0x81, 0xBD, FIL_, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x81, 0xBC, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x82, 0x9E, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x83, 0xB4, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x82, 0xAC, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xAE, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB0,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82,
+ 0xB2, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x82, 0xB4, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x82, 0xB6, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x82, 0xB8, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x82, 0xBA, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x82, 0xBC, FIL_, 0x01,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xBE, FIL_,
+ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x80,
+ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83,
+ 0x82, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x83, 0x85, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x83, 0x87, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x83, 0x89, FIL_, 0x02, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x83, 0x90, FIL_, 0xE3, 0x82,
+ 0x9A, FIL_, 0xE3, 0x83, 0x91, FIL_, 0x02, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x83, 0x93, FIL_, 0xE3,
+ 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x94, FIL_, 0x02,
+ 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x97, FIL_,
+ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x96, FIL_,
+ 0x02, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x9A,
+ FIL_, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x99,
+ FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83,
+ 0x9C, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83,
+ 0x9D, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3,
+ 0x83, 0xB7, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_,
+ 0xE3, 0x83, 0xB8, FIL_, 0x01, 0xE3, 0x82, 0x99,
+ FIL_, 0xE3, 0x83, 0xB9, FIL_, 0x01, 0xE3, 0x82,
+ 0x99, FIL_, 0xE3, 0x83, 0xBA, FIL_, 0x01, 0xE3,
+ 0x82, 0x99, FIL_, 0xE3, 0x83, 0xBE, FIL_,
+ },
+};
+
+static const uchar_t u8_decomp_b2_tbl[2][2][256] = {
+ {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 1, 2, 3, 4, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, 5,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ {
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, 6, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, 7,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+
+ },
+ {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 1, 2, 3, 4, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, 5,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ {
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, 6, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, 7,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+
+ },
+
+};
+
+static const u8_displacement_t u8_decomp_b3_tbl[2][8][256] = {
+ {
+ { /* Third byte table 0. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0, 0 },
+ { 1, 35 }, { 2, 247 }, { 3, 474 },
+ { 4, 693 }, { 5, 709 }, { 6, 951 },
+ { N_, 0 }, { 7, 1139 }, { 8, 1152 },
+ { N_, 0 }, { 9, 1177 }, { 10, 1199 },
+ { 11, 1295 }, { 12, 1360 }, { 13, 1405 },
+ { N_, 0 }, { 14, 1450 }, { N_, 0 },
+ { N_, 0 }, { 15, 1620 }, { N_, 0 },
+ { 16, 1624 }, { 17, 1649 }, { N_, 0 },
+ { 18, 1665 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 1. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 19, 1680 },
+ { 20, 1701 }, { N_, 0 }, { 21, 1757 },
+ { 22, 1792 }, { 23, 1806 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 24, 1834 },
+ { 25, 1869 }, { 26, 1876 }, { N_, 0 },
+ { 27, 1897 }, { N_, 0 }, { 28, 1904 },
+ { N_, 0 }, { 29, 1942 }, { N_, 0 },
+ { 30, 1963 }, { 31, 1994 }, { N_, 0 },
+ { 32, 2000 }, { 33, 2006 }, { 34, 2018 },
+ { 35, 2021 }, { 36, 2109 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 2. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 37, 2158 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 0x8000, 2165 }, { 0x8001, 2445 },
+ { 0x8002, 2741 }, { 0x8003, 3029 }, { 0x8004, 3337 },
+ { 0x8005, 3725 }, { 0x8006, 4053 }, { 0x8007, 4536 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 3. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 38, 4895 },
+ { 39, 4964 }, { 40, 4999 }, { N_, 0 },
+ { 41, 5018 }, { 42, 5098 }, { 43, 5230 },
+ { 44, 5248 }, { 45, 5266 }, { 46, 5326 },
+ { 47, 5410 }, { 48, 5470 }, { 49, 5518 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 50, 5526 }, { 51, 5596 },
+ { 52, 5767 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 53, 5810 }, { 54, 5822 }, { N_, 0 },
+ { 55, 5830 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 56, 5836 }, { 57, 5839 }, { 58, 5842 },
+ { 59, 6034 }, { 60, 6226 }, { 61, 6418 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 4. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 62, 6484 },
+ { 63, 6497 }, { 64, 6672 }, { 65, 6770 },
+ { 66, 6923 }, { 67, 6968 }, { 68, 7160 },
+ { N_, 0 }, { 0x8008, 7247 }, { 69, 7597 },
+ { 70, 7773 }, { 71, 7950 }, { 0x8009, 8142 },
+ { 0x800A, 8919 }, { 72, 9351 }, { 73, 9522 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 5. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0x800B, 9743 },
+ { 0x800C, 9999 }, { 0x800D, 10255 }, { 0x800E, 10511 },
+ { 74, 10767 }, { 75, 10967 }, { N_, 0 },
+ { N_, 0 }, { 76, 11139 }, { 77, 11303 },
+ { 78, 11468 }, { 79, 11576 }, { 0x800F, 11740 },
+ { 0x8010, 12006 }, { 0x8011, 12280 }, { 0x8012, 12546 },
+ { 80, 12812 }, { 0x8013, 13060 }, { 0x8014, 13348 },
+ { 81, 13720 }, { 82, 13898 }, { 83, 13933 },
+ { 84, 14045 }, { 85, 14197 }, { 86, 14347 },
+ { 87, 14410 }, { 88, 14540 }, { 89, 14729 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 6. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 90, 14829 }, { 91, 14912 },
+ { 92, 14969 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 93, 14982 }, { 94, 15046 }, { 95, 15109 },
+ { 96, 15163 }, { 97, 15225 }, { 98, 15282 },
+ { 99, 15341 }, { 100, 15405 }, { 101, 15469 },
+ { 102, 15533 }, { 103, 15597 }, { 104, 15681 },
+ { 105, 15812 }, { 106, 15942 }, { 107, 16072 },
+ { 108, 16202 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 7. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 0x8015, 16273 }, { 0x8016, 16536 },
+ { 0x8017, 16799 }, { 0x8018, 17064 }, { 0x8019, 17329 },
+ { 0x801A, 17601 }, { 0x801B, 17878 }, { 0x801C, 18147 },
+ { 109, 18419 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ },
+ {
+ { /* Third byte table 0. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0, 0 },
+ { 1, 35 }, { 2, 247 }, { 3, 474 },
+ { 4, 693 }, { 5, 709 }, { 6, 951 },
+ { N_, 0 }, { 7, 1139 }, { 8, 1152 },
+ { N_, 0 }, { 9, 1177 }, { 10, 1199 },
+ { 11, 1295 }, { 12, 1362 }, { 13, 1407 },
+ { N_, 0 }, { 14, 1452 }, { N_, 0 },
+ { N_, 0 }, { 15, 1622 }, { N_, 0 },
+ { 16, 1626 }, { 17, 1651 }, { N_, 0 },
+ { 18, 1667 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 1. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 19, 1682 },
+ { 20, 1703 }, { N_, 0 }, { 21, 1759 },
+ { 22, 1794 }, { 23, 1808 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 24, 1836 },
+ { 25, 1871 }, { 26, 1878 }, { N_, 0 },
+ { 27, 1899 }, { N_, 0 }, { 28, 1906 },
+ { N_, 0 }, { 29, 1944 }, { N_, 0 },
+ { 30, 1965 }, { 31, 1996 }, { N_, 0 },
+ { 32, 2002 }, { 33, 2008 }, { 34, 2020 },
+ { 35, 2023 }, { 36, 2111 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 2. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 37, 2160 },
+ { N_, 0 }, { N_, 0 }, { 38, 2167 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 39, 2170 }, { 40, 2226 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 41, 2247 }, { 42, 2268 }, { 43, 2340 },
+ { N_, 0 }, { 0x8000, 2414 }, { 0x8001, 2694 },
+ { 0x8002, 2990 }, { 0x8003, 3278 }, { 0x8004, 3586 },
+ { 0x8005, 3974 }, { 0x8006, 4302 }, { 0x8007, 4785 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 3. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 44, 5144 },
+ { 45, 5213 }, { 46, 5248 }, { N_, 0 },
+ { 47, 5273 }, { 48, 5358 }, { 49, 5490 },
+ { 50, 5508 }, { 51, 5526 }, { 52, 5586 },
+ { 53, 5670 }, { 54, 5730 }, { 55, 5778 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 56, 5786 }, { 57, 5856 },
+ { 58, 6027 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 59, 6070 }, { 60, 6082 }, { N_, 0 },
+ { 61, 6090 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 62, 6096 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 63, 6099 }, { 64, 6102 }, { 65, 6105 },
+ { 66, 6297 }, { 67, 6489 }, { 68, 6681 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 4. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 69, 6747 },
+ { 70, 6760 }, { 71, 6935 }, { 72, 7033 },
+ { 73, 7186 }, { 74, 7231 }, { 75, 7423 },
+ { N_, 0 }, { 0x8008, 7510 }, { 76, 7891 },
+ { 77, 8103 }, { 78, 8280 }, { 0x8009, 8482 },
+ { 0x800A, 9259 }, { 79, 9701 }, { 80, 9872 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 5. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0x800B, 10106 },
+ { 0x800C, 10362 }, { 0x800D, 10618 }, { 0x800E, 10874 },
+ { 81, 11130 }, { 82, 11330 }, { 0x800F, 11566 },
+ { 83, 11822 }, { 84, 11932 }, { 85, 12096 },
+ { 86, 12261 }, { 87, 12369 }, { 0x8010, 12533 },
+ { 0x8011, 12799 }, { 0x8012, 13073 }, { 0x8013, 13339 },
+ { 88, 13605 }, { 0x8014, 13853 }, { 0x8015, 14141 },
+ { 89, 14513 }, { 90, 14691 }, { 91, 14746 },
+ { 92, 14860 }, { 93, 15012 }, { 94, 15162 },
+ { 95, 15225 }, { 96, 15355 }, { 97, 15544 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 6. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 98, 15644 }, { 99, 15727 },
+ { 100, 15784 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 101, 15797 }, { 102, 15861 }, { 103, 15924 },
+ { 104, 15978 }, { 105, 16041 }, { 106, 16098 },
+ { 107, 16157 }, { 108, 16221 }, { 109, 16285 },
+ { 110, 16349 }, { 111, 16413 }, { 112, 16501 },
+ { 113, 16632 }, { 114, 16762 }, { 115, 16892 },
+ { 116, 17022 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ { /* Third byte table 7. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 0x8016, 17097 }, { 0x8017, 17360 },
+ { 0x8018, 17623 }, { 0x8019, 17888 }, { 0x801A, 18153 },
+ { 0x801B, 18425 }, { 0x801C, 18702 }, { 0x801D, 18971 },
+ { 117, 19243 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 },
+ },
+ },
+};
+
+static const uchar_t u8_decomp_b4_tbl[2][118][257] = {
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1,
+ 1, 4, 4, 5, 5, 5, 5, 5,
+ 8, 8, 8, 9, 10, 13, 15, 15,
+ 15, 18, 19, 20, 20, 25, 30, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 24,
+ 28, 32, 36, 40, 44, 48, 52, 56,
+ 60, 60, 64, 68, 72, 76, 80, 84,
+ 84, 84, 88, 92, 96, 100, 104, 104,
+ 104, 108, 112, 116, 120, 124, 128, 128,
+ 132, 136, 140, 144, 148, 152, 156, 160,
+ 164, 164, 168, 172, 176, 180, 184, 188,
+ 188, 188, 192, 196, 200, 204, 208, 208,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 64, 64, 68, 72, 76, 80, 84,
+ 88, 92, 96, 100, 104, 108, 112, 116,
+ 120, 124, 128, 132, 136, 140, 144, 144,
+ 144, 148, 152, 156, 160, 164, 168, 172,
+ 176, 180, 180, 182, 184, 188, 192, 196,
+ 200, 200, 204, 208, 212, 216, 220, 224,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 3, 7, 11, 15, 19,
+ 23, 27, 30, 30, 30, 34, 38, 42,
+ 46, 50, 54, 54, 54, 58, 62, 66,
+ 70, 74, 78, 82, 86, 90, 94, 98,
+ 102, 106, 110, 114, 118, 122, 126, 126,
+ 126, 130, 134, 138, 142, 146, 150, 154,
+ 158, 162, 166, 170, 174, 178, 182, 186,
+ 190, 194, 198, 202, 206, 210, 214, 218,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 12, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 4, 8, 12,
+ 14, 16, 18, 20, 22, 24, 28, 32,
+ 36, 40, 44, 48, 52, 56, 62, 68,
+ 74, 80, 86, 92, 98, 104, 104, 110,
+ 116, 122, 128, 133, 138, 138, 138, 142,
+ 146, 150, 154, 158, 162, 168, 174, 179,
+ 184, 188, 190, 192, 194, 198, 202, 202,
+ 202, 206, 210, 216, 222, 227, 232, 237,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 112, 112, 116,
+ 120, 120, 120, 120, 120, 120, 120, 124,
+ 128, 132, 136, 142, 148, 154, 160, 164,
+ 168, 174, 180, 184, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 3, 4, 5, 7, 9, 11,
+ 12, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 18,
+ 18, 20, 21, 22, 23, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 6, 9, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 17, 17, 17,
+ 17, 17, 17, 20, 20, 20, 20, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 3, 14, 19,
+ 22, 27, 32, 37, 37, 42, 42, 47,
+ 52, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 64, 69, 74, 79, 84,
+ 89, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 5, 10, 15, 20, 25,
+ 25, 27, 29, 31, 41, 51, 53, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 57, 59, 61, 61, 63, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65, 65, 65, 65, 65, 65, 65, 65,
+ 65,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 10, 15, 15, 15, 15,
+ 20, 20, 20, 20, 20, 25, 30, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 10, 15, 15, 15, 15,
+ 20, 20, 20, 20, 20, 25, 30, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 40,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 5, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 15, 20, 25, 30, 30, 30, 35,
+ 40, 40, 40, 45, 50, 55, 60, 65,
+ 70, 70, 70, 75, 80, 85, 90, 95,
+ 100, 100, 100, 105, 110, 115, 120, 125,
+ 130, 135, 140, 145, 150, 155, 160, 160,
+ 160, 165, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 5, 10, 15, 20, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 4, 8,
+ 12, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 5, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 7, 7, 7, 7, 7, 7,
+ 7, 7, 14, 14, 14, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 14, 21, 28, 35, 42, 49,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 21, 28, 28,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 7, 7, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 7, 14, 21, 21, 21, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 7, 14, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 28, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 14, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 7, 7, 7, 7, 7,
+ 14, 21, 21, 28, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 14, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 7, 14, 24, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 6, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 14, 14,
+ 14, 14, 14, 21, 21, 21, 21, 21,
+ 28, 28, 28, 28, 28, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 49, 49, 56, 63,
+ 72, 79, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88,
+ },
+ { /* Fourth byte table 36. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 21, 21,
+ 21, 21, 21, 28, 28, 28, 28, 28,
+ 35, 35, 35, 35, 35, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49,
+ },
+ { /* Fourth byte table 37. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7,
+ },
+ { /* Fourth byte table 38. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 21, 21, 21, 21,
+ 21, 21, 24, 24, 24, 24, 24, 24,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 28, 30, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 34, 40, 49, 49, 55,
+ 64, 64, 64, 64, 64, 66, 66, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69,
+ },
+ { /* Fourth byte table 39. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 4, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 20, 21, 21, 21, 22, 23, 24,
+ 25, 26, 27, 28, 31, 32, 33, 34,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 40. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 14, 15, 16, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19,
+ },
+ { /* Fourth byte table 41. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 7, 10, 10, 13, 16,
+ 18, 18, 21, 22, 23, 24, 25, 26,
+ 28, 29, 30, 31, 32, 32, 33, 35,
+ 35, 35, 36, 37, 38, 39, 40, 40,
+ 40, 42, 45, 47, 47, 48, 48, 51,
+ 51, 52, 52, 54, 58, 59, 60, 60,
+ 61, 62, 63, 63, 64, 65, 67, 69,
+ 71, 73, 74, 74, 74, 74, 76, 78,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80,
+ },
+ { /* Fourth byte table 42. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 3, 3, 3, 4, 5,
+ 6, 7, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 13, 18, 23, 28,
+ 33, 38, 43, 48, 53, 58, 63, 68,
+ 72, 73, 75, 78, 80, 81, 83, 86,
+ 90, 92, 93, 95, 98, 99, 100, 101,
+ 102, 103, 105, 108, 110, 111, 113, 116,
+ 120, 122, 123, 125, 128, 129, 130, 131,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132,
+ },
+ { /* Fourth byte table 43. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 6, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18,
+ },
+ { /* Fourth byte table 44. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 6, 12,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18,
+ },
+ { /* Fourth byte table 45. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 6, 6, 6,
+ 6, 6, 12, 12, 12, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 24, 24, 30,
+ 30, 30, 30, 30, 30, 36, 45, 45,
+ 51, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60,
+ },
+ { /* Fourth byte table 46. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 6, 6, 6, 12, 12, 12,
+ 18, 18, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 28, 28, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 40, 44,
+ 48, 54, 60, 60, 60, 66, 72, 72,
+ 72, 78, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84,
+ },
+ { /* Fourth byte table 47. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 12, 12, 18, 24, 24,
+ 24, 30, 36, 36, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 42, 48, 54,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60,
+ },
+ { /* Fourth byte table 48. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 24, 24, 24,
+ 24, 24, 24, 30, 36, 42, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 49. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 4, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8,
+ },
+ { /* Fourth byte table 50. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 11, 13, 15, 17, 19, 21,
+ 23, 25, 27, 29, 31, 34, 37, 40,
+ 43, 46, 49, 52, 55, 58, 62, 66,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70,
+ },
+ { /* Fourth byte table 51. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 53, 56, 59, 62, 65, 68,
+ 71, 74, 77, 80, 83, 86, 89, 92,
+ 95, 98, 101, 104, 107, 110, 113, 116,
+ 119, 122, 125, 128, 131, 134, 137, 140,
+ 143, 146, 149, 152, 155, 158, 161, 162,
+ 163, 164, 165, 166, 167, 168, 169, 170,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171,
+ },
+ { /* Fourth byte table 52. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43,
+ },
+ { /* Fourth byte table 53. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12,
+ },
+ { /* Fourth byte table 54. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 3, 5, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8,
+ },
+ { /* Fourth byte table 55. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6,
+ },
+ { /* Fourth byte table 56. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 57. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 58. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 59. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 60. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 61. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66,
+ },
+ { /* Fourth byte table 62. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 4,
+ 4, 7, 10, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13,
+ },
+ { /* Fourth byte table 63. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 7, 7, 14,
+ 14, 21, 21, 28, 28, 35, 35, 42,
+ 42, 49, 49, 56, 56, 63, 63, 70,
+ 70, 77, 77, 84, 84, 84, 91, 91,
+ 98, 98, 105, 105, 105, 105, 105, 105,
+ 105, 112, 119, 119, 126, 133, 133, 140,
+ 147, 147, 154, 161, 161, 168, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175,
+ },
+ { /* Fourth byte table 64. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 7, 7, 7,
+ 7, 7, 7, 7, 11, 15, 15, 22,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 35, 35, 42,
+ 42, 49, 49, 56, 56, 63, 63, 70,
+ 70, 77, 77, 84, 84, 91, 91, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98,
+ },
+ { /* Fourth byte table 65. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 14, 14, 14, 21, 21,
+ 28, 28, 35, 35, 35, 35, 35, 35,
+ 35, 42, 49, 49, 56, 63, 63, 70,
+ 77, 77, 84, 91, 91, 98, 105, 105,
+ 105, 105, 105, 105, 105, 105, 105, 105,
+ 105, 105, 105, 105, 105, 105, 105, 105,
+ 105, 105, 105, 105, 105, 112, 112, 112,
+ 119, 126, 133, 140, 140, 140, 140, 147,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153,
+ },
+ { /* Fourth byte table 66. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 6, 9, 12, 15, 18,
+ 21, 24, 27, 30, 33, 36, 39, 42,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45,
+ },
+ { /* Fourth byte table 67. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 68. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 45, 45, 45, 48, 51, 54, 57, 60,
+ 63, 66, 69, 72, 75, 78, 81, 84,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87,
+ },
+ { /* Fourth byte table 69. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 15, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 22, 24, 26, 28, 30, 32,
+ 34, 36, 38, 40, 42, 44, 46, 48,
+ 50, 53, 56, 59, 62, 65, 68, 71,
+ 74, 77, 80, 83, 86, 89, 92, 98,
+ 104, 110, 116, 122, 128, 134, 140, 146,
+ 152, 158, 164, 170, 176, 176, 176, 176,
+ 176, 176, 176, 176, 176, 176, 176, 176,
+ 176, 176, 176, 176, 176, 176, 176, 176,
+ 176, 176, 176, 176, 176, 176, 176, 176,
+ 176, 176, 176, 176, 176, 176, 176, 176,
+ 176, 176, 176, 176, 176, 176, 176, 176,
+ 176, 176, 176, 176, 176, 176, 176, 176,
+ 176, 176, 176, 176, 176, 176, 176, 176,
+ 176, 176, 176, 176, 176, 176, 176, 176,
+ 176,
+ },
+ { /* Fourth byte table 70. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 149, 151, 153, 155, 157, 159,
+ 161, 163, 165, 167, 169, 171, 173, 175,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177,
+ },
+ { /* Fourth byte table 71. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 41, 46, 51, 51, 51, 51,
+ 51, 54, 57, 60, 63, 66, 69, 72,
+ 75, 78, 81, 84, 87, 90, 93, 96,
+ 99, 102, 105, 108, 111, 114, 117, 120,
+ 123, 126, 129, 132, 135, 138, 141, 144,
+ 147, 150, 153, 156, 159, 162, 165, 168,
+ 171, 174, 177, 180, 183, 186, 189, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 72. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 7, 9, 11, 13, 15,
+ 17, 20, 24, 26, 28, 31, 34, 36,
+ 38, 40, 43, 46, 49, 52, 55, 57,
+ 59, 61, 63, 65, 68, 70, 72, 74,
+ 77, 80, 82, 85, 88, 91, 93, 96,
+ 101, 107, 109, 112, 115, 118, 121, 128,
+ 136, 138, 140, 143, 145, 147, 149, 152,
+ 154, 156, 158, 160, 162, 165, 167, 169,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171,
+ },
+ { /* Fourth byte table 73. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 10, 12, 14, 16, 22,
+ 25, 27, 29, 31, 33, 35, 37, 39,
+ 41, 43, 45, 48, 50, 52, 55, 58,
+ 60, 64, 67, 69, 71, 73, 75, 75,
+ 75, 79, 83, 87, 91, 95, 99, 103,
+ 107, 111, 116, 121, 126, 131, 136, 141,
+ 146, 151, 156, 161, 166, 171, 176, 181,
+ 186, 191, 196, 201, 206, 211, 216, 221,
+ 221, 221, 221, 221, 221, 221, 221, 221,
+ 221, 221, 221, 221, 221, 221, 221, 221,
+ 221, 221, 221, 221, 221, 221, 221, 221,
+ 221, 221, 221, 221, 221, 221, 221, 221,
+ 221, 221, 221, 221, 221, 221, 221, 221,
+ 221, 221, 221, 221, 221, 221, 221, 221,
+ 221, 221, 221, 221, 221, 221, 221, 221,
+ 221, 221, 221, 221, 221, 221, 221, 221,
+ 221,
+ },
+ { /* Fourth byte table 74. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 56,
+ 56, 60, 60, 64, 64, 64, 68, 72,
+ 76, 80, 84, 88, 92, 96, 100, 104,
+ 104, 108, 108, 112, 112, 112, 116, 120,
+ 120, 120, 120, 124, 128, 132, 136, 136,
+ 136, 140, 144, 148, 152, 156, 160, 164,
+ 168, 172, 176, 180, 184, 188, 192, 196,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200,
+ },
+ { /* Fourth byte table 75. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172, 172, 172, 172, 172, 172, 172, 172,
+ 172,
+ },
+ { /* Fourth byte table 76. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 9, 12, 14, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 20, 24, 28, 32,
+ 36, 36, 36, 36, 36, 36, 41, 41,
+ 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 64, 65, 70, 75, 82, 89, 94,
+ 99, 104, 109, 114, 119, 124, 129, 134,
+ 134, 139, 144, 149, 154, 159, 159, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164,
+ },
+ { /* Fourth byte table 77. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 10, 15, 20, 20, 25,
+ 30, 35, 40, 45, 50, 55, 60, 65,
+ 69, 71, 73, 75, 77, 79, 81, 83,
+ 85, 87, 89, 91, 93, 95, 97, 99,
+ 101, 103, 105, 107, 109, 111, 113, 115,
+ 117, 119, 121, 123, 125, 127, 129, 131,
+ 133, 135, 137, 139, 141, 143, 145, 147,
+ 149, 151, 153, 155, 157, 159, 161, 163,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165,
+ },
+ { /* Fourth byte table 78. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 76, 80, 82,
+ 84, 86, 88, 90, 92, 94, 96, 98,
+ 100, 104, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108,
+ },
+ { /* Fourth byte table 79. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 4, 6, 8,
+ 10, 12, 14, 16, 18, 20, 24, 26,
+ 28, 30, 32, 34, 36, 38, 40, 42,
+ 44, 46, 48, 54, 60, 66, 72, 78,
+ 84, 90, 96, 102, 108, 114, 120, 126,
+ 132, 138, 144, 150, 156, 158, 160, 162,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164,
+ },
+ { /* Fourth byte table 80. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248,
+ },
+ { /* Fourth byte table 81. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 36, 42,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 54, 60, 68, 76, 84, 92, 100,
+ 108, 116, 122, 155, 170, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178,
+ },
+ { /* Fourth byte table 82. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 5, 8, 9, 10, 11, 12,
+ 13, 14, 17, 20, 23, 26, 29, 32,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 83. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 15, 15,
+ 15, 15, 18, 21, 24, 27, 28, 29,
+ 30, 31, 34, 35, 35, 36, 37, 38,
+ 39, 42, 43, 44, 45, 46, 49, 52,
+ 53, 54, 55, 56, 57, 58, 59, 60,
+ 60, 61, 62, 63, 64, 64, 64, 64,
+ 64, 67, 71, 74, 74, 77, 77, 80,
+ 84, 87, 91, 94, 98, 101, 105, 108,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112,
+ },
+ { /* Fourth byte table 84. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 6, 10, 14, 18, 22, 26,
+ 30, 34, 38, 42, 46, 50, 52, 54,
+ 56, 58, 60, 62, 64, 66, 68, 70,
+ 72, 74, 76, 78, 80, 82, 84, 86,
+ 88, 90, 92, 94, 96, 98, 100, 102,
+ 104, 106, 108, 110, 112, 114, 116, 118,
+ 120, 122, 124, 126, 128, 130, 132, 134,
+ 136, 138, 140, 142, 144, 146, 148, 150,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152,
+ },
+ { /* Fourth byte table 85. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78,
+ 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 98, 100, 102, 104, 106, 112, 118,
+ 124, 130, 136, 142, 146, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150,
+ },
+ { /* Fourth byte table 86. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 87. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 34, 37, 40, 43, 46, 49, 52, 55,
+ 58, 61, 64, 67, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 103,
+ 106, 109, 112, 115, 118, 121, 124, 127,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130,
+ },
+ { /* Fourth byte table 88. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189,
+ },
+ { /* Fourth byte table 89. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 3, 6, 9, 12, 15,
+ 18, 18, 18, 21, 24, 27, 30, 33,
+ 36, 36, 36, 39, 42, 45, 48, 51,
+ 54, 54, 54, 57, 60, 63, 63, 63,
+ 63, 65, 67, 69, 72, 74, 76, 79,
+ 79, 82, 85, 88, 91, 94, 97, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100,
+ },
+ { /* Fourth byte table 90. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 9,
+ 18, 31, 44, 57, 70, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83,
+ },
+ { /* Fourth byte table 91. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 9, 18, 31, 44,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57,
+ },
+ { /* Fourth byte table 92. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13,
+ },
+ { /* Fourth byte table 93. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 94. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 95. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 29, 30,
+ 31, 31, 31, 32, 32, 32, 33, 34,
+ 34, 34, 35, 36, 37, 38, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 50, 51, 51, 52, 53,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54,
+ },
+ { /* Fourth byte table 96. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 2, 3, 3, 4, 5,
+ 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 16, 17, 18, 19, 20, 21,
+ 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45,
+ 46, 47, 48, 49, 50, 51, 52, 53,
+ 54, 55, 56, 57, 58, 59, 60, 61,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 97. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 6,
+ 7, 8, 9, 10, 10, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 18, 19,
+ 20, 21, 22, 23, 24, 25, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 53, 54, 55, 56, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57,
+ },
+ { /* Fourth byte table 98. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 5, 6,
+ 6, 6, 6, 7, 8, 9, 10, 11,
+ 12, 13, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59,
+ },
+ { /* Fourth byte table 99. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 100. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 101. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 102. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 103. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 36, 36, 36,
+ 36, 38, 40, 42, 44, 46, 48, 50,
+ 52, 54, 56, 58, 60, 62, 64, 66,
+ 68, 70, 72, 74, 76, 78, 80, 82,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84,
+ },
+ { /* Fourth byte table 104. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 5, 7, 9, 11, 13, 15,
+ 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47,
+ 49, 51, 53, 55, 58, 60, 62, 64,
+ 66, 68, 70, 72, 74, 76, 78, 80,
+ 82, 84, 86, 88, 90, 92, 94, 96,
+ 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 123, 125, 127, 129,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131,
+ },
+ { /* Fourth byte table 105. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 45, 47,
+ 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79,
+ 81, 83, 85, 87, 89, 91, 93, 95,
+ 97, 99, 101, 103, 105, 107, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130,
+ },
+ { /* Fourth byte table 106. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 33, 35, 37, 39, 41, 43, 45, 47,
+ 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79,
+ 81, 83, 85, 87, 89, 91, 93, 95,
+ 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130,
+ },
+ { /* Fourth byte table 107. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47,
+ 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79,
+ 81, 83, 86, 88, 90, 92, 94, 96,
+ 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130,
+ },
+ { /* Fourth byte table 108. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 9, 11, 13, 15,
+ 17, 19, 21, 21, 21, 21, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64, 65, 66, 67, 68, 69, 70,
+ 71, 71, 71, 71, 71, 71, 71, 71,
+ 71, 71, 71, 71, 71, 71, 71, 71,
+ 71, 71, 71, 71, 71, 71, 71, 71,
+ 71, 71, 71, 71, 71, 71, 71, 71,
+ 71, 71, 71, 71, 71, 71, 71, 71,
+ 71, 71, 71, 71, 71, 71, 71, 71,
+ 71, 71, 71, 71, 71, 71, 71, 71,
+ 71, 71, 71, 71, 71, 71, 71, 71,
+ 71,
+ },
+ { /* Fourth byte table 109. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 9, 13, 17, 21, 25, 29,
+ 33, 37, 42, 46, 50, 54, 58, 62,
+ 66, 71, 75, 80, 85, 90, 94, 98,
+ 102, 106, 110, 114, 118, 122, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127,
+ },
+ { /* Fourth byte table 110. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 111. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 112. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 113. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 114. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 115. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 116. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 117. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ },
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1,
+ 1, 4, 4, 5, 5, 5, 5, 5,
+ 8, 8, 8, 9, 10, 13, 15, 15,
+ 15, 18, 19, 20, 20, 25, 30, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 24,
+ 28, 32, 36, 40, 44, 48, 52, 56,
+ 60, 60, 64, 68, 72, 76, 80, 84,
+ 84, 84, 88, 92, 96, 100, 104, 104,
+ 104, 108, 112, 116, 120, 124, 128, 128,
+ 132, 136, 140, 144, 148, 152, 156, 160,
+ 164, 164, 168, 172, 176, 180, 184, 188,
+ 188, 188, 192, 196, 200, 204, 208, 208,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 64, 64, 68, 72, 76, 80, 84,
+ 88, 92, 96, 100, 104, 108, 112, 116,
+ 120, 124, 128, 132, 136, 140, 144, 144,
+ 144, 148, 152, 156, 160, 164, 168, 172,
+ 176, 180, 180, 182, 184, 188, 192, 196,
+ 200, 200, 204, 208, 212, 216, 220, 224,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227, 227, 227, 227, 227, 227, 227, 227,
+ 227,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 3, 7, 11, 15, 19,
+ 23, 27, 30, 30, 30, 34, 38, 42,
+ 46, 50, 54, 54, 54, 58, 62, 66,
+ 70, 74, 78, 82, 86, 90, 94, 98,
+ 102, 106, 110, 114, 118, 122, 126, 126,
+ 126, 130, 134, 138, 142, 146, 150, 154,
+ 158, 162, 166, 170, 174, 178, 182, 186,
+ 190, 194, 198, 202, 206, 210, 214, 218,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219, 219, 219, 219, 219, 219, 219, 219,
+ 219,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 12, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 4, 8, 12,
+ 14, 16, 18, 20, 22, 24, 28, 32,
+ 36, 40, 44, 48, 52, 56, 62, 68,
+ 74, 80, 86, 92, 98, 104, 104, 110,
+ 116, 122, 128, 133, 138, 138, 138, 142,
+ 146, 150, 154, 158, 162, 168, 174, 179,
+ 184, 188, 190, 192, 194, 198, 202, 202,
+ 202, 206, 210, 216, 222, 227, 232, 237,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242, 242, 242, 242, 242, 242, 242, 242,
+ 242,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 112, 112, 116,
+ 120, 120, 120, 120, 120, 120, 120, 124,
+ 128, 132, 136, 142, 148, 154, 160, 164,
+ 168, 174, 180, 184, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188, 188, 188, 188, 188, 188, 188, 188,
+ 188,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 3, 4, 5, 7, 9, 11,
+ 12, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 18,
+ 18, 20, 21, 22, 23, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 6, 9, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 17, 17, 17,
+ 17, 17, 17, 20, 20, 20, 20, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 3, 14, 19,
+ 22, 27, 32, 37, 37, 42, 42, 47,
+ 52, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 64, 69, 74, 79, 84,
+ 89, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 5, 10, 15, 20, 25,
+ 25, 27, 29, 31, 41, 51, 53, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 57, 59, 61, 61, 63, 65, 65,
+ 65, 65, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67, 67, 67, 67, 67, 67, 67, 67,
+ 67,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 10, 15, 15, 15, 15,
+ 20, 20, 20, 20, 20, 25, 30, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 10, 15, 15, 15, 15,
+ 20, 20, 20, 20, 20, 25, 30, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 40,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 5, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 15, 20, 25, 30, 30, 30, 35,
+ 40, 40, 40, 45, 50, 55, 60, 65,
+ 70, 70, 70, 75, 80, 85, 90, 95,
+ 100, 100, 100, 105, 110, 115, 120, 125,
+ 130, 135, 140, 145, 150, 155, 160, 160,
+ 160, 165, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170, 170, 170, 170, 170, 170, 170, 170,
+ 170,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 5, 10, 15, 20, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 4, 8,
+ 12, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 5, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 7, 7, 7, 7, 7, 7,
+ 7, 7, 14, 14, 14, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 14, 21, 28, 35, 42, 49,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 21, 28, 28,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 7, 7, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 7, 14, 21, 21, 21, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 7, 14, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 28, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 14, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 7, 7, 7, 7, 7,
+ 14, 21, 21, 28, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 14, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 7, 7, 14, 24, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 6, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 14, 14,
+ 14, 14, 14, 21, 21, 21, 21, 21,
+ 28, 28, 28, 28, 28, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 49, 49, 56, 63,
+ 72, 79, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88,
+ },
+ { /* Fourth byte table 36. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 21, 21,
+ 21, 21, 21, 28, 28, 28, 28, 28,
+ 35, 35, 35, 35, 35, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49, 49, 49, 49, 49, 49, 49, 49,
+ 49,
+ },
+ { /* Fourth byte table 37. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7,
+ },
+ { /* Fourth byte table 38. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 39. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 7,
+ 7, 14, 14, 21, 21, 28, 28, 35,
+ 35, 35, 35, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 49, 49, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56,
+ },
+ { /* Fourth byte table 40. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 14, 14, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 41. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 3, 4,
+ 4, 5, 6, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 16, 17, 19, 20,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 42. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 6, 8, 11,
+ 12, 13, 14, 16, 18, 20, 21, 21,
+ 22, 23, 25, 26, 28, 31, 34, 35,
+ 36, 37, 40, 42, 43, 46, 48, 50,
+ 52, 54, 56, 57, 58, 59, 60, 62,
+ 64, 66, 68, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72,
+ },
+ { /* Fourth byte table 43. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 3, 5, 7,
+ 9, 10, 12, 14, 16, 18, 20, 22,
+ 25, 27, 29, 32, 34, 36, 38, 40,
+ 42, 44, 46, 48, 50, 52, 54, 56,
+ 58, 61, 63, 65, 66, 68, 70, 72,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74,
+ },
+ { /* Fourth byte table 44. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 21, 21, 21, 21,
+ 21, 21, 24, 24, 24, 24, 24, 24,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 28, 30, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 34, 40, 49, 49, 55,
+ 64, 64, 64, 64, 64, 66, 66, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69, 69, 69, 69, 69, 69, 69, 69,
+ 69,
+ },
+ { /* Fourth byte table 45. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 4, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 20, 21, 21, 21, 22, 23, 24,
+ 25, 26, 27, 28, 31, 32, 33, 34,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35,
+ 35,
+ },
+ { /* Fourth byte table 46. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 14, 15, 16, 17,
+ 17, 18, 19, 20, 21, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25,
+ 25,
+ },
+ { /* Fourth byte table 47. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 7, 10, 10, 13, 16,
+ 18, 18, 21, 22, 23, 24, 25, 26,
+ 28, 29, 30, 31, 32, 32, 33, 35,
+ 35, 35, 36, 37, 38, 39, 40, 40,
+ 40, 42, 45, 47, 47, 48, 48, 51,
+ 51, 52, 52, 54, 58, 59, 60, 60,
+ 61, 62, 63, 63, 64, 65, 67, 69,
+ 71, 73, 74, 74, 77, 79, 81, 83,
+ 85, 85, 85, 85, 85, 85, 85, 85,
+ 85, 85, 85, 85, 85, 85, 85, 85,
+ 85, 85, 85, 85, 85, 85, 85, 85,
+ 85, 85, 85, 85, 85, 85, 85, 85,
+ 85, 85, 85, 85, 85, 85, 85, 85,
+ 85, 85, 85, 85, 85, 85, 85, 85,
+ 85, 85, 85, 85, 85, 85, 85, 85,
+ 85, 85, 85, 85, 85, 85, 85, 85,
+ 85,
+ },
+ { /* Fourth byte table 48. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 3, 3, 3, 4, 5,
+ 6, 7, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 13, 18, 23, 28,
+ 33, 38, 43, 48, 53, 58, 63, 68,
+ 72, 73, 75, 78, 80, 81, 83, 86,
+ 90, 92, 93, 95, 98, 99, 100, 101,
+ 102, 103, 105, 108, 110, 111, 113, 116,
+ 120, 122, 123, 125, 128, 129, 130, 131,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132, 132, 132, 132, 132, 132, 132, 132,
+ 132,
+ },
+ { /* Fourth byte table 49. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 6, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18,
+ },
+ { /* Fourth byte table 50. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 6, 12,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18,
+ },
+ { /* Fourth byte table 51. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 6, 6, 6,
+ 6, 6, 12, 12, 12, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 24, 24, 30,
+ 30, 30, 30, 30, 30, 36, 45, 45,
+ 51, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60,
+ },
+ { /* Fourth byte table 52. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 6, 6, 6, 12, 12, 12,
+ 18, 18, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 28, 28, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 40, 44,
+ 48, 54, 60, 60, 60, 66, 72, 72,
+ 72, 78, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84,
+ },
+ { /* Fourth byte table 53. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 12, 12, 18, 24, 24,
+ 24, 30, 36, 36, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 42, 48, 54,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60,
+ },
+ { /* Fourth byte table 54. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 24, 24, 24,
+ 24, 24, 24, 30, 36, 42, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 55. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 4, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8,
+ },
+ { /* Fourth byte table 56. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 11, 13, 15, 17, 19, 21,
+ 23, 25, 27, 29, 31, 34, 37, 40,
+ 43, 46, 49, 52, 55, 58, 62, 66,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70,
+ },
+ { /* Fourth byte table 57. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 53, 56, 59, 62, 65, 68,
+ 71, 74, 77, 80, 83, 86, 89, 92,
+ 95, 98, 101, 104, 107, 110, 113, 116,
+ 119, 122, 125, 128, 131, 134, 137, 140,
+ 143, 146, 149, 152, 155, 158, 161, 162,
+ 163, 164, 165, 166, 167, 168, 169, 170,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171,
+ },
+ { /* Fourth byte table 58. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43, 43, 43, 43, 43, 43, 43, 43,
+ 43,
+ },
+ { /* Fourth byte table 59. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12,
+ },
+ { /* Fourth byte table 60. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 3, 5, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8,
+ },
+ { /* Fourth byte table 61. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6,
+ },
+ { /* Fourth byte table 62. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 63. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 64. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 65. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 66. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 67. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 68. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66,
+ },
+ { /* Fourth byte table 69. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 4,
+ 4, 7, 10, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13,
+ },
+ { /* Fourth byte table 70. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 7, 7, 14,
+ 14, 21, 21, 28, 28, 35, 35, 42,
+ 42, 49, 49, 56, 56, 63, 63, 70,
+ 70, 77, 77, 84, 84, 84, 91, 91,
+ 98, 98, 105, 105, 105, 105, 105, 105,
+ 105, 112, 119, 119, 126, 133, 133, 140,
+ 147, 147, 154, 161, 161, 168, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175, 175, 175, 175, 175, 175, 175, 175,
+ 175,
+ },
+ { /* Fourth byte table 71. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 7, 7, 7,
+ 7, 7, 7, 7, 11, 15, 15, 22,
+ 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 35, 35, 42,
+ 42, 49, 49, 56, 56, 63, 63, 70,
+ 70, 77, 77, 84, 84, 91, 91, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98,
+ 98,
+ },
+ { /* Fourth byte table 72. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 7, 14, 14, 14, 21, 21,
+ 28, 28, 35, 35, 35, 35, 35, 35,
+ 35, 42, 49, 49, 56, 63, 63, 70,
+ 77, 77, 84, 91, 91, 98, 105, 105,
+ 105, 105, 105, 105, 105, 105, 105, 105,
+ 105, 105, 105, 105, 105, 105, 105, 105,
+ 105, 105, 105, 105, 105, 112, 112, 112,
+ 119, 126, 133, 140, 140, 140, 140, 147,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153, 153, 153, 153, 153, 153, 153, 153,
+ 153,
+ },
+ { /* Fourth byte table 73. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 6, 9, 12, 15, 18,
+ 21, 24, 27, 30, 33, 36, 39, 42,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45,
+ 45,
+ },
+ { /* Fourth byte table 74. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192, 192, 192, 192, 192, 192, 192, 192,
+ 192,
+ },
+ { /* Fourth byte table 75. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 45, 45, 45, 48, 51, 54, 57, 60,
+ 63, 66, 69, 72, 75, 78, 81, 84,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87,
+ },
+ { /* Fourth byte table 76. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 15, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 23, 25, 27, 29, 31, 33, 35,
+ 37, 39, 41, 43, 45, 47, 49, 51,
+ 53, 56, 59, 62, 65, 68, 71, 74,
+ 77, 80, 83, 86, 89, 92, 95, 101,
+ 107, 113, 119, 125, 131, 137, 143, 149,
+ 155, 161, 167, 173, 179, 194, 206, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212, 212, 212, 212, 212, 212, 212, 212,
+ 212,
+ },
+ { /* Fourth byte table 77. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 149, 151, 153, 155, 157, 159,
+ 161, 163, 165, 167, 169, 171, 173, 175,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177, 177, 177, 177, 177, 177, 177, 177,
+ 177,
+ },
+ { /* Fourth byte table 78. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 41, 46, 51, 53, 56, 58,
+ 61, 64, 67, 70, 73, 76, 79, 82,
+ 85, 88, 91, 94, 97, 100, 103, 106,
+ 109, 112, 115, 118, 121, 124, 127, 130,
+ 133, 136, 139, 142, 145, 148, 151, 154,
+ 157, 160, 163, 166, 169, 172, 175, 178,
+ 181, 184, 187, 190, 193, 196, 199, 202,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 202, 202, 202, 202, 202, 202, 202, 202,
+ 202,
+ },
+ { /* Fourth byte table 79. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 7, 9, 11, 13, 15,
+ 17, 20, 24, 26, 28, 31, 34, 36,
+ 38, 40, 43, 46, 49, 52, 55, 57,
+ 59, 61, 63, 65, 68, 70, 72, 74,
+ 77, 80, 82, 85, 88, 91, 93, 96,
+ 101, 107, 109, 112, 115, 118, 121, 128,
+ 136, 138, 140, 143, 145, 147, 149, 152,
+ 154, 156, 158, 160, 162, 165, 167, 169,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171, 171, 171, 171, 171, 171, 171, 171,
+ 171,
+ },
+ { /* Fourth byte table 80. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 10, 12, 14, 16, 22,
+ 25, 27, 29, 31, 33, 35, 37, 39,
+ 41, 43, 45, 48, 50, 52, 55, 58,
+ 60, 64, 67, 69, 71, 73, 75, 80,
+ 85, 89, 93, 97, 101, 105, 109, 113,
+ 117, 121, 126, 131, 136, 141, 146, 151,
+ 156, 161, 166, 171, 176, 181, 186, 191,
+ 196, 201, 206, 211, 216, 221, 226, 231,
+ 234, 234, 234, 234, 234, 234, 234, 234,
+ 234, 234, 234, 234, 234, 234, 234, 234,
+ 234, 234, 234, 234, 234, 234, 234, 234,
+ 234, 234, 234, 234, 234, 234, 234, 234,
+ 234, 234, 234, 234, 234, 234, 234, 234,
+ 234, 234, 234, 234, 234, 234, 234, 234,
+ 234, 234, 234, 234, 234, 234, 234, 234,
+ 234, 234, 234, 234, 234, 234, 234, 234,
+ 234,
+ },
+ { /* Fourth byte table 81. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 56,
+ 56, 60, 60, 64, 64, 64, 68, 72,
+ 76, 80, 84, 88, 92, 96, 100, 104,
+ 104, 108, 108, 112, 112, 112, 116, 120,
+ 120, 120, 120, 124, 128, 132, 136, 136,
+ 136, 140, 144, 148, 152, 156, 160, 164,
+ 168, 172, 176, 180, 184, 188, 192, 196,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200,
+ 200,
+ },
+ { /* Fourth byte table 82. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 172, 172, 172, 172,
+ 172, 176, 180, 184, 188, 192, 196, 200,
+ 204, 208, 212, 216, 220, 224, 228, 232,
+ 236, 236, 236, 236, 236, 236, 236, 236,
+ 236, 236, 236, 236, 236, 236, 236, 236,
+ 236, 236, 236, 236, 236, 236, 236, 236,
+ 236, 236, 236, 236, 236, 236, 236, 236,
+ 236, 236, 236, 236, 236, 236, 236, 236,
+ 236, 236, 236, 236, 236, 236, 236, 236,
+ 236, 236, 236, 236, 236, 236, 236, 236,
+ 236, 236, 236, 236, 236, 236, 236, 236,
+ 236,
+ },
+ { /* Fourth byte table 83. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 65, 70, 75, 79, 83, 87, 92, 97,
+ 102, 106, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110, 110, 110, 110, 110, 110, 110, 110,
+ 110,
+ },
+ { /* Fourth byte table 84. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 9, 12, 14, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 20, 24, 28, 32,
+ 36, 36, 36, 36, 36, 36, 41, 41,
+ 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 64, 65, 70, 75, 82, 89, 94,
+ 99, 104, 109, 114, 119, 124, 129, 134,
+ 134, 139, 144, 149, 154, 159, 159, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164,
+ },
+ { /* Fourth byte table 85. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 10, 15, 20, 20, 25,
+ 30, 35, 40, 45, 50, 55, 60, 65,
+ 69, 71, 73, 75, 77, 79, 81, 83,
+ 85, 87, 89, 91, 93, 95, 97, 99,
+ 101, 103, 105, 107, 109, 111, 113, 115,
+ 117, 119, 121, 123, 125, 127, 129, 131,
+ 133, 135, 137, 139, 141, 143, 145, 147,
+ 149, 151, 153, 155, 157, 159, 161, 163,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165, 165, 165, 165, 165, 165, 165, 165,
+ 165,
+ },
+ { /* Fourth byte table 86. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 76, 80, 82,
+ 84, 86, 88, 90, 92, 94, 96, 98,
+ 100, 104, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108, 108, 108, 108, 108, 108, 108, 108,
+ 108,
+ },
+ { /* Fourth byte table 87. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 4, 6, 8,
+ 10, 12, 14, 16, 18, 20, 24, 26,
+ 28, 30, 32, 34, 36, 38, 40, 42,
+ 44, 46, 48, 54, 60, 66, 72, 78,
+ 84, 90, 96, 102, 108, 114, 120, 126,
+ 132, 138, 144, 150, 156, 158, 160, 162,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164, 164, 164, 164, 164, 164, 164, 164,
+ 164,
+ },
+ { /* Fourth byte table 88. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248, 248, 248, 248, 248, 248, 248, 248,
+ 248,
+ },
+ { /* Fourth byte table 89. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 36, 42,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 54, 60, 68, 76, 84, 92, 100,
+ 108, 116, 122, 155, 170, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178, 178, 178, 178, 178, 178, 178, 178,
+ 178,
+ },
+ { /* Fourth byte table 90. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 4, 7, 8, 9, 10, 11,
+ 14, 17, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 22, 25, 28, 29, 30, 31, 32,
+ 33, 34, 37, 40, 43, 46, 49, 52,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55, 55, 55, 55, 55, 55, 55, 55,
+ 55,
+ },
+ { /* Fourth byte table 91. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 15, 15,
+ 16, 17, 20, 23, 26, 29, 30, 31,
+ 32, 33, 36, 37, 37, 38, 39, 40,
+ 41, 44, 45, 46, 47, 48, 51, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 62, 63, 64, 65, 66, 66, 66, 66,
+ 66, 69, 73, 76, 76, 79, 79, 82,
+ 86, 89, 93, 96, 100, 103, 107, 110,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114,
+ },
+ { /* Fourth byte table 92. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 6, 10, 14, 18, 22, 26,
+ 30, 34, 38, 42, 46, 50, 52, 54,
+ 56, 58, 60, 62, 64, 66, 68, 70,
+ 72, 74, 76, 78, 80, 82, 84, 86,
+ 88, 90, 92, 94, 96, 98, 100, 102,
+ 104, 106, 108, 110, 112, 114, 116, 118,
+ 120, 122, 124, 126, 128, 130, 132, 134,
+ 136, 138, 140, 142, 144, 146, 148, 150,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152,
+ },
+ { /* Fourth byte table 93. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78,
+ 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 98, 100, 102, 104, 106, 112, 118,
+ 124, 130, 136, 142, 146, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150, 150, 150, 150, 150, 150, 150, 150,
+ 150,
+ },
+ { /* Fourth byte table 94. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 95. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 34, 37, 40, 43, 46, 49, 52, 55,
+ 58, 61, 64, 67, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 103,
+ 106, 109, 112, 115, 118, 121, 124, 127,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130,
+ },
+ { /* Fourth byte table 96. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 144, 147, 150, 153, 156, 159, 162, 165,
+ 168, 171, 174, 177, 180, 183, 186, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189, 189, 189, 189, 189, 189, 189, 189,
+ 189,
+ },
+ { /* Fourth byte table 97. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 3, 6, 9, 12, 15,
+ 18, 18, 18, 21, 24, 27, 30, 33,
+ 36, 36, 36, 39, 42, 45, 48, 51,
+ 54, 54, 54, 57, 60, 63, 63, 63,
+ 63, 65, 67, 69, 72, 74, 76, 79,
+ 79, 82, 85, 88, 91, 94, 97, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100,
+ 100,
+ },
+ { /* Fourth byte table 98. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 9,
+ 18, 31, 44, 57, 70, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83,
+ },
+ { /* Fourth byte table 99. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 9, 18, 31, 44,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57,
+ },
+ { /* Fourth byte table 100. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13,
+ },
+ { /* Fourth byte table 101. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 102. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 103. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 29, 30,
+ 31, 31, 31, 32, 32, 32, 33, 34,
+ 34, 34, 35, 36, 37, 38, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 50, 51, 51, 52, 53,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54,
+ },
+ { /* Fourth byte table 104. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 105. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 6,
+ 7, 8, 9, 10, 10, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 18, 19,
+ 20, 21, 22, 23, 24, 25, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 53, 54, 55, 56, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57,
+ },
+ { /* Fourth byte table 106. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 5, 6,
+ 6, 6, 6, 7, 8, 9, 10, 11,
+ 12, 13, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59, 59, 59, 59, 59, 59, 59, 59,
+ 59,
+ },
+ { /* Fourth byte table 107. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 108. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 109. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 110. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 111. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 38, 40, 40,
+ 40, 42, 44, 46, 48, 50, 52, 54,
+ 56, 58, 60, 62, 64, 66, 68, 70,
+ 72, 74, 76, 78, 80, 82, 84, 86,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88, 88, 88, 88, 88, 88, 88, 88,
+ 88,
+ },
+ { /* Fourth byte table 112. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 5, 7, 9, 11, 13, 15,
+ 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47,
+ 49, 51, 53, 55, 58, 60, 62, 64,
+ 66, 68, 70, 72, 74, 76, 78, 80,
+ 82, 84, 86, 88, 90, 92, 94, 96,
+ 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 123, 125, 127, 129,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131, 131, 131, 131, 131, 131, 131,
+ 131,
+ },
+ { /* Fourth byte table 113. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 45, 47,
+ 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79,
+ 81, 83, 85, 87, 89, 91, 93, 95,
+ 97, 99, 101, 103, 105, 107, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130,
+ },
+ { /* Fourth byte table 114. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 33, 35, 37, 39, 41, 43, 45, 47,
+ 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79,
+ 81, 83, 85, 87, 89, 91, 93, 95,
+ 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130,
+ },
+ { /* Fourth byte table 115. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47,
+ 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79,
+ 81, 83, 86, 88, 90, 92, 94, 96,
+ 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130, 130, 130, 130, 130, 130, 130, 130,
+ 130,
+ },
+ { /* Fourth byte table 116. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 9, 11, 13, 15,
+ 17, 19, 21, 23, 25, 25, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 65, 66,
+ 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 75, 75, 75, 75, 75, 75, 75,
+ 75, 75, 75, 75, 75, 75, 75, 75,
+ 75, 75, 75, 75, 75, 75, 75, 75,
+ 75, 75, 75, 75, 75, 75, 75, 75,
+ 75, 75, 75, 75, 75, 75, 75, 75,
+ 75, 75, 75, 75, 75, 75, 75, 75,
+ 75, 75, 75, 75, 75, 75, 75, 75,
+ 75, 75, 75, 75, 75, 75, 75, 75,
+ 75,
+ },
+ { /* Fourth byte table 117. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 9, 13, 17, 21, 25, 29,
+ 33, 37, 42, 46, 50, 54, 58, 62,
+ 66, 71, 75, 80, 85, 90, 94, 98,
+ 102, 106, 110, 114, 118, 122, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127, 127, 127, 127, 127, 127, 127,
+ 127,
+ },
+ },
+};
+
+static const uint16_t u8_decomp_b4_16bit_tbl[2][30][257] = {
+ {
+ { /* Fourth byte 16-bit table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 38, 44, 48, 52, 56, 60, 64,
+ 68, 72, 76, 80, 84, 90, 96, 102,
+ 108, 112, 116, 120, 124, 130, 136, 140,
+ 144, 148, 152, 156, 160, 164, 168, 172,
+ 176, 180, 184, 188, 192, 196, 200, 206,
+ 212, 216, 220, 224, 228, 232, 236, 240,
+ 244, 250, 256, 260, 264, 268, 272, 276,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280,
+ },
+ { /* Fourth byte 16-bit table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 54, 60, 66,
+ 72, 78, 84, 90, 96, 100, 104, 108,
+ 112, 116, 120, 124, 128, 134, 140, 144,
+ 148, 152, 156, 160, 164, 170, 176, 182,
+ 188, 194, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 262, 268, 274, 280, 284, 288, 292,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296,
+ },
+ { /* Fourth byte 16-bit table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 107, 116, 116, 116, 116,
+ 116, 120, 124, 128, 132, 138, 144, 150,
+ 156, 162, 168, 174, 180, 186, 192, 198,
+ 204, 210, 216, 222, 228, 234, 240, 246,
+ 252, 256, 260, 264, 268, 272, 276, 282,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288,
+ },
+ { /* Fourth byte 16-bit table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 36, 42,
+ 48, 52, 56, 60, 64, 68, 72, 76,
+ 80, 86, 92, 98, 104, 110, 116, 122,
+ 128, 134, 140, 146, 152, 158, 164, 170,
+ 176, 182, 188, 194, 200, 204, 208, 212,
+ 216, 222, 228, 234, 240, 246, 252, 258,
+ 264, 270, 276, 280, 284, 288, 292, 296,
+ 300, 304, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308,
+ },
+ { /* Fourth byte 16-bit table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 17, 24, 31, 38, 45,
+ 52, 57, 62, 69, 76, 83, 90, 97,
+ 104, 109, 114, 121, 128, 135, 142, 142,
+ 142, 147, 152, 159, 166, 173, 180, 180,
+ 180, 185, 190, 197, 204, 211, 218, 225,
+ 232, 237, 242, 249, 256, 263, 270, 277,
+ 284, 289, 294, 301, 308, 315, 322, 329,
+ 336, 341, 346, 353, 360, 367, 374, 381,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388,
+ },
+ { /* Fourth byte 16-bit table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 17, 24, 31, 38, 38,
+ 38, 43, 48, 55, 62, 69, 76, 76,
+ 76, 81, 86, 93, 100, 107, 114, 121,
+ 128, 128, 133, 133, 140, 140, 147, 147,
+ 154, 159, 164, 171, 178, 185, 192, 199,
+ 206, 211, 216, 223, 230, 237, 244, 251,
+ 258, 263, 268, 273, 278, 283, 288, 293,
+ 298, 303, 308, 313, 318, 323, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328,
+ },
+ { /* Fourth byte 16-bit table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 14, 23, 32, 41, 50, 59,
+ 68, 75, 82, 91, 100, 109, 118, 127,
+ 136, 143, 150, 159, 168, 177, 186, 195,
+ 204, 211, 218, 227, 236, 245, 254, 263,
+ 272, 279, 286, 295, 304, 313, 322, 331,
+ 340, 347, 354, 363, 372, 381, 390, 399,
+ 408, 413, 418, 425, 430, 437, 437, 442,
+ 449, 454, 459, 464, 469, 474, 477, 480,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483,
+ },
+ { /* Fourth byte 16-bit table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 14, 21, 26, 33, 33, 38,
+ 45, 50, 55, 60, 65, 70, 82, 94,
+ 106, 111, 116, 123, 130, 130, 130, 135,
+ 142, 147, 152, 157, 162, 162, 174, 186,
+ 198, 203, 208, 215, 222, 227, 232, 237,
+ 244, 249, 254, 259, 264, 269, 280, 291,
+ 293, 293, 293, 300, 305, 312, 312, 317,
+ 324, 329, 334, 339, 344, 349, 356, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359,
+ },
+ { /* Fourth byte 16-bit table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 15, 20, 25, 30, 35,
+ 40, 45, 50, 55, 60, 65, 70, 78,
+ 86, 94, 102, 110, 118, 126, 134, 142,
+ 150, 158, 166, 174, 182, 190, 190, 190,
+ 190, 195, 200, 205, 210, 215, 220, 225,
+ 230, 235, 240, 245, 250, 255, 260, 265,
+ 270, 275, 280, 285, 290, 295, 300, 305,
+ 310, 315, 320, 325, 330, 335, 340, 345,
+ 350, 350, 350, 350, 350, 350, 350, 350,
+ 350, 350, 350, 350, 350, 350, 350, 350,
+ 350, 350, 350, 350, 350, 350, 350, 350,
+ 350, 350, 350, 350, 350, 350, 350, 350,
+ 350, 350, 350, 350, 350, 350, 350, 350,
+ 350, 350, 350, 350, 350, 350, 350, 350,
+ 350, 350, 350, 350, 350, 350, 350, 350,
+ 350, 350, 350, 350, 350, 350, 350, 350,
+ 350,
+ },
+ { /* Fourth byte 16-bit table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 15, 27, 42, 51, 66, 75, 84,
+ 102, 114, 123, 132, 141, 153, 165, 177,
+ 189, 201, 213, 225, 243, 249, 267, 285,
+ 300, 312, 330, 348, 360, 369, 378, 390,
+ 402, 417, 432, 441, 450, 462, 471, 480,
+ 486, 492, 501, 510, 528, 540, 555, 573,
+ 585, 594, 603, 621, 633, 651, 660, 675,
+ 684, 696, 705, 717, 732, 744, 759, 771,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777,
+ },
+ { /* Fourth byte 16-bit table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 15, 24, 33, 45, 54, 63, 72,
+ 87, 99, 105, 123, 132, 147, 159, 171,
+ 180, 189, 201, 207, 219, 234, 240, 258,
+ 267, 271, 275, 279, 283, 287, 291, 295,
+ 299, 303, 307, 312, 317, 322, 327, 332,
+ 337, 342, 347, 352, 357, 362, 367, 372,
+ 377, 382, 385, 387, 389, 392, 394, 396,
+ 396, 396, 396, 396, 402, 408, 414, 420,
+ 432, 432, 432, 432, 432, 432, 432, 432,
+ 432, 432, 432, 432, 432, 432, 432, 432,
+ 432, 432, 432, 432, 432, 432, 432, 432,
+ 432, 432, 432, 432, 432, 432, 432, 432,
+ 432, 432, 432, 432, 432, 432, 432, 432,
+ 432, 432, 432, 432, 432, 432, 432, 432,
+ 432, 432, 432, 432, 432, 432, 432, 432,
+ 432, 432, 432, 432, 432, 432, 432, 432,
+ 432,
+ },
+ { /* Fourth byte 16-bit table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 34, 38,
+ 42, 46, 50, 54, 58, 62, 66, 70,
+ 74, 78, 82, 86, 90, 94, 98, 102,
+ 106, 110, 114, 118, 122, 126, 130, 134,
+ 138, 142, 146, 150, 154, 158, 162, 166,
+ 170, 174, 178, 182, 186, 190, 194, 198,
+ 202, 206, 210, 214, 218, 222, 226, 230,
+ 234, 238, 242, 246, 250, 254, 258, 262,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266,
+ },
+ { /* Fourth byte 16-bit table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 125,
+ 130, 135, 140, 145, 150, 156, 162, 168,
+ 174, 180, 186, 190, 194, 198, 202, 206,
+ 210, 214, 218, 222, 226, 230, 234, 238,
+ 242, 246, 250, 254, 258, 262, 266, 270,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274,
+ },
+ { /* Fourth byte 16-bit table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 98, 104, 110, 116, 122, 126, 130, 134,
+ 138, 142, 146, 150, 154, 158, 162, 166,
+ 170, 174, 178, 182, 186, 190, 194, 198,
+ 202, 206, 210, 214, 218, 222, 226, 230,
+ 234, 238, 242, 246, 250, 254, 258, 262,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266,
+ },
+ { /* Fourth byte 16-bit table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 130, 136, 140, 144, 148, 152, 156, 160,
+ 164, 168, 172, 176, 180, 184, 188, 192,
+ 196, 200, 204, 210, 216, 222, 226, 230,
+ 234, 238, 242, 246, 250, 254, 258, 262,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266,
+ },
+ { /* Fourth byte 16-bit table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 36, 42,
+ 48, 54, 60, 66, 72, 78, 84, 90,
+ 96, 102, 108, 114, 120, 126, 132, 138,
+ 144, 150, 156, 162, 168, 174, 180, 186,
+ 192, 198, 204, 210, 216, 222, 228, 234,
+ 240, 246, 252, 258, 264, 270, 276, 282,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288,
+ },
+ { /* Fourth byte 16-bit table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 36, 42,
+ 48, 54, 60, 66, 72, 78, 84, 90,
+ 96, 96, 96, 102, 108, 114, 120, 126,
+ 132, 138, 144, 150, 156, 162, 168, 174,
+ 180, 186, 192, 198, 204, 210, 216, 222,
+ 228, 234, 240, 246, 252, 258, 264, 270,
+ 276, 282, 288, 294, 300, 306, 312, 318,
+ 324, 330, 336, 342, 348, 354, 360, 366,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372,
+ },
+ { /* Fourth byte 16-bit table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 17, 21, 25, 29,
+ 33, 37, 41, 45, 49, 53, 58, 62,
+ 66, 70, 74, 79, 83, 87, 91, 96,
+ 100, 104, 108, 112, 116, 121, 125, 129,
+ 133, 137, 141, 145, 149, 153, 157, 161,
+ 165, 169, 173, 177, 181, 185, 189, 193,
+ 197, 201, 205, 209, 213, 218, 222, 226,
+ 230, 235, 239, 243, 247, 251, 255, 259,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263,
+ },
+ { /* Fourth byte 16-bit table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 105, 109, 113, 117, 121, 125,
+ 129, 134, 139, 143, 147, 151, 155, 159,
+ 163, 167, 171, 175, 179, 184, 188, 192,
+ 196, 200, 205, 209, 213, 217, 221, 225,
+ 229, 233, 237, 241, 246, 250, 255, 259,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263,
+ },
+ { /* Fourth byte 16-bit table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 41, 45, 49, 53, 57, 61,
+ 66, 70, 75, 80, 84, 88, 92, 96,
+ 101, 106, 110, 114, 118, 122, 126, 130,
+ 134, 138, 142, 146, 150, 155, 159, 163,
+ 167, 171, 175, 179, 183, 187, 191, 195,
+ 199, 203, 207, 211, 215, 219, 223, 227,
+ 231, 236, 240, 244, 248, 252, 256, 261,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265,
+ },
+ { /* Fourth byte 16-bit table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 45, 49, 53, 57, 61,
+ 65, 69, 73, 77, 81, 85, 89, 93,
+ 97, 101, 105, 109, 113, 117, 122, 126,
+ 130, 134, 138, 142, 147, 151, 155, 159,
+ 163, 167, 171, 175, 179, 184, 188, 192,
+ 196, 201, 205, 209, 213, 217, 221, 225,
+ 230, 235, 240, 244, 249, 253, 257, 261,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265,
+ },
+ { /* Fourth byte 16-bit table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 29,
+ 33, 37, 41, 45, 49, 53, 58, 62,
+ 66, 71, 76, 80, 84, 88, 92, 96,
+ 100, 104, 108, 112, 117, 121, 126, 130,
+ 135, 139, 143, 147, 152, 156, 160, 165,
+ 170, 174, 178, 182, 186, 190, 194, 198,
+ 202, 206, 210, 214, 218, 222, 227, 231,
+ 236, 240, 245, 249, 254, 259, 264, 268,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272,
+ },
+ { /* Fourth byte 16-bit table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 9, 14, 19, 24, 28, 32,
+ 36, 40, 44, 48, 52, 56, 61, 65,
+ 69, 73, 77, 82, 86, 91, 96, 100,
+ 104, 108, 112, 116, 120, 125, 130, 135,
+ 139, 143, 148, 152, 156, 160, 165, 169,
+ 173, 177, 181, 185, 190, 194, 198, 202,
+ 206, 210, 214, 219, 224, 228, 233, 237,
+ 242, 246, 250, 254, 259, 264, 268, 273,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277,
+ },
+ { /* Fourth byte 16-bit table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 9, 13, 17, 21, 25, 29,
+ 34, 39, 44, 49, 53, 57, 61, 65,
+ 69, 73, 77, 81, 85, 89, 93, 97,
+ 102, 106, 110, 114, 118, 122, 126, 130,
+ 134, 138, 142, 146, 150, 155, 160, 165,
+ 169, 173, 177, 181, 186, 190, 195, 199,
+ 203, 208, 213, 217, 221, 225, 229, 233,
+ 237, 241, 245, 249, 253, 257, 261, 265,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269,
+ },
+ { /* Fourth byte 16-bit table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 25, 29,
+ 33, 37, 41, 45, 50, 55, 59, 63,
+ 67, 71, 75, 79, 84, 88, 92, 96,
+ 100, 105, 110, 114, 118, 122, 127, 131,
+ 135, 140, 145, 149, 153, 157, 162, 166,
+ 170, 174, 178, 182, 186, 190, 195, 199,
+ 203, 207, 212, 216, 220, 224, 228, 233,
+ 238, 242, 246, 250, 255, 259, 264, 268,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272,
+ },
+ { /* Fourth byte 16-bit table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ },
+ {
+ { /* Fourth byte 16-bit table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 38, 44, 48, 52, 56, 60, 64,
+ 68, 72, 76, 80, 84, 90, 96, 102,
+ 108, 112, 116, 120, 124, 130, 136, 140,
+ 144, 148, 152, 156, 160, 164, 168, 172,
+ 176, 180, 184, 188, 192, 196, 200, 206,
+ 212, 216, 220, 224, 228, 232, 236, 240,
+ 244, 250, 256, 260, 264, 268, 272, 276,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280, 280, 280, 280, 280, 280, 280, 280,
+ 280,
+ },
+ { /* Fourth byte 16-bit table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 54, 60, 66,
+ 72, 78, 84, 90, 96, 100, 104, 108,
+ 112, 116, 120, 124, 128, 134, 140, 144,
+ 148, 152, 156, 160, 164, 170, 176, 182,
+ 188, 194, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 262, 268, 274, 280, 284, 288, 292,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296, 296, 296, 296, 296, 296, 296, 296,
+ 296,
+ },
+ { /* Fourth byte 16-bit table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 107, 116, 116, 116, 116,
+ 116, 120, 124, 128, 132, 138, 144, 150,
+ 156, 162, 168, 174, 180, 186, 192, 198,
+ 204, 210, 216, 222, 228, 234, 240, 246,
+ 252, 256, 260, 264, 268, 272, 276, 282,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288,
+ },
+ { /* Fourth byte 16-bit table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 36, 42,
+ 48, 52, 56, 60, 64, 68, 72, 76,
+ 80, 86, 92, 98, 104, 110, 116, 122,
+ 128, 134, 140, 146, 152, 158, 164, 170,
+ 176, 182, 188, 194, 200, 204, 208, 212,
+ 216, 222, 228, 234, 240, 246, 252, 258,
+ 264, 270, 276, 280, 284, 288, 292, 296,
+ 300, 304, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308, 308, 308, 308, 308, 308, 308, 308,
+ 308,
+ },
+ { /* Fourth byte 16-bit table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 17, 24, 31, 38, 45,
+ 52, 57, 62, 69, 76, 83, 90, 97,
+ 104, 109, 114, 121, 128, 135, 142, 142,
+ 142, 147, 152, 159, 166, 173, 180, 180,
+ 180, 185, 190, 197, 204, 211, 218, 225,
+ 232, 237, 242, 249, 256, 263, 270, 277,
+ 284, 289, 294, 301, 308, 315, 322, 329,
+ 336, 341, 346, 353, 360, 367, 374, 381,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388, 388, 388, 388, 388, 388, 388, 388,
+ 388,
+ },
+ { /* Fourth byte 16-bit table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 17, 24, 31, 38, 38,
+ 38, 43, 48, 55, 62, 69, 76, 76,
+ 76, 81, 86, 93, 100, 107, 114, 121,
+ 128, 128, 133, 133, 140, 140, 147, 147,
+ 154, 159, 164, 171, 178, 185, 192, 199,
+ 206, 211, 216, 223, 230, 237, 244, 251,
+ 258, 263, 268, 273, 278, 283, 288, 293,
+ 298, 303, 308, 313, 318, 323, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328, 328, 328, 328, 328, 328, 328, 328,
+ 328,
+ },
+ { /* Fourth byte 16-bit table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 7, 14, 23, 32, 41, 50, 59,
+ 68, 75, 82, 91, 100, 109, 118, 127,
+ 136, 143, 150, 159, 168, 177, 186, 195,
+ 204, 211, 218, 227, 236, 245, 254, 263,
+ 272, 279, 286, 295, 304, 313, 322, 331,
+ 340, 347, 354, 363, 372, 381, 390, 399,
+ 408, 413, 418, 425, 430, 437, 437, 442,
+ 449, 454, 459, 464, 469, 474, 477, 480,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483, 483, 483, 483, 483, 483, 483, 483,
+ 483,
+ },
+ { /* Fourth byte 16-bit table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 14, 21, 26, 33, 33, 38,
+ 45, 50, 55, 60, 65, 70, 82, 94,
+ 106, 111, 116, 123, 130, 130, 130, 135,
+ 142, 147, 152, 157, 162, 162, 174, 186,
+ 198, 203, 208, 215, 222, 227, 232, 237,
+ 244, 249, 254, 259, 264, 269, 280, 291,
+ 293, 293, 293, 300, 305, 312, 312, 317,
+ 324, 329, 334, 339, 344, 349, 356, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359, 359, 359, 359, 359, 359, 359, 359,
+ 359,
+ },
+ { /* Fourth byte 16-bit table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 10, 15, 20, 25, 30, 35,
+ 40, 45, 50, 55, 60, 65, 70, 78,
+ 86, 94, 102, 110, 118, 126, 134, 142,
+ 150, 158, 166, 174, 182, 190, 207, 221,
+ 221, 226, 231, 236, 241, 246, 251, 256,
+ 261, 266, 271, 276, 281, 286, 291, 296,
+ 301, 306, 311, 316, 321, 326, 331, 336,
+ 341, 346, 351, 356, 361, 366, 371, 376,
+ 381, 381, 381, 381, 381, 381, 381, 381,
+ 381, 381, 381, 381, 381, 381, 381, 381,
+ 381, 381, 381, 381, 381, 381, 381, 381,
+ 381, 381, 381, 381, 381, 381, 381, 381,
+ 381, 381, 381, 381, 381, 381, 381, 381,
+ 381, 381, 381, 381, 381, 381, 381, 381,
+ 381, 381, 381, 381, 381, 381, 381, 381,
+ 381, 381, 381, 381, 381, 381, 381, 381,
+ 381,
+ },
+ { /* Fourth byte 16-bit table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 15, 27, 42, 51, 66, 75, 84,
+ 102, 114, 123, 132, 141, 153, 165, 177,
+ 189, 201, 213, 225, 243, 249, 267, 285,
+ 300, 312, 330, 348, 360, 369, 378, 390,
+ 402, 417, 432, 441, 450, 462, 471, 480,
+ 486, 492, 501, 510, 528, 540, 555, 573,
+ 585, 594, 603, 621, 633, 651, 660, 675,
+ 684, 696, 705, 717, 732, 744, 759, 771,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777, 777, 777, 777, 777, 777, 777, 777,
+ 777,
+ },
+ { /* Fourth byte 16-bit table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 15, 24, 33, 45, 54, 63, 72,
+ 87, 99, 105, 123, 132, 147, 159, 171,
+ 180, 189, 201, 207, 219, 234, 240, 258,
+ 267, 271, 275, 279, 283, 287, 291, 295,
+ 299, 303, 307, 312, 317, 322, 327, 332,
+ 337, 342, 347, 352, 357, 362, 367, 372,
+ 377, 382, 385, 387, 389, 392, 394, 396,
+ 398, 401, 404, 406, 412, 418, 424, 430,
+ 442, 442, 442, 442, 442, 442, 442, 442,
+ 442, 442, 442, 442, 442, 442, 442, 442,
+ 442, 442, 442, 442, 442, 442, 442, 442,
+ 442, 442, 442, 442, 442, 442, 442, 442,
+ 442, 442, 442, 442, 442, 442, 442, 442,
+ 442, 442, 442, 442, 442, 442, 442, 442,
+ 442, 442, 442, 442, 442, 442, 442, 442,
+ 442, 442, 442, 442, 442, 442, 442, 442,
+ 442,
+ },
+ { /* Fourth byte 16-bit table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 248, 252,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256, 256, 256, 256, 256, 256, 256, 256,
+ 256,
+ },
+ { /* Fourth byte 16-bit table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 34, 38,
+ 42, 46, 50, 54, 58, 62, 66, 70,
+ 74, 78, 82, 86, 90, 94, 98, 102,
+ 106, 110, 114, 118, 122, 126, 130, 134,
+ 138, 142, 146, 150, 154, 158, 162, 166,
+ 170, 174, 178, 182, 186, 190, 194, 198,
+ 202, 206, 210, 214, 218, 222, 226, 230,
+ 234, 238, 242, 246, 250, 254, 258, 262,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266,
+ },
+ { /* Fourth byte 16-bit table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 125,
+ 130, 135, 140, 145, 150, 156, 162, 168,
+ 174, 180, 186, 190, 194, 198, 202, 206,
+ 210, 214, 218, 222, 226, 230, 234, 238,
+ 242, 246, 250, 254, 258, 262, 266, 270,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274, 274, 274, 274, 274, 274, 274, 274,
+ 274,
+ },
+ { /* Fourth byte 16-bit table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 98, 104, 110, 116, 122, 126, 130, 134,
+ 138, 142, 146, 150, 154, 158, 162, 166,
+ 170, 174, 178, 182, 186, 190, 194, 198,
+ 202, 206, 210, 214, 218, 222, 226, 230,
+ 234, 238, 242, 246, 250, 254, 258, 262,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266,
+ },
+ { /* Fourth byte 16-bit table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 130, 136, 140, 144, 148, 152, 156, 160,
+ 164, 168, 172, 176, 180, 184, 188, 192,
+ 196, 200, 204, 210, 216, 222, 226, 230,
+ 234, 238, 242, 246, 250, 254, 258, 262,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266, 266, 266, 266, 266, 266, 266, 266,
+ 266,
+ },
+ { /* Fourth byte 16-bit table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 36, 42,
+ 48, 54, 60, 66, 72, 78, 84, 90,
+ 96, 102, 108, 114, 120, 126, 132, 138,
+ 144, 150, 156, 162, 168, 174, 180, 186,
+ 192, 198, 204, 210, 216, 222, 228, 234,
+ 240, 246, 252, 258, 264, 270, 276, 282,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288, 288, 288, 288, 288, 288, 288, 288,
+ 288,
+ },
+ { /* Fourth byte 16-bit table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 12, 18, 24, 30, 36, 42,
+ 48, 54, 60, 66, 72, 78, 84, 90,
+ 96, 96, 96, 102, 108, 114, 120, 126,
+ 132, 138, 144, 150, 156, 162, 168, 174,
+ 180, 186, 192, 198, 204, 210, 216, 222,
+ 228, 234, 240, 246, 252, 258, 264, 270,
+ 276, 282, 288, 294, 300, 306, 312, 318,
+ 324, 330, 336, 342, 348, 354, 360, 366,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372, 372, 372, 372, 372, 372, 372, 372,
+ 372,
+ },
+ { /* Fourth byte 16-bit table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 17, 21, 25, 29,
+ 33, 37, 41, 45, 49, 53, 58, 62,
+ 66, 70, 74, 79, 83, 87, 91, 96,
+ 100, 104, 108, 112, 116, 121, 125, 129,
+ 133, 137, 141, 145, 149, 153, 157, 161,
+ 165, 169, 173, 177, 181, 185, 189, 193,
+ 197, 201, 205, 209, 213, 218, 222, 226,
+ 230, 235, 239, 243, 247, 251, 255, 259,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263,
+ },
+ { /* Fourth byte 16-bit table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 105, 109, 113, 117, 121, 125,
+ 129, 134, 139, 143, 147, 151, 155, 159,
+ 163, 167, 171, 175, 179, 184, 188, 192,
+ 196, 200, 205, 209, 213, 217, 221, 225,
+ 229, 233, 237, 241, 246, 250, 255, 259,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263, 263, 263, 263, 263, 263, 263, 263,
+ 263,
+ },
+ { /* Fourth byte 16-bit table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 41, 45, 49, 53, 57, 61,
+ 66, 70, 75, 80, 84, 88, 92, 96,
+ 101, 106, 110, 114, 118, 122, 126, 130,
+ 134, 138, 142, 146, 150, 155, 159, 163,
+ 167, 171, 175, 179, 183, 187, 191, 195,
+ 199, 203, 207, 211, 215, 219, 223, 227,
+ 231, 236, 240, 244, 248, 252, 256, 261,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265,
+ },
+ { /* Fourth byte 16-bit table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 45, 49, 53, 57, 61,
+ 65, 69, 73, 77, 81, 85, 89, 93,
+ 97, 101, 105, 109, 113, 117, 122, 126,
+ 130, 134, 138, 142, 147, 151, 155, 159,
+ 163, 167, 171, 175, 179, 184, 188, 192,
+ 196, 201, 205, 209, 213, 217, 221, 225,
+ 230, 235, 240, 244, 249, 253, 257, 261,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265, 265, 265, 265, 265, 265, 265, 265,
+ 265,
+ },
+ { /* Fourth byte 16-bit table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 29,
+ 33, 37, 41, 45, 49, 53, 58, 62,
+ 66, 71, 76, 80, 84, 88, 92, 96,
+ 100, 104, 108, 112, 117, 121, 126, 130,
+ 135, 139, 143, 147, 152, 156, 160, 165,
+ 170, 174, 178, 182, 186, 190, 194, 198,
+ 202, 206, 210, 214, 218, 222, 227, 231,
+ 236, 240, 245, 249, 254, 259, 264, 268,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272,
+ },
+ { /* Fourth byte 16-bit table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 9, 14, 19, 24, 28, 32,
+ 36, 40, 44, 48, 52, 56, 61, 65,
+ 69, 73, 77, 82, 86, 91, 96, 100,
+ 104, 108, 112, 116, 120, 125, 130, 135,
+ 139, 143, 148, 152, 156, 160, 165, 169,
+ 173, 177, 181, 185, 190, 194, 198, 202,
+ 206, 210, 214, 219, 224, 228, 233, 237,
+ 242, 246, 250, 254, 259, 264, 268, 273,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277, 277, 277, 277, 277, 277, 277, 277,
+ 277,
+ },
+ { /* Fourth byte 16-bit table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 5, 9, 13, 17, 21, 25, 29,
+ 34, 39, 44, 49, 53, 57, 61, 65,
+ 69, 73, 77, 81, 85, 89, 93, 97,
+ 102, 106, 110, 114, 118, 122, 126, 130,
+ 134, 138, 142, 146, 150, 155, 160, 165,
+ 169, 173, 177, 181, 186, 190, 195, 199,
+ 203, 208, 213, 217, 221, 225, 229, 233,
+ 237, 241, 245, 249, 253, 257, 261, 265,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269, 269, 269, 269, 269, 269, 269, 269,
+ 269,
+ },
+ { /* Fourth byte 16-bit table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 25, 29,
+ 33, 37, 41, 45, 50, 55, 59, 63,
+ 67, 71, 75, 79, 84, 88, 92, 96,
+ 100, 105, 110, 114, 118, 122, 127, 131,
+ 135, 140, 145, 149, 153, 157, 162, 166,
+ 170, 174, 178, 182, 186, 190, 195, 199,
+ 203, 207, 212, 216, 220, 224, 228, 233,
+ 238, 242, 246, 250, 255, 259, 264, 268,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272, 272, 272, 272, 272, 272, 272, 272,
+ 272,
+ },
+ },
+};
+
+static const uchar_t u8_decomp_final_tbl[2][19370] = {
+ {
+ 0x20, 0x20, 0xCC, 0x88, 0x61, 0x20, 0xCC, 0x84,
+ 0x32, 0x33, 0x20, 0xCC, 0x81, 0xCE, 0xBC, 0x20,
+ 0xCC, 0xA7, 0x31, 0x6F, 0x31, 0xE2, 0x81, 0x84,
+ 0x34, 0x31, 0xE2, 0x81, 0x84, 0x32, 0x33, 0xE2,
+ 0x81, 0x84, 0x34, 0xF6, 0x41, 0xCC, 0x80, 0xF6,
+ 0x41, 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x82, 0xF6,
+ 0x41, 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0x88, 0xF6,
+ 0x41, 0xCC, 0x8A, 0xF6, 0x43, 0xCC, 0xA7, 0xF6,
+ 0x45, 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x81, 0xF6,
+ 0x45, 0xCC, 0x82, 0xF6, 0x45, 0xCC, 0x88, 0xF6,
+ 0x49, 0xCC, 0x80, 0xF6, 0x49, 0xCC, 0x81, 0xF6,
+ 0x49, 0xCC, 0x82, 0xF6, 0x49, 0xCC, 0x88, 0xF6,
+ 0x4E, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x80, 0xF6,
+ 0x4F, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xF6,
+ 0x4F, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x88, 0xF6,
+ 0x55, 0xCC, 0x80, 0xF6, 0x55, 0xCC, 0x81, 0xF6,
+ 0x55, 0xCC, 0x82, 0xF6, 0x55, 0xCC, 0x88, 0xF6,
+ 0x59, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x80, 0xF6,
+ 0x61, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x82, 0xF6,
+ 0x61, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x88, 0xF6,
+ 0x61, 0xCC, 0x8A, 0xF6, 0x63, 0xCC, 0xA7, 0xF6,
+ 0x65, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x81, 0xF6,
+ 0x65, 0xCC, 0x82, 0xF6, 0x65, 0xCC, 0x88, 0xF6,
+ 0x69, 0xCC, 0x80, 0xF6, 0x69, 0xCC, 0x81, 0xF6,
+ 0x69, 0xCC, 0x82, 0xF6, 0x69, 0xCC, 0x88, 0xF6,
+ 0x6E, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x80, 0xF6,
+ 0x6F, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xF6,
+ 0x6F, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x88, 0xF6,
+ 0x75, 0xCC, 0x80, 0xF6, 0x75, 0xCC, 0x81, 0xF6,
+ 0x75, 0xCC, 0x82, 0xF6, 0x75, 0xCC, 0x88, 0xF6,
+ 0x79, 0xCC, 0x81, 0xF6, 0x79, 0xCC, 0x88, 0xF6,
+ 0x41, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x84, 0xF6,
+ 0x41, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0x86, 0xF6,
+ 0x41, 0xCC, 0xA8, 0xF6, 0x61, 0xCC, 0xA8, 0xF6,
+ 0x43, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0x81, 0xF6,
+ 0x43, 0xCC, 0x82, 0xF6, 0x63, 0xCC, 0x82, 0xF6,
+ 0x43, 0xCC, 0x87, 0xF6, 0x63, 0xCC, 0x87, 0xF6,
+ 0x43, 0xCC, 0x8C, 0xF6, 0x63, 0xCC, 0x8C, 0xF6,
+ 0x44, 0xCC, 0x8C, 0xF6, 0x64, 0xCC, 0x8C, 0xF6,
+ 0x45, 0xCC, 0x84, 0xF6, 0x65, 0xCC, 0x84, 0xF6,
+ 0x45, 0xCC, 0x86, 0xF6, 0x65, 0xCC, 0x86, 0xF6,
+ 0x45, 0xCC, 0x87, 0xF6, 0x65, 0xCC, 0x87, 0xF6,
+ 0x45, 0xCC, 0xA8, 0xF6, 0x65, 0xCC, 0xA8, 0xF6,
+ 0x45, 0xCC, 0x8C, 0xF6, 0x65, 0xCC, 0x8C, 0xF6,
+ 0x47, 0xCC, 0x82, 0xF6, 0x67, 0xCC, 0x82, 0xF6,
+ 0x47, 0xCC, 0x86, 0xF6, 0x67, 0xCC, 0x86, 0xF6,
+ 0x47, 0xCC, 0x87, 0xF6, 0x67, 0xCC, 0x87, 0xF6,
+ 0x47, 0xCC, 0xA7, 0xF6, 0x67, 0xCC, 0xA7, 0xF6,
+ 0x48, 0xCC, 0x82, 0xF6, 0x68, 0xCC, 0x82, 0xF6,
+ 0x49, 0xCC, 0x83, 0xF6, 0x69, 0xCC, 0x83, 0xF6,
+ 0x49, 0xCC, 0x84, 0xF6, 0x69, 0xCC, 0x84, 0xF6,
+ 0x49, 0xCC, 0x86, 0xF6, 0x69, 0xCC, 0x86, 0xF6,
+ 0x49, 0xCC, 0xA8, 0xF6, 0x69, 0xCC, 0xA8, 0xF6,
+ 0x49, 0xCC, 0x87, 0x49, 0x4A, 0x69, 0x6A, 0xF6,
+ 0x4A, 0xCC, 0x82, 0xF6, 0x6A, 0xCC, 0x82, 0xF6,
+ 0x4B, 0xCC, 0xA7, 0xF6, 0x6B, 0xCC, 0xA7, 0xF6,
+ 0x4C, 0xCC, 0x81, 0xF6, 0x6C, 0xCC, 0x81, 0xF6,
+ 0x4C, 0xCC, 0xA7, 0xF6, 0x6C, 0xCC, 0xA7, 0xF6,
+ 0x4C, 0xCC, 0x8C, 0xF6, 0x6C, 0xCC, 0x8C, 0x4C,
+ 0xC2, 0xB7, 0x6C, 0xC2, 0xB7, 0xF6, 0x4E, 0xCC,
+ 0x81, 0xF6, 0x6E, 0xCC, 0x81, 0xF6, 0x4E, 0xCC,
+ 0xA7, 0xF6, 0x6E, 0xCC, 0xA7, 0xF6, 0x4E, 0xCC,
+ 0x8C, 0xF6, 0x6E, 0xCC, 0x8C, 0xCA, 0xBC, 0x6E,
+ 0xF6, 0x4F, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, 0x84,
+ 0xF6, 0x4F, 0xCC, 0x86, 0xF6, 0x6F, 0xCC, 0x86,
+ 0xF6, 0x4F, 0xCC, 0x8B, 0xF6, 0x6F, 0xCC, 0x8B,
+ 0xF6, 0x52, 0xCC, 0x81, 0xF6, 0x72, 0xCC, 0x81,
+ 0xF6, 0x52, 0xCC, 0xA7, 0xF6, 0x72, 0xCC, 0xA7,
+ 0xF6, 0x52, 0xCC, 0x8C, 0xF6, 0x72, 0xCC, 0x8C,
+ 0xF6, 0x53, 0xCC, 0x81, 0xF6, 0x73, 0xCC, 0x81,
+ 0xF6, 0x53, 0xCC, 0x82, 0xF6, 0x73, 0xCC, 0x82,
+ 0xF6, 0x53, 0xCC, 0xA7, 0xF6, 0x73, 0xCC, 0xA7,
+ 0xF6, 0x53, 0xCC, 0x8C, 0xF6, 0x73, 0xCC, 0x8C,
+ 0xF6, 0x54, 0xCC, 0xA7, 0xF6, 0x74, 0xCC, 0xA7,
+ 0xF6, 0x54, 0xCC, 0x8C, 0xF6, 0x74, 0xCC, 0x8C,
+ 0xF6, 0x55, 0xCC, 0x83, 0xF6, 0x75, 0xCC, 0x83,
+ 0xF6, 0x55, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x84,
+ 0xF6, 0x55, 0xCC, 0x86, 0xF6, 0x75, 0xCC, 0x86,
+ 0xF6, 0x55, 0xCC, 0x8A, 0xF6, 0x75, 0xCC, 0x8A,
+ 0xF6, 0x55, 0xCC, 0x8B, 0xF6, 0x75, 0xCC, 0x8B,
+ 0xF6, 0x55, 0xCC, 0xA8, 0xF6, 0x75, 0xCC, 0xA8,
+ 0xF6, 0x57, 0xCC, 0x82, 0xF6, 0x77, 0xCC, 0x82,
+ 0xF6, 0x59, 0xCC, 0x82, 0xF6, 0x79, 0xCC, 0x82,
+ 0xF6, 0x59, 0xCC, 0x88, 0xF6, 0x5A, 0xCC, 0x81,
+ 0xF6, 0x7A, 0xCC, 0x81, 0xF6, 0x5A, 0xCC, 0x87,
+ 0xF6, 0x7A, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, 0x8C,
+ 0xF6, 0x7A, 0xCC, 0x8C, 0x73, 0xF6, 0x4F, 0xCC,
+ 0x9B, 0xF6, 0x6F, 0xCC, 0x9B, 0xF6, 0x55, 0xCC,
+ 0x9B, 0xF6, 0x75, 0xCC, 0x9B, 0x44, 0x5A, 0xCC,
+ 0x8C, 0x44, 0x7A, 0xCC, 0x8C, 0x64, 0x7A, 0xCC,
+ 0x8C, 0x4C, 0x4A, 0x4C, 0x6A, 0x6C, 0x6A, 0x4E,
+ 0x4A, 0x4E, 0x6A, 0x6E, 0x6A, 0xF6, 0x41, 0xCC,
+ 0x8C, 0xF6, 0x61, 0xCC, 0x8C, 0xF6, 0x49, 0xCC,
+ 0x8C, 0xF6, 0x69, 0xCC, 0x8C, 0xF6, 0x4F, 0xCC,
+ 0x8C, 0xF6, 0x6F, 0xCC, 0x8C, 0xF6, 0x55, 0xCC,
+ 0x8C, 0xF6, 0x75, 0xCC, 0x8C, 0xF6, 0x55, 0xCC,
+ 0x88, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x88, 0xCC,
+ 0x84, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x81, 0xF6,
+ 0x75, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x55, 0xCC,
+ 0x88, 0xCC, 0x8C, 0xF6, 0x75, 0xCC, 0x88, 0xCC,
+ 0x8C, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x80, 0xF6,
+ 0x75, 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0x41, 0xCC,
+ 0x88, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x88, 0xCC,
+ 0x84, 0xF6, 0x41, 0xCC, 0x87, 0xCC, 0x84, 0xF6,
+ 0x61, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0xC3, 0x86,
+ 0xCC, 0x84, 0xF6, 0xC3, 0xA6, 0xCC, 0x84, 0xF6,
+ 0x47, 0xCC, 0x8C, 0xF6, 0x67, 0xCC, 0x8C, 0xF6,
+ 0x4B, 0xCC, 0x8C, 0xF6, 0x6B, 0xCC, 0x8C, 0xF6,
+ 0x4F, 0xCC, 0xA8, 0xF6, 0x6F, 0xCC, 0xA8, 0xF6,
+ 0x4F, 0xCC, 0xA8, 0xCC, 0x84, 0xF6, 0x6F, 0xCC,
+ 0xA8, 0xCC, 0x84, 0xF6, 0xC6, 0xB7, 0xCC, 0x8C,
+ 0xF6, 0xCA, 0x92, 0xCC, 0x8C, 0xF6, 0x6A, 0xCC,
+ 0x8C, 0x44, 0x5A, 0x44, 0x7A, 0x64, 0x7A, 0xF6,
+ 0x47, 0xCC, 0x81, 0xF6, 0x67, 0xCC, 0x81, 0xF6,
+ 0x4E, 0xCC, 0x80, 0xF6, 0x6E, 0xCC, 0x80, 0xF6,
+ 0x41, 0xCC, 0x8A, 0xCC, 0x81, 0xF6, 0x61, 0xCC,
+ 0x8A, 0xCC, 0x81, 0xF6, 0xC3, 0x86, 0xCC, 0x81,
+ 0xF6, 0xC3, 0xA6, 0xCC, 0x81, 0xF6, 0xC3, 0x98,
+ 0xCC, 0x81, 0xF6, 0xC3, 0xB8, 0xCC, 0x81, 0xF6,
+ 0x41, 0xCC, 0x8F, 0xF6, 0x61, 0xCC, 0x8F, 0xF6,
+ 0x41, 0xCC, 0x91, 0xF6, 0x61, 0xCC, 0x91, 0xF6,
+ 0x45, 0xCC, 0x8F, 0xF6, 0x65, 0xCC, 0x8F, 0xF6,
+ 0x45, 0xCC, 0x91, 0xF6, 0x65, 0xCC, 0x91, 0xF6,
+ 0x49, 0xCC, 0x8F, 0xF6, 0x69, 0xCC, 0x8F, 0xF6,
+ 0x49, 0xCC, 0x91, 0xF6, 0x69, 0xCC, 0x91, 0xF6,
+ 0x4F, 0xCC, 0x8F, 0xF6, 0x6F, 0xCC, 0x8F, 0xF6,
+ 0x4F, 0xCC, 0x91, 0xF6, 0x6F, 0xCC, 0x91, 0xF6,
+ 0x52, 0xCC, 0x8F, 0xF6, 0x72, 0xCC, 0x8F, 0xF6,
+ 0x52, 0xCC, 0x91, 0xF6, 0x72, 0xCC, 0x91, 0xF6,
+ 0x55, 0xCC, 0x8F, 0xF6, 0x75, 0xCC, 0x8F, 0xF6,
+ 0x55, 0xCC, 0x91, 0xF6, 0x75, 0xCC, 0x91, 0xF6,
+ 0x53, 0xCC, 0xA6, 0xF6, 0x73, 0xCC, 0xA6, 0xF6,
+ 0x54, 0xCC, 0xA6, 0xF6, 0x74, 0xCC, 0xA6, 0xF6,
+ 0x48, 0xCC, 0x8C, 0xF6, 0x68, 0xCC, 0x8C, 0xF6,
+ 0x41, 0xCC, 0x87, 0xF6, 0x61, 0xCC, 0x87, 0xF6,
+ 0x45, 0xCC, 0xA7, 0xF6, 0x65, 0xCC, 0xA7, 0xF6,
+ 0x4F, 0xCC, 0x88, 0xCC, 0x84, 0xF6, 0x6F, 0xCC,
+ 0x88, 0xCC, 0x84, 0xF6, 0x4F, 0xCC, 0x83, 0xCC,
+ 0x84, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x84, 0xF6,
+ 0x4F, 0xCC, 0x87, 0xF6, 0x6F, 0xCC, 0x87, 0xF6,
+ 0x4F, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0x6F, 0xCC,
+ 0x87, 0xCC, 0x84, 0xF6, 0x59, 0xCC, 0x84, 0xF6,
+ 0x79, 0xCC, 0x84, 0x68, 0xC9, 0xA6, 0x6A, 0x72,
+ 0xC9, 0xB9, 0xC9, 0xBB, 0xCA, 0x81, 0x77, 0x79,
+ 0x20, 0xCC, 0x86, 0x20, 0xCC, 0x87, 0x20, 0xCC,
+ 0x8A, 0x20, 0xCC, 0xA8, 0x20, 0xCC, 0x83, 0x20,
+ 0xCC, 0x8B, 0xC9, 0xA3, 0x6C, 0x73, 0x78, 0xCA,
+ 0x95, 0xF6, 0xCC, 0x80, 0xF6, 0xCC, 0x81, 0xF6,
+ 0xCC, 0x93, 0xF6, 0xCC, 0x88, 0xCC, 0x81, 0xF6,
+ 0xCA, 0xB9, 0x20, 0xCD, 0x85, 0xF6, 0x3B, 0x20,
+ 0xCC, 0x81, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81,
+ 0x20, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0x91,
+ 0xCC, 0x81, 0xF6, 0xC2, 0xB7, 0xF6, 0xCE, 0x95,
+ 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x81, 0xF6,
+ 0xCE, 0x99, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, 0xF6, 0xCE,
+ 0xA9, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x88,
+ 0xCC, 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x88, 0xF6,
+ 0xCE, 0xA5, 0xCC, 0x88, 0xF6, 0xCE, 0xB1, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x81,
+ 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6,
+ 0xCE, 0xB9, 0xCC, 0x88, 0xF6, 0xCF, 0x85, 0xCC,
+ 0x88, 0xF6, 0xCE, 0xBF, 0xCC, 0x81, 0xF6, 0xCF,
+ 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x81,
+ 0xCE, 0xB2, 0xCE, 0xB8, 0xCE, 0xA5, 0xF5, 0x05,
+ 0xCF, 0x92, 0xCC, 0x81, 0xCE, 0xA5, 0xCC, 0x81,
+ 0xF5, 0x05, 0xCF, 0x92, 0xCC, 0x88, 0xCE, 0xA5,
+ 0xCC, 0x88, 0xCF, 0x86, 0xCF, 0x80, 0xCE, 0xBA,
+ 0xCF, 0x81, 0xCF, 0x82, 0xCE, 0x98, 0xCE, 0xB5,
+ 0xF6, 0xD0, 0x95, 0xCC, 0x80, 0xF6, 0xD0, 0x95,
+ 0xCC, 0x88, 0xF6, 0xD0, 0x93, 0xCC, 0x81, 0xF6,
+ 0xD0, 0x86, 0xCC, 0x88, 0xF6, 0xD0, 0x9A, 0xCC,
+ 0x81, 0xF6, 0xD0, 0x98, 0xCC, 0x80, 0xF6, 0xD0,
+ 0xA3, 0xCC, 0x86, 0xF6, 0xD0, 0x98, 0xCC, 0x86,
+ 0xF6, 0xD0, 0xB8, 0xCC, 0x86, 0xF6, 0xD0, 0xB5,
+ 0xCC, 0x80, 0xF6, 0xD0, 0xB5, 0xCC, 0x88, 0xF6,
+ 0xD0, 0xB3, 0xCC, 0x81, 0xF6, 0xD1, 0x96, 0xCC,
+ 0x88, 0xF6, 0xD0, 0xBA, 0xCC, 0x81, 0xF6, 0xD0,
+ 0xB8, 0xCC, 0x80, 0xF6, 0xD1, 0x83, 0xCC, 0x86,
+ 0xF6, 0xD1, 0xB4, 0xCC, 0x8F, 0xF6, 0xD1, 0xB5,
+ 0xCC, 0x8F, 0xF6, 0xD0, 0x96, 0xCC, 0x86, 0xF6,
+ 0xD0, 0xB6, 0xCC, 0x86, 0xF6, 0xD0, 0x90, 0xCC,
+ 0x86, 0xF6, 0xD0, 0xB0, 0xCC, 0x86, 0xF6, 0xD0,
+ 0x90, 0xCC, 0x88, 0xF6, 0xD0, 0xB0, 0xCC, 0x88,
+ 0xF6, 0xD0, 0x95, 0xCC, 0x86, 0xF6, 0xD0, 0xB5,
+ 0xCC, 0x86, 0xF6, 0xD3, 0x98, 0xCC, 0x88, 0xF6,
+ 0xD3, 0x99, 0xCC, 0x88, 0xF6, 0xD0, 0x96, 0xCC,
+ 0x88, 0xF6, 0xD0, 0xB6, 0xCC, 0x88, 0xF6, 0xD0,
+ 0x97, 0xCC, 0x88, 0xF6, 0xD0, 0xB7, 0xCC, 0x88,
+ 0xF6, 0xD0, 0x98, 0xCC, 0x84, 0xF6, 0xD0, 0xB8,
+ 0xCC, 0x84, 0xF6, 0xD0, 0x98, 0xCC, 0x88, 0xF6,
+ 0xD0, 0xB8, 0xCC, 0x88, 0xF6, 0xD0, 0x9E, 0xCC,
+ 0x88, 0xF6, 0xD0, 0xBE, 0xCC, 0x88, 0xF6, 0xD3,
+ 0xA8, 0xCC, 0x88, 0xF6, 0xD3, 0xA9, 0xCC, 0x88,
+ 0xF6, 0xD0, 0xAD, 0xCC, 0x88, 0xF6, 0xD1, 0x8D,
+ 0xCC, 0x88, 0xF6, 0xD0, 0xA3, 0xCC, 0x84, 0xF6,
+ 0xD1, 0x83, 0xCC, 0x84, 0xF6, 0xD0, 0xA3, 0xCC,
+ 0x88, 0xF6, 0xD1, 0x83, 0xCC, 0x88, 0xF6, 0xD0,
+ 0xA3, 0xCC, 0x8B, 0xF6, 0xD1, 0x83, 0xCC, 0x8B,
+ 0xF6, 0xD0, 0xA7, 0xCC, 0x88, 0xF6, 0xD1, 0x87,
+ 0xCC, 0x88, 0xF6, 0xD0, 0xAB, 0xCC, 0x88, 0xF6,
+ 0xD1, 0x8B, 0xCC, 0x88, 0xD5, 0xA5, 0xD6, 0x82,
+ 0xF6, 0xD8, 0xA7, 0xD9, 0x93, 0xF6, 0xD8, 0xA7,
+ 0xD9, 0x94, 0xF6, 0xD9, 0x88, 0xD9, 0x94, 0xF6,
+ 0xD8, 0xA7, 0xD9, 0x95, 0xF6, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xD8, 0xA7, 0xD9, 0xB4, 0xD9, 0x88, 0xD9,
+ 0xB4, 0xDB, 0x87, 0xD9, 0xB4, 0xD9, 0x8A, 0xD9,
+ 0xB4, 0xF6, 0xDB, 0x95, 0xD9, 0x94, 0xF6, 0xDB,
+ 0x81, 0xD9, 0x94, 0xF6, 0xDB, 0x92, 0xD9, 0x94,
+ 0xF6, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, 0xBC, 0xF6,
+ 0xE0, 0xA4, 0xB0, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0,
+ 0xA4, 0xB3, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4,
+ 0x95, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x96,
+ 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x97, 0xE0,
+ 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x9C, 0xE0, 0xA4,
+ 0xBC, 0xF6, 0xE0, 0xA4, 0xA1, 0xE0, 0xA4, 0xBC,
+ 0xF6, 0xE0, 0xA4, 0xA2, 0xE0, 0xA4, 0xBC, 0xF6,
+ 0xE0, 0xA4, 0xAB, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0,
+ 0xA4, 0xAF, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA7,
+ 0x87, 0xE0, 0xA6, 0xBE, 0xF6, 0xE0, 0xA7, 0x87,
+ 0xE0, 0xA7, 0x97, 0xF6, 0xE0, 0xA6, 0xA1, 0xE0,
+ 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xA2, 0xE0, 0xA6,
+ 0xBC, 0xF6, 0xE0, 0xA6, 0xAF, 0xE0, 0xA6, 0xBC,
+ 0xF6, 0xE0, 0xA8, 0xB2, 0xE0, 0xA8, 0xBC, 0xF6,
+ 0xE0, 0xA8, 0xB8, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0,
+ 0xA8, 0x96, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8,
+ 0x97, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, 0x9C,
+ 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, 0xAB, 0xE0,
+ 0xA8, 0xBC, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD,
+ 0x96, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAC, 0xBE,
+ 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, 0x97, 0xF6,
+ 0xE0, 0xAC, 0xA1, 0xE0, 0xAC, 0xBC, 0xF6, 0xE0,
+ 0xAC, 0xA2, 0xE0, 0xAC, 0xBC, 0xF6, 0xE0, 0xAE,
+ 0x92, 0xE0, 0xAF, 0x97, 0xF6, 0xE0, 0xAF, 0x86,
+ 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x87, 0xE0,
+ 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x86, 0xE0, 0xAF,
+ 0x97, 0xF6, 0xE0, 0xB1, 0x86, 0xE0, 0xB1, 0x96,
+ 0xF6, 0xE0, 0xB2, 0xBF, 0xE0, 0xB3, 0x95, 0xF6,
+ 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x95, 0xF6, 0xE0,
+ 0xB3, 0x86, 0xE0, 0xB3, 0x96, 0xF6, 0xE0, 0xB3,
+ 0x86, 0xE0, 0xB3, 0x82, 0xF6, 0xE0, 0xB3, 0x86,
+ 0xE0, 0xB3, 0x82, 0xE0, 0xB3, 0x95, 0xF6, 0xE0,
+ 0xB5, 0x86, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, 0xB5,
+ 0x87, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, 0xB5, 0x86,
+ 0xE0, 0xB5, 0x97, 0xF6, 0xE0, 0xB7, 0x99, 0xE0,
+ 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, 0xB7,
+ 0x8F, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, 0xB7, 0x8F,
+ 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, 0xE0,
+ 0xB7, 0x9F, 0xE0, 0xB9, 0x8D, 0xE0, 0xB8, 0xB2,
+ 0xE0, 0xBB, 0x8D, 0xE0, 0xBA, 0xB2, 0xE0, 0xBA,
+ 0xAB, 0xE0, 0xBA, 0x99, 0xE0, 0xBA, 0xAB, 0xE0,
+ 0xBA, 0xA1, 0xE0, 0xBC, 0x8B, 0xF6, 0xE0, 0xBD,
+ 0x82, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x8C,
+ 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x91, 0xE0,
+ 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x96, 0xE0, 0xBE,
+ 0xB7, 0xF6, 0xE0, 0xBD, 0x9B, 0xE0, 0xBE, 0xB7,
+ 0xF6, 0xE0, 0xBD, 0x80, 0xE0, 0xBE, 0xB5, 0xF6,
+ 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2, 0xF6, 0xE0,
+ 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0xF6, 0xE0, 0xBE,
+ 0xB2, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, 0xB2, 0xE0,
+ 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBE,
+ 0xB3, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, 0xB3, 0xE0,
+ 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBD,
+ 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBE, 0x92,
+ 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0x9C, 0xE0,
+ 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA1, 0xE0, 0xBE,
+ 0xB7, 0xF6, 0xE0, 0xBE, 0xA6, 0xE0, 0xBE, 0xB7,
+ 0xF6, 0xE0, 0xBE, 0xAB, 0xE0, 0xBE, 0xB7, 0xF6,
+ 0xE0, 0xBE, 0x90, 0xE0, 0xBE, 0xB5, 0xF6, 0xE1,
+ 0x80, 0xA5, 0xE1, 0x80, 0xAE, 0xF6, 0x41, 0xCC,
+ 0xA5, 0xF6, 0x61, 0xCC, 0xA5, 0xF6, 0x42, 0xCC,
+ 0x87, 0xF6, 0x62, 0xCC, 0x87, 0xF6, 0x42, 0xCC,
+ 0xA3, 0xF6, 0x62, 0xCC, 0xA3, 0xF6, 0x42, 0xCC,
+ 0xB1, 0xF6, 0x62, 0xCC, 0xB1, 0xF6, 0x43, 0xCC,
+ 0xA7, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0xA7, 0xCC,
+ 0x81, 0xF6, 0x44, 0xCC, 0x87, 0xF6, 0x64, 0xCC,
+ 0x87, 0xF6, 0x44, 0xCC, 0xA3, 0xF6, 0x64, 0xCC,
+ 0xA3, 0xF6, 0x44, 0xCC, 0xB1, 0xF6, 0x64, 0xCC,
+ 0xB1, 0xF6, 0x44, 0xCC, 0xA7, 0xF6, 0x64, 0xCC,
+ 0xA7, 0xF6, 0x44, 0xCC, 0xAD, 0xF6, 0x64, 0xCC,
+ 0xAD, 0xF6, 0x45, 0xCC, 0x84, 0xCC, 0x80, 0xF6,
+ 0x65, 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x45, 0xCC,
+ 0x84, 0xCC, 0x81, 0xF6, 0x65, 0xCC, 0x84, 0xCC,
+ 0x81, 0xF6, 0x45, 0xCC, 0xAD, 0xF6, 0x65, 0xCC,
+ 0xAD, 0xF6, 0x45, 0xCC, 0xB0, 0xF6, 0x65, 0xCC,
+ 0xB0, 0xF6, 0x45, 0xCC, 0xA7, 0xCC, 0x86, 0xF6,
+ 0x65, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, 0x46, 0xCC,
+ 0x87, 0xF6, 0x66, 0xCC, 0x87, 0xF6, 0x47, 0xCC,
+ 0x84, 0xF6, 0x67, 0xCC, 0x84, 0xF6, 0x48, 0xCC,
+ 0x87, 0xF6, 0x68, 0xCC, 0x87, 0xF6, 0x48, 0xCC,
+ 0xA3, 0xF6, 0x68, 0xCC, 0xA3, 0xF6, 0x48, 0xCC,
+ 0x88, 0xF6, 0x68, 0xCC, 0x88, 0xF6, 0x48, 0xCC,
+ 0xA7, 0xF6, 0x68, 0xCC, 0xA7, 0xF6, 0x48, 0xCC,
+ 0xAE, 0xF6, 0x68, 0xCC, 0xAE, 0xF6, 0x49, 0xCC,
+ 0xB0, 0xF6, 0x69, 0xCC, 0xB0, 0xF6, 0x49, 0xCC,
+ 0x88, 0xCC, 0x81, 0xF6, 0x69, 0xCC, 0x88, 0xCC,
+ 0x81, 0xF6, 0x4B, 0xCC, 0x81, 0xF6, 0x6B, 0xCC,
+ 0x81, 0xF6, 0x4B, 0xCC, 0xA3, 0xF6, 0x6B, 0xCC,
+ 0xA3, 0xF6, 0x4B, 0xCC, 0xB1, 0xF6, 0x6B, 0xCC,
+ 0xB1, 0xF6, 0x4C, 0xCC, 0xA3, 0xF6, 0x6C, 0xCC,
+ 0xA3, 0xF6, 0x4C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6,
+ 0x6C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x4C, 0xCC,
+ 0xB1, 0xF6, 0x6C, 0xCC, 0xB1, 0xF6, 0x4C, 0xCC,
+ 0xAD, 0xF6, 0x6C, 0xCC, 0xAD, 0xF6, 0x4D, 0xCC,
+ 0x81, 0xF6, 0x6D, 0xCC, 0x81, 0xF6, 0x4D, 0xCC,
+ 0x87, 0xF6, 0x6D, 0xCC, 0x87, 0xF6, 0x4D, 0xCC,
+ 0xA3, 0xF6, 0x6D, 0xCC, 0xA3, 0xF6, 0x4E, 0xCC,
+ 0x87, 0xF6, 0x6E, 0xCC, 0x87, 0xF6, 0x4E, 0xCC,
+ 0xA3, 0xF6, 0x6E, 0xCC, 0xA3, 0xF6, 0x4E, 0xCC,
+ 0xB1, 0xF6, 0x6E, 0xCC, 0xB1, 0xF6, 0x4E, 0xCC,
+ 0xAD, 0xF6, 0x6E, 0xCC, 0xAD, 0xF6, 0x4F, 0xCC,
+ 0x83, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x83, 0xCC,
+ 0x81, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, 0x88, 0xF6,
+ 0x6F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, 0x4F, 0xCC,
+ 0x84, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x84, 0xCC,
+ 0x80, 0xF6, 0x4F, 0xCC, 0x84, 0xCC, 0x81, 0xF6,
+ 0x6F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x50, 0xCC,
+ 0x81, 0xF6, 0x70, 0xCC, 0x81, 0xF6, 0x50, 0xCC,
+ 0x87, 0xF6, 0x70, 0xCC, 0x87, 0xF6, 0x52, 0xCC,
+ 0x87, 0xF6, 0x72, 0xCC, 0x87, 0xF6, 0x52, 0xCC,
+ 0xA3, 0xF6, 0x72, 0xCC, 0xA3, 0xF6, 0x52, 0xCC,
+ 0xA3, 0xCC, 0x84, 0xF6, 0x72, 0xCC, 0xA3, 0xCC,
+ 0x84, 0xF6, 0x52, 0xCC, 0xB1, 0xF6, 0x72, 0xCC,
+ 0xB1, 0xF6, 0x53, 0xCC, 0x87, 0xF6, 0x73, 0xCC,
+ 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xF6, 0x73, 0xCC,
+ 0xA3, 0xF6, 0x53, 0xCC, 0x81, 0xCC, 0x87, 0xF6,
+ 0x73, 0xCC, 0x81, 0xCC, 0x87, 0xF6, 0x53, 0xCC,
+ 0x8C, 0xCC, 0x87, 0xF6, 0x73, 0xCC, 0x8C, 0xCC,
+ 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xCC, 0x87, 0xF6,
+ 0x73, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, 0x54, 0xCC,
+ 0x87, 0xF6, 0x74, 0xCC, 0x87, 0xF6, 0x54, 0xCC,
+ 0xA3, 0xF6, 0x74, 0xCC, 0xA3, 0xF6, 0x54, 0xCC,
+ 0xB1, 0xF6, 0x74, 0xCC, 0xB1, 0xF6, 0x54, 0xCC,
+ 0xAD, 0xF6, 0x74, 0xCC, 0xAD, 0xF6, 0x55, 0xCC,
+ 0xA4, 0xF6, 0x75, 0xCC, 0xA4, 0xF6, 0x55, 0xCC,
+ 0xB0, 0xF6, 0x75, 0xCC, 0xB0, 0xF6, 0x55, 0xCC,
+ 0xAD, 0xF6, 0x75, 0xCC, 0xAD, 0xF6, 0x55, 0xCC,
+ 0x83, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x83, 0xCC,
+ 0x81, 0xF6, 0x55, 0xCC, 0x84, 0xCC, 0x88, 0xF6,
+ 0x75, 0xCC, 0x84, 0xCC, 0x88, 0xF6, 0x56, 0xCC,
+ 0x83, 0xF6, 0x76, 0xCC, 0x83, 0xF6, 0x56, 0xCC,
+ 0xA3, 0xF6, 0x76, 0xCC, 0xA3, 0xF6, 0x57, 0xCC,
+ 0x80, 0xF6, 0x77, 0xCC, 0x80, 0xF6, 0x57, 0xCC,
+ 0x81, 0xF6, 0x77, 0xCC, 0x81, 0xF6, 0x57, 0xCC,
+ 0x88, 0xF6, 0x77, 0xCC, 0x88, 0xF6, 0x57, 0xCC,
+ 0x87, 0xF6, 0x77, 0xCC, 0x87, 0xF6, 0x57, 0xCC,
+ 0xA3, 0xF6, 0x77, 0xCC, 0xA3, 0xF6, 0x58, 0xCC,
+ 0x87, 0xF6, 0x78, 0xCC, 0x87, 0xF6, 0x58, 0xCC,
+ 0x88, 0xF6, 0x78, 0xCC, 0x88, 0xF6, 0x59, 0xCC,
+ 0x87, 0xF6, 0x79, 0xCC, 0x87, 0xF6, 0x5A, 0xCC,
+ 0x82, 0xF6, 0x7A, 0xCC, 0x82, 0xF6, 0x5A, 0xCC,
+ 0xA3, 0xF6, 0x7A, 0xCC, 0xA3, 0xF6, 0x5A, 0xCC,
+ 0xB1, 0xF6, 0x7A, 0xCC, 0xB1, 0xF6, 0x68, 0xCC,
+ 0xB1, 0xF6, 0x74, 0xCC, 0x88, 0xF6, 0x77, 0xCC,
+ 0x8A, 0xF6, 0x79, 0xCC, 0x8A, 0x61, 0xCA, 0xBE,
+ 0xF5, 0x05, 0xC5, 0xBF, 0xCC, 0x87, 0x73, 0xCC,
+ 0x87, 0xF6, 0x41, 0xCC, 0xA3, 0xF6, 0x61, 0xCC,
+ 0xA3, 0xF6, 0x41, 0xCC, 0x89, 0xF6, 0x61, 0xCC,
+ 0x89, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x81, 0xF6,
+ 0x61, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x41, 0xCC,
+ 0x82, 0xCC, 0x80, 0xF6, 0x61, 0xCC, 0x82, 0xCC,
+ 0x80, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x89, 0xF6,
+ 0x61, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x41, 0xCC,
+ 0x82, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x82, 0xCC,
+ 0x83, 0xF6, 0x41, 0xCC, 0xA3, 0xCC, 0x82, 0xF6,
+ 0x61, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x41, 0xCC,
+ 0x86, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x86, 0xCC,
+ 0x81, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x80, 0xF6,
+ 0x61, 0xCC, 0x86, 0xCC, 0x80, 0xF6, 0x41, 0xCC,
+ 0x86, 0xCC, 0x89, 0xF6, 0x61, 0xCC, 0x86, 0xCC,
+ 0x89, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x83, 0xF6,
+ 0x61, 0xCC, 0x86, 0xCC, 0x83, 0xF6, 0x41, 0xCC,
+ 0xA3, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0xA3, 0xCC,
+ 0x86, 0xF6, 0x45, 0xCC, 0xA3, 0xF6, 0x65, 0xCC,
+ 0xA3, 0xF6, 0x45, 0xCC, 0x89, 0xF6, 0x65, 0xCC,
+ 0x89, 0xF6, 0x45, 0xCC, 0x83, 0xF6, 0x65, 0xCC,
+ 0x83, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x81, 0xF6,
+ 0x65, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x45, 0xCC,
+ 0x82, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x82, 0xCC,
+ 0x80, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x89, 0xF6,
+ 0x65, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x45, 0xCC,
+ 0x82, 0xCC, 0x83, 0xF6, 0x65, 0xCC, 0x82, 0xCC,
+ 0x83, 0xF6, 0x45, 0xCC, 0xA3, 0xCC, 0x82, 0xF6,
+ 0x65, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x49, 0xCC,
+ 0x89, 0xF6, 0x69, 0xCC, 0x89, 0xF6, 0x49, 0xCC,
+ 0xA3, 0xF6, 0x69, 0xCC, 0xA3, 0xF6, 0x4F, 0xCC,
+ 0xA3, 0xF6, 0x6F, 0xCC, 0xA3, 0xF6, 0x4F, 0xCC,
+ 0x89, 0xF6, 0x6F, 0xCC, 0x89, 0xF6, 0x4F, 0xCC,
+ 0x82, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xCC,
+ 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x80, 0xF6,
+ 0x6F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x4F, 0xCC,
+ 0x82, 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x82, 0xCC,
+ 0x89, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x83, 0xF6,
+ 0x6F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x4F, 0xCC,
+ 0xA3, 0xCC, 0x82, 0xF6, 0x6F, 0xCC, 0xA3, 0xCC,
+ 0x82, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6,
+ 0x6F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x4F, 0xCC,
+ 0x9B, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x9B, 0xCC,
+ 0x80, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6,
+ 0x6F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x4F, 0xCC,
+ 0x9B, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x9B, 0xCC,
+ 0x83, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6,
+ 0x6F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x55, 0xCC,
+ 0xA3, 0xF6, 0x75, 0xCC, 0xA3, 0xF6, 0x55, 0xCC,
+ 0x89, 0xF6, 0x75, 0xCC, 0x89, 0xF6, 0x55, 0xCC,
+ 0x9B, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x9B, 0xCC,
+ 0x81, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x80, 0xF6,
+ 0x75, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x55, 0xCC,
+ 0x9B, 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x9B, 0xCC,
+ 0x89, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x83, 0xF6,
+ 0x75, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x55, 0xCC,
+ 0x9B, 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0x9B, 0xCC,
+ 0xA3, 0xF6, 0x59, 0xCC, 0x80, 0xF6, 0x79, 0xCC,
+ 0x80, 0xF6, 0x59, 0xCC, 0xA3, 0xF6, 0x79, 0xCC,
+ 0xA3, 0xF6, 0x59, 0xCC, 0x89, 0xF6, 0x79, 0xCC,
+ 0x89, 0xF6, 0x59, 0xCC, 0x83, 0xF6, 0x79, 0xCC,
+ 0x83, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xF6, 0xCE,
+ 0xB1, 0xCC, 0x94, 0xF6, 0xCE, 0xB1, 0xCC, 0x93,
+ 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC,
+ 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCC, 0x81,
+ 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xF6,
+ 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE,
+ 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0x91,
+ 0xCC, 0x93, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xF6,
+ 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE,
+ 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x91,
+ 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC,
+ 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, 0x93,
+ 0xCD, 0x82, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD,
+ 0x82, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xF6, 0xCE,
+ 0xB5, 0xCC, 0x94, 0xF6, 0xCE, 0xB5, 0xCC, 0x93,
+ 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC,
+ 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xCC, 0x81,
+ 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, 0x81, 0xF6,
+ 0xCE, 0x95, 0xCC, 0x93, 0xF6, 0xCE, 0x95, 0xCC,
+ 0x94, 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x80,
+ 0xF6, 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x80, 0xF6,
+ 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE,
+ 0x95, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7,
+ 0xCC, 0x93, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xF6,
+ 0xCE, 0xB7, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xB7,
+ 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC,
+ 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, 0x93,
+ 0xCD, 0x82, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD,
+ 0x82, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xF6, 0xCE,
+ 0x97, 0xCC, 0x94, 0xF6, 0xCE, 0x97, 0xCC, 0x93,
+ 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC,
+ 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x81,
+ 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x81, 0xF6,
+ 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE,
+ 0x97, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xB9,
+ 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, 0xF6,
+ 0xCE, 0xB9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE,
+ 0xB9, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xB9,
+ 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC,
+ 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x93,
+ 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, 0xCD,
+ 0x82, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xF6, 0xCE,
+ 0x99, 0xCC, 0x94, 0xF6, 0xCE, 0x99, 0xCC, 0x93,
+ 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC,
+ 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCC, 0x81,
+ 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, 0x81, 0xF6,
+ 0xCE, 0x99, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE,
+ 0x99, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xBF,
+ 0xCC, 0x93, 0xF6, 0xCE, 0xBF, 0xCC, 0x94, 0xF6,
+ 0xCE, 0xBF, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE,
+ 0xBF, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xBF,
+ 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xBF, 0xCC,
+ 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x93,
+ 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xF6, 0xCE, 0x9F,
+ 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC,
+ 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x93,
+ 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xCC,
+ 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xF6, 0xCF,
+ 0x85, 0xCC, 0x94, 0xF6, 0xCF, 0x85, 0xCC, 0x93,
+ 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC,
+ 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCC, 0x81,
+ 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, 0x81, 0xF6,
+ 0xCF, 0x85, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF,
+ 0x85, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA5,
+ 0xCC, 0x94, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC,
+ 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, 0x81,
+ 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCD, 0x82, 0xF6,
+ 0xCF, 0x89, 0xCC, 0x93, 0xF6, 0xCF, 0x89, 0xCC,
+ 0x94, 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x80,
+ 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x80, 0xF6,
+ 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCF,
+ 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCF, 0x89,
+ 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, 0x89, 0xCC,
+ 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA9, 0xCC, 0x93,
+ 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xF6, 0xCE, 0xA9,
+ 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC,
+ 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, 0x93,
+ 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, 0x82,
+ 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x82, 0xF6,
+ 0xCE, 0xB1, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x80, 0xF6, 0xCE,
+ 0xB5, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, 0x80,
+ 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9,
+ 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, 0xF6,
+ 0xCE, 0xBF, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, 0xCC,
+ 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x80, 0xF6, 0xCF,
+ 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x80,
+ 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xF6, 0xCE, 0xB1,
+ 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC,
+ 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x93,
+ 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC,
+ 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1,
+ 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE,
+ 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6,
+ 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85,
+ 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xCD,
+ 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCD, 0x85,
+ 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, 0x85, 0xF6,
+ 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85,
+ 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xCD,
+ 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x81,
+ 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC,
+ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93,
+ 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC,
+ 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xB7,
+ 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC,
+ 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x93,
+ 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC,
+ 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7,
+ 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6,
+ 0xCE, 0xB7, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85,
+ 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, 0x82, 0xCD,
+ 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x85,
+ 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x85, 0xF6,
+ 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85,
+ 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x80, 0xCD,
+ 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x81,
+ 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC,
+ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93,
+ 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC,
+ 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCF, 0x89,
+ 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC,
+ 0x94, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x93,
+ 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC,
+ 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89,
+ 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCF,
+ 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6,
+ 0xCF, 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85,
+ 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCD, 0x82, 0xCD,
+ 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, 0x85,
+ 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x85, 0xF6,
+ 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85,
+ 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, 0x80, 0xCD,
+ 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x81,
+ 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC,
+ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93,
+ 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC,
+ 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xB1,
+ 0xCC, 0x86, 0xF6, 0xCE, 0xB1, 0xCC, 0x84, 0xF6,
+ 0xCE, 0xB1, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE,
+ 0xB1, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x81,
+ 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCD, 0x82, 0xF6,
+ 0xCE, 0xB1, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE,
+ 0x91, 0xCC, 0x86, 0xF6, 0xCE, 0x91, 0xCC, 0x84,
+ 0xF6, 0xCE, 0x91, 0xCC, 0x80, 0xF6, 0xCE, 0x91,
+ 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCD, 0x85, 0x20,
+ 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0x20, 0xCC, 0x93,
+ 0x20, 0xCD, 0x82, 0xF5, 0x05, 0xC2, 0xA8, 0xCD,
+ 0x82, 0x20, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7,
+ 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xCD,
+ 0x85, 0xF6, 0xCE, 0xB7, 0xCD, 0x82, 0xF6, 0xCE,
+ 0xB7, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x95,
+ 0xCC, 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x81, 0xF6,
+ 0xCE, 0x97, 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC,
+ 0x81, 0xF6, 0xCE, 0x97, 0xCD, 0x85, 0xF5, 0x06,
+ 0xE1, 0xBE, 0xBF, 0xCC, 0x80, 0x20, 0xCC, 0x93,
+ 0xCC, 0x80, 0xF5, 0x06, 0xE1, 0xBE, 0xBF, 0xCC,
+ 0x81, 0x20, 0xCC, 0x93, 0xCC, 0x81, 0xF5, 0x06,
+ 0xE1, 0xBE, 0xBF, 0xCD, 0x82, 0x20, 0xCC, 0x93,
+ 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x86, 0xF6,
+ 0xCE, 0xB9, 0xCC, 0x84, 0xF6, 0xCE, 0xB9, 0xCC,
+ 0x88, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x88,
+ 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCD, 0x82, 0xF6,
+ 0xCE, 0xB9, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE,
+ 0x99, 0xCC, 0x86, 0xF6, 0xCE, 0x99, 0xCC, 0x84,
+ 0xF6, 0xCE, 0x99, 0xCC, 0x80, 0xF6, 0xCE, 0x99,
+ 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, 0xCC,
+ 0x80, 0x20, 0xCC, 0x94, 0xCC, 0x80, 0xF5, 0x06,
+ 0xE1, 0xBF, 0xBE, 0xCC, 0x81, 0x20, 0xCC, 0x94,
+ 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, 0xCD,
+ 0x82, 0x20, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCF,
+ 0x85, 0xCC, 0x86, 0xF6, 0xCF, 0x85, 0xCC, 0x84,
+ 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x80, 0xF6,
+ 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCF,
+ 0x81, 0xCC, 0x93, 0xF6, 0xCF, 0x81, 0xCC, 0x94,
+ 0xF6, 0xCF, 0x85, 0xCD, 0x82, 0xF6, 0xCF, 0x85,
+ 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, 0xCC,
+ 0x86, 0xF6, 0xCE, 0xA5, 0xCC, 0x84, 0xF6, 0xCE,
+ 0xA5, 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x81,
+ 0xF6, 0xCE, 0xA1, 0xCC, 0x94, 0xF5, 0x05, 0xC2,
+ 0xA8, 0xCC, 0x80, 0x20, 0xCC, 0x88, 0xCC, 0x80,
+ 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, 0x20, 0xCC,
+ 0x88, 0xCC, 0x81, 0xF6, 0x60, 0xF6, 0xCF, 0x89,
+ 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCD,
+ 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xCD, 0x85,
+ 0xF6, 0xCF, 0x89, 0xCD, 0x82, 0xF6, 0xCF, 0x89,
+ 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x9F, 0xCC,
+ 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x81, 0xF6, 0xCE,
+ 0xA9, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, 0x81,
+ 0xF6, 0xCE, 0xA9, 0xCD, 0x85, 0xF5, 0x03, 0xC2,
+ 0xB4, 0x20, 0xCC, 0x81, 0x20, 0xCC, 0x94, 0xF5,
+ 0x04, 0xE2, 0x80, 0x82, 0x20, 0xF5, 0x04, 0xE2,
+ 0x80, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0xE2, 0x80, 0x90, 0x20,
+ 0xCC, 0xB3, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
+ 0x20, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2,
+ 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2,
+ 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80,
+ 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0x21,
+ 0x21, 0x20, 0xCC, 0x85, 0x3F, 0x3F, 0x3F, 0x21,
+ 0x21, 0x3F, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2,
+ 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0x20, 0x30,
+ 0x69, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2B,
+ 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, 0x6E, 0x30,
+ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29,
+ 0x52, 0x73, 0x61, 0x2F, 0x63, 0x61, 0x2F, 0x73,
+ 0x43, 0xC2, 0xB0, 0x43, 0x63, 0x2F, 0x6F, 0x63,
+ 0x2F, 0x75, 0xC6, 0x90, 0xC2, 0xB0, 0x46, 0x67,
+ 0x48, 0x48, 0x48, 0x68, 0xC4, 0xA7, 0x49, 0x49,
+ 0x4C, 0x6C, 0x4E, 0x4E, 0x6F, 0x50, 0x51, 0x52,
+ 0x52, 0x52, 0x53, 0x4D, 0x54, 0x45, 0x4C, 0x54,
+ 0x4D, 0x5A, 0xF6, 0xCE, 0xA9, 0x5A, 0xF6, 0x4B,
+ 0xF6, 0x41, 0xCC, 0x8A, 0x42, 0x43, 0x65, 0x45,
+ 0x46, 0x4D, 0x6F, 0xD7, 0x90, 0xD7, 0x91, 0xD7,
+ 0x92, 0xD7, 0x93, 0x69, 0xCE, 0xB3, 0xCE, 0x93,
+ 0xCE, 0xA0, 0xE2, 0x88, 0x91, 0x44, 0x64, 0x65,
+ 0x69, 0x6A, 0x31, 0xE2, 0x81, 0x84, 0x33, 0x32,
+ 0xE2, 0x81, 0x84, 0x33, 0x31, 0xE2, 0x81, 0x84,
+ 0x35, 0x32, 0xE2, 0x81, 0x84, 0x35, 0x33, 0xE2,
+ 0x81, 0x84, 0x35, 0x34, 0xE2, 0x81, 0x84, 0x35,
+ 0x31, 0xE2, 0x81, 0x84, 0x36, 0x35, 0xE2, 0x81,
+ 0x84, 0x36, 0x31, 0xE2, 0x81, 0x84, 0x38, 0x33,
+ 0xE2, 0x81, 0x84, 0x38, 0x35, 0xE2, 0x81, 0x84,
+ 0x38, 0x37, 0xE2, 0x81, 0x84, 0x38, 0x31, 0xE2,
+ 0x81, 0x84, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49,
+ 0x49, 0x56, 0x56, 0x56, 0x49, 0x56, 0x49, 0x49,
+ 0x56, 0x49, 0x49, 0x49, 0x49, 0x58, 0x58, 0x58,
+ 0x49, 0x58, 0x49, 0x49, 0x4C, 0x43, 0x44, 0x4D,
+ 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x76,
+ 0x76, 0x76, 0x69, 0x76, 0x69, 0x69, 0x76, 0x69,
+ 0x69, 0x69, 0x69, 0x78, 0x78, 0x78, 0x69, 0x78,
+ 0x69, 0x69, 0x6C, 0x63, 0x64, 0x6D, 0xF6, 0xE2,
+ 0x86, 0x90, 0xCC, 0xB8, 0xF6, 0xE2, 0x86, 0x92,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x86, 0x94, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x87, 0x90, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x87, 0x94, 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x92,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x83, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x88, 0x88, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x88, 0x8B, 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0xA3,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0xA5, 0xCC, 0xB8,
+ 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88,
+ 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2,
+ 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE,
+ 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xF6, 0xE2,
+ 0x88, 0xBC, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x83,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x85, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x89, 0x88, 0xCC, 0xB8, 0xF6, 0x3D,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA1, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x89, 0x8D, 0xCC, 0xB8, 0xF6, 0x3C,
+ 0xCC, 0xB8, 0xF6, 0x3E, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x89, 0xA4, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA5,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB2, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x89, 0xB3, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x89, 0xB6, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB7,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBA, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x89, 0xBB, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x8A, 0x82, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x83,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x86, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x8A, 0x87, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x8A, 0xA2, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA8,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA9, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x8A, 0xAB, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x89, 0xBC, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBD,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x91, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x8A, 0x92, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x8A, 0xB2, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB3,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB4, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x8A, 0xB5, 0xCC, 0xB8, 0xF6, 0xE3,
+ 0x80, 0x88, 0xF6, 0xE3, 0x80, 0x89, 0x31, 0x32,
+ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x31,
+ 0x30, 0x31, 0x31, 0x31, 0x32, 0x31, 0x33, 0x31,
+ 0x34, 0x31, 0x35, 0x31, 0x36, 0x31, 0x37, 0x31,
+ 0x38, 0x31, 0x39, 0x32, 0x30, 0x28, 0x31, 0x29,
+ 0x28, 0x32, 0x29, 0x28, 0x33, 0x29, 0x28, 0x34,
+ 0x29, 0x28, 0x35, 0x29, 0x28, 0x36, 0x29, 0x28,
+ 0x37, 0x29, 0x28, 0x38, 0x29, 0x28, 0x39, 0x29,
+ 0x28, 0x31, 0x30, 0x29, 0x28, 0x31, 0x31, 0x29,
+ 0x28, 0x31, 0x32, 0x29, 0x28, 0x31, 0x33, 0x29,
+ 0x28, 0x31, 0x34, 0x29, 0x28, 0x31, 0x35, 0x29,
+ 0x28, 0x31, 0x36, 0x29, 0x28, 0x31, 0x37, 0x29,
+ 0x28, 0x31, 0x38, 0x29, 0x28, 0x31, 0x39, 0x29,
+ 0x28, 0x32, 0x30, 0x29, 0x31, 0x2E, 0x32, 0x2E,
+ 0x33, 0x2E, 0x34, 0x2E, 0x35, 0x2E, 0x36, 0x2E,
+ 0x37, 0x2E, 0x38, 0x2E, 0x39, 0x2E, 0x31, 0x30,
+ 0x2E, 0x31, 0x31, 0x2E, 0x31, 0x32, 0x2E, 0x31,
+ 0x33, 0x2E, 0x31, 0x34, 0x2E, 0x31, 0x35, 0x2E,
+ 0x31, 0x36, 0x2E, 0x31, 0x37, 0x2E, 0x31, 0x38,
+ 0x2E, 0x31, 0x39, 0x2E, 0x32, 0x30, 0x2E, 0x28,
+ 0x61, 0x29, 0x28, 0x62, 0x29, 0x28, 0x63, 0x29,
+ 0x28, 0x64, 0x29, 0x28, 0x65, 0x29, 0x28, 0x66,
+ 0x29, 0x28, 0x67, 0x29, 0x28, 0x68, 0x29, 0x28,
+ 0x69, 0x29, 0x28, 0x6A, 0x29, 0x28, 0x6B, 0x29,
+ 0x28, 0x6C, 0x29, 0x28, 0x6D, 0x29, 0x28, 0x6E,
+ 0x29, 0x28, 0x6F, 0x29, 0x28, 0x70, 0x29, 0x28,
+ 0x71, 0x29, 0x28, 0x72, 0x29, 0x28, 0x73, 0x29,
+ 0x28, 0x74, 0x29, 0x28, 0x75, 0x29, 0x28, 0x76,
+ 0x29, 0x28, 0x77, 0x29, 0x28, 0x78, 0x29, 0x28,
+ 0x79, 0x29, 0x28, 0x7A, 0x29, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x30, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB,
+ 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0x3A, 0x3A,
+ 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0xF6, 0xE2,
+ 0xAB, 0x9D, 0xCC, 0xB8, 0xE6, 0xAF, 0x8D, 0xE9,
+ 0xBE, 0x9F, 0xE4, 0xB8, 0x80, 0xE4, 0xB8, 0xA8,
+ 0xE4, 0xB8, 0xB6, 0xE4, 0xB8, 0xBF, 0xE4, 0xB9,
+ 0x99, 0xE4, 0xBA, 0x85, 0xE4, 0xBA, 0x8C, 0xE4,
+ 0xBA, 0xA0, 0xE4, 0xBA, 0xBA, 0xE5, 0x84, 0xBF,
+ 0xE5, 0x85, 0xA5, 0xE5, 0x85, 0xAB, 0xE5, 0x86,
+ 0x82, 0xE5, 0x86, 0x96, 0xE5, 0x86, 0xAB, 0xE5,
+ 0x87, 0xA0, 0xE5, 0x87, 0xB5, 0xE5, 0x88, 0x80,
+ 0xE5, 0x8A, 0x9B, 0xE5, 0x8B, 0xB9, 0xE5, 0x8C,
+ 0x95, 0xE5, 0x8C, 0x9A, 0xE5, 0x8C, 0xB8, 0xE5,
+ 0x8D, 0x81, 0xE5, 0x8D, 0x9C, 0xE5, 0x8D, 0xA9,
+ 0xE5, 0x8E, 0x82, 0xE5, 0x8E, 0xB6, 0xE5, 0x8F,
+ 0x88, 0xE5, 0x8F, 0xA3, 0xE5, 0x9B, 0x97, 0xE5,
+ 0x9C, 0x9F, 0xE5, 0xA3, 0xAB, 0xE5, 0xA4, 0x82,
+ 0xE5, 0xA4, 0x8A, 0xE5, 0xA4, 0x95, 0xE5, 0xA4,
+ 0xA7, 0xE5, 0xA5, 0xB3, 0xE5, 0xAD, 0x90, 0xE5,
+ 0xAE, 0x80, 0xE5, 0xAF, 0xB8, 0xE5, 0xB0, 0x8F,
+ 0xE5, 0xB0, 0xA2, 0xE5, 0xB0, 0xB8, 0xE5, 0xB1,
+ 0xAE, 0xE5, 0xB1, 0xB1, 0xE5, 0xB7, 0x9B, 0xE5,
+ 0xB7, 0xA5, 0xE5, 0xB7, 0xB1, 0xE5, 0xB7, 0xBE,
+ 0xE5, 0xB9, 0xB2, 0xE5, 0xB9, 0xBA, 0xE5, 0xB9,
+ 0xBF, 0xE5, 0xBB, 0xB4, 0xE5, 0xBB, 0xBE, 0xE5,
+ 0xBC, 0x8B, 0xE5, 0xBC, 0x93, 0xE5, 0xBD, 0x90,
+ 0xE5, 0xBD, 0xA1, 0xE5, 0xBD, 0xB3, 0xE5, 0xBF,
+ 0x83, 0xE6, 0x88, 0x88, 0xE6, 0x88, 0xB6, 0xE6,
+ 0x89, 0x8B, 0xE6, 0x94, 0xAF, 0xE6, 0x94, 0xB4,
+ 0xE6, 0x96, 0x87, 0xE6, 0x96, 0x97, 0xE6, 0x96,
+ 0xA4, 0xE6, 0x96, 0xB9, 0xE6, 0x97, 0xA0, 0xE6,
+ 0x97, 0xA5, 0xE6, 0x9B, 0xB0, 0xE6, 0x9C, 0x88,
+ 0xE6, 0x9C, 0xA8, 0xE6, 0xAC, 0xA0, 0xE6, 0xAD,
+ 0xA2, 0xE6, 0xAD, 0xB9, 0xE6, 0xAE, 0xB3, 0xE6,
+ 0xAF, 0x8B, 0xE6, 0xAF, 0x94, 0xE6, 0xAF, 0x9B,
+ 0xE6, 0xB0, 0x8F, 0xE6, 0xB0, 0x94, 0xE6, 0xB0,
+ 0xB4, 0xE7, 0x81, 0xAB, 0xE7, 0x88, 0xAA, 0xE7,
+ 0x88, 0xB6, 0xE7, 0x88, 0xBB, 0xE7, 0x88, 0xBF,
+ 0xE7, 0x89, 0x87, 0xE7, 0x89, 0x99, 0xE7, 0x89,
+ 0x9B, 0xE7, 0x8A, 0xAC, 0xE7, 0x8E, 0x84, 0xE7,
+ 0x8E, 0x89, 0xE7, 0x93, 0x9C, 0xE7, 0x93, 0xA6,
+ 0xE7, 0x94, 0x98, 0xE7, 0x94, 0x9F, 0xE7, 0x94,
+ 0xA8, 0xE7, 0x94, 0xB0, 0xE7, 0x96, 0x8B, 0xE7,
+ 0x96, 0x92, 0xE7, 0x99, 0xB6, 0xE7, 0x99, 0xBD,
+ 0xE7, 0x9A, 0xAE, 0xE7, 0x9A, 0xBF, 0xE7, 0x9B,
+ 0xAE, 0xE7, 0x9F, 0x9B, 0xE7, 0x9F, 0xA2, 0xE7,
+ 0x9F, 0xB3, 0xE7, 0xA4, 0xBA, 0xE7, 0xA6, 0xB8,
+ 0xE7, 0xA6, 0xBE, 0xE7, 0xA9, 0xB4, 0xE7, 0xAB,
+ 0x8B, 0xE7, 0xAB, 0xB9, 0xE7, 0xB1, 0xB3, 0xE7,
+ 0xB3, 0xB8, 0xE7, 0xBC, 0xB6, 0xE7, 0xBD, 0x91,
+ 0xE7, 0xBE, 0x8A, 0xE7, 0xBE, 0xBD, 0xE8, 0x80,
+ 0x81, 0xE8, 0x80, 0x8C, 0xE8, 0x80, 0x92, 0xE8,
+ 0x80, 0xB3, 0xE8, 0x81, 0xBF, 0xE8, 0x82, 0x89,
+ 0xE8, 0x87, 0xA3, 0xE8, 0x87, 0xAA, 0xE8, 0x87,
+ 0xB3, 0xE8, 0x87, 0xBC, 0xE8, 0x88, 0x8C, 0xE8,
+ 0x88, 0x9B, 0xE8, 0x88, 0x9F, 0xE8, 0x89, 0xAE,
+ 0xE8, 0x89, 0xB2, 0xE8, 0x89, 0xB8, 0xE8, 0x99,
+ 0x8D, 0xE8, 0x99, 0xAB, 0xE8, 0xA1, 0x80, 0xE8,
+ 0xA1, 0x8C, 0xE8, 0xA1, 0xA3, 0xE8, 0xA5, 0xBE,
+ 0xE8, 0xA6, 0x8B, 0xE8, 0xA7, 0x92, 0xE8, 0xA8,
+ 0x80, 0xE8, 0xB0, 0xB7, 0xE8, 0xB1, 0x86, 0xE8,
+ 0xB1, 0x95, 0xE8, 0xB1, 0xB8, 0xE8, 0xB2, 0x9D,
+ 0xE8, 0xB5, 0xA4, 0xE8, 0xB5, 0xB0, 0xE8, 0xB6,
+ 0xB3, 0xE8, 0xBA, 0xAB, 0xE8, 0xBB, 0x8A, 0xE8,
+ 0xBE, 0x9B, 0xE8, 0xBE, 0xB0, 0xE8, 0xBE, 0xB5,
+ 0xE9, 0x82, 0x91, 0xE9, 0x85, 0x89, 0xE9, 0x87,
+ 0x86, 0xE9, 0x87, 0x8C, 0xE9, 0x87, 0x91, 0xE9,
+ 0x95, 0xB7, 0xE9, 0x96, 0x80, 0xE9, 0x98, 0x9C,
+ 0xE9, 0x9A, 0xB6, 0xE9, 0x9A, 0xB9, 0xE9, 0x9B,
+ 0xA8, 0xE9, 0x9D, 0x91, 0xE9, 0x9D, 0x9E, 0xE9,
+ 0x9D, 0xA2, 0xE9, 0x9D, 0xA9, 0xE9, 0x9F, 0x8B,
+ 0xE9, 0x9F, 0xAD, 0xE9, 0x9F, 0xB3, 0xE9, 0xA0,
+ 0x81, 0xE9, 0xA2, 0xA8, 0xE9, 0xA3, 0x9B, 0xE9,
+ 0xA3, 0x9F, 0xE9, 0xA6, 0x96, 0xE9, 0xA6, 0x99,
+ 0xE9, 0xA6, 0xAC, 0xE9, 0xAA, 0xA8, 0xE9, 0xAB,
+ 0x98, 0xE9, 0xAB, 0x9F, 0xE9, 0xAC, 0xA5, 0xE9,
+ 0xAC, 0xAF, 0xE9, 0xAC, 0xB2, 0xE9, 0xAC, 0xBC,
+ 0xE9, 0xAD, 0x9A, 0xE9, 0xB3, 0xA5, 0xE9, 0xB9,
+ 0xB5, 0xE9, 0xB9, 0xBF, 0xE9, 0xBA, 0xA5, 0xE9,
+ 0xBA, 0xBB, 0xE9, 0xBB, 0x83, 0xE9, 0xBB, 0x8D,
+ 0xE9, 0xBB, 0x91, 0xE9, 0xBB, 0xB9, 0xE9, 0xBB,
+ 0xBD, 0xE9, 0xBC, 0x8E, 0xE9, 0xBC, 0x93, 0xE9,
+ 0xBC, 0xA0, 0xE9, 0xBC, 0xBB, 0xE9, 0xBD, 0x8A,
+ 0xE9, 0xBD, 0x92, 0xE9, 0xBE, 0x8D, 0xE9, 0xBE,
+ 0x9C, 0xE9, 0xBE, 0xA0, 0x20, 0xE3, 0x80, 0x92,
+ 0xE5, 0x8D, 0x81, 0xE5, 0x8D, 0x84, 0xE5, 0x8D,
+ 0x85, 0xF6, 0xE3, 0x81, 0x8B, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x81, 0x8D, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x81, 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x81, 0x91, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81,
+ 0x93, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x95,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x97, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x99, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x81, 0x9B, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x81, 0x9D, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x81, 0x9F, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x81, 0xA1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81,
+ 0xA4, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA6,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA8, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x9A,
+ 0xF6, 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x9A, 0xF6, 0xE3,
+ 0x81, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81,
+ 0xB5, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xB8,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB8, 0xE3,
+ 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x9A,
+ 0xF6, 0xE3, 0x81, 0x86, 0xE3, 0x82, 0x99, 0x20,
+ 0xE3, 0x82, 0x99, 0x20, 0xE3, 0x82, 0x9A, 0xF6,
+ 0xE3, 0x82, 0x9D, 0xE3, 0x82, 0x99, 0xE3, 0x82,
+ 0x88, 0xE3, 0x82, 0x8A, 0xF6, 0xE3, 0x82, 0xAB,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAD, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAF, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x82, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x82, 0xB7, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82,
+ 0xB9, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBB,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBD, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBF, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x83, 0x84, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x83, 0x86, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x83, 0x88, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83,
+ 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F,
+ 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x92, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82,
+ 0x9A, 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x9A, 0xF6,
+ 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83,
+ 0x9B, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x9B,
+ 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x82, 0xA6, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x83, 0xAF, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x83, 0xB0, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x83, 0xB1, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x83, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x83, 0xBD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xB3,
+ 0xE3, 0x83, 0x88, 0xE1, 0x84, 0x80, 0xE1, 0x84,
+ 0x81, 0xE1, 0x86, 0xAA, 0xE1, 0x84, 0x82, 0xE1,
+ 0x86, 0xAC, 0xE1, 0x86, 0xAD, 0xE1, 0x84, 0x83,
+ 0xE1, 0x84, 0x84, 0xE1, 0x84, 0x85, 0xE1, 0x86,
+ 0xB0, 0xE1, 0x86, 0xB1, 0xE1, 0x86, 0xB2, 0xE1,
+ 0x86, 0xB3, 0xE1, 0x86, 0xB4, 0xE1, 0x86, 0xB5,
+ 0xE1, 0x84, 0x9A, 0xE1, 0x84, 0x86, 0xE1, 0x84,
+ 0x87, 0xE1, 0x84, 0x88, 0xE1, 0x84, 0xA1, 0xE1,
+ 0x84, 0x89, 0xE1, 0x84, 0x8A, 0xE1, 0x84, 0x8B,
+ 0xE1, 0x84, 0x8C, 0xE1, 0x84, 0x8D, 0xE1, 0x84,
+ 0x8E, 0xE1, 0x84, 0x8F, 0xE1, 0x84, 0x90, 0xE1,
+ 0x84, 0x91, 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1,
+ 0xE1, 0x85, 0xA2, 0xE1, 0x85, 0xA3, 0xE1, 0x85,
+ 0xA4, 0xE1, 0x85, 0xA5, 0xE1, 0x85, 0xA6, 0xE1,
+ 0x85, 0xA7, 0xE1, 0x85, 0xA8, 0xE1, 0x85, 0xA9,
+ 0xE1, 0x85, 0xAA, 0xE1, 0x85, 0xAB, 0xE1, 0x85,
+ 0xAC, 0xE1, 0x85, 0xAD, 0xE1, 0x85, 0xAE, 0xE1,
+ 0x85, 0xAF, 0xE1, 0x85, 0xB0, 0xE1, 0x85, 0xB1,
+ 0xE1, 0x85, 0xB2, 0xE1, 0x85, 0xB3, 0xE1, 0x85,
+ 0xB4, 0xE1, 0x85, 0xB5, 0xE1, 0x85, 0xA0, 0xE1,
+ 0x84, 0x94, 0xE1, 0x84, 0x95, 0xE1, 0x87, 0x87,
+ 0xE1, 0x87, 0x88, 0xE1, 0x87, 0x8C, 0xE1, 0x87,
+ 0x8E, 0xE1, 0x87, 0x93, 0xE1, 0x87, 0x97, 0xE1,
+ 0x87, 0x99, 0xE1, 0x84, 0x9C, 0xE1, 0x87, 0x9D,
+ 0xE1, 0x87, 0x9F, 0xE1, 0x84, 0x9D, 0xE1, 0x84,
+ 0x9E, 0xE1, 0x84, 0xA0, 0xE1, 0x84, 0xA2, 0xE1,
+ 0x84, 0xA3, 0xE1, 0x84, 0xA7, 0xE1, 0x84, 0xA9,
+ 0xE1, 0x84, 0xAB, 0xE1, 0x84, 0xAC, 0xE1, 0x84,
+ 0xAD, 0xE1, 0x84, 0xAE, 0xE1, 0x84, 0xAF, 0xE1,
+ 0x84, 0xB2, 0xE1, 0x84, 0xB6, 0xE1, 0x85, 0x80,
+ 0xE1, 0x85, 0x87, 0xE1, 0x85, 0x8C, 0xE1, 0x87,
+ 0xB1, 0xE1, 0x87, 0xB2, 0xE1, 0x85, 0x97, 0xE1,
+ 0x85, 0x98, 0xE1, 0x85, 0x99, 0xE1, 0x86, 0x84,
+ 0xE1, 0x86, 0x85, 0xE1, 0x86, 0x88, 0xE1, 0x86,
+ 0x91, 0xE1, 0x86, 0x92, 0xE1, 0x86, 0x94, 0xE1,
+ 0x86, 0x9E, 0xE1, 0x86, 0xA1, 0xE4, 0xB8, 0x80,
+ 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B,
+ 0x9B, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4,
+ 0xB8, 0x8B, 0xE7, 0x94, 0xB2, 0xE4, 0xB9, 0x99,
+ 0xE4, 0xB8, 0x99, 0xE4, 0xB8, 0x81, 0xE5, 0xA4,
+ 0xA9, 0xE5, 0x9C, 0xB0, 0xE4, 0xBA, 0xBA, 0x28,
+ 0xE1, 0x84, 0x80, 0x29, 0x28, 0xE1, 0x84, 0x82,
+ 0x29, 0x28, 0xE1, 0x84, 0x83, 0x29, 0x28, 0xE1,
+ 0x84, 0x85, 0x29, 0x28, 0xE1, 0x84, 0x86, 0x29,
+ 0x28, 0xE1, 0x84, 0x87, 0x29, 0x28, 0xE1, 0x84,
+ 0x89, 0x29, 0x28, 0xE1, 0x84, 0x8B, 0x29, 0x28,
+ 0xE1, 0x84, 0x8C, 0x29, 0x28, 0xE1, 0x84, 0x8E,
+ 0x29, 0x28, 0xE1, 0x84, 0x8F, 0x29, 0x28, 0xE1,
+ 0x84, 0x90, 0x29, 0x28, 0xE1, 0x84, 0x91, 0x29,
+ 0x28, 0xE1, 0x84, 0x92, 0x29, 0x28, 0xE1, 0x84,
+ 0x80, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x82, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x83, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x85, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x86, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x87, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x89, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x8B, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x8C, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x8E, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x8F, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x90, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x91, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x92, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84,
+ 0x8C, 0xE1, 0x85, 0xAE, 0x29, 0x28, 0xE4, 0xB8,
+ 0x80, 0x29, 0x28, 0xE4, 0xBA, 0x8C, 0x29, 0x28,
+ 0xE4, 0xB8, 0x89, 0x29, 0x28, 0xE5, 0x9B, 0x9B,
+ 0x29, 0x28, 0xE4, 0xBA, 0x94, 0x29, 0x28, 0xE5,
+ 0x85, 0xAD, 0x29, 0x28, 0xE4, 0xB8, 0x83, 0x29,
+ 0x28, 0xE5, 0x85, 0xAB, 0x29, 0x28, 0xE4, 0xB9,
+ 0x9D, 0x29, 0x28, 0xE5, 0x8D, 0x81, 0x29, 0x28,
+ 0xE6, 0x9C, 0x88, 0x29, 0x28, 0xE7, 0x81, 0xAB,
+ 0x29, 0x28, 0xE6, 0xB0, 0xB4, 0x29, 0x28, 0xE6,
+ 0x9C, 0xA8, 0x29, 0x28, 0xE9, 0x87, 0x91, 0x29,
+ 0x28, 0xE5, 0x9C, 0x9F, 0x29, 0x28, 0xE6, 0x97,
+ 0xA5, 0x29, 0x28, 0xE6, 0xA0, 0xAA, 0x29, 0x28,
+ 0xE6, 0x9C, 0x89, 0x29, 0x28, 0xE7, 0xA4, 0xBE,
+ 0x29, 0x28, 0xE5, 0x90, 0x8D, 0x29, 0x28, 0xE7,
+ 0x89, 0xB9, 0x29, 0x28, 0xE8, 0xB2, 0xA1, 0x29,
+ 0x28, 0xE7, 0xA5, 0x9D, 0x29, 0x28, 0xE5, 0x8A,
+ 0xB4, 0x29, 0x28, 0xE4, 0xBB, 0xA3, 0x29, 0x28,
+ 0xE5, 0x91, 0xBC, 0x29, 0x28, 0xE5, 0xAD, 0xA6,
+ 0x29, 0x28, 0xE7, 0x9B, 0xA3, 0x29, 0x28, 0xE4,
+ 0xBC, 0x81, 0x29, 0x28, 0xE8, 0xB3, 0x87, 0x29,
+ 0x28, 0xE5, 0x8D, 0x94, 0x29, 0x28, 0xE7, 0xA5,
+ 0xAD, 0x29, 0x28, 0xE4, 0xBC, 0x91, 0x29, 0x28,
+ 0xE8, 0x87, 0xAA, 0x29, 0x28, 0xE8, 0x87, 0xB3,
+ 0x29, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33, 0x32,
+ 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, 0x32,
+ 0x38, 0x32, 0x39, 0x33, 0x30, 0x33, 0x31, 0x33,
+ 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35, 0xE1,
+ 0x84, 0x80, 0xE1, 0x84, 0x82, 0xE1, 0x84, 0x83,
+ 0xE1, 0x84, 0x85, 0xE1, 0x84, 0x86, 0xE1, 0x84,
+ 0x87, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8B, 0xE1,
+ 0x84, 0x8C, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F,
+ 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84,
+ 0x92, 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xA1, 0xE1,
+ 0x84, 0x82, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x83,
+ 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x85, 0xE1, 0x85,
+ 0xA1, 0xE1, 0x84, 0x86, 0xE1, 0x85, 0xA1, 0xE1,
+ 0x84, 0x87, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x89,
+ 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8B, 0xE1, 0x85,
+ 0xA1, 0xE1, 0x84, 0x8C, 0xE1, 0x85, 0xA1, 0xE1,
+ 0x84, 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8F,
+ 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x90, 0xE1, 0x85,
+ 0xA1, 0xE1, 0x84, 0x91, 0xE1, 0x85, 0xA1, 0xE1,
+ 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE4, 0xB8, 0x80,
+ 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B,
+ 0x9B, 0xE4, 0xBA, 0x94, 0xE5, 0x85, 0xAD, 0xE4,
+ 0xB8, 0x83, 0xE5, 0x85, 0xAB, 0xE4, 0xB9, 0x9D,
+ 0xE5, 0x8D, 0x81, 0xE6, 0x9C, 0x88, 0xE7, 0x81,
+ 0xAB, 0xE6, 0xB0, 0xB4, 0xE6, 0x9C, 0xA8, 0xE9,
+ 0x87, 0x91, 0xE5, 0x9C, 0x9F, 0xE6, 0x97, 0xA5,
+ 0xE6, 0xA0, 0xAA, 0xE6, 0x9C, 0x89, 0xE7, 0xA4,
+ 0xBE, 0xE5, 0x90, 0x8D, 0xE7, 0x89, 0xB9, 0xE8,
+ 0xB2, 0xA1, 0xE7, 0xA5, 0x9D, 0xE5, 0x8A, 0xB4,
+ 0xE7, 0xA7, 0x98, 0xE7, 0x94, 0xB7, 0xE5, 0xA5,
+ 0xB3, 0xE9, 0x81, 0xA9, 0xE5, 0x84, 0xAA, 0xE5,
+ 0x8D, 0xB0, 0xE6, 0xB3, 0xA8, 0xE9, 0xA0, 0x85,
+ 0xE4, 0xBC, 0x91, 0xE5, 0x86, 0x99, 0xE6, 0xAD,
+ 0xA3, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4,
+ 0xB8, 0x8B, 0xE5, 0xB7, 0xA6, 0xE5, 0x8F, 0xB3,
+ 0xE5, 0x8C, 0xBB, 0xE5, 0xAE, 0x97, 0xE5, 0xAD,
+ 0xA6, 0xE7, 0x9B, 0xA3, 0xE4, 0xBC, 0x81, 0xE8,
+ 0xB3, 0x87, 0xE5, 0x8D, 0x94, 0xE5, 0xA4, 0x9C,
+ 0x33, 0x36, 0x33, 0x37, 0x33, 0x38, 0x33, 0x39,
+ 0x34, 0x30, 0x34, 0x31, 0x34, 0x32, 0x34, 0x33,
+ 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, 0x34, 0x37,
+ 0x34, 0x38, 0x34, 0x39, 0x35, 0x30, 0x31, 0xE6,
+ 0x9C, 0x88, 0x32, 0xE6, 0x9C, 0x88, 0x33, 0xE6,
+ 0x9C, 0x88, 0x34, 0xE6, 0x9C, 0x88, 0x35, 0xE6,
+ 0x9C, 0x88, 0x36, 0xE6, 0x9C, 0x88, 0x37, 0xE6,
+ 0x9C, 0x88, 0x38, 0xE6, 0x9C, 0x88, 0x39, 0xE6,
+ 0x9C, 0x88, 0x31, 0x30, 0xE6, 0x9C, 0x88, 0x31,
+ 0x31, 0xE6, 0x9C, 0x88, 0x31, 0x32, 0xE6, 0x9C,
+ 0x88, 0xE3, 0x82, 0xA2, 0xE3, 0x82, 0xA4, 0xE3,
+ 0x82, 0xA6, 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xAA,
+ 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x82,
+ 0xAF, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0xB3, 0xE3,
+ 0x82, 0xB5, 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xB9,
+ 0xE3, 0x82, 0xBB, 0xE3, 0x82, 0xBD, 0xE3, 0x82,
+ 0xBF, 0xE3, 0x83, 0x81, 0xE3, 0x83, 0x84, 0xE3,
+ 0x83, 0x86, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8A,
+ 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0x8C, 0xE3, 0x83,
+ 0x8D, 0xE3, 0x83, 0x8E, 0xE3, 0x83, 0x8F, 0xE3,
+ 0x83, 0x92, 0xE3, 0x83, 0x95, 0xE3, 0x83, 0x98,
+ 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0x9E, 0xE3, 0x83,
+ 0x9F, 0xE3, 0x83, 0xA0, 0xE3, 0x83, 0xA1, 0xE3,
+ 0x83, 0xA2, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xA6,
+ 0xE3, 0x83, 0xA8, 0xE3, 0x83, 0xA9, 0xE3, 0x83,
+ 0xAA, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAC, 0xE3,
+ 0x83, 0xAD, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0xB0,
+ 0xE3, 0x83, 0xB1, 0xE3, 0x83, 0xB2, 0xE3, 0x82,
+ 0xA2, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, 0xE3,
+ 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xA2,
+ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x82,
+ 0xA1, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xB3, 0xE3,
+ 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA2,
+ 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xBC, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0x8B, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99,
+ 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xB3, 0xE3, 0x83,
+ 0x81, 0xE3, 0x82, 0xA6, 0xE3, 0x82, 0xA9, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xB9,
+ 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xBC, 0xE3, 0x83,
+ 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA8, 0xE3,
+ 0x83, 0xBC, 0xE3, 0x82, 0xAB, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x82, 0xAA, 0xE3, 0x83, 0xB3, 0xE3, 0x82,
+ 0xB9, 0xE3, 0x82, 0xAA, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x83, 0xA0, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0xA4,
+ 0xE3, 0x83, 0xAA, 0xE3, 0x82, 0xAB, 0xE3, 0x83,
+ 0xA9, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3,
+ 0x82, 0xAB, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xAA,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAB, 0xE3, 0x82,
+ 0x99, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xB3, 0xE3,
+ 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xAD, 0xE3, 0x82,
+ 0x99, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3,
+ 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x8B,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x83,
+ 0xA5, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB,
+ 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, 0xE3, 0x83,
+ 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3,
+ 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, 0x82, 0xAF,
+ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, 0x83,
+ 0xA0, 0xE3, 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3,
+ 0x83, 0xA1, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88,
+ 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x83,
+ 0xAD, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0x83, 0xE3,
+ 0x83, 0x88, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99,
+ 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x82,
+ 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3,
+ 0x83, 0xA0, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, 0xE3, 0x82,
+ 0xBB, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA4, 0xE3,
+ 0x83, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x8D, 0xE3, 0x82,
+ 0xB1, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB9, 0xE3,
+ 0x82, 0xB3, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x8A,
+ 0xE3, 0x82, 0xB3, 0xE3, 0x83, 0xBC, 0xE3, 0x83,
+ 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xB5, 0xE3,
+ 0x82, 0xA4, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB,
+ 0xE3, 0x82, 0xB5, 0xE3, 0x83, 0xB3, 0xE3, 0x83,
+ 0x81, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xA0, 0xE3,
+ 0x82, 0xB7, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x82,
+ 0xBB, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3,
+ 0x82, 0xBB, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88,
+ 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, 0xE3, 0x83,
+ 0xBC, 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x86, 0xE3,
+ 0x82, 0x99, 0xE3, 0x82, 0xB7, 0xE3, 0x83, 0x88,
+ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83,
+ 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x8A, 0xE3,
+ 0x83, 0x8E, 0xE3, 0x83, 0x8E, 0xE3, 0x83, 0x83,
+ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8F, 0xE3, 0x82,
+ 0xA4, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x8F, 0xE3,
+ 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xBB,
+ 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x83,
+ 0x8F, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x83, 0x84, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x99,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAC, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3,
+ 0x82, 0xA2, 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x88,
+ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82,
+ 0x9A, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, 0xE3,
+ 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xB3,
+ 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x99, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0xA1, 0xE3,
+ 0x83, 0xA9, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88,
+ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x95, 0xE3, 0x82,
+ 0xA3, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3,
+ 0x83, 0x95, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x83,
+ 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xA7, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x83, 0xA9, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0xAF,
+ 0xE3, 0x82, 0xBF, 0xE3, 0x83, 0xBC, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3,
+ 0x82, 0xBD, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x9A,
+ 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0x92, 0xE3, 0x83,
+ 0x98, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x84, 0xE3,
+ 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x98, 0xE3, 0x82,
+ 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB7, 0xE3,
+ 0x82, 0x99, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x99,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xBF, 0xE3, 0x83,
+ 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA4, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x9B,
+ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83,
+ 0x88, 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0xB3, 0xE3,
+ 0x83, 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x83,
+ 0x9B, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3,
+ 0x83, 0x9B, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xA4, 0xE3, 0x82,
+ 0xAF, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0x9E, 0xE3,
+ 0x82, 0xA4, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x9E,
+ 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x8F, 0xE3, 0x83,
+ 0x9E, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xAF, 0xE3,
+ 0x83, 0x9E, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xB7,
+ 0xE3, 0x83, 0xA7, 0xE3, 0x83, 0xB3, 0xE3, 0x83,
+ 0x9F, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xAA,
+ 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xAA, 0xE3, 0x83,
+ 0x8F, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x83, 0xAB, 0xE3, 0x83, 0xA1, 0xE3, 0x82, 0xAB,
+ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA1, 0xE3, 0x82,
+ 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x88, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, 0x83,
+ 0xA4, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3,
+ 0x82, 0x99, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xA6, 0xE3, 0x82,
+ 0xA2, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0xAA, 0xE3,
+ 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB,
+ 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xA9, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3,
+ 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xA0, 0xE3,
+ 0x83, 0xAC, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88,
+ 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, 0xE3, 0x83,
+ 0xB3, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0x83, 0xE3,
+ 0x83, 0x88, 0x30, 0xE7, 0x82, 0xB9, 0x31, 0xE7,
+ 0x82, 0xB9, 0x32, 0xE7, 0x82, 0xB9, 0x33, 0xE7,
+ 0x82, 0xB9, 0x34, 0xE7, 0x82, 0xB9, 0x35, 0xE7,
+ 0x82, 0xB9, 0x36, 0xE7, 0x82, 0xB9, 0x37, 0xE7,
+ 0x82, 0xB9, 0x38, 0xE7, 0x82, 0xB9, 0x39, 0xE7,
+ 0x82, 0xB9, 0x31, 0x30, 0xE7, 0x82, 0xB9, 0x31,
+ 0x31, 0xE7, 0x82, 0xB9, 0x31, 0x32, 0xE7, 0x82,
+ 0xB9, 0x31, 0x33, 0xE7, 0x82, 0xB9, 0x31, 0x34,
+ 0xE7, 0x82, 0xB9, 0x31, 0x35, 0xE7, 0x82, 0xB9,
+ 0x31, 0x36, 0xE7, 0x82, 0xB9, 0x31, 0x37, 0xE7,
+ 0x82, 0xB9, 0x31, 0x38, 0xE7, 0x82, 0xB9, 0x31,
+ 0x39, 0xE7, 0x82, 0xB9, 0x32, 0x30, 0xE7, 0x82,
+ 0xB9, 0x32, 0x31, 0xE7, 0x82, 0xB9, 0x32, 0x32,
+ 0xE7, 0x82, 0xB9, 0x32, 0x33, 0xE7, 0x82, 0xB9,
+ 0x32, 0x34, 0xE7, 0x82, 0xB9, 0x68, 0x50, 0x61,
+ 0x64, 0x61, 0x41, 0x55, 0x62, 0x61, 0x72, 0x6F,
+ 0x56, 0x70, 0x63, 0xE5, 0xB9, 0xB3, 0xE6, 0x88,
+ 0x90, 0xE6, 0x98, 0xAD, 0xE5, 0x92, 0x8C, 0xE5,
+ 0xA4, 0xA7, 0xE6, 0xAD, 0xA3, 0xE6, 0x98, 0x8E,
+ 0xE6, 0xB2, 0xBB, 0xE6, 0xA0, 0xAA, 0xE5, 0xBC,
+ 0x8F, 0xE4, 0xBC, 0x9A, 0xE7, 0xA4, 0xBE, 0x70,
+ 0x41, 0x6E, 0x41, 0xCE, 0xBC, 0x41, 0x6D, 0x41,
+ 0x6B, 0x41, 0x4B, 0x42, 0x4D, 0x42, 0x47, 0x42,
+ 0x63, 0x61, 0x6C, 0x6B, 0x63, 0x61, 0x6C, 0x70,
+ 0x46, 0x6E, 0x46, 0xCE, 0xBC, 0x46, 0xCE, 0xBC,
+ 0x67, 0x6D, 0x67, 0x6B, 0x67, 0x48, 0x7A, 0x6B,
+ 0x48, 0x7A, 0x4D, 0x48, 0x7A, 0x47, 0x48, 0x7A,
+ 0x54, 0x48, 0x7A, 0xCE, 0xBC, 0x6C, 0x6D, 0x6C,
+ 0x64, 0x6C, 0x6B, 0x6C, 0x66, 0x6D, 0x6E, 0x6D,
+ 0xCE, 0xBC, 0x6D, 0x6D, 0x6D, 0x63, 0x6D, 0x6B,
+ 0x6D, 0x6D, 0x6D, 0x32, 0x63, 0x6D, 0x32, 0x6D,
+ 0x32, 0x6B, 0x6D, 0x32, 0x6D, 0x6D, 0x33, 0x63,
+ 0x6D, 0x33, 0x6D, 0x33, 0x6B, 0x6D, 0x33, 0x6D,
+ 0xE2, 0x88, 0x95, 0x73, 0x6D, 0xE2, 0x88, 0x95,
+ 0x73, 0x32, 0x50, 0x61, 0x6B, 0x50, 0x61, 0x4D,
+ 0x50, 0x61, 0x47, 0x50, 0x61, 0x72, 0x61, 0x64,
+ 0x72, 0x61, 0x64, 0xE2, 0x88, 0x95, 0x73, 0x72,
+ 0x61, 0x64, 0xE2, 0x88, 0x95, 0x73, 0x32, 0x70,
+ 0x73, 0x6E, 0x73, 0xCE, 0xBC, 0x73, 0x6D, 0x73,
+ 0x70, 0x56, 0x6E, 0x56, 0xCE, 0xBC, 0x56, 0x6D,
+ 0x56, 0x6B, 0x56, 0x4D, 0x56, 0x70, 0x57, 0x6E,
+ 0x57, 0xCE, 0xBC, 0x57, 0x6D, 0x57, 0x6B, 0x57,
+ 0x4D, 0x57, 0x6B, 0xCE, 0xA9, 0x4D, 0xCE, 0xA9,
+ 0x61, 0x2E, 0x6D, 0x2E, 0x42, 0x71, 0x63, 0x63,
+ 0x63, 0x64, 0x43, 0xE2, 0x88, 0x95, 0x6B, 0x67,
+ 0x43, 0x6F, 0x2E, 0x64, 0x42, 0x47, 0x79, 0x68,
+ 0x61, 0x48, 0x50, 0x69, 0x6E, 0x4B, 0x4B, 0x4B,
+ 0x4D, 0x6B, 0x74, 0x6C, 0x6D, 0x6C, 0x6E, 0x6C,
+ 0x6F, 0x67, 0x6C, 0x78, 0x6D, 0x62, 0x6D, 0x69,
+ 0x6C, 0x6D, 0x6F, 0x6C, 0x50, 0x48, 0x70, 0x2E,
+ 0x6D, 0x2E, 0x50, 0x50, 0x4D, 0x50, 0x52, 0x73,
+ 0x72, 0x53, 0x76, 0x57, 0x62, 0x31, 0xE6, 0x97,
+ 0xA5, 0x32, 0xE6, 0x97, 0xA5, 0x33, 0xE6, 0x97,
+ 0xA5, 0x34, 0xE6, 0x97, 0xA5, 0x35, 0xE6, 0x97,
+ 0xA5, 0x36, 0xE6, 0x97, 0xA5, 0x37, 0xE6, 0x97,
+ 0xA5, 0x38, 0xE6, 0x97, 0xA5, 0x39, 0xE6, 0x97,
+ 0xA5, 0x31, 0x30, 0xE6, 0x97, 0xA5, 0x31, 0x31,
+ 0xE6, 0x97, 0xA5, 0x31, 0x32, 0xE6, 0x97, 0xA5,
+ 0x31, 0x33, 0xE6, 0x97, 0xA5, 0x31, 0x34, 0xE6,
+ 0x97, 0xA5, 0x31, 0x35, 0xE6, 0x97, 0xA5, 0x31,
+ 0x36, 0xE6, 0x97, 0xA5, 0x31, 0x37, 0xE6, 0x97,
+ 0xA5, 0x31, 0x38, 0xE6, 0x97, 0xA5, 0x31, 0x39,
+ 0xE6, 0x97, 0xA5, 0x32, 0x30, 0xE6, 0x97, 0xA5,
+ 0x32, 0x31, 0xE6, 0x97, 0xA5, 0x32, 0x32, 0xE6,
+ 0x97, 0xA5, 0x32, 0x33, 0xE6, 0x97, 0xA5, 0x32,
+ 0x34, 0xE6, 0x97, 0xA5, 0x32, 0x35, 0xE6, 0x97,
+ 0xA5, 0x32, 0x36, 0xE6, 0x97, 0xA5, 0x32, 0x37,
+ 0xE6, 0x97, 0xA5, 0x32, 0x38, 0xE6, 0x97, 0xA5,
+ 0x32, 0x39, 0xE6, 0x97, 0xA5, 0x33, 0x30, 0xE6,
+ 0x97, 0xA5, 0x33, 0x31, 0xE6, 0x97, 0xA5, 0xF6,
+ 0xE8, 0xB1, 0x88, 0xF6, 0xE6, 0x9B, 0xB4, 0xF6,
+ 0xE8, 0xBB, 0x8A, 0xF6, 0xE8, 0xB3, 0x88, 0xF6,
+ 0xE6, 0xBB, 0x91, 0xF6, 0xE4, 0xB8, 0xB2, 0xF6,
+ 0xE5, 0x8F, 0xA5, 0xF6, 0xE9, 0xBE, 0x9C, 0xF6,
+ 0xE9, 0xBE, 0x9C, 0xF6, 0xE5, 0xA5, 0x91, 0xF6,
+ 0xE9, 0x87, 0x91, 0xF6, 0xE5, 0x96, 0x87, 0xF6,
+ 0xE5, 0xA5, 0x88, 0xF6, 0xE6, 0x87, 0xB6, 0xF6,
+ 0xE7, 0x99, 0xA9, 0xF6, 0xE7, 0xBE, 0x85, 0xF6,
+ 0xE8, 0x98, 0xBF, 0xF6, 0xE8, 0x9E, 0xBA, 0xF6,
+ 0xE8, 0xA3, 0xB8, 0xF6, 0xE9, 0x82, 0x8F, 0xF6,
+ 0xE6, 0xA8, 0x82, 0xF6, 0xE6, 0xB4, 0x9B, 0xF6,
+ 0xE7, 0x83, 0x99, 0xF6, 0xE7, 0x8F, 0x9E, 0xF6,
+ 0xE8, 0x90, 0xBD, 0xF6, 0xE9, 0x85, 0xAA, 0xF6,
+ 0xE9, 0xA7, 0xB1, 0xF6, 0xE4, 0xBA, 0x82, 0xF6,
+ 0xE5, 0x8D, 0xB5, 0xF6, 0xE6, 0xAC, 0x84, 0xF6,
+ 0xE7, 0x88, 0x9B, 0xF6, 0xE8, 0x98, 0xAD, 0xF6,
+ 0xE9, 0xB8, 0x9E, 0xF6, 0xE5, 0xB5, 0x90, 0xF6,
+ 0xE6, 0xBF, 0xAB, 0xF6, 0xE8, 0x97, 0x8D, 0xF6,
+ 0xE8, 0xA5, 0xA4, 0xF6, 0xE6, 0x8B, 0x89, 0xF6,
+ 0xE8, 0x87, 0x98, 0xF6, 0xE8, 0xA0, 0x9F, 0xF6,
+ 0xE5, 0xBB, 0x8A, 0xF6, 0xE6, 0x9C, 0x97, 0xF6,
+ 0xE6, 0xB5, 0xAA, 0xF6, 0xE7, 0x8B, 0xBC, 0xF6,
+ 0xE9, 0x83, 0x8E, 0xF6, 0xE4, 0xBE, 0x86, 0xF6,
+ 0xE5, 0x86, 0xB7, 0xF6, 0xE5, 0x8B, 0x9E, 0xF6,
+ 0xE6, 0x93, 0x84, 0xF6, 0xE6, 0xAB, 0x93, 0xF6,
+ 0xE7, 0x88, 0x90, 0xF6, 0xE7, 0x9B, 0xA7, 0xF6,
+ 0xE8, 0x80, 0x81, 0xF6, 0xE8, 0x98, 0x86, 0xF6,
+ 0xE8, 0x99, 0x9C, 0xF6, 0xE8, 0xB7, 0xAF, 0xF6,
+ 0xE9, 0x9C, 0xB2, 0xF6, 0xE9, 0xAD, 0xAF, 0xF6,
+ 0xE9, 0xB7, 0xBA, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6,
+ 0xE7, 0xA5, 0xBF, 0xF6, 0xE7, 0xB6, 0xA0, 0xF6,
+ 0xE8, 0x8F, 0x89, 0xF6, 0xE9, 0x8C, 0x84, 0xF6,
+ 0xE9, 0xB9, 0xBF, 0xF6, 0xE8, 0xAB, 0x96, 0xF6,
+ 0xE5, 0xA3, 0x9F, 0xF6, 0xE5, 0xBC, 0x84, 0xF6,
+ 0xE7, 0xB1, 0xA0, 0xF6, 0xE8, 0x81, 0xBE, 0xF6,
+ 0xE7, 0x89, 0xA2, 0xF6, 0xE7, 0xA3, 0x8A, 0xF6,
+ 0xE8, 0xB3, 0x82, 0xF6, 0xE9, 0x9B, 0xB7, 0xF6,
+ 0xE5, 0xA3, 0x98, 0xF6, 0xE5, 0xB1, 0xA2, 0xF6,
+ 0xE6, 0xA8, 0x93, 0xF6, 0xE6, 0xB7, 0x9A, 0xF6,
+ 0xE6, 0xBC, 0x8F, 0xF6, 0xE7, 0xB4, 0xAF, 0xF6,
+ 0xE7, 0xB8, 0xB7, 0xF6, 0xE9, 0x99, 0x8B, 0xF6,
+ 0xE5, 0x8B, 0x92, 0xF6, 0xE8, 0x82, 0x8B, 0xF6,
+ 0xE5, 0x87, 0x9C, 0xF6, 0xE5, 0x87, 0x8C, 0xF6,
+ 0xE7, 0xA8, 0x9C, 0xF6, 0xE7, 0xB6, 0xBE, 0xF6,
+ 0xE8, 0x8F, 0xB1, 0xF6, 0xE9, 0x99, 0xB5, 0xF6,
+ 0xE8, 0xAE, 0x80, 0xF6, 0xE6, 0x8B, 0x8F, 0xF6,
+ 0xE6, 0xA8, 0x82, 0xF6, 0xE8, 0xAB, 0xBE, 0xF6,
+ 0xE4, 0xB8, 0xB9, 0xF6, 0xE5, 0xAF, 0xA7, 0xF6,
+ 0xE6, 0x80, 0x92, 0xF6, 0xE7, 0x8E, 0x87, 0xF6,
+ 0xE7, 0x95, 0xB0, 0xF6, 0xE5, 0x8C, 0x97, 0xF6,
+ 0xE7, 0xA3, 0xBB, 0xF6, 0xE4, 0xBE, 0xBF, 0xF6,
+ 0xE5, 0xBE, 0xA9, 0xF6, 0xE4, 0xB8, 0x8D, 0xF6,
+ 0xE6, 0xB3, 0x8C, 0xF6, 0xE6, 0x95, 0xB8, 0xF6,
+ 0xE7, 0xB4, 0xA2, 0xF6, 0xE5, 0x8F, 0x83, 0xF6,
+ 0xE5, 0xA1, 0x9E, 0xF6, 0xE7, 0x9C, 0x81, 0xF6,
+ 0xE8, 0x91, 0x89, 0xF6, 0xE8, 0xAA, 0xAA, 0xF6,
+ 0xE6, 0xAE, 0xBA, 0xF6, 0xE8, 0xBE, 0xB0, 0xF6,
+ 0xE6, 0xB2, 0x88, 0xF6, 0xE6, 0x8B, 0xBE, 0xF6,
+ 0xE8, 0x8B, 0xA5, 0xF6, 0xE6, 0x8E, 0xA0, 0xF6,
+ 0xE7, 0x95, 0xA5, 0xF6, 0xE4, 0xBA, 0xAE, 0xF6,
+ 0xE5, 0x85, 0xA9, 0xF6, 0xE5, 0x87, 0x89, 0xF6,
+ 0xE6, 0xA2, 0x81, 0xF6, 0xE7, 0xB3, 0xA7, 0xF6,
+ 0xE8, 0x89, 0xAF, 0xF6, 0xE8, 0xAB, 0x92, 0xF6,
+ 0xE9, 0x87, 0x8F, 0xF6, 0xE5, 0x8B, 0xB5, 0xF6,
+ 0xE5, 0x91, 0x82, 0xF6, 0xE5, 0xA5, 0xB3, 0xF6,
+ 0xE5, 0xBB, 0xAC, 0xF6, 0xE6, 0x97, 0x85, 0xF6,
+ 0xE6, 0xBF, 0xBE, 0xF6, 0xE7, 0xA4, 0xAA, 0xF6,
+ 0xE9, 0x96, 0xAD, 0xF6, 0xE9, 0xA9, 0xAA, 0xF6,
+ 0xE9, 0xBA, 0x97, 0xF6, 0xE9, 0xBB, 0x8E, 0xF6,
+ 0xE5, 0x8A, 0x9B, 0xF6, 0xE6, 0x9B, 0x86, 0xF6,
+ 0xE6, 0xAD, 0xB7, 0xF6, 0xE8, 0xBD, 0xA2, 0xF6,
+ 0xE5, 0xB9, 0xB4, 0xF6, 0xE6, 0x86, 0x90, 0xF6,
+ 0xE6, 0x88, 0x80, 0xF6, 0xE6, 0x92, 0x9A, 0xF6,
+ 0xE6, 0xBC, 0xA3, 0xF6, 0xE7, 0x85, 0x89, 0xF6,
+ 0xE7, 0x92, 0x89, 0xF6, 0xE7, 0xA7, 0x8A, 0xF6,
+ 0xE7, 0xB7, 0xB4, 0xF6, 0xE8, 0x81, 0xAF, 0xF6,
+ 0xE8, 0xBC, 0xA6, 0xF6, 0xE8, 0x93, 0xAE, 0xF6,
+ 0xE9, 0x80, 0xA3, 0xF6, 0xE9, 0x8D, 0x8A, 0xF6,
+ 0xE5, 0x88, 0x97, 0xF6, 0xE5, 0x8A, 0xA3, 0xF6,
+ 0xE5, 0x92, 0xBD, 0xF6, 0xE7, 0x83, 0x88, 0xF6,
+ 0xE8, 0xA3, 0x82, 0xF6, 0xE8, 0xAA, 0xAA, 0xF6,
+ 0xE5, 0xBB, 0x89, 0xF6, 0xE5, 0xBF, 0xB5, 0xF6,
+ 0xE6, 0x8D, 0xBB, 0xF6, 0xE6, 0xAE, 0xAE, 0xF6,
+ 0xE7, 0xB0, 0xBE, 0xF6, 0xE7, 0x8D, 0xB5, 0xF6,
+ 0xE4, 0xBB, 0xA4, 0xF6, 0xE5, 0x9B, 0xB9, 0xF6,
+ 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xB6, 0xBA, 0xF6,
+ 0xE6, 0x80, 0x9C, 0xF6, 0xE7, 0x8E, 0xB2, 0xF6,
+ 0xE7, 0x91, 0xA9, 0xF6, 0xE7, 0xBE, 0x9A, 0xF6,
+ 0xE8, 0x81, 0x86, 0xF6, 0xE9, 0x88, 0xB4, 0xF6,
+ 0xE9, 0x9B, 0xB6, 0xF6, 0xE9, 0x9D, 0x88, 0xF6,
+ 0xE9, 0xA0, 0x98, 0xF6, 0xE4, 0xBE, 0x8B, 0xF6,
+ 0xE7, 0xA6, 0xAE, 0xF6, 0xE9, 0x86, 0xB4, 0xF6,
+ 0xE9, 0x9A, 0xB8, 0xF6, 0xE6, 0x83, 0xA1, 0xF6,
+ 0xE4, 0xBA, 0x86, 0xF6, 0xE5, 0x83, 0x9A, 0xF6,
+ 0xE5, 0xAF, 0xAE, 0xF6, 0xE5, 0xB0, 0xBF, 0xF6,
+ 0xE6, 0x96, 0x99, 0xF6, 0xE6, 0xA8, 0x82, 0xF6,
+ 0xE7, 0x87, 0x8E, 0xF6, 0xE7, 0x99, 0x82, 0xF6,
+ 0xE8, 0x93, 0xBC, 0xF6, 0xE9, 0x81, 0xBC, 0xF6,
+ 0xE9, 0xBE, 0x8D, 0xF6, 0xE6, 0x9A, 0x88, 0xF6,
+ 0xE9, 0x98, 0xAE, 0xF6, 0xE5, 0x8A, 0x89, 0xF6,
+ 0xE6, 0x9D, 0xBB, 0xF6, 0xE6, 0x9F, 0xB3, 0xF6,
+ 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xBA, 0x9C, 0xF6,
+ 0xE7, 0x90, 0x89, 0xF6, 0xE7, 0x95, 0x99, 0xF6,
+ 0xE7, 0xA1, 0xAB, 0xF6, 0xE7, 0xB4, 0x90, 0xF6,
+ 0xE9, 0xA1, 0x9E, 0xF6, 0xE5, 0x85, 0xAD, 0xF6,
+ 0xE6, 0x88, 0xAE, 0xF6, 0xE9, 0x99, 0xB8, 0xF6,
+ 0xE5, 0x80, 0xAB, 0xF6, 0xE5, 0xB4, 0x99, 0xF6,
+ 0xE6, 0xB7, 0xAA, 0xF6, 0xE8, 0xBC, 0xAA, 0xF6,
+ 0xE5, 0xBE, 0x8B, 0xF6, 0xE6, 0x85, 0x84, 0xF6,
+ 0xE6, 0xA0, 0x97, 0xF6, 0xE7, 0x8E, 0x87, 0xF6,
+ 0xE9, 0x9A, 0x86, 0xF6, 0xE5, 0x88, 0xA9, 0xF6,
+ 0xE5, 0x90, 0x8F, 0xF6, 0xE5, 0xB1, 0xA5, 0xF6,
+ 0xE6, 0x98, 0x93, 0xF6, 0xE6, 0x9D, 0x8E, 0xF6,
+ 0xE6, 0xA2, 0xA8, 0xF6, 0xE6, 0xB3, 0xA5, 0xF6,
+ 0xE7, 0x90, 0x86, 0xF6, 0xE7, 0x97, 0xA2, 0xF6,
+ 0xE7, 0xBD, 0xB9, 0xF6, 0xE8, 0xA3, 0x8F, 0xF6,
+ 0xE8, 0xA3, 0xA1, 0xF6, 0xE9, 0x87, 0x8C, 0xF6,
+ 0xE9, 0x9B, 0xA2, 0xF6, 0xE5, 0x8C, 0xBF, 0xF6,
+ 0xE6, 0xBA, 0xBA, 0xF6, 0xE5, 0x90, 0x9D, 0xF6,
+ 0xE7, 0x87, 0x90, 0xF6, 0xE7, 0x92, 0x98, 0xF6,
+ 0xE8, 0x97, 0xBA, 0xF6, 0xE9, 0x9A, 0xA3, 0xF6,
+ 0xE9, 0xB1, 0x97, 0xF6, 0xE9, 0xBA, 0x9F, 0xF6,
+ 0xE6, 0x9E, 0x97, 0xF6, 0xE6, 0xB7, 0x8B, 0xF6,
+ 0xE8, 0x87, 0xA8, 0xF6, 0xE7, 0xAB, 0x8B, 0xF6,
+ 0xE7, 0xAC, 0xA0, 0xF6, 0xE7, 0xB2, 0x92, 0xF6,
+ 0xE7, 0x8B, 0x80, 0xF6, 0xE7, 0x82, 0x99, 0xF6,
+ 0xE8, 0xAD, 0x98, 0xF6, 0xE4, 0xBB, 0x80, 0xF6,
+ 0xE8, 0x8C, 0xB6, 0xF6, 0xE5, 0x88, 0xBA, 0xF6,
+ 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xBA, 0xA6, 0xF6,
+ 0xE6, 0x8B, 0x93, 0xF6, 0xE7, 0xB3, 0x96, 0xF6,
+ 0xE5, 0xAE, 0x85, 0xF6, 0xE6, 0xB4, 0x9E, 0xF6,
+ 0xE6, 0x9A, 0xB4, 0xF6, 0xE8, 0xBC, 0xBB, 0xF6,
+ 0xE8, 0xA1, 0x8C, 0xF6, 0xE9, 0x99, 0x8D, 0xF6,
+ 0xE8, 0xA6, 0x8B, 0xF6, 0xE5, 0xBB, 0x93, 0xF6,
+ 0xE5, 0x85, 0x80, 0xF6, 0xE5, 0x97, 0x80, 0xF6,
+ 0xE5, 0xA1, 0x9A, 0xF6, 0xE6, 0x99, 0xB4, 0xF6,
+ 0xE5, 0x87, 0x9E, 0xF6, 0xE7, 0x8C, 0xAA, 0xF6,
+ 0xE7, 0x9B, 0x8A, 0xF6, 0xE7, 0xA4, 0xBC, 0xF6,
+ 0xE7, 0xA5, 0x9E, 0xF6, 0xE7, 0xA5, 0xA5, 0xF6,
+ 0xE7, 0xA6, 0x8F, 0xF6, 0xE9, 0x9D, 0x96, 0xF6,
+ 0xE7, 0xB2, 0xBE, 0xF6, 0xE7, 0xBE, 0xBD, 0xF6,
+ 0xE8, 0x98, 0x92, 0xF6, 0xE8, 0xAB, 0xB8, 0xF6,
+ 0xE9, 0x80, 0xB8, 0xF6, 0xE9, 0x83, 0xBD, 0xF6,
+ 0xE9, 0xA3, 0xAF, 0xF6, 0xE9, 0xA3, 0xBC, 0xF6,
+ 0xE9, 0xA4, 0xA8, 0xF6, 0xE9, 0xB6, 0xB4, 0xF6,
+ 0xE4, 0xBE, 0xAE, 0xF6, 0xE5, 0x83, 0xA7, 0xF6,
+ 0xE5, 0x85, 0x8D, 0xF6, 0xE5, 0x8B, 0x89, 0xF6,
+ 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, 0x8D, 0x91, 0xF6,
+ 0xE5, 0x96, 0x9D, 0xF6, 0xE5, 0x98, 0x86, 0xF6,
+ 0xE5, 0x99, 0xA8, 0xF6, 0xE5, 0xA1, 0x80, 0xF6,
+ 0xE5, 0xA2, 0xA8, 0xF6, 0xE5, 0xB1, 0xA4, 0xF6,
+ 0xE5, 0xB1, 0xAE, 0xF6, 0xE6, 0x82, 0x94, 0xF6,
+ 0xE6, 0x85, 0xA8, 0xF6, 0xE6, 0x86, 0x8E, 0xF6,
+ 0xE6, 0x87, 0xB2, 0xF6, 0xE6, 0x95, 0x8F, 0xF6,
+ 0xE6, 0x97, 0xA2, 0xF6, 0xE6, 0x9A, 0x91, 0xF6,
+ 0xE6, 0xA2, 0x85, 0xF6, 0xE6, 0xB5, 0xB7, 0xF6,
+ 0xE6, 0xB8, 0x9A, 0xF6, 0xE6, 0xBC, 0xA2, 0xF6,
+ 0xE7, 0x85, 0xAE, 0xF6, 0xE7, 0x88, 0xAB, 0xF6,
+ 0xE7, 0x90, 0xA2, 0xF6, 0xE7, 0xA2, 0x91, 0xF6,
+ 0xE7, 0xA4, 0xBE, 0xF6, 0xE7, 0xA5, 0x89, 0xF6,
+ 0xE7, 0xA5, 0x88, 0xF6, 0xE7, 0xA5, 0x90, 0xF6,
+ 0xE7, 0xA5, 0x96, 0xF6, 0xE7, 0xA5, 0x9D, 0xF6,
+ 0xE7, 0xA6, 0x8D, 0xF6, 0xE7, 0xA6, 0x8E, 0xF6,
+ 0xE7, 0xA9, 0x80, 0xF6, 0xE7, 0xAA, 0x81, 0xF6,
+ 0xE7, 0xAF, 0x80, 0xF6, 0xE7, 0xB7, 0xB4, 0xF6,
+ 0xE7, 0xB8, 0x89, 0xF6, 0xE7, 0xB9, 0x81, 0xF6,
+ 0xE7, 0xBD, 0xB2, 0xF6, 0xE8, 0x80, 0x85, 0xF6,
+ 0xE8, 0x87, 0xAD, 0xF6, 0xE8, 0x89, 0xB9, 0xF6,
+ 0xE8, 0x89, 0xB9, 0xF6, 0xE8, 0x91, 0x97, 0xF6,
+ 0xE8, 0xA4, 0x90, 0xF6, 0xE8, 0xA6, 0x96, 0xF6,
+ 0xE8, 0xAC, 0x81, 0xF6, 0xE8, 0xAC, 0xB9, 0xF6,
+ 0xE8, 0xB3, 0x93, 0xF6, 0xE8, 0xB4, 0x88, 0xF6,
+ 0xE8, 0xBE, 0xB6, 0xF6, 0xE9, 0x80, 0xB8, 0xF6,
+ 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, 0x9F, 0xBF, 0xF6,
+ 0xE9, 0xA0, 0xBB, 0x66, 0x66, 0x66, 0x69, 0x66,
+ 0x6C, 0x66, 0x66, 0x69, 0x66, 0x66, 0x6C, 0x73,
+ 0x74, 0x73, 0x74, 0xD5, 0xB4, 0xD5, 0xB6, 0xD5,
+ 0xB4, 0xD5, 0xA5, 0xD5, 0xB4, 0xD5, 0xAB, 0xD5,
+ 0xBE, 0xD5, 0xB6, 0xD5, 0xB4, 0xD5, 0xAD, 0xF6,
+ 0xD7, 0x99, 0xD6, 0xB4, 0xF6, 0xD7, 0xB2, 0xD6,
+ 0xB7, 0xD7, 0xA2, 0xD7, 0x90, 0xD7, 0x93, 0xD7,
+ 0x94, 0xD7, 0x9B, 0xD7, 0x9C, 0xD7, 0x9D, 0xD7,
+ 0xA8, 0xD7, 0xAA, 0x2B, 0xF6, 0xD7, 0xA9, 0xD7,
+ 0x81, 0xF6, 0xD7, 0xA9, 0xD7, 0x82, 0xF6, 0xD7,
+ 0xA9, 0xD6, 0xBC, 0xD7, 0x81, 0xF6, 0xD7, 0xA9,
+ 0xD6, 0xBC, 0xD7, 0x82, 0xF6, 0xD7, 0x90, 0xD6,
+ 0xB7, 0xF6, 0xD7, 0x90, 0xD6, 0xB8, 0xF6, 0xD7,
+ 0x90, 0xD6, 0xBC, 0xF6, 0xD7, 0x91, 0xD6, 0xBC,
+ 0xF6, 0xD7, 0x92, 0xD6, 0xBC, 0xF6, 0xD7, 0x93,
+ 0xD6, 0xBC, 0xF6, 0xD7, 0x94, 0xD6, 0xBC, 0xF6,
+ 0xD7, 0x95, 0xD6, 0xBC, 0xF6, 0xD7, 0x96, 0xD6,
+ 0xBC, 0xF6, 0xD7, 0x98, 0xD6, 0xBC, 0xF6, 0xD7,
+ 0x99, 0xD6, 0xBC, 0xF6, 0xD7, 0x9A, 0xD6, 0xBC,
+ 0xF6, 0xD7, 0x9B, 0xD6, 0xBC, 0xF6, 0xD7, 0x9C,
+ 0xD6, 0xBC, 0xF6, 0xD7, 0x9E, 0xD6, 0xBC, 0xF6,
+ 0xD7, 0xA0, 0xD6, 0xBC, 0xF6, 0xD7, 0xA1, 0xD6,
+ 0xBC, 0xF6, 0xD7, 0xA3, 0xD6, 0xBC, 0xF6, 0xD7,
+ 0xA4, 0xD6, 0xBC, 0xF6, 0xD7, 0xA6, 0xD6, 0xBC,
+ 0xF6, 0xD7, 0xA7, 0xD6, 0xBC, 0xF6, 0xD7, 0xA8,
+ 0xD6, 0xBC, 0xF6, 0xD7, 0xA9, 0xD6, 0xBC, 0xF6,
+ 0xD7, 0xAA, 0xD6, 0xBC, 0xF6, 0xD7, 0x95, 0xD6,
+ 0xB9, 0xF6, 0xD7, 0x91, 0xD6, 0xBF, 0xF6, 0xD7,
+ 0x9B, 0xD6, 0xBF, 0xF6, 0xD7, 0xA4, 0xD6, 0xBF,
+ 0xD7, 0x90, 0xD7, 0x9C, 0xD9, 0xB1, 0xD9, 0xB1,
+ 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB,
+ 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE,
+ 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80,
+ 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA,
+ 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF,
+ 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9,
+ 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4,
+ 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6,
+ 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84,
+ 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83,
+ 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86,
+ 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87,
+ 0xDA, 0x8D, 0xDA, 0x8D, 0xDA, 0x8C, 0xDA, 0x8C,
+ 0xDA, 0x8E, 0xDA, 0x8E, 0xDA, 0x88, 0xDA, 0x88,
+ 0xDA, 0x98, 0xDA, 0x98, 0xDA, 0x91, 0xDA, 0x91,
+ 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9,
+ 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF,
+ 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3,
+ 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1,
+ 0xDA, 0xBA, 0xDA, 0xBA, 0xDA, 0xBB, 0xDA, 0xBB,
+ 0xDA, 0xBB, 0xDA, 0xBB, 0xDB, 0x95, 0xD9, 0x94,
+ 0xDB, 0x95, 0xD9, 0x94, 0xDB, 0x81, 0xDB, 0x81,
+ 0xDB, 0x81, 0xDB, 0x81, 0xDA, 0xBE, 0xDA, 0xBE,
+ 0xDA, 0xBE, 0xDA, 0xBE, 0xDB, 0x92, 0xDB, 0x92,
+ 0xDB, 0x92, 0xD9, 0x94, 0xDB, 0x92, 0xD9, 0x94,
+ 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD,
+ 0xDB, 0x87, 0xDB, 0x87, 0xDB, 0x86, 0xDB, 0x86,
+ 0xDB, 0x88, 0xDB, 0x88, 0xDB, 0x87, 0xD9, 0xB4,
+ 0xDB, 0x8B, 0xDB, 0x8B, 0xDB, 0x85, 0xDB, 0x85,
+ 0xDB, 0x89, 0xDB, 0x89, 0xDB, 0x90, 0xDB, 0x90,
+ 0xDB, 0x90, 0xDB, 0x90, 0xD9, 0x89, 0xD9, 0x89,
+ 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xDB, 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x95,
+ 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xDB, 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x87,
+ 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xDB, 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x88,
+ 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89,
+ 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD9, 0x89, 0xDB, 0x8C, 0xDB, 0x8C,
+ 0xDB, 0x8C, 0xDB, 0x8C, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAD,
+ 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, 0xA8,
+ 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, 0xA8,
+ 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8,
+ 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD8, 0xAA,
+ 0xD8, 0xAD, 0xD8, 0xAA, 0xD8, 0xAE, 0xD8, 0xAA,
+ 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA,
+ 0xD9, 0x8A, 0xD8, 0xAB, 0xD8, 0xAC, 0xD8, 0xAB,
+ 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB,
+ 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC,
+ 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAD,
+ 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD8, 0xAE,
+ 0xD8, 0xAD, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB3,
+ 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3,
+ 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB5,
+ 0xD8, 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD8, 0xB6,
+ 0xD8, 0xAC, 0xD8, 0xB6, 0xD8, 0xAD, 0xD8, 0xB6,
+ 0xD8, 0xAE, 0xD8, 0xB6, 0xD9, 0x85, 0xD8, 0xB7,
+ 0xD8, 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB8,
+ 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, 0xB9,
+ 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, 0xBA,
+ 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, 0x81,
+ 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x81,
+ 0xD9, 0x85, 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81,
+ 0xD9, 0x8A, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, 0x82,
+ 0xD9, 0x85, 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82,
+ 0xD9, 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83,
+ 0xD8, 0xAC, 0xD9, 0x83, 0xD8, 0xAD, 0xD9, 0x83,
+ 0xD8, 0xAE, 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x83,
+ 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, 0xD9, 0x83,
+ 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x84,
+ 0xD8, 0xAD, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x84,
+ 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84,
+ 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, 0x85,
+ 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x85,
+ 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x86,
+ 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, 0x86,
+ 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86,
+ 0xD9, 0x8A, 0xD9, 0x87, 0xD8, 0xAC, 0xD9, 0x87,
+ 0xD9, 0x85, 0xD9, 0x87, 0xD9, 0x89, 0xD9, 0x87,
+ 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, 0x8A,
+ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x89, 0xD9, 0x8A,
+ 0xD9, 0x8A, 0xD8, 0xB0, 0xD9, 0xB0, 0xD8, 0xB1,
+ 0xD9, 0xB0, 0xD9, 0x89, 0xD9, 0xB0, 0x20, 0xD9,
+ 0x8C, 0xD9, 0x91, 0x20, 0xD9, 0x8D, 0xD9, 0x91,
+ 0x20, 0xD9, 0x8E, 0xD9, 0x91, 0x20, 0xD9, 0x8F,
+ 0xD9, 0x91, 0x20, 0xD9, 0x90, 0xD9, 0x91, 0x20,
+ 0xD9, 0x91, 0xD9, 0xB0, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xD8, 0xB1, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xB2,
+ 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x8A,
+ 0xD8, 0xA8, 0xD8, 0xB1, 0xD8, 0xA8, 0xD8, 0xB2,
+ 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x86,
+ 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, 0xD9, 0x8A,
+ 0xD8, 0xAA, 0xD8, 0xB1, 0xD8, 0xAA, 0xD8, 0xB2,
+ 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x86,
+ 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x8A,
+ 0xD8, 0xAB, 0xD8, 0xB1, 0xD8, 0xAB, 0xD8, 0xB2,
+ 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x86,
+ 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, 0xD9, 0x8A,
+ 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, 0xD9, 0x8A,
+ 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, 0xD9, 0x8A,
+ 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, 0xD9, 0x84,
+ 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89,
+ 0xD9, 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD9, 0x85,
+ 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, 0xD9, 0x8A,
+ 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x85, 0xD9, 0x85,
+ 0xD9, 0x86, 0xD8, 0xB1, 0xD9, 0x86, 0xD8, 0xB2,
+ 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x86,
+ 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, 0xD9, 0x8A,
+ 0xD9, 0x89, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xB1,
+ 0xD9, 0x8A, 0xD8, 0xB2, 0xD9, 0x8A, 0xD9, 0x85,
+ 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, 0x89,
+ 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAD,
+ 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAE, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xD9, 0x87, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, 0xA8,
+ 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, 0xA8,
+ 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x87, 0xD8, 0xAA,
+ 0xD8, 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAA,
+ 0xD8, 0xAE, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA,
+ 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAC,
+ 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xAD,
+ 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE,
+ 0xD8, 0xAC, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB3,
+ 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3,
+ 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB5,
+ 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAE, 0xD8, 0xB5,
+ 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAC, 0xD8, 0xB6,
+ 0xD8, 0xAD, 0xD8, 0xB6, 0xD8, 0xAE, 0xD8, 0xB6,
+ 0xD9, 0x85, 0xD8, 0xB7, 0xD8, 0xAD, 0xD8, 0xB8,
+ 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, 0xB9,
+ 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, 0xBA,
+ 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, 0x81,
+ 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x81,
+ 0xD9, 0x85, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, 0x82,
+ 0xD9, 0x85, 0xD9, 0x83, 0xD8, 0xAC, 0xD9, 0x83,
+ 0xD8, 0xAD, 0xD9, 0x83, 0xD8, 0xAE, 0xD9, 0x83,
+ 0xD9, 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x84,
+ 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x84,
+ 0xD8, 0xAE, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x84,
+ 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, 0x85,
+ 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x86,
+ 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, 0x86,
+ 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, 0x87,
+ 0xD8, 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x87,
+ 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, 0x8A,
+ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x94,
+ 0xD9, 0x87, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8,
+ 0xD9, 0x87, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA,
+ 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB,
+ 0xD9, 0x87, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB3,
+ 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xB4,
+ 0xD9, 0x87, 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x83,
+ 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86,
+ 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, 0x8A,
+ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x80,
+ 0xD9, 0x8E, 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x8F,
+ 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x90, 0xD9, 0x91,
+ 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, 0x8A,
+ 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, 0x8A,
+ 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x8A,
+ 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, 0x8A,
+ 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, 0x8A,
+ 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, 0x8A,
+ 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, 0x8A,
+ 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x8A,
+ 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, 0x8A,
+ 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD,
+ 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85,
+ 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, 0xB1,
+ 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, 0xB1,
+ 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, 0x8A,
+ 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, 0x8A,
+ 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x8A,
+ 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, 0x8A,
+ 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, 0x8A,
+ 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, 0x8A,
+ 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, 0x8A,
+ 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x8A,
+ 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, 0x8A,
+ 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD,
+ 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85,
+ 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, 0xB1,
+ 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, 0xB1,
+ 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD,
+ 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85,
+ 0xD8, 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x87,
+ 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, 0xAC,
+ 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, 0xD8, 0xAE,
+ 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD,
+ 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB7, 0xD9, 0x85,
+ 0xD8, 0xB8, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x8B,
+ 0xD8, 0xA7, 0xD9, 0x8B, 0xD8, 0xAA, 0xD8, 0xAC,
+ 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC,
+ 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAA,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAE,
+ 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAC,
+ 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAA,
+ 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, 0x85,
+ 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xAD,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAD,
+ 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xB3, 0xD8, 0xAD,
+ 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAC, 0xD8, 0xAD,
+ 0xD8, 0xB3, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xB3,
+ 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85,
+ 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xAC,
+ 0xD8, 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3,
+ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB5, 0xD8, 0xAD,
+ 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAD, 0xD8, 0xAD,
+ 0xD8, 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB4,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAD,
+ 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4,
+ 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85,
+ 0xD9, 0x85, 0xD8, 0xB4, 0xD9, 0x85, 0xD9, 0x85,
+ 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xB6,
+ 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAE,
+ 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD,
+ 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB7,
+ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85,
+ 0xD9, 0x8A, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85,
+ 0xD8, 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9,
+ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, 0x85,
+ 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x85,
+ 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xBA,
+ 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x81, 0xD8, 0xAE,
+ 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x85,
+ 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x82,
+ 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD,
+ 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x8A,
+ 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x89, 0xD9, 0x84,
+ 0xD8, 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAC,
+ 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85,
+ 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x84,
+ 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, 0xD9, 0x85,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC,
+ 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x85,
+ 0xD8, 0xAD, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85,
+ 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, 0x85,
+ 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xAC,
+ 0xD8, 0xAE, 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC,
+ 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x86,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAD,
+ 0xD9, 0x89, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85,
+ 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x86,
+ 0xD8, 0xAC, 0xD9, 0x89, 0xD9, 0x86, 0xD9, 0x85,
+ 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x89,
+ 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x8A,
+ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xA8, 0xD8, 0xAE,
+ 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAA,
+ 0xD8, 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAE,
+ 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x8A,
+ 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xAC,
+ 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD,
+ 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x89,
+ 0xD8, 0xB3, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xB5,
+ 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xB4, 0xD8, 0xAD,
+ 0xD9, 0x8A, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x8A,
+ 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x84,
+ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAD,
+ 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x85,
+ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x82, 0xD9, 0x85,
+ 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAD, 0xD9, 0x8A,
+ 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84,
+ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, 0x85,
+ 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x8A,
+ 0xD9, 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85,
+ 0xD8, 0xAE, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC,
+ 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x85,
+ 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x86,
+ 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAD,
+ 0xD9, 0x8A, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x81,
+ 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAD,
+ 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x85,
+ 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xB5,
+ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, 0xAE,
+ 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x8A,
+ 0xD8, 0xB5, 0xD9, 0x84, 0xDB, 0x92, 0xD9, 0x82,
+ 0xD9, 0x84, 0xDB, 0x92, 0xD8, 0xA7, 0xD9, 0x84,
+ 0xD9, 0x84, 0xD9, 0x87, 0xD8, 0xA7, 0xD9, 0x83,
+ 0xD8, 0xA8, 0xD8, 0xB1, 0xD9, 0x85, 0xD8, 0xAD,
+ 0xD9, 0x85, 0xD8, 0xAF, 0xD8, 0xB5, 0xD9, 0x84,
+ 0xD8, 0xB9, 0xD9, 0x85, 0xD8, 0xB1, 0xD8, 0xB3,
+ 0xD9, 0x88, 0xD9, 0x84, 0xD8, 0xB9, 0xD9, 0x84,
+ 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x88, 0xD8, 0xB3,
+ 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xB5, 0xD9, 0x84,
+ 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89,
+ 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 0x84, 0xD9,
+ 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A,
+ 0xD9, 0x87, 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9,
+ 0x84, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x84, 0x20,
+ 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x84,
+ 0xD9, 0x87, 0xD8, 0xB1, 0xDB, 0x8C, 0xD8, 0xA7,
+ 0xD9, 0x84, 0x2E, 0x2E, 0xE2, 0x80, 0x94, 0xE2,
+ 0x80, 0x93, 0x5F, 0x5F, 0x28, 0x29, 0x7B, 0x7D,
+ 0xE3, 0x80, 0x94, 0xE3, 0x80, 0x95, 0xE3, 0x80,
+ 0x90, 0xE3, 0x80, 0x91, 0xE3, 0x80, 0x8A, 0xE3,
+ 0x80, 0x8B, 0xE3, 0x80, 0x88, 0xE3, 0x80, 0x89,
+ 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, 0xE3, 0x80,
+ 0x8E, 0xE3, 0x80, 0x8F, 0x20, 0xCC, 0x85, 0x20,
+ 0xCC, 0x85, 0x20, 0xCC, 0x85, 0x20, 0xCC, 0x85,
+ 0x5F, 0x5F, 0x5F, 0x2C, 0xE3, 0x80, 0x81, 0x2E,
+ 0x3B, 0x3A, 0x3F, 0x21, 0xE2, 0x80, 0x94, 0x28,
+ 0x29, 0x7B, 0x7D, 0xE3, 0x80, 0x94, 0xE3, 0x80,
+ 0x95, 0x23, 0x26, 0x2A, 0x2B, 0x2D, 0x3C, 0x3E,
+ 0x3D, 0x5C, 0x24, 0x25, 0x40, 0x20, 0xD9, 0x8B,
+ 0xD9, 0x80, 0xD9, 0x8B, 0x20, 0xD9, 0x8C, 0x20,
+ 0xD9, 0x8D, 0x20, 0xD9, 0x8E, 0xD9, 0x80, 0xD9,
+ 0x8E, 0x20, 0xD9, 0x8F, 0xD9, 0x80, 0xD9, 0x8F,
+ 0x20, 0xD9, 0x90, 0xD9, 0x80, 0xD9, 0x90, 0x20,
+ 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x91, 0x20, 0xD9,
+ 0x92, 0xD9, 0x80, 0xD9, 0x92, 0xD8, 0xA1, 0xD8,
+ 0xA7, 0xD9, 0x93, 0xD8, 0xA7, 0xD9, 0x93, 0xD8,
+ 0xA7, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x94, 0xD9,
+ 0x88, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x94, 0xD8,
+ 0xA7, 0xD9, 0x95, 0xD8, 0xA7, 0xD9, 0x95, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD8,
+ 0xA7, 0xD8, 0xA7, 0xD8, 0xA8, 0xD8, 0xA8, 0xD8,
+ 0xA8, 0xD8, 0xA8, 0xD8, 0xA9, 0xD8, 0xA9, 0xD8,
+ 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8,
+ 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8,
+ 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8,
+ 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8,
+ 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8,
+ 0xAF, 0xD8, 0xAF, 0xD8, 0xB0, 0xD8, 0xB0, 0xD8,
+ 0xB1, 0xD8, 0xB1, 0xD8, 0xB2, 0xD8, 0xB2, 0xD8,
+ 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8,
+ 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8,
+ 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8,
+ 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8,
+ 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8,
+ 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8,
+ 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8,
+ 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD9,
+ 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9,
+ 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9,
+ 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9,
+ 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9,
+ 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9,
+ 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9,
+ 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9,
+ 0x88, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x89, 0xD9,
+ 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9,
+ 0x84, 0xD8, 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8,
+ 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, 0xD9,
+ 0x94, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, 0xD9,
+ 0x84, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8,
+ 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, 0xD9,
+ 0x84, 0xD8, 0xA7, 0x21, 0x22, 0x23, 0x24, 0x25,
+ 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D,
+ 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35,
+ 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D,
+ 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45,
+ 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D,
+ 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55,
+ 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D,
+ 0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D,
+ 0x7E, 0xE2, 0xA6, 0x85, 0xE2, 0xA6, 0x86, 0xE3,
+ 0x80, 0x82, 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D,
+ 0xE3, 0x80, 0x81, 0xE3, 0x83, 0xBB, 0xE3, 0x83,
+ 0xB2, 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA3, 0xE3,
+ 0x82, 0xA5, 0xE3, 0x82, 0xA7, 0xE3, 0x82, 0xA9,
+ 0xE3, 0x83, 0xA3, 0xE3, 0x83, 0xA5, 0xE3, 0x83,
+ 0xA7, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x82, 0xA2, 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6,
+ 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82,
+ 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3,
+ 0x82, 0xB1, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5,
+ 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82,
+ 0xBB, 0xE3, 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3,
+ 0x83, 0x81, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86,
+ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83,
+ 0x8B, 0xE3, 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3,
+ 0x83, 0x8E, 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92,
+ 0xE3, 0x83, 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83,
+ 0x9B, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3,
+ 0x83, 0xA0, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2,
+ 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83,
+ 0xA8, 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3,
+ 0x83, 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD,
+ 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0xB3, 0xE3, 0x82,
+ 0x99, 0xE3, 0x82, 0x9A, 0xE1, 0x85, 0xA0, 0xE1,
+ 0x84, 0x80, 0xE1, 0x84, 0x81, 0xE1, 0x86, 0xAA,
+ 0xE1, 0x84, 0x82, 0xE1, 0x86, 0xAC, 0xE1, 0x86,
+ 0xAD, 0xE1, 0x84, 0x83, 0xE1, 0x84, 0x84, 0xE1,
+ 0x84, 0x85, 0xE1, 0x86, 0xB0, 0xE1, 0x86, 0xB1,
+ 0xE1, 0x86, 0xB2, 0xE1, 0x86, 0xB3, 0xE1, 0x86,
+ 0xB4, 0xE1, 0x86, 0xB5, 0xE1, 0x84, 0x9A, 0xE1,
+ 0x84, 0x86, 0xE1, 0x84, 0x87, 0xE1, 0x84, 0x88,
+ 0xE1, 0x84, 0xA1, 0xE1, 0x84, 0x89, 0xE1, 0x84,
+ 0x8A, 0xE1, 0x84, 0x8B, 0xE1, 0x84, 0x8C, 0xE1,
+ 0x84, 0x8D, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F,
+ 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84,
+ 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x85, 0xA2, 0xE1,
+ 0x85, 0xA3, 0xE1, 0x85, 0xA4, 0xE1, 0x85, 0xA5,
+ 0xE1, 0x85, 0xA6, 0xE1, 0x85, 0xA7, 0xE1, 0x85,
+ 0xA8, 0xE1, 0x85, 0xA9, 0xE1, 0x85, 0xAA, 0xE1,
+ 0x85, 0xAB, 0xE1, 0x85, 0xAC, 0xE1, 0x85, 0xAD,
+ 0xE1, 0x85, 0xAE, 0xE1, 0x85, 0xAF, 0xE1, 0x85,
+ 0xB0, 0xE1, 0x85, 0xB1, 0xE1, 0x85, 0xB2, 0xE1,
+ 0x85, 0xB3, 0xE1, 0x85, 0xB4, 0xE1, 0x85, 0xB5,
+ 0xC2, 0xA2, 0xC2, 0xA3, 0xC2, 0xAC, 0x20, 0xCC,
+ 0x84, 0xC2, 0xA6, 0xC2, 0xA5, 0xE2, 0x82, 0xA9,
+ 0xE2, 0x94, 0x82, 0xE2, 0x86, 0x90, 0xE2, 0x86,
+ 0x91, 0xE2, 0x86, 0x92, 0xE2, 0x86, 0x93, 0xE2,
+ 0x96, 0xA0, 0xE2, 0x97, 0x8B, 0xF6, 0xF0, 0x9D,
+ 0x85, 0x97, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0,
+ 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF6,
+ 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5,
+ 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x85,
+ 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85,
+ 0xAF, 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D,
+ 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB0, 0xF6, 0xF0,
+ 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0,
+ 0x9D, 0x85, 0xB1, 0xF6, 0xF0, 0x9D, 0x85, 0x98,
+ 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB2,
+ 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, 0x85,
+ 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D,
+ 0x85, 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0,
+ 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAE, 0xF6,
+ 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, 0xA5,
+ 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x86,
+ 0xB9, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85,
+ 0xAF, 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D,
+ 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, 0x41, 0x42,
+ 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A,
+ 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52,
+ 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A,
+ 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+ 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70,
+ 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+ 0x79, 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46,
+ 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E,
+ 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56,
+ 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64,
+ 0x65, 0x66, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x43, 0x44, 0x47, 0x4A, 0x4B, 0x4E,
+ 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x66,
+ 0x68, 0x69, 0x6A, 0x6B, 0x6D, 0x6E, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44,
+ 0x45, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E,
+ 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66,
+ 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E,
+ 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
+ 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, 0x45,
+ 0x46, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4F,
+ 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE,
+ 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE,
+ 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE,
+ 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE,
+ 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE,
+ 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE,
+ 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1,
+ 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5,
+ 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9,
+ 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD,
+ 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81,
+ 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85,
+ 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89,
+ 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE,
+ 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE,
+ 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE,
+ 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE,
+ 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE,
+ 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE,
+ 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE,
+ 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE,
+ 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2,
+ 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6,
+ 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA,
+ 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE,
+ 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82,
+ 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86,
+ 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88,
+ 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF,
+ 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE,
+ 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE,
+ 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE,
+ 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE,
+ 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE,
+ 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE,
+ 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2,
+ 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3,
+ 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7,
+ 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB,
+ 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF,
+ 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83,
+ 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87,
+ 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE,
+ 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF,
+ 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE,
+ 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE,
+ 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE,
+ 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE,
+ 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE,
+ 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE,
+ 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87,
+ 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4,
+ 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8,
+ 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC,
+ 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80,
+ 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84,
+ 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88,
+ 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE,
+ 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF,
+ 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE,
+ 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE,
+ 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE,
+ 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE,
+ 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE,
+ 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE,
+ 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1,
+ 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5,
+ 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9,
+ 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD,
+ 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81,
+ 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85,
+ 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89,
+ 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE,
+ 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0x30,
+ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
+ 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, 0x33, 0x34,
+ 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, 0x32,
+ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30,
+ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0xF6, 0xE4, 0xB8, 0xBD, 0xF6, 0xE4, 0xB8,
+ 0xB8, 0xF6, 0xE4, 0xB9, 0x81, 0xF6, 0xF0, 0xA0,
+ 0x84, 0xA2, 0xF6, 0xE4, 0xBD, 0xA0, 0xF6, 0xE4,
+ 0xBE, 0xAE, 0xF6, 0xE4, 0xBE, 0xBB, 0xF6, 0xE5,
+ 0x80, 0x82, 0xF6, 0xE5, 0x81, 0xBA, 0xF6, 0xE5,
+ 0x82, 0x99, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, 0xE5,
+ 0x83, 0x8F, 0xF6, 0xE3, 0x92, 0x9E, 0xF6, 0xF0,
+ 0xA0, 0x98, 0xBA, 0xF6, 0xE5, 0x85, 0x8D, 0xF6,
+ 0xE5, 0x85, 0x94, 0xF6, 0xE5, 0x85, 0xA4, 0xF6,
+ 0xE5, 0x85, 0xB7, 0xF6, 0xF0, 0xA0, 0x94, 0x9C,
+ 0xF6, 0xE3, 0x92, 0xB9, 0xF6, 0xE5, 0x85, 0xA7,
+ 0xF6, 0xE5, 0x86, 0x8D, 0xF6, 0xF0, 0xA0, 0x95,
+ 0x8B, 0xF6, 0xE5, 0x86, 0x97, 0xF6, 0xE5, 0x86,
+ 0xA4, 0xF6, 0xE4, 0xBB, 0x8C, 0xF6, 0xE5, 0x86,
+ 0xAC, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xF0, 0xA9,
+ 0x87, 0x9F, 0xF6, 0xE5, 0x87, 0xB5, 0xF6, 0xE5,
+ 0x88, 0x83, 0xF6, 0xE3, 0x93, 0x9F, 0xF6, 0xE5,
+ 0x88, 0xBB, 0xF6, 0xE5, 0x89, 0x86, 0xF6, 0xE5,
+ 0x89, 0xB2, 0xF6, 0xE5, 0x89, 0xB7, 0xF6, 0xE3,
+ 0x94, 0x95, 0xF6, 0xE5, 0x8B, 0x87, 0xF6, 0xE5,
+ 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5,
+ 0x8B, 0xBA, 0xF6, 0xE5, 0x8C, 0x85, 0xF6, 0xE5,
+ 0x8C, 0x86, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, 0xE5,
+ 0x8D, 0x89, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, 0xE5,
+ 0x8D, 0x9A, 0xF6, 0xE5, 0x8D, 0xB3, 0xF6, 0xE5,
+ 0x8D, 0xBD, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xE5,
+ 0x8D, 0xBF, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xF0,
+ 0xA0, 0xA8, 0xAC, 0xF6, 0xE7, 0x81, 0xB0, 0xF6,
+ 0xE5, 0x8F, 0x8A, 0xF6, 0xE5, 0x8F, 0x9F, 0xF6,
+ 0xF0, 0xA0, 0xAD, 0xA3, 0xF6, 0xE5, 0x8F, 0xAB,
+ 0xF6, 0xE5, 0x8F, 0xB1, 0xF6, 0xE5, 0x90, 0x86,
+ 0xF6, 0xE5, 0x92, 0x9E, 0xF6, 0xE5, 0x90, 0xB8,
+ 0xF6, 0xE5, 0x91, 0x88, 0xF6, 0xE5, 0x91, 0xA8,
+ 0xF6, 0xE5, 0x92, 0xA2, 0xF6, 0xE5, 0x93, 0xB6,
+ 0xF6, 0xE5, 0x94, 0x90, 0xF6, 0xE5, 0x95, 0x93,
+ 0xF6, 0xE5, 0x95, 0xA3, 0xF6, 0xE5, 0x96, 0x84,
+ 0xF6, 0xE5, 0x96, 0x84, 0xF6, 0xE5, 0x96, 0x99,
+ 0xF6, 0xE5, 0x96, 0xAB, 0xF6, 0xE5, 0x96, 0xB3,
+ 0xF6, 0xE5, 0x97, 0x82, 0xF6, 0xE5, 0x9C, 0x96,
+ 0xF6, 0xE5, 0x98, 0x86, 0xF6, 0xE5, 0x9C, 0x97,
+ 0xF6, 0xE5, 0x99, 0x91, 0xF6, 0xE5, 0x99, 0xB4,
+ 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xA3, 0xAE,
+ 0xF6, 0xE5, 0x9F, 0x8E, 0xF6, 0xE5, 0x9F, 0xB4,
+ 0xF6, 0xE5, 0xA0, 0x8D, 0xF6, 0xE5, 0x9E, 0x8B,
+ 0xF6, 0xE5, 0xA0, 0xB2, 0xF6, 0xE5, 0xA0, 0xB1,
+ 0xF6, 0xE5, 0xA2, 0xAC, 0xF6, 0xF0, 0xA1, 0x93,
+ 0xA4, 0xF6, 0xE5, 0xA3, 0xB2, 0xF6, 0xE5, 0xA3,
+ 0xB7, 0xF6, 0xE5, 0xA4, 0x86, 0xF6, 0xE5, 0xA4,
+ 0x9A, 0xF6, 0xE5, 0xA4, 0xA2, 0xF6, 0xE5, 0xA5,
+ 0xA2, 0xF6, 0xF0, 0xA1, 0x9A, 0xA8, 0xF6, 0xF0,
+ 0xA1, 0x9B, 0xAA, 0xF6, 0xE5, 0xA7, 0xAC, 0xF6,
+ 0xE5, 0xA8, 0x9B, 0xF6, 0xE5, 0xA8, 0xA7, 0xF6,
+ 0xE5, 0xA7, 0x98, 0xF6, 0xE5, 0xA9, 0xA6, 0xF6,
+ 0xE3, 0x9B, 0xAE, 0xF6, 0xE3, 0x9B, 0xBC, 0xF6,
+ 0xE5, 0xAC, 0x88, 0xF6, 0xE5, 0xAC, 0xBE, 0xF6,
+ 0xE5, 0xAC, 0xBE, 0xF6, 0xF0, 0xA1, 0xA7, 0x88,
+ 0xF6, 0xE5, 0xAF, 0x83, 0xF6, 0xE5, 0xAF, 0x98,
+ 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xAF, 0xB3,
+ 0xF6, 0xF0, 0xA1, 0xAC, 0x98, 0xF6, 0xE5, 0xAF,
+ 0xBF, 0xF6, 0xE5, 0xB0, 0x86, 0xF6, 0xE5, 0xBD,
+ 0x93, 0xF6, 0xE5, 0xB0, 0xA2, 0xF6, 0xE3, 0x9E,
+ 0x81, 0xF6, 0xE5, 0xB1, 0xA0, 0xF6, 0xE5, 0xB1,
+ 0xAE, 0xF6, 0xE5, 0xB3, 0x80, 0xF6, 0xE5, 0xB2,
+ 0x8D, 0xF6, 0xF0, 0xA1, 0xB7, 0xA4, 0xF6, 0xE5,
+ 0xB5, 0x83, 0xF6, 0xF0, 0xA1, 0xB7, 0xA6, 0xF6,
+ 0xE5, 0xB5, 0xAE, 0xF6, 0xE5, 0xB5, 0xAB, 0xF6,
+ 0xE5, 0xB5, 0xBC, 0xF6, 0xE5, 0xB7, 0xA1, 0xF6,
+ 0xE5, 0xB7, 0xA2, 0xF6, 0xE3, 0xA0, 0xAF, 0xF6,
+ 0xE5, 0xB7, 0xBD, 0xF6, 0xE5, 0xB8, 0xA8, 0xF6,
+ 0xE5, 0xB8, 0xBD, 0xF6, 0xE5, 0xB9, 0xA9, 0xF6,
+ 0xE3, 0xA1, 0xA2, 0xF6, 0xF0, 0xA2, 0x86, 0x83,
+ 0xF6, 0xE3, 0xA1, 0xBC, 0xF6, 0xE5, 0xBA, 0xB0,
+ 0xF6, 0xE5, 0xBA, 0xB3, 0xF6, 0xE5, 0xBA, 0xB6,
+ 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xF0, 0xAA, 0x8E,
+ 0x92, 0xF6, 0xE5, 0xBB, 0xBE, 0xF6, 0xF0, 0xA2,
+ 0x8C, 0xB1, 0xF6, 0xF0, 0xA2, 0x8C, 0xB1, 0xF6,
+ 0xE8, 0x88, 0x81, 0xF6, 0xE5, 0xBC, 0xA2, 0xF6,
+ 0xE5, 0xBC, 0xA2, 0xF6, 0xE3, 0xA3, 0x87, 0xF6,
+ 0xF0, 0xA3, 0x8A, 0xB8, 0xF6, 0xF0, 0xA6, 0x87,
+ 0x9A, 0xF6, 0xE5, 0xBD, 0xA2, 0xF6, 0xE5, 0xBD,
+ 0xAB, 0xF6, 0xE3, 0xA3, 0xA3, 0xF6, 0xE5, 0xBE,
+ 0x9A, 0xF6, 0xE5, 0xBF, 0x8D, 0xF6, 0xE5, 0xBF,
+ 0x97, 0xF6, 0xE5, 0xBF, 0xB9, 0xF6, 0xE6, 0x82,
+ 0x81, 0xF6, 0xE3, 0xA4, 0xBA, 0xF6, 0xE3, 0xA4,
+ 0x9C, 0xF6, 0xE6, 0x82, 0x94, 0xF6, 0xF0, 0xA2,
+ 0x9B, 0x94, 0xF6, 0xE6, 0x83, 0x87, 0xF6, 0xE6,
+ 0x85, 0x88, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6,
+ 0x85, 0x8E, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6,
+ 0x85, 0xBA, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6,
+ 0x86, 0xB2, 0xF6, 0xE6, 0x86, 0xA4, 0xF6, 0xE6,
+ 0x86, 0xAF, 0xF6, 0xE6, 0x87, 0x9E, 0xF6, 0xE6,
+ 0x87, 0xB2, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, 0xE6,
+ 0x88, 0x90, 0xF6, 0xE6, 0x88, 0x9B, 0xF6, 0xE6,
+ 0x89, 0x9D, 0xF6, 0xE6, 0x8A, 0xB1, 0xF6, 0xE6,
+ 0x8B, 0x94, 0xF6, 0xE6, 0x8D, 0x90, 0xF6, 0xF0,
+ 0xA2, 0xAC, 0x8C, 0xF6, 0xE6, 0x8C, 0xBD, 0xF6,
+ 0xE6, 0x8B, 0xBC, 0xF6, 0xE6, 0x8D, 0xA8, 0xF6,
+ 0xE6, 0x8E, 0x83, 0xF6, 0xE6, 0x8F, 0xA4, 0xF6,
+ 0xF0, 0xA2, 0xAF, 0xB1, 0xF6, 0xE6, 0x90, 0xA2,
+ 0xF6, 0xE6, 0x8F, 0x85, 0xF6, 0xE6, 0x8E, 0xA9,
+ 0xF6, 0xE3, 0xA8, 0xAE, 0xF6, 0xE6, 0x91, 0xA9,
+ 0xF6, 0xE6, 0x91, 0xBE, 0xF6, 0xE6, 0x92, 0x9D,
+ 0xF6, 0xE6, 0x91, 0xB7, 0xF6, 0xE3, 0xA9, 0xAC,
+ 0xF6, 0xE6, 0x95, 0x8F, 0xF6, 0xE6, 0x95, 0xAC,
+ 0xF6, 0xF0, 0xA3, 0x80, 0x8A, 0xF6, 0xE6, 0x97,
+ 0xA3, 0xF6, 0xE6, 0x9B, 0xB8, 0xF6, 0xE6, 0x99,
+ 0x89, 0xF6, 0xE3, 0xAC, 0x99, 0xF6, 0xE6, 0x9A,
+ 0x91, 0xF6, 0xE3, 0xAC, 0x88, 0xF6, 0xE3, 0xAB,
+ 0xA4, 0xF6, 0xE5, 0x86, 0x92, 0xF6, 0xE5, 0x86,
+ 0x95, 0xF6, 0xE6, 0x9C, 0x80, 0xF6, 0xE6, 0x9A,
+ 0x9C, 0xF6, 0xE8, 0x82, 0xAD, 0xF6, 0xE4, 0x8F,
+ 0x99, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, 0xE6, 0x9C,
+ 0x9B, 0xF6, 0xE6, 0x9C, 0xA1, 0xF6, 0xE6, 0x9D,
+ 0x9E, 0xF6, 0xE6, 0x9D, 0x93, 0xF6, 0xF0, 0xA3,
+ 0x8F, 0x83, 0xF6, 0xE3, 0xAD, 0x89, 0xF6, 0xE6,
+ 0x9F, 0xBA, 0xF6, 0xE6, 0x9E, 0x85, 0xF6, 0xE6,
+ 0xA1, 0x92, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xF0,
+ 0xA3, 0x91, 0xAD, 0xF6, 0xE6, 0xA2, 0x8E, 0xF6,
+ 0xE6, 0xA0, 0x9F, 0xF6, 0xE6, 0xA4, 0x94, 0xF6,
+ 0xE3, 0xAE, 0x9D, 0xF6, 0xE6, 0xA5, 0x82, 0xF6,
+ 0xE6, 0xA6, 0xA3, 0xF6, 0xE6, 0xA7, 0xAA, 0xF6,
+ 0xE6, 0xAA, 0xA8, 0xF6, 0xF0, 0xA3, 0x9A, 0xA3,
+ 0xF6, 0xE6, 0xAB, 0x9B, 0xF6, 0xE3, 0xB0, 0x98,
+ 0xF6, 0xE6, 0xAC, 0xA1, 0xF6, 0xF0, 0xA3, 0xA2,
+ 0xA7, 0xF6, 0xE6, 0xAD, 0x94, 0xF6, 0xE3, 0xB1,
+ 0x8E, 0xF6, 0xE6, 0xAD, 0xB2, 0xF6, 0xE6, 0xAE,
+ 0x9F, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE6, 0xAE,
+ 0xBB, 0xF6, 0xF0, 0xA3, 0xAA, 0x8D, 0xF6, 0xF0,
+ 0xA1, 0xB4, 0x8B, 0xF6, 0xF0, 0xA3, 0xAB, 0xBA,
+ 0xF6, 0xE6, 0xB1, 0x8E, 0xF6, 0xF0, 0xA3, 0xB2,
+ 0xBC, 0xF6, 0xE6, 0xB2, 0xBF, 0xF6, 0xE6, 0xB3,
+ 0x8D, 0xF6, 0xE6, 0xB1, 0xA7, 0xF6, 0xE6, 0xB4,
+ 0x96, 0xF6, 0xE6, 0xB4, 0xBE, 0xF6, 0xE6, 0xB5,
+ 0xB7, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xB5,
+ 0xA9, 0xF6, 0xE6, 0xB5, 0xB8, 0xF6, 0xE6, 0xB6,
+ 0x85, 0xF6, 0xF0, 0xA3, 0xB4, 0x9E, 0xF6, 0xE6,
+ 0xB4, 0xB4, 0xF6, 0xE6, 0xB8, 0xAF, 0xF6, 0xE6,
+ 0xB9, 0xAE, 0xF6, 0xE3, 0xB4, 0xB3, 0xF6, 0xE6,
+ 0xBB, 0x8B, 0xF6, 0xE6, 0xBB, 0x87, 0xF6, 0xF0,
+ 0xA3, 0xBB, 0x91, 0xF6, 0xE6, 0xB7, 0xB9, 0xF6,
+ 0xE6, 0xBD, 0xAE, 0xF6, 0xF0, 0xA3, 0xBD, 0x9E,
+ 0xF6, 0xF0, 0xA3, 0xBE, 0x8E, 0xF6, 0xE6, 0xBF,
+ 0x86, 0xF6, 0xE7, 0x80, 0xB9, 0xF6, 0xE7, 0x80,
+ 0x9E, 0xF6, 0xE7, 0x80, 0x9B, 0xF6, 0xE3, 0xB6,
+ 0x96, 0xF6, 0xE7, 0x81, 0x8A, 0xF6, 0xE7, 0x81,
+ 0xBD, 0xF6, 0xE7, 0x81, 0xB7, 0xF6, 0xE7, 0x82,
+ 0xAD, 0xF6, 0xF0, 0xA0, 0x94, 0xA5, 0xF6, 0xE7,
+ 0x85, 0x85, 0xF6, 0xF0, 0xA4, 0x89, 0xA3, 0xF6,
+ 0xE7, 0x86, 0x9C, 0xF6, 0xF0, 0xA4, 0x8E, 0xAB,
+ 0xF6, 0xE7, 0x88, 0xA8, 0xF6, 0xE7, 0x88, 0xB5,
+ 0xF6, 0xE7, 0x89, 0x90, 0xF6, 0xF0, 0xA4, 0x98,
+ 0x88, 0xF6, 0xE7, 0x8A, 0x80, 0xF6, 0xE7, 0x8A,
+ 0x95, 0xF6, 0xF0, 0xA4, 0x9C, 0xB5, 0xF6, 0xF0,
+ 0xA4, 0xA0, 0x94, 0xF6, 0xE7, 0x8D, 0xBA, 0xF6,
+ 0xE7, 0x8E, 0x8B, 0xF6, 0xE3, 0xBA, 0xAC, 0xF6,
+ 0xE7, 0x8E, 0xA5, 0xF6, 0xE3, 0xBA, 0xB8, 0xF6,
+ 0xE3, 0xBA, 0xB8, 0xF6, 0xE7, 0x91, 0x87, 0xF6,
+ 0xE7, 0x91, 0x9C, 0xF6, 0xE7, 0x91, 0xB1, 0xF6,
+ 0xE7, 0x92, 0x85, 0xF6, 0xE7, 0x93, 0x8A, 0xF6,
+ 0xE3, 0xBC, 0x9B, 0xF6, 0xE7, 0x94, 0xA4, 0xF6,
+ 0xF0, 0xA4, 0xB0, 0xB6, 0xF6, 0xE7, 0x94, 0xBE,
+ 0xF6, 0xF0, 0xA4, 0xB2, 0x92, 0xF6, 0xE7, 0x95,
+ 0xB0, 0xF6, 0xF0, 0xA2, 0x86, 0x9F, 0xF6, 0xE7,
+ 0x98, 0x90, 0xF6, 0xF0, 0xA4, 0xBE, 0xA1, 0xF6,
+ 0xF0, 0xA4, 0xBE, 0xB8, 0xF6, 0xF0, 0xA5, 0x81,
+ 0x84, 0xF6, 0xE3, 0xBF, 0xBC, 0xF6, 0xE4, 0x80,
+ 0x88, 0xF6, 0xE7, 0x9B, 0xB4, 0xF6, 0xF0, 0xA5,
+ 0x83, 0xB3, 0xF6, 0xF0, 0xA5, 0x83, 0xB2, 0xF6,
+ 0xF0, 0xA5, 0x84, 0x99, 0xF6, 0xF0, 0xA5, 0x84,
+ 0xB3, 0xF6, 0xE7, 0x9C, 0x9E, 0xF6, 0xE7, 0x9C,
+ 0x9F, 0xF6, 0xE7, 0x9C, 0x9F, 0xF6, 0xE7, 0x9D,
+ 0x8A, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xE7, 0x9E,
+ 0x8B, 0xF6, 0xE4, 0x81, 0x86, 0xF6, 0xE4, 0x82,
+ 0x96, 0xF6, 0xF0, 0xA5, 0x90, 0x9D, 0xF6, 0xE7,
+ 0xA1, 0x8E, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, 0xE7,
+ 0xA3, 0x8C, 0xF6, 0xE4, 0x83, 0xA3, 0xF6, 0xF0,
+ 0xA5, 0x98, 0xA6, 0xF6, 0xE7, 0xA5, 0x96, 0xF6,
+ 0xF0, 0xA5, 0x9A, 0x9A, 0xF6, 0xF0, 0xA5, 0x9B,
+ 0x85, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE7, 0xA7,
+ 0xAB, 0xF6, 0xE4, 0x84, 0xAF, 0xF6, 0xE7, 0xA9,
+ 0x80, 0xF6, 0xE7, 0xA9, 0x8A, 0xF6, 0xE7, 0xA9,
+ 0x8F, 0xF6, 0xF0, 0xA5, 0xA5, 0xBC, 0xF6, 0xF0,
+ 0xA5, 0xAA, 0xA7, 0xF6, 0xF0, 0xA5, 0xAA, 0xA7,
+ 0xF6, 0xE7, 0xAB, 0xAE, 0xF6, 0xE4, 0x88, 0x82,
+ 0xF6, 0xF0, 0xA5, 0xAE, 0xAB, 0xF6, 0xE7, 0xAF,
+ 0x86, 0xF6, 0xE7, 0xAF, 0x89, 0xF6, 0xE4, 0x88,
+ 0xA7, 0xF6, 0xF0, 0xA5, 0xB2, 0x80, 0xF6, 0xE7,
+ 0xB3, 0x92, 0xF6, 0xE4, 0x8A, 0xA0, 0xF6, 0xE7,
+ 0xB3, 0xA8, 0xF6, 0xE7, 0xB3, 0xA3, 0xF6, 0xE7,
+ 0xB4, 0x80, 0xF6, 0xF0, 0xA5, 0xBE, 0x86, 0xF6,
+ 0xE7, 0xB5, 0xA3, 0xF6, 0xE4, 0x8C, 0x81, 0xF6,
+ 0xE7, 0xB7, 0x87, 0xF6, 0xE7, 0xB8, 0x82, 0xF6,
+ 0xE7, 0xB9, 0x85, 0xF6, 0xE4, 0x8C, 0xB4, 0xF6,
+ 0xF0, 0xA6, 0x88, 0xA8, 0xF6, 0xF0, 0xA6, 0x89,
+ 0x87, 0xF6, 0xE4, 0x8D, 0x99, 0xF6, 0xF0, 0xA6,
+ 0x8B, 0x99, 0xF6, 0xE7, 0xBD, 0xBA, 0xF6, 0xF0,
+ 0xA6, 0x8C, 0xBE, 0xF6, 0xE7, 0xBE, 0x95, 0xF6,
+ 0xE7, 0xBF, 0xBA, 0xF6, 0xE8, 0x80, 0x85, 0xF6,
+ 0xF0, 0xA6, 0x93, 0x9A, 0xF6, 0xF0, 0xA6, 0x94,
+ 0xA3, 0xF6, 0xE8, 0x81, 0xA0, 0xF6, 0xF0, 0xA6,
+ 0x96, 0xA8, 0xF6, 0xE8, 0x81, 0xB0, 0xF6, 0xF0,
+ 0xA3, 0x8D, 0x9F, 0xF6, 0xE4, 0x8F, 0x95, 0xF6,
+ 0xE8, 0x82, 0xB2, 0xF6, 0xE8, 0x84, 0x83, 0xF6,
+ 0xE4, 0x90, 0x8B, 0xF6, 0xE8, 0x84, 0xBE, 0xF6,
+ 0xE5, 0xAA, 0xB5, 0xF6, 0xF0, 0xA6, 0x9E, 0xA7,
+ 0xF6, 0xF0, 0xA6, 0x9E, 0xB5, 0xF6, 0xF0, 0xA3,
+ 0x8E, 0x93, 0xF6, 0xF0, 0xA3, 0x8E, 0x9C, 0xF6,
+ 0xE8, 0x88, 0x81, 0xF6, 0xE8, 0x88, 0x84, 0xF6,
+ 0xE8, 0xBE, 0x9E, 0xF6, 0xE4, 0x91, 0xAB, 0xF6,
+ 0xE8, 0x8A, 0x91, 0xF6, 0xE8, 0x8A, 0x8B, 0xF6,
+ 0xE8, 0x8A, 0x9D, 0xF6, 0xE5, 0x8A, 0xB3, 0xF6,
+ 0xE8, 0x8A, 0xB1, 0xF6, 0xE8, 0x8A, 0xB3, 0xF6,
+ 0xE8, 0x8A, 0xBD, 0xF6, 0xE8, 0x8B, 0xA6, 0xF6,
+ 0xF0, 0xA6, 0xAC, 0xBC, 0xF6, 0xE8, 0x8B, 0xA5,
+ 0xF6, 0xE8, 0x8C, 0x9D, 0xF6, 0xE8, 0x8D, 0xA3,
+ 0xF6, 0xE8, 0x8E, 0xAD, 0xF6, 0xE8, 0x8C, 0xA3,
+ 0xF6, 0xE8, 0x8E, 0xBD, 0xF6, 0xE8, 0x8F, 0xA7,
+ 0xF6, 0xE8, 0x91, 0x97, 0xF6, 0xE8, 0x8D, 0x93,
+ 0xF6, 0xE8, 0x8F, 0x8A, 0xF6, 0xE8, 0x8F, 0x8C,
+ 0xF6, 0xE8, 0x8F, 0x9C, 0xF6, 0xF0, 0xA6, 0xB0,
+ 0xB6, 0xF6, 0xF0, 0xA6, 0xB5, 0xAB, 0xF6, 0xF0,
+ 0xA6, 0xB3, 0x95, 0xF6, 0xE4, 0x94, 0xAB, 0xF6,
+ 0xE8, 0x93, 0xB1, 0xF6, 0xE8, 0x93, 0xB3, 0xF6,
+ 0xE8, 0x94, 0x96, 0xF6, 0xF0, 0xA7, 0x8F, 0x8A,
+ 0xF6, 0xE8, 0x95, 0xA4, 0xF6, 0xF0, 0xA6, 0xBC,
+ 0xAC, 0xF6, 0xE4, 0x95, 0x9D, 0xF6, 0xE4, 0x95,
+ 0xA1, 0xF6, 0xF0, 0xA6, 0xBE, 0xB1, 0xF6, 0xF0,
+ 0xA7, 0x83, 0x92, 0xF6, 0xE4, 0x95, 0xAB, 0xF6,
+ 0xE8, 0x99, 0x90, 0xF6, 0xE8, 0x99, 0x9C, 0xF6,
+ 0xE8, 0x99, 0xA7, 0xF6, 0xE8, 0x99, 0xA9, 0xF6,
+ 0xE8, 0x9A, 0xA9, 0xF6, 0xE8, 0x9A, 0x88, 0xF6,
+ 0xE8, 0x9C, 0x8E, 0xF6, 0xE8, 0x9B, 0xA2, 0xF6,
+ 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, 0x9C, 0xA8, 0xF6,
+ 0xE8, 0x9D, 0xAB, 0xF6, 0xE8, 0x9E, 0x86, 0xF6,
+ 0xE4, 0x97, 0x97, 0xF6, 0xE8, 0x9F, 0xA1, 0xF6,
+ 0xE8, 0xA0, 0x81, 0xF6, 0xE4, 0x97, 0xB9, 0xF6,
+ 0xE8, 0xA1, 0xA0, 0xF6, 0xE8, 0xA1, 0xA3, 0xF6,
+ 0xF0, 0xA7, 0x99, 0xA7, 0xF6, 0xE8, 0xA3, 0x97,
+ 0xF6, 0xE8, 0xA3, 0x9E, 0xF6, 0xE4, 0x98, 0xB5,
+ 0xF6, 0xE8, 0xA3, 0xBA, 0xF6, 0xE3, 0x92, 0xBB,
+ 0xF6, 0xF0, 0xA7, 0xA2, 0xAE, 0xF6, 0xF0, 0xA7,
+ 0xA5, 0xA6, 0xF6, 0xE4, 0x9A, 0xBE, 0xF6, 0xE4,
+ 0x9B, 0x87, 0xF6, 0xE8, 0xAA, 0xA0, 0xF6, 0xE8,
+ 0xAB, 0xAD, 0xF6, 0xE8, 0xAE, 0x8A, 0xF6, 0xE8,
+ 0xB1, 0x95, 0xF6, 0xF0, 0xA7, 0xB2, 0xA8, 0xF6,
+ 0xE8, 0xB2, 0xAB, 0xF6, 0xE8, 0xB3, 0x81, 0xF6,
+ 0xE8, 0xB4, 0x9B, 0xF6, 0xE8, 0xB5, 0xB7, 0xF6,
+ 0xF0, 0xA7, 0xBC, 0xAF, 0xF6, 0xF0, 0xA0, 0xA0,
+ 0x84, 0xF6, 0xE8, 0xB7, 0x8B, 0xF6, 0xE8, 0xB6,
+ 0xBC, 0xF6, 0xE8, 0xB7, 0xB0, 0xF6, 0xF0, 0xA0,
+ 0xA3, 0x9E, 0xF6, 0xE8, 0xBB, 0x94, 0xF6, 0xE8,
+ 0xBC, 0xB8, 0xF6, 0xF0, 0xA8, 0x97, 0x92, 0xF6,
+ 0xF0, 0xA8, 0x97, 0xAD, 0xF6, 0xE9, 0x82, 0x94,
+ 0xF6, 0xE9, 0x83, 0xB1, 0xF6, 0xE9, 0x84, 0x91,
+ 0xF6, 0xF0, 0xA8, 0x9C, 0xAE, 0xF6, 0xE9, 0x84,
+ 0x9B, 0xF6, 0xE9, 0x88, 0xB8, 0xF6, 0xE9, 0x8B,
+ 0x97, 0xF6, 0xE9, 0x8B, 0x98, 0xF6, 0xE9, 0x89,
+ 0xBC, 0xF6, 0xE9, 0x8F, 0xB9, 0xF6, 0xE9, 0x90,
+ 0x95, 0xF6, 0xF0, 0xA8, 0xAF, 0xBA, 0xF6, 0xE9,
+ 0x96, 0x8B, 0xF6, 0xE4, 0xA6, 0x95, 0xF6, 0xE9,
+ 0x96, 0xB7, 0xF6, 0xF0, 0xA8, 0xB5, 0xB7, 0xF6,
+ 0xE4, 0xA7, 0xA6, 0xF6, 0xE9, 0x9B, 0x83, 0xF6,
+ 0xE5, 0xB6, 0xB2, 0xF6, 0xE9, 0x9C, 0xA3, 0xF6,
+ 0xF0, 0xA9, 0x85, 0x85, 0xF6, 0xF0, 0xA9, 0x88,
+ 0x9A, 0xF6, 0xE4, 0xA9, 0xAE, 0xF6, 0xE4, 0xA9,
+ 0xB6, 0xF6, 0xE9, 0x9F, 0xA0, 0xF6, 0xF0, 0xA9,
+ 0x90, 0x8A, 0xF6, 0xE4, 0xAA, 0xB2, 0xF6, 0xF0,
+ 0xA9, 0x92, 0x96, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6,
+ 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, 0xA0, 0xA9, 0xF6,
+ 0xF0, 0xA9, 0x96, 0xB6, 0xF6, 0xE9, 0xA3, 0xA2,
+ 0xF6, 0xE4, 0xAC, 0xB3, 0xF6, 0xE9, 0xA4, 0xA9,
+ 0xF6, 0xE9, 0xA6, 0xA7, 0xF6, 0xE9, 0xA7, 0x82,
+ 0xF6, 0xE9, 0xA7, 0xBE, 0xF6, 0xE4, 0xAF, 0x8E,
+ 0xF6, 0xF0, 0xA9, 0xAC, 0xB0, 0xF6, 0xE9, 0xAC,
+ 0x92, 0xF6, 0xE9, 0xB1, 0x80, 0xF6, 0xE9, 0xB3,
+ 0xBD, 0xF6, 0xE4, 0xB3, 0x8E, 0xF6, 0xE4, 0xB3,
+ 0xAD, 0xF6, 0xE9, 0xB5, 0xA7, 0xF6, 0xF0, 0xAA,
+ 0x83, 0x8E, 0xF6, 0xE4, 0xB3, 0xB8, 0xF6, 0xF0,
+ 0xAA, 0x84, 0x85, 0xF6, 0xF0, 0xAA, 0x88, 0x8E,
+ 0xF6, 0xF0, 0xAA, 0x8A, 0x91, 0xF6, 0xE9, 0xBA,
+ 0xBB, 0xF6, 0xE4, 0xB5, 0x96, 0xF6, 0xE9, 0xBB,
+ 0xB9, 0xF6, 0xE9, 0xBB, 0xBE, 0xF6, 0xE9, 0xBC,
+ 0x85, 0xF6, 0xE9, 0xBC, 0x8F, 0xF6, 0xE9, 0xBC,
+ 0x96, 0xF6, 0xE9, 0xBC, 0xBB, 0xF6, 0xF0, 0xAA,
+ 0x98, 0x80, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0,
+ },
+ {
+ 0x20, 0x20, 0xCC, 0x88, 0x61, 0x20, 0xCC, 0x84,
+ 0x32, 0x33, 0x20, 0xCC, 0x81, 0xCE, 0xBC, 0x20,
+ 0xCC, 0xA7, 0x31, 0x6F, 0x31, 0xE2, 0x81, 0x84,
+ 0x34, 0x31, 0xE2, 0x81, 0x84, 0x32, 0x33, 0xE2,
+ 0x81, 0x84, 0x34, 0xF6, 0x41, 0xCC, 0x80, 0xF6,
+ 0x41, 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x82, 0xF6,
+ 0x41, 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0x88, 0xF6,
+ 0x41, 0xCC, 0x8A, 0xF6, 0x43, 0xCC, 0xA7, 0xF6,
+ 0x45, 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x81, 0xF6,
+ 0x45, 0xCC, 0x82, 0xF6, 0x45, 0xCC, 0x88, 0xF6,
+ 0x49, 0xCC, 0x80, 0xF6, 0x49, 0xCC, 0x81, 0xF6,
+ 0x49, 0xCC, 0x82, 0xF6, 0x49, 0xCC, 0x88, 0xF6,
+ 0x4E, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x80, 0xF6,
+ 0x4F, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xF6,
+ 0x4F, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x88, 0xF6,
+ 0x55, 0xCC, 0x80, 0xF6, 0x55, 0xCC, 0x81, 0xF6,
+ 0x55, 0xCC, 0x82, 0xF6, 0x55, 0xCC, 0x88, 0xF6,
+ 0x59, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x80, 0xF6,
+ 0x61, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x82, 0xF6,
+ 0x61, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x88, 0xF6,
+ 0x61, 0xCC, 0x8A, 0xF6, 0x63, 0xCC, 0xA7, 0xF6,
+ 0x65, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x81, 0xF6,
+ 0x65, 0xCC, 0x82, 0xF6, 0x65, 0xCC, 0x88, 0xF6,
+ 0x69, 0xCC, 0x80, 0xF6, 0x69, 0xCC, 0x81, 0xF6,
+ 0x69, 0xCC, 0x82, 0xF6, 0x69, 0xCC, 0x88, 0xF6,
+ 0x6E, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x80, 0xF6,
+ 0x6F, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xF6,
+ 0x6F, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x88, 0xF6,
+ 0x75, 0xCC, 0x80, 0xF6, 0x75, 0xCC, 0x81, 0xF6,
+ 0x75, 0xCC, 0x82, 0xF6, 0x75, 0xCC, 0x88, 0xF6,
+ 0x79, 0xCC, 0x81, 0xF6, 0x79, 0xCC, 0x88, 0xF6,
+ 0x41, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x84, 0xF6,
+ 0x41, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0x86, 0xF6,
+ 0x41, 0xCC, 0xA8, 0xF6, 0x61, 0xCC, 0xA8, 0xF6,
+ 0x43, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0x81, 0xF6,
+ 0x43, 0xCC, 0x82, 0xF6, 0x63, 0xCC, 0x82, 0xF6,
+ 0x43, 0xCC, 0x87, 0xF6, 0x63, 0xCC, 0x87, 0xF6,
+ 0x43, 0xCC, 0x8C, 0xF6, 0x63, 0xCC, 0x8C, 0xF6,
+ 0x44, 0xCC, 0x8C, 0xF6, 0x64, 0xCC, 0x8C, 0xF6,
+ 0x45, 0xCC, 0x84, 0xF6, 0x65, 0xCC, 0x84, 0xF6,
+ 0x45, 0xCC, 0x86, 0xF6, 0x65, 0xCC, 0x86, 0xF6,
+ 0x45, 0xCC, 0x87, 0xF6, 0x65, 0xCC, 0x87, 0xF6,
+ 0x45, 0xCC, 0xA8, 0xF6, 0x65, 0xCC, 0xA8, 0xF6,
+ 0x45, 0xCC, 0x8C, 0xF6, 0x65, 0xCC, 0x8C, 0xF6,
+ 0x47, 0xCC, 0x82, 0xF6, 0x67, 0xCC, 0x82, 0xF6,
+ 0x47, 0xCC, 0x86, 0xF6, 0x67, 0xCC, 0x86, 0xF6,
+ 0x47, 0xCC, 0x87, 0xF6, 0x67, 0xCC, 0x87, 0xF6,
+ 0x47, 0xCC, 0xA7, 0xF6, 0x67, 0xCC, 0xA7, 0xF6,
+ 0x48, 0xCC, 0x82, 0xF6, 0x68, 0xCC, 0x82, 0xF6,
+ 0x49, 0xCC, 0x83, 0xF6, 0x69, 0xCC, 0x83, 0xF6,
+ 0x49, 0xCC, 0x84, 0xF6, 0x69, 0xCC, 0x84, 0xF6,
+ 0x49, 0xCC, 0x86, 0xF6, 0x69, 0xCC, 0x86, 0xF6,
+ 0x49, 0xCC, 0xA8, 0xF6, 0x69, 0xCC, 0xA8, 0xF6,
+ 0x49, 0xCC, 0x87, 0x49, 0x4A, 0x69, 0x6A, 0xF6,
+ 0x4A, 0xCC, 0x82, 0xF6, 0x6A, 0xCC, 0x82, 0xF6,
+ 0x4B, 0xCC, 0xA7, 0xF6, 0x6B, 0xCC, 0xA7, 0xF6,
+ 0x4C, 0xCC, 0x81, 0xF6, 0x6C, 0xCC, 0x81, 0xF6,
+ 0x4C, 0xCC, 0xA7, 0xF6, 0x6C, 0xCC, 0xA7, 0xF6,
+ 0x4C, 0xCC, 0x8C, 0xF6, 0x6C, 0xCC, 0x8C, 0x4C,
+ 0xC2, 0xB7, 0x6C, 0xC2, 0xB7, 0xF6, 0x4E, 0xCC,
+ 0x81, 0xF6, 0x6E, 0xCC, 0x81, 0xF6, 0x4E, 0xCC,
+ 0xA7, 0xF6, 0x6E, 0xCC, 0xA7, 0xF6, 0x4E, 0xCC,
+ 0x8C, 0xF6, 0x6E, 0xCC, 0x8C, 0xCA, 0xBC, 0x6E,
+ 0xF6, 0x4F, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, 0x84,
+ 0xF6, 0x4F, 0xCC, 0x86, 0xF6, 0x6F, 0xCC, 0x86,
+ 0xF6, 0x4F, 0xCC, 0x8B, 0xF6, 0x6F, 0xCC, 0x8B,
+ 0xF6, 0x52, 0xCC, 0x81, 0xF6, 0x72, 0xCC, 0x81,
+ 0xF6, 0x52, 0xCC, 0xA7, 0xF6, 0x72, 0xCC, 0xA7,
+ 0xF6, 0x52, 0xCC, 0x8C, 0xF6, 0x72, 0xCC, 0x8C,
+ 0xF6, 0x53, 0xCC, 0x81, 0xF6, 0x73, 0xCC, 0x81,
+ 0xF6, 0x53, 0xCC, 0x82, 0xF6, 0x73, 0xCC, 0x82,
+ 0xF6, 0x53, 0xCC, 0xA7, 0xF6, 0x73, 0xCC, 0xA7,
+ 0xF6, 0x53, 0xCC, 0x8C, 0xF6, 0x73, 0xCC, 0x8C,
+ 0xF6, 0x54, 0xCC, 0xA7, 0xF6, 0x74, 0xCC, 0xA7,
+ 0xF6, 0x54, 0xCC, 0x8C, 0xF6, 0x74, 0xCC, 0x8C,
+ 0xF6, 0x55, 0xCC, 0x83, 0xF6, 0x75, 0xCC, 0x83,
+ 0xF6, 0x55, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x84,
+ 0xF6, 0x55, 0xCC, 0x86, 0xF6, 0x75, 0xCC, 0x86,
+ 0xF6, 0x55, 0xCC, 0x8A, 0xF6, 0x75, 0xCC, 0x8A,
+ 0xF6, 0x55, 0xCC, 0x8B, 0xF6, 0x75, 0xCC, 0x8B,
+ 0xF6, 0x55, 0xCC, 0xA8, 0xF6, 0x75, 0xCC, 0xA8,
+ 0xF6, 0x57, 0xCC, 0x82, 0xF6, 0x77, 0xCC, 0x82,
+ 0xF6, 0x59, 0xCC, 0x82, 0xF6, 0x79, 0xCC, 0x82,
+ 0xF6, 0x59, 0xCC, 0x88, 0xF6, 0x5A, 0xCC, 0x81,
+ 0xF6, 0x7A, 0xCC, 0x81, 0xF6, 0x5A, 0xCC, 0x87,
+ 0xF6, 0x7A, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, 0x8C,
+ 0xF6, 0x7A, 0xCC, 0x8C, 0x73, 0xF6, 0x4F, 0xCC,
+ 0x9B, 0xF6, 0x6F, 0xCC, 0x9B, 0xF6, 0x55, 0xCC,
+ 0x9B, 0xF6, 0x75, 0xCC, 0x9B, 0x44, 0x5A, 0xCC,
+ 0x8C, 0x44, 0x7A, 0xCC, 0x8C, 0x64, 0x7A, 0xCC,
+ 0x8C, 0x4C, 0x4A, 0x4C, 0x6A, 0x6C, 0x6A, 0x4E,
+ 0x4A, 0x4E, 0x6A, 0x6E, 0x6A, 0xF6, 0x41, 0xCC,
+ 0x8C, 0xF6, 0x61, 0xCC, 0x8C, 0xF6, 0x49, 0xCC,
+ 0x8C, 0xF6, 0x69, 0xCC, 0x8C, 0xF6, 0x4F, 0xCC,
+ 0x8C, 0xF6, 0x6F, 0xCC, 0x8C, 0xF6, 0x55, 0xCC,
+ 0x8C, 0xF6, 0x75, 0xCC, 0x8C, 0xF6, 0x55, 0xCC,
+ 0x88, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x88, 0xCC,
+ 0x84, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x81, 0xF6,
+ 0x75, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x55, 0xCC,
+ 0x88, 0xCC, 0x8C, 0xF6, 0x75, 0xCC, 0x88, 0xCC,
+ 0x8C, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x80, 0xF6,
+ 0x75, 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0x41, 0xCC,
+ 0x88, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x88, 0xCC,
+ 0x84, 0xF6, 0x41, 0xCC, 0x87, 0xCC, 0x84, 0xF6,
+ 0x61, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0xC3, 0x86,
+ 0xCC, 0x84, 0xF6, 0xC3, 0xA6, 0xCC, 0x84, 0xF6,
+ 0x47, 0xCC, 0x8C, 0xF6, 0x67, 0xCC, 0x8C, 0xF6,
+ 0x4B, 0xCC, 0x8C, 0xF6, 0x6B, 0xCC, 0x8C, 0xF6,
+ 0x4F, 0xCC, 0xA8, 0xF6, 0x6F, 0xCC, 0xA8, 0xF6,
+ 0x4F, 0xCC, 0xA8, 0xCC, 0x84, 0xF6, 0x6F, 0xCC,
+ 0xA8, 0xCC, 0x84, 0xF6, 0xC6, 0xB7, 0xCC, 0x8C,
+ 0xF6, 0xCA, 0x92, 0xCC, 0x8C, 0xF6, 0x6A, 0xCC,
+ 0x8C, 0x44, 0x5A, 0x44, 0x7A, 0x64, 0x7A, 0xF6,
+ 0x47, 0xCC, 0x81, 0xF6, 0x67, 0xCC, 0x81, 0xF6,
+ 0x4E, 0xCC, 0x80, 0xF6, 0x6E, 0xCC, 0x80, 0xF6,
+ 0x41, 0xCC, 0x8A, 0xCC, 0x81, 0xF6, 0x61, 0xCC,
+ 0x8A, 0xCC, 0x81, 0xF6, 0xC3, 0x86, 0xCC, 0x81,
+ 0xF6, 0xC3, 0xA6, 0xCC, 0x81, 0xF6, 0xC3, 0x98,
+ 0xCC, 0x81, 0xF6, 0xC3, 0xB8, 0xCC, 0x81, 0xF6,
+ 0x41, 0xCC, 0x8F, 0xF6, 0x61, 0xCC, 0x8F, 0xF6,
+ 0x41, 0xCC, 0x91, 0xF6, 0x61, 0xCC, 0x91, 0xF6,
+ 0x45, 0xCC, 0x8F, 0xF6, 0x65, 0xCC, 0x8F, 0xF6,
+ 0x45, 0xCC, 0x91, 0xF6, 0x65, 0xCC, 0x91, 0xF6,
+ 0x49, 0xCC, 0x8F, 0xF6, 0x69, 0xCC, 0x8F, 0xF6,
+ 0x49, 0xCC, 0x91, 0xF6, 0x69, 0xCC, 0x91, 0xF6,
+ 0x4F, 0xCC, 0x8F, 0xF6, 0x6F, 0xCC, 0x8F, 0xF6,
+ 0x4F, 0xCC, 0x91, 0xF6, 0x6F, 0xCC, 0x91, 0xF6,
+ 0x52, 0xCC, 0x8F, 0xF6, 0x72, 0xCC, 0x8F, 0xF6,
+ 0x52, 0xCC, 0x91, 0xF6, 0x72, 0xCC, 0x91, 0xF6,
+ 0x55, 0xCC, 0x8F, 0xF6, 0x75, 0xCC, 0x8F, 0xF6,
+ 0x55, 0xCC, 0x91, 0xF6, 0x75, 0xCC, 0x91, 0xF6,
+ 0x53, 0xCC, 0xA6, 0xF6, 0x73, 0xCC, 0xA6, 0xF6,
+ 0x54, 0xCC, 0xA6, 0xF6, 0x74, 0xCC, 0xA6, 0xF6,
+ 0x48, 0xCC, 0x8C, 0xF6, 0x68, 0xCC, 0x8C, 0xF6,
+ 0x41, 0xCC, 0x87, 0xF6, 0x61, 0xCC, 0x87, 0xF6,
+ 0x45, 0xCC, 0xA7, 0xF6, 0x65, 0xCC, 0xA7, 0xF6,
+ 0x4F, 0xCC, 0x88, 0xCC, 0x84, 0xF6, 0x6F, 0xCC,
+ 0x88, 0xCC, 0x84, 0xF6, 0x4F, 0xCC, 0x83, 0xCC,
+ 0x84, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x84, 0xF6,
+ 0x4F, 0xCC, 0x87, 0xF6, 0x6F, 0xCC, 0x87, 0xF6,
+ 0x4F, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0x6F, 0xCC,
+ 0x87, 0xCC, 0x84, 0xF6, 0x59, 0xCC, 0x84, 0xF6,
+ 0x79, 0xCC, 0x84, 0x68, 0xC9, 0xA6, 0x6A, 0x72,
+ 0xC9, 0xB9, 0xC9, 0xBB, 0xCA, 0x81, 0x77, 0x79,
+ 0x20, 0xCC, 0x86, 0x20, 0xCC, 0x87, 0x20, 0xCC,
+ 0x8A, 0x20, 0xCC, 0xA8, 0x20, 0xCC, 0x83, 0x20,
+ 0xCC, 0x8B, 0xC9, 0xA3, 0x6C, 0x73, 0x78, 0xCA,
+ 0x95, 0xF6, 0xCC, 0x80, 0xF6, 0xCC, 0x81, 0xF6,
+ 0xCC, 0x93, 0xF6, 0xCC, 0x88, 0xCC, 0x81, 0xF6,
+ 0xCA, 0xB9, 0x20, 0xCD, 0x85, 0xF6, 0x3B, 0x20,
+ 0xCC, 0x81, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81,
+ 0x20, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0x91,
+ 0xCC, 0x81, 0xF6, 0xC2, 0xB7, 0xF6, 0xCE, 0x95,
+ 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x81, 0xF6,
+ 0xCE, 0x99, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, 0xF6, 0xCE,
+ 0xA9, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x88,
+ 0xCC, 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x88, 0xF6,
+ 0xCE, 0xA5, 0xCC, 0x88, 0xF6, 0xCE, 0xB1, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x81,
+ 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6,
+ 0xCE, 0xB9, 0xCC, 0x88, 0xF6, 0xCF, 0x85, 0xCC,
+ 0x88, 0xF6, 0xCE, 0xBF, 0xCC, 0x81, 0xF6, 0xCF,
+ 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x81,
+ 0xCE, 0xB2, 0xCE, 0xB8, 0xCE, 0xA5, 0xF5, 0x05,
+ 0xCF, 0x92, 0xCC, 0x81, 0xCE, 0xA5, 0xCC, 0x81,
+ 0xF5, 0x05, 0xCF, 0x92, 0xCC, 0x88, 0xCE, 0xA5,
+ 0xCC, 0x88, 0xCF, 0x86, 0xCF, 0x80, 0xCE, 0xBA,
+ 0xCF, 0x81, 0xCF, 0x82, 0xCE, 0x98, 0xCE, 0xB5,
+ 0xCE, 0xA3, 0xF6, 0xD0, 0x95, 0xCC, 0x80, 0xF6,
+ 0xD0, 0x95, 0xCC, 0x88, 0xF6, 0xD0, 0x93, 0xCC,
+ 0x81, 0xF6, 0xD0, 0x86, 0xCC, 0x88, 0xF6, 0xD0,
+ 0x9A, 0xCC, 0x81, 0xF6, 0xD0, 0x98, 0xCC, 0x80,
+ 0xF6, 0xD0, 0xA3, 0xCC, 0x86, 0xF6, 0xD0, 0x98,
+ 0xCC, 0x86, 0xF6, 0xD0, 0xB8, 0xCC, 0x86, 0xF6,
+ 0xD0, 0xB5, 0xCC, 0x80, 0xF6, 0xD0, 0xB5, 0xCC,
+ 0x88, 0xF6, 0xD0, 0xB3, 0xCC, 0x81, 0xF6, 0xD1,
+ 0x96, 0xCC, 0x88, 0xF6, 0xD0, 0xBA, 0xCC, 0x81,
+ 0xF6, 0xD0, 0xB8, 0xCC, 0x80, 0xF6, 0xD1, 0x83,
+ 0xCC, 0x86, 0xF6, 0xD1, 0xB4, 0xCC, 0x8F, 0xF6,
+ 0xD1, 0xB5, 0xCC, 0x8F, 0xF6, 0xD0, 0x96, 0xCC,
+ 0x86, 0xF6, 0xD0, 0xB6, 0xCC, 0x86, 0xF6, 0xD0,
+ 0x90, 0xCC, 0x86, 0xF6, 0xD0, 0xB0, 0xCC, 0x86,
+ 0xF6, 0xD0, 0x90, 0xCC, 0x88, 0xF6, 0xD0, 0xB0,
+ 0xCC, 0x88, 0xF6, 0xD0, 0x95, 0xCC, 0x86, 0xF6,
+ 0xD0, 0xB5, 0xCC, 0x86, 0xF6, 0xD3, 0x98, 0xCC,
+ 0x88, 0xF6, 0xD3, 0x99, 0xCC, 0x88, 0xF6, 0xD0,
+ 0x96, 0xCC, 0x88, 0xF6, 0xD0, 0xB6, 0xCC, 0x88,
+ 0xF6, 0xD0, 0x97, 0xCC, 0x88, 0xF6, 0xD0, 0xB7,
+ 0xCC, 0x88, 0xF6, 0xD0, 0x98, 0xCC, 0x84, 0xF6,
+ 0xD0, 0xB8, 0xCC, 0x84, 0xF6, 0xD0, 0x98, 0xCC,
+ 0x88, 0xF6, 0xD0, 0xB8, 0xCC, 0x88, 0xF6, 0xD0,
+ 0x9E, 0xCC, 0x88, 0xF6, 0xD0, 0xBE, 0xCC, 0x88,
+ 0xF6, 0xD3, 0xA8, 0xCC, 0x88, 0xF6, 0xD3, 0xA9,
+ 0xCC, 0x88, 0xF6, 0xD0, 0xAD, 0xCC, 0x88, 0xF6,
+ 0xD1, 0x8D, 0xCC, 0x88, 0xF6, 0xD0, 0xA3, 0xCC,
+ 0x84, 0xF6, 0xD1, 0x83, 0xCC, 0x84, 0xF6, 0xD0,
+ 0xA3, 0xCC, 0x88, 0xF6, 0xD1, 0x83, 0xCC, 0x88,
+ 0xF6, 0xD0, 0xA3, 0xCC, 0x8B, 0xF6, 0xD1, 0x83,
+ 0xCC, 0x8B, 0xF6, 0xD0, 0xA7, 0xCC, 0x88, 0xF6,
+ 0xD1, 0x87, 0xCC, 0x88, 0xF6, 0xD0, 0xAB, 0xCC,
+ 0x88, 0xF6, 0xD1, 0x8B, 0xCC, 0x88, 0xD5, 0xA5,
+ 0xD6, 0x82, 0xF6, 0xD8, 0xA7, 0xD9, 0x93, 0xF6,
+ 0xD8, 0xA7, 0xD9, 0x94, 0xF6, 0xD9, 0x88, 0xD9,
+ 0x94, 0xF6, 0xD8, 0xA7, 0xD9, 0x95, 0xF6, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0xB4, 0xD9,
+ 0x88, 0xD9, 0xB4, 0xDB, 0x87, 0xD9, 0xB4, 0xD9,
+ 0x8A, 0xD9, 0xB4, 0xF6, 0xDB, 0x95, 0xD9, 0x94,
+ 0xF6, 0xDB, 0x81, 0xD9, 0x94, 0xF6, 0xDB, 0x92,
+ 0xD9, 0x94, 0xF6, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4,
+ 0xBC, 0xF6, 0xE0, 0xA4, 0xB0, 0xE0, 0xA4, 0xBC,
+ 0xF6, 0xE0, 0xA4, 0xB3, 0xE0, 0xA4, 0xBC, 0xF6,
+ 0xE0, 0xA4, 0x95, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0,
+ 0xA4, 0x96, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4,
+ 0x97, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x9C,
+ 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0xA1, 0xE0,
+ 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0xA2, 0xE0, 0xA4,
+ 0xBC, 0xF6, 0xE0, 0xA4, 0xAB, 0xE0, 0xA4, 0xBC,
+ 0xF6, 0xE0, 0xA4, 0xAF, 0xE0, 0xA4, 0xBC, 0xF6,
+ 0xE0, 0xA7, 0x87, 0xE0, 0xA6, 0xBE, 0xF6, 0xE0,
+ 0xA7, 0x87, 0xE0, 0xA7, 0x97, 0xF6, 0xE0, 0xA6,
+ 0xA1, 0xE0, 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xA2,
+ 0xE0, 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xAF, 0xE0,
+ 0xA6, 0xBC, 0xF6, 0xE0, 0xA8, 0xB2, 0xE0, 0xA8,
+ 0xBC, 0xF6, 0xE0, 0xA8, 0xB8, 0xE0, 0xA8, 0xBC,
+ 0xF6, 0xE0, 0xA8, 0x96, 0xE0, 0xA8, 0xBC, 0xF6,
+ 0xE0, 0xA8, 0x97, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0,
+ 0xA8, 0x9C, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8,
+ 0xAB, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xAD, 0x87,
+ 0xE0, 0xAD, 0x96, 0xF6, 0xE0, 0xAD, 0x87, 0xE0,
+ 0xAC, 0xBE, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD,
+ 0x97, 0xF6, 0xE0, 0xAC, 0xA1, 0xE0, 0xAC, 0xBC,
+ 0xF6, 0xE0, 0xAC, 0xA2, 0xE0, 0xAC, 0xBC, 0xF6,
+ 0xE0, 0xAE, 0x92, 0xE0, 0xAF, 0x97, 0xF6, 0xE0,
+ 0xAF, 0x86, 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF,
+ 0x87, 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x86,
+ 0xE0, 0xAF, 0x97, 0xF6, 0xE0, 0xB1, 0x86, 0xE0,
+ 0xB1, 0x96, 0xF6, 0xE0, 0xB2, 0xBF, 0xE0, 0xB3,
+ 0x95, 0xF6, 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x95,
+ 0xF6, 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x96, 0xF6,
+ 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x82, 0xF6, 0xE0,
+ 0xB3, 0x86, 0xE0, 0xB3, 0x82, 0xE0, 0xB3, 0x95,
+ 0xF6, 0xE0, 0xB5, 0x86, 0xE0, 0xB4, 0xBE, 0xF6,
+ 0xE0, 0xB5, 0x87, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0,
+ 0xB5, 0x86, 0xE0, 0xB5, 0x97, 0xF6, 0xE0, 0xB7,
+ 0x99, 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99,
+ 0xE0, 0xB7, 0x8F, 0xF6, 0xE0, 0xB7, 0x99, 0xE0,
+ 0xB7, 0x8F, 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7,
+ 0x99, 0xE0, 0xB7, 0x9F, 0xE0, 0xB9, 0x8D, 0xE0,
+ 0xB8, 0xB2, 0xE0, 0xBB, 0x8D, 0xE0, 0xBA, 0xB2,
+ 0xE0, 0xBA, 0xAB, 0xE0, 0xBA, 0x99, 0xE0, 0xBA,
+ 0xAB, 0xE0, 0xBA, 0xA1, 0xE0, 0xBC, 0x8B, 0xF6,
+ 0xE0, 0xBD, 0x82, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0,
+ 0xBD, 0x8C, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD,
+ 0x91, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x96,
+ 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x9B, 0xE0,
+ 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x80, 0xE0, 0xBE,
+ 0xB5, 0xF6, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2,
+ 0xF6, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0xF6,
+ 0xE0, 0xBE, 0xB2, 0xE0, 0xBE, 0x80, 0xE0, 0xBE,
+ 0xB2, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6,
+ 0xE0, 0xBE, 0xB3, 0xE0, 0xBE, 0x80, 0xE0, 0xBE,
+ 0xB3, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6,
+ 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0,
+ 0xBE, 0x92, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE,
+ 0x9C, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA1,
+ 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA6, 0xE0,
+ 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xAB, 0xE0, 0xBE,
+ 0xB7, 0xF6, 0xE0, 0xBE, 0x90, 0xE0, 0xBE, 0xB5,
+ 0xF6, 0xE1, 0x80, 0xA5, 0xE1, 0x80, 0xAE, 0xE1,
+ 0x83, 0x9C, 0xF6, 0xE1, 0xAC, 0x85, 0xE1, 0xAC,
+ 0xB5, 0xF6, 0xE1, 0xAC, 0x87, 0xE1, 0xAC, 0xB5,
+ 0xF6, 0xE1, 0xAC, 0x89, 0xE1, 0xAC, 0xB5, 0xF6,
+ 0xE1, 0xAC, 0x8B, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1,
+ 0xAC, 0x8D, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC,
+ 0x91, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBA,
+ 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBC, 0xE1,
+ 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBE, 0xE1, 0xAC,
+ 0xB5, 0xF6, 0xE1, 0xAC, 0xBF, 0xE1, 0xAC, 0xB5,
+ 0xF6, 0xE1, 0xAD, 0x82, 0xE1, 0xAC, 0xB5, 0x41,
+ 0xC3, 0x86, 0x42, 0x44, 0x45, 0xC6, 0x8E, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0xC8, 0xA2, 0x50, 0x52, 0x54, 0x55, 0x57, 0x61,
+ 0xC9, 0x90, 0xC9, 0x91, 0xE1, 0xB4, 0x82, 0x62,
+ 0x64, 0x65, 0xC9, 0x99, 0xC9, 0x9B, 0xC9, 0x9C,
+ 0x67, 0x6B, 0x6D, 0xC5, 0x8B, 0x6F, 0xC9, 0x94,
+ 0xE1, 0xB4, 0x96, 0xE1, 0xB4, 0x97, 0x70, 0x74,
+ 0x75, 0xE1, 0xB4, 0x9D, 0xC9, 0xAF, 0x76, 0xE1,
+ 0xB4, 0xA5, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4,
+ 0xCF, 0x86, 0xCF, 0x87, 0x69, 0x72, 0x75, 0x76,
+ 0xCE, 0xB2, 0xCE, 0xB3, 0xCF, 0x81, 0xCF, 0x86,
+ 0xCF, 0x87, 0xD0, 0xBD, 0xC9, 0x92, 0x63, 0xC9,
+ 0x95, 0xC3, 0xB0, 0xC9, 0x9C, 0x66, 0xC9, 0x9F,
+ 0xC9, 0xA1, 0xC9, 0xA5, 0xC9, 0xA8, 0xC9, 0xA9,
+ 0xC9, 0xAA, 0xE1, 0xB5, 0xBB, 0xCA, 0x9D, 0xC9,
+ 0xAD, 0xE1, 0xB6, 0x85, 0xCA, 0x9F, 0xC9, 0xB1,
+ 0xC9, 0xB0, 0xC9, 0xB2, 0xC9, 0xB3, 0xC9, 0xB4,
+ 0xC9, 0xB5, 0xC9, 0xB8, 0xCA, 0x82, 0xCA, 0x83,
+ 0xC6, 0xAB, 0xCA, 0x89, 0xCA, 0x8A, 0xE1, 0xB4,
+ 0x9C, 0xCA, 0x8B, 0xCA, 0x8C, 0x7A, 0xCA, 0x90,
+ 0xCA, 0x91, 0xCA, 0x92, 0xCE, 0xB8, 0xF6, 0x41,
+ 0xCC, 0xA5, 0xF6, 0x61, 0xCC, 0xA5, 0xF6, 0x42,
+ 0xCC, 0x87, 0xF6, 0x62, 0xCC, 0x87, 0xF6, 0x42,
+ 0xCC, 0xA3, 0xF6, 0x62, 0xCC, 0xA3, 0xF6, 0x42,
+ 0xCC, 0xB1, 0xF6, 0x62, 0xCC, 0xB1, 0xF6, 0x43,
+ 0xCC, 0xA7, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0xA7,
+ 0xCC, 0x81, 0xF6, 0x44, 0xCC, 0x87, 0xF6, 0x64,
+ 0xCC, 0x87, 0xF6, 0x44, 0xCC, 0xA3, 0xF6, 0x64,
+ 0xCC, 0xA3, 0xF6, 0x44, 0xCC, 0xB1, 0xF6, 0x64,
+ 0xCC, 0xB1, 0xF6, 0x44, 0xCC, 0xA7, 0xF6, 0x64,
+ 0xCC, 0xA7, 0xF6, 0x44, 0xCC, 0xAD, 0xF6, 0x64,
+ 0xCC, 0xAD, 0xF6, 0x45, 0xCC, 0x84, 0xCC, 0x80,
+ 0xF6, 0x65, 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x45,
+ 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x65, 0xCC, 0x84,
+ 0xCC, 0x81, 0xF6, 0x45, 0xCC, 0xAD, 0xF6, 0x65,
+ 0xCC, 0xAD, 0xF6, 0x45, 0xCC, 0xB0, 0xF6, 0x65,
+ 0xCC, 0xB0, 0xF6, 0x45, 0xCC, 0xA7, 0xCC, 0x86,
+ 0xF6, 0x65, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, 0x46,
+ 0xCC, 0x87, 0xF6, 0x66, 0xCC, 0x87, 0xF6, 0x47,
+ 0xCC, 0x84, 0xF6, 0x67, 0xCC, 0x84, 0xF6, 0x48,
+ 0xCC, 0x87, 0xF6, 0x68, 0xCC, 0x87, 0xF6, 0x48,
+ 0xCC, 0xA3, 0xF6, 0x68, 0xCC, 0xA3, 0xF6, 0x48,
+ 0xCC, 0x88, 0xF6, 0x68, 0xCC, 0x88, 0xF6, 0x48,
+ 0xCC, 0xA7, 0xF6, 0x68, 0xCC, 0xA7, 0xF6, 0x48,
+ 0xCC, 0xAE, 0xF6, 0x68, 0xCC, 0xAE, 0xF6, 0x49,
+ 0xCC, 0xB0, 0xF6, 0x69, 0xCC, 0xB0, 0xF6, 0x49,
+ 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x69, 0xCC, 0x88,
+ 0xCC, 0x81, 0xF6, 0x4B, 0xCC, 0x81, 0xF6, 0x6B,
+ 0xCC, 0x81, 0xF6, 0x4B, 0xCC, 0xA3, 0xF6, 0x6B,
+ 0xCC, 0xA3, 0xF6, 0x4B, 0xCC, 0xB1, 0xF6, 0x6B,
+ 0xCC, 0xB1, 0xF6, 0x4C, 0xCC, 0xA3, 0xF6, 0x6C,
+ 0xCC, 0xA3, 0xF6, 0x4C, 0xCC, 0xA3, 0xCC, 0x84,
+ 0xF6, 0x6C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x4C,
+ 0xCC, 0xB1, 0xF6, 0x6C, 0xCC, 0xB1, 0xF6, 0x4C,
+ 0xCC, 0xAD, 0xF6, 0x6C, 0xCC, 0xAD, 0xF6, 0x4D,
+ 0xCC, 0x81, 0xF6, 0x6D, 0xCC, 0x81, 0xF6, 0x4D,
+ 0xCC, 0x87, 0xF6, 0x6D, 0xCC, 0x87, 0xF6, 0x4D,
+ 0xCC, 0xA3, 0xF6, 0x6D, 0xCC, 0xA3, 0xF6, 0x4E,
+ 0xCC, 0x87, 0xF6, 0x6E, 0xCC, 0x87, 0xF6, 0x4E,
+ 0xCC, 0xA3, 0xF6, 0x6E, 0xCC, 0xA3, 0xF6, 0x4E,
+ 0xCC, 0xB1, 0xF6, 0x6E, 0xCC, 0xB1, 0xF6, 0x4E,
+ 0xCC, 0xAD, 0xF6, 0x6E, 0xCC, 0xAD, 0xF6, 0x4F,
+ 0xCC, 0x83, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x83,
+ 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, 0x88,
+ 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, 0x4F,
+ 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x84,
+ 0xCC, 0x80, 0xF6, 0x4F, 0xCC, 0x84, 0xCC, 0x81,
+ 0xF6, 0x6F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x50,
+ 0xCC, 0x81, 0xF6, 0x70, 0xCC, 0x81, 0xF6, 0x50,
+ 0xCC, 0x87, 0xF6, 0x70, 0xCC, 0x87, 0xF6, 0x52,
+ 0xCC, 0x87, 0xF6, 0x72, 0xCC, 0x87, 0xF6, 0x52,
+ 0xCC, 0xA3, 0xF6, 0x72, 0xCC, 0xA3, 0xF6, 0x52,
+ 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x72, 0xCC, 0xA3,
+ 0xCC, 0x84, 0xF6, 0x52, 0xCC, 0xB1, 0xF6, 0x72,
+ 0xCC, 0xB1, 0xF6, 0x53, 0xCC, 0x87, 0xF6, 0x73,
+ 0xCC, 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xF6, 0x73,
+ 0xCC, 0xA3, 0xF6, 0x53, 0xCC, 0x81, 0xCC, 0x87,
+ 0xF6, 0x73, 0xCC, 0x81, 0xCC, 0x87, 0xF6, 0x53,
+ 0xCC, 0x8C, 0xCC, 0x87, 0xF6, 0x73, 0xCC, 0x8C,
+ 0xCC, 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xCC, 0x87,
+ 0xF6, 0x73, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, 0x54,
+ 0xCC, 0x87, 0xF6, 0x74, 0xCC, 0x87, 0xF6, 0x54,
+ 0xCC, 0xA3, 0xF6, 0x74, 0xCC, 0xA3, 0xF6, 0x54,
+ 0xCC, 0xB1, 0xF6, 0x74, 0xCC, 0xB1, 0xF6, 0x54,
+ 0xCC, 0xAD, 0xF6, 0x74, 0xCC, 0xAD, 0xF6, 0x55,
+ 0xCC, 0xA4, 0xF6, 0x75, 0xCC, 0xA4, 0xF6, 0x55,
+ 0xCC, 0xB0, 0xF6, 0x75, 0xCC, 0xB0, 0xF6, 0x55,
+ 0xCC, 0xAD, 0xF6, 0x75, 0xCC, 0xAD, 0xF6, 0x55,
+ 0xCC, 0x83, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x83,
+ 0xCC, 0x81, 0xF6, 0x55, 0xCC, 0x84, 0xCC, 0x88,
+ 0xF6, 0x75, 0xCC, 0x84, 0xCC, 0x88, 0xF6, 0x56,
+ 0xCC, 0x83, 0xF6, 0x76, 0xCC, 0x83, 0xF6, 0x56,
+ 0xCC, 0xA3, 0xF6, 0x76, 0xCC, 0xA3, 0xF6, 0x57,
+ 0xCC, 0x80, 0xF6, 0x77, 0xCC, 0x80, 0xF6, 0x57,
+ 0xCC, 0x81, 0xF6, 0x77, 0xCC, 0x81, 0xF6, 0x57,
+ 0xCC, 0x88, 0xF6, 0x77, 0xCC, 0x88, 0xF6, 0x57,
+ 0xCC, 0x87, 0xF6, 0x77, 0xCC, 0x87, 0xF6, 0x57,
+ 0xCC, 0xA3, 0xF6, 0x77, 0xCC, 0xA3, 0xF6, 0x58,
+ 0xCC, 0x87, 0xF6, 0x78, 0xCC, 0x87, 0xF6, 0x58,
+ 0xCC, 0x88, 0xF6, 0x78, 0xCC, 0x88, 0xF6, 0x59,
+ 0xCC, 0x87, 0xF6, 0x79, 0xCC, 0x87, 0xF6, 0x5A,
+ 0xCC, 0x82, 0xF6, 0x7A, 0xCC, 0x82, 0xF6, 0x5A,
+ 0xCC, 0xA3, 0xF6, 0x7A, 0xCC, 0xA3, 0xF6, 0x5A,
+ 0xCC, 0xB1, 0xF6, 0x7A, 0xCC, 0xB1, 0xF6, 0x68,
+ 0xCC, 0xB1, 0xF6, 0x74, 0xCC, 0x88, 0xF6, 0x77,
+ 0xCC, 0x8A, 0xF6, 0x79, 0xCC, 0x8A, 0x61, 0xCA,
+ 0xBE, 0xF5, 0x05, 0xC5, 0xBF, 0xCC, 0x87, 0x73,
+ 0xCC, 0x87, 0xF6, 0x41, 0xCC, 0xA3, 0xF6, 0x61,
+ 0xCC, 0xA3, 0xF6, 0x41, 0xCC, 0x89, 0xF6, 0x61,
+ 0xCC, 0x89, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x81,
+ 0xF6, 0x61, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x41,
+ 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x61, 0xCC, 0x82,
+ 0xCC, 0x80, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x89,
+ 0xF6, 0x61, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x41,
+ 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x82,
+ 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0xA3, 0xCC, 0x82,
+ 0xF6, 0x61, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x41,
+ 0xCC, 0x86, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x86,
+ 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x80,
+ 0xF6, 0x61, 0xCC, 0x86, 0xCC, 0x80, 0xF6, 0x41,
+ 0xCC, 0x86, 0xCC, 0x89, 0xF6, 0x61, 0xCC, 0x86,
+ 0xCC, 0x89, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x83,
+ 0xF6, 0x61, 0xCC, 0x86, 0xCC, 0x83, 0xF6, 0x41,
+ 0xCC, 0xA3, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0xA3,
+ 0xCC, 0x86, 0xF6, 0x45, 0xCC, 0xA3, 0xF6, 0x65,
+ 0xCC, 0xA3, 0xF6, 0x45, 0xCC, 0x89, 0xF6, 0x65,
+ 0xCC, 0x89, 0xF6, 0x45, 0xCC, 0x83, 0xF6, 0x65,
+ 0xCC, 0x83, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x81,
+ 0xF6, 0x65, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x45,
+ 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x82,
+ 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x89,
+ 0xF6, 0x65, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x45,
+ 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x65, 0xCC, 0x82,
+ 0xCC, 0x83, 0xF6, 0x45, 0xCC, 0xA3, 0xCC, 0x82,
+ 0xF6, 0x65, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x49,
+ 0xCC, 0x89, 0xF6, 0x69, 0xCC, 0x89, 0xF6, 0x49,
+ 0xCC, 0xA3, 0xF6, 0x69, 0xCC, 0xA3, 0xF6, 0x4F,
+ 0xCC, 0xA3, 0xF6, 0x6F, 0xCC, 0xA3, 0xF6, 0x4F,
+ 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x89, 0xF6, 0x4F,
+ 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82,
+ 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x80,
+ 0xF6, 0x6F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x4F,
+ 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x82,
+ 0xCC, 0x89, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x83,
+ 0xF6, 0x6F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x4F,
+ 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x6F, 0xCC, 0xA3,
+ 0xCC, 0x82, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x81,
+ 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x4F,
+ 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x9B,
+ 0xCC, 0x80, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x89,
+ 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x4F,
+ 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x9B,
+ 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0xA3,
+ 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x55,
+ 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0xA3, 0xF6, 0x55,
+ 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x89, 0xF6, 0x55,
+ 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x9B,
+ 0xCC, 0x81, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x80,
+ 0xF6, 0x75, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x55,
+ 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x9B,
+ 0xCC, 0x89, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x83,
+ 0xF6, 0x75, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x55,
+ 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0x9B,
+ 0xCC, 0xA3, 0xF6, 0x59, 0xCC, 0x80, 0xF6, 0x79,
+ 0xCC, 0x80, 0xF6, 0x59, 0xCC, 0xA3, 0xF6, 0x79,
+ 0xCC, 0xA3, 0xF6, 0x59, 0xCC, 0x89, 0xF6, 0x79,
+ 0xCC, 0x89, 0xF6, 0x59, 0xCC, 0x83, 0xF6, 0x79,
+ 0xCC, 0x83, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xF6,
+ 0xCE, 0xB1, 0xCC, 0x94, 0xF6, 0xCE, 0xB1, 0xCC,
+ 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x94,
+ 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81,
+ 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xF6,
+ 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE,
+ 0x91, 0xCC, 0x93, 0xF6, 0xCE, 0x91, 0xCC, 0x94,
+ 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xF6,
+ 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE,
+ 0x91, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x91,
+ 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC,
+ 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0x91, 0xCC, 0x94,
+ 0xCD, 0x82, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xF6,
+ 0xCE, 0xB5, 0xCC, 0x94, 0xF6, 0xCE, 0xB5, 0xCC,
+ 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x94,
+ 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, 0x81,
+ 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xF6, 0xCE, 0x95,
+ 0xCC, 0x94, 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC,
+ 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x80,
+ 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x81, 0xF6,
+ 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x93, 0xF6, 0xCE, 0xB7, 0xCC, 0x94,
+ 0xF6, 0xCE, 0xB7, 0xCC, 0x93, 0xCC, 0x80, 0xF6,
+ 0xCE, 0xB7, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB7,
+ 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC,
+ 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB7, 0xCC, 0x94,
+ 0xCD, 0x82, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xF6,
+ 0xCE, 0x97, 0xCC, 0x94, 0xF6, 0xCE, 0x97, 0xCC,
+ 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x94,
+ 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC,
+ 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x81,
+ 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x82, 0xF6,
+ 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE,
+ 0xB9, 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0xCC, 0x94,
+ 0xF6, 0xCE, 0xB9, 0xCC, 0x93, 0xCC, 0x80, 0xF6,
+ 0xCE, 0xB9, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE,
+ 0xB9, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB9,
+ 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC,
+ 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x94,
+ 0xCD, 0x82, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xF6,
+ 0xCE, 0x99, 0xCC, 0x94, 0xF6, 0xCE, 0x99, 0xCC,
+ 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x94,
+ 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCC,
+ 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, 0x81,
+ 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCD, 0x82, 0xF6,
+ 0xCE, 0x99, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE,
+ 0xBF, 0xCC, 0x93, 0xF6, 0xCE, 0xBF, 0xCC, 0x94,
+ 0xF6, 0xCE, 0xBF, 0xCC, 0x93, 0xCC, 0x80, 0xF6,
+ 0xCE, 0xBF, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE,
+ 0xBF, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xBF,
+ 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC,
+ 0x93, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xF6, 0xCE,
+ 0x9F, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x9F,
+ 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC,
+ 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x94,
+ 0xCC, 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xF6,
+ 0xCF, 0x85, 0xCC, 0x94, 0xF6, 0xCF, 0x85, 0xCC,
+ 0x93, 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x94,
+ 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCC,
+ 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, 0x81,
+ 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCD, 0x82, 0xF6,
+ 0xCF, 0x85, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE,
+ 0xA5, 0xCC, 0x94, 0xF6, 0xCE, 0xA5, 0xCC, 0x94,
+ 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCD, 0x82,
+ 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xF6, 0xCF, 0x89,
+ 0xCC, 0x94, 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC,
+ 0x80, 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x80,
+ 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xF6,
+ 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCF,
+ 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, 0x89,
+ 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA9, 0xCC,
+ 0x93, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xF6, 0xCE,
+ 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xA9,
+ 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC,
+ 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x94,
+ 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD,
+ 0x82, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x82,
+ 0xF6, 0xCE, 0xB1, 0xCC, 0x80, 0xF6, 0xCE, 0xB1,
+ 0xCC, 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x80, 0xF6,
+ 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC,
+ 0x80, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xF6, 0xCE,
+ 0xB9, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x81,
+ 0xF6, 0xCE, 0xBF, 0xCC, 0x80, 0xF6, 0xCE, 0xBF,
+ 0xCC, 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x80, 0xF6,
+ 0xCF, 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC,
+ 0x80, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xF6, 0xCE,
+ 0xB1, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB1,
+ 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC,
+ 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1,
+ 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE,
+ 0xB1, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6,
+ 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85,
+ 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xCD,
+ 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82,
+ 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCD,
+ 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, 0x85,
+ 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xCD,
+ 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80,
+ 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC,
+ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94,
+ 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC,
+ 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x91,
+ 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB7,
+ 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC,
+ 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7,
+ 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE,
+ 0xB7, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6,
+ 0xCE, 0xB7, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85,
+ 0xF6, 0xCE, 0xB7, 0xCC, 0x93, 0xCD, 0x82, 0xCD,
+ 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, 0x82,
+ 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD,
+ 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x85,
+ 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x80, 0xCD,
+ 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x80,
+ 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC,
+ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94,
+ 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC,
+ 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x97,
+ 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCF,
+ 0x89, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCF, 0x89,
+ 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC,
+ 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89,
+ 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF,
+ 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6,
+ 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85,
+ 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xCD,
+ 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCD, 0x82,
+ 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD,
+ 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x85,
+ 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xCD,
+ 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, 0x80,
+ 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC,
+ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94,
+ 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC,
+ 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xA9,
+ 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE,
+ 0xB1, 0xCC, 0x86, 0xF6, 0xCE, 0xB1, 0xCC, 0x84,
+ 0xF6, 0xCE, 0xB1, 0xCC, 0x80, 0xCD, 0x85, 0xF6,
+ 0xCE, 0xB1, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC,
+ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCD, 0x82,
+ 0xF6, 0xCE, 0xB1, 0xCD, 0x82, 0xCD, 0x85, 0xF6,
+ 0xCE, 0x91, 0xCC, 0x86, 0xF6, 0xCE, 0x91, 0xCC,
+ 0x84, 0xF6, 0xCE, 0x91, 0xCC, 0x80, 0xF6, 0xCE,
+ 0x91, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCD, 0x85,
+ 0x20, 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0x20, 0xCC,
+ 0x93, 0x20, 0xCD, 0x82, 0xF5, 0x05, 0xC2, 0xA8,
+ 0xCD, 0x82, 0x20, 0xCC, 0x88, 0xCD, 0x82, 0xF6,
+ 0xCE, 0xB7, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE,
+ 0xB7, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x81,
+ 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCD, 0x82, 0xF6,
+ 0xCE, 0xB7, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE,
+ 0x95, 0xCC, 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x81,
+ 0xF6, 0xCE, 0x97, 0xCC, 0x80, 0xF6, 0xCE, 0x97,
+ 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCD, 0x85, 0xF5,
+ 0x06, 0xE1, 0xBE, 0xBF, 0xCC, 0x80, 0x20, 0xCC,
+ 0x93, 0xCC, 0x80, 0xF5, 0x06, 0xE1, 0xBE, 0xBF,
+ 0xCC, 0x81, 0x20, 0xCC, 0x93, 0xCC, 0x81, 0xF5,
+ 0x06, 0xE1, 0xBE, 0xBF, 0xCD, 0x82, 0x20, 0xCC,
+ 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x86,
+ 0xF6, 0xCE, 0xB9, 0xCC, 0x84, 0xF6, 0xCE, 0xB9,
+ 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC,
+ 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCD, 0x82,
+ 0xF6, 0xCE, 0xB9, 0xCC, 0x88, 0xCD, 0x82, 0xF6,
+ 0xCE, 0x99, 0xCC, 0x86, 0xF6, 0xCE, 0x99, 0xCC,
+ 0x84, 0xF6, 0xCE, 0x99, 0xCC, 0x80, 0xF6, 0xCE,
+ 0x99, 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE,
+ 0xCC, 0x80, 0x20, 0xCC, 0x94, 0xCC, 0x80, 0xF5,
+ 0x06, 0xE1, 0xBF, 0xBE, 0xCC, 0x81, 0x20, 0xCC,
+ 0x94, 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE,
+ 0xCD, 0x82, 0x20, 0xCC, 0x94, 0xCD, 0x82, 0xF6,
+ 0xCF, 0x85, 0xCC, 0x86, 0xF6, 0xCF, 0x85, 0xCC,
+ 0x84, 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x80,
+ 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6,
+ 0xCF, 0x81, 0xCC, 0x93, 0xF6, 0xCF, 0x81, 0xCC,
+ 0x94, 0xF6, 0xCF, 0x85, 0xCD, 0x82, 0xF6, 0xCF,
+ 0x85, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, 0xA5,
+ 0xCC, 0x86, 0xF6, 0xCE, 0xA5, 0xCC, 0x84, 0xF6,
+ 0xCE, 0xA5, 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xA1, 0xCC, 0x94, 0xF5, 0x05,
+ 0xC2, 0xA8, 0xCC, 0x80, 0x20, 0xCC, 0x88, 0xCC,
+ 0x80, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, 0x20,
+ 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x60, 0xF6, 0xCF,
+ 0x89, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89,
+ 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xCD,
+ 0x85, 0xF6, 0xCF, 0x89, 0xCD, 0x82, 0xF6, 0xCF,
+ 0x89, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x9F,
+ 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x81, 0xF6,
+ 0xCE, 0xA9, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC,
+ 0x81, 0xF6, 0xCE, 0xA9, 0xCD, 0x85, 0xF5, 0x03,
+ 0xC2, 0xB4, 0x20, 0xCC, 0x81, 0x20, 0xCC, 0x94,
+ 0xF5, 0x04, 0xE2, 0x80, 0x82, 0x20, 0xF5, 0x04,
+ 0xE2, 0x80, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0xE2, 0x80, 0x90,
+ 0x20, 0xCC, 0xB3, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
+ 0x2E, 0x20, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2,
+ 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80,
+ 0xB2, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2,
+ 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5,
+ 0x21, 0x21, 0x20, 0xCC, 0x85, 0x3F, 0x3F, 0x3F,
+ 0x21, 0x21, 0x3F, 0xE2, 0x80, 0xB2, 0xE2, 0x80,
+ 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0x20,
+ 0x30, 0x69, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+ 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, 0x6E,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28,
+ 0x29, 0x61, 0x65, 0x6F, 0x78, 0xC9, 0x99, 0x52,
+ 0x73, 0x61, 0x2F, 0x63, 0x61, 0x2F, 0x73, 0x43,
+ 0xC2, 0xB0, 0x43, 0x63, 0x2F, 0x6F, 0x63, 0x2F,
+ 0x75, 0xC6, 0x90, 0xC2, 0xB0, 0x46, 0x67, 0x48,
+ 0x48, 0x48, 0x68, 0xC4, 0xA7, 0x49, 0x49, 0x4C,
+ 0x6C, 0x4E, 0x4E, 0x6F, 0x50, 0x51, 0x52, 0x52,
+ 0x52, 0x53, 0x4D, 0x54, 0x45, 0x4C, 0x54, 0x4D,
+ 0x5A, 0xF6, 0xCE, 0xA9, 0x5A, 0xF6, 0x4B, 0xF6,
+ 0x41, 0xCC, 0x8A, 0x42, 0x43, 0x65, 0x45, 0x46,
+ 0x4D, 0x6F, 0xD7, 0x90, 0xD7, 0x91, 0xD7, 0x92,
+ 0xD7, 0x93, 0x69, 0x46, 0x41, 0x58, 0xCF, 0x80,
+ 0xCE, 0xB3, 0xCE, 0x93, 0xCE, 0xA0, 0xE2, 0x88,
+ 0x91, 0x44, 0x64, 0x65, 0x69, 0x6A, 0x31, 0xE2,
+ 0x81, 0x84, 0x33, 0x32, 0xE2, 0x81, 0x84, 0x33,
+ 0x31, 0xE2, 0x81, 0x84, 0x35, 0x32, 0xE2, 0x81,
+ 0x84, 0x35, 0x33, 0xE2, 0x81, 0x84, 0x35, 0x34,
+ 0xE2, 0x81, 0x84, 0x35, 0x31, 0xE2, 0x81, 0x84,
+ 0x36, 0x35, 0xE2, 0x81, 0x84, 0x36, 0x31, 0xE2,
+ 0x81, 0x84, 0x38, 0x33, 0xE2, 0x81, 0x84, 0x38,
+ 0x35, 0xE2, 0x81, 0x84, 0x38, 0x37, 0xE2, 0x81,
+ 0x84, 0x38, 0x31, 0xE2, 0x81, 0x84, 0x49, 0x49,
+ 0x49, 0x49, 0x49, 0x49, 0x49, 0x56, 0x56, 0x56,
+ 0x49, 0x56, 0x49, 0x49, 0x56, 0x49, 0x49, 0x49,
+ 0x49, 0x58, 0x58, 0x58, 0x49, 0x58, 0x49, 0x49,
+ 0x4C, 0x43, 0x44, 0x4D, 0x69, 0x69, 0x69, 0x69,
+ 0x69, 0x69, 0x69, 0x76, 0x76, 0x76, 0x69, 0x76,
+ 0x69, 0x69, 0x76, 0x69, 0x69, 0x69, 0x69, 0x78,
+ 0x78, 0x78, 0x69, 0x78, 0x69, 0x69, 0x6C, 0x63,
+ 0x64, 0x6D, 0xF6, 0xE2, 0x86, 0x90, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x86, 0x92, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x86, 0x94, 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x90,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x94, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x87, 0x92, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x88, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x88,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x8B, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x88, 0xA3, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x88, 0xA5, 0xCC, 0xB8, 0xE2, 0x88, 0xAB, 0xE2,
+ 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB,
+ 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAE, 0xE2, 0x88,
+ 0xAE, 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xE2,
+ 0x88, 0xAE, 0xF6, 0xE2, 0x88, 0xBC, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x89, 0x83, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x89, 0x85, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x88,
+ 0xCC, 0xB8, 0xF6, 0x3D, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x89, 0xA1, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x8D,
+ 0xCC, 0xB8, 0xF6, 0x3C, 0xCC, 0xB8, 0xF6, 0x3E,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA4, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x89, 0xA5, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x89, 0xB2, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB3,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB6, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x89, 0xB7, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x89, 0xBA, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBB,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x82, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x8A, 0x83, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x8A, 0x86, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x87,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA2, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x8A, 0xA8, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x8A, 0xA9, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xAB,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBC, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x89, 0xBD, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x8A, 0x91, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x92,
+ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB2, 0xCC, 0xB8,
+ 0xF6, 0xE2, 0x8A, 0xB3, 0xCC, 0xB8, 0xF6, 0xE2,
+ 0x8A, 0xB4, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB5,
+ 0xCC, 0xB8, 0xF6, 0xE3, 0x80, 0x88, 0xF6, 0xE3,
+ 0x80, 0x89, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
+ 0x37, 0x38, 0x39, 0x31, 0x30, 0x31, 0x31, 0x31,
+ 0x32, 0x31, 0x33, 0x31, 0x34, 0x31, 0x35, 0x31,
+ 0x36, 0x31, 0x37, 0x31, 0x38, 0x31, 0x39, 0x32,
+ 0x30, 0x28, 0x31, 0x29, 0x28, 0x32, 0x29, 0x28,
+ 0x33, 0x29, 0x28, 0x34, 0x29, 0x28, 0x35, 0x29,
+ 0x28, 0x36, 0x29, 0x28, 0x37, 0x29, 0x28, 0x38,
+ 0x29, 0x28, 0x39, 0x29, 0x28, 0x31, 0x30, 0x29,
+ 0x28, 0x31, 0x31, 0x29, 0x28, 0x31, 0x32, 0x29,
+ 0x28, 0x31, 0x33, 0x29, 0x28, 0x31, 0x34, 0x29,
+ 0x28, 0x31, 0x35, 0x29, 0x28, 0x31, 0x36, 0x29,
+ 0x28, 0x31, 0x37, 0x29, 0x28, 0x31, 0x38, 0x29,
+ 0x28, 0x31, 0x39, 0x29, 0x28, 0x32, 0x30, 0x29,
+ 0x31, 0x2E, 0x32, 0x2E, 0x33, 0x2E, 0x34, 0x2E,
+ 0x35, 0x2E, 0x36, 0x2E, 0x37, 0x2E, 0x38, 0x2E,
+ 0x39, 0x2E, 0x31, 0x30, 0x2E, 0x31, 0x31, 0x2E,
+ 0x31, 0x32, 0x2E, 0x31, 0x33, 0x2E, 0x31, 0x34,
+ 0x2E, 0x31, 0x35, 0x2E, 0x31, 0x36, 0x2E, 0x31,
+ 0x37, 0x2E, 0x31, 0x38, 0x2E, 0x31, 0x39, 0x2E,
+ 0x32, 0x30, 0x2E, 0x28, 0x61, 0x29, 0x28, 0x62,
+ 0x29, 0x28, 0x63, 0x29, 0x28, 0x64, 0x29, 0x28,
+ 0x65, 0x29, 0x28, 0x66, 0x29, 0x28, 0x67, 0x29,
+ 0x28, 0x68, 0x29, 0x28, 0x69, 0x29, 0x28, 0x6A,
+ 0x29, 0x28, 0x6B, 0x29, 0x28, 0x6C, 0x29, 0x28,
+ 0x6D, 0x29, 0x28, 0x6E, 0x29, 0x28, 0x6F, 0x29,
+ 0x28, 0x70, 0x29, 0x28, 0x71, 0x29, 0x28, 0x72,
+ 0x29, 0x28, 0x73, 0x29, 0x28, 0x74, 0x29, 0x28,
+ 0x75, 0x29, 0x28, 0x76, 0x29, 0x28, 0x77, 0x29,
+ 0x28, 0x78, 0x29, 0x28, 0x79, 0x29, 0x28, 0x7A,
+ 0x29, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x30, 0xE2, 0x88,
+ 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2,
+ 0x88, 0xAB, 0x3A, 0x3A, 0x3D, 0x3D, 0x3D, 0x3D,
+ 0x3D, 0x3D, 0xF6, 0xE2, 0xAB, 0x9D, 0xCC, 0xB8,
+ 0xE2, 0xB5, 0xA1, 0xE6, 0xAF, 0x8D, 0xE9, 0xBE,
+ 0x9F, 0xE4, 0xB8, 0x80, 0xE4, 0xB8, 0xA8, 0xE4,
+ 0xB8, 0xB6, 0xE4, 0xB8, 0xBF, 0xE4, 0xB9, 0x99,
+ 0xE4, 0xBA, 0x85, 0xE4, 0xBA, 0x8C, 0xE4, 0xBA,
+ 0xA0, 0xE4, 0xBA, 0xBA, 0xE5, 0x84, 0xBF, 0xE5,
+ 0x85, 0xA5, 0xE5, 0x85, 0xAB, 0xE5, 0x86, 0x82,
+ 0xE5, 0x86, 0x96, 0xE5, 0x86, 0xAB, 0xE5, 0x87,
+ 0xA0, 0xE5, 0x87, 0xB5, 0xE5, 0x88, 0x80, 0xE5,
+ 0x8A, 0x9B, 0xE5, 0x8B, 0xB9, 0xE5, 0x8C, 0x95,
+ 0xE5, 0x8C, 0x9A, 0xE5, 0x8C, 0xB8, 0xE5, 0x8D,
+ 0x81, 0xE5, 0x8D, 0x9C, 0xE5, 0x8D, 0xA9, 0xE5,
+ 0x8E, 0x82, 0xE5, 0x8E, 0xB6, 0xE5, 0x8F, 0x88,
+ 0xE5, 0x8F, 0xA3, 0xE5, 0x9B, 0x97, 0xE5, 0x9C,
+ 0x9F, 0xE5, 0xA3, 0xAB, 0xE5, 0xA4, 0x82, 0xE5,
+ 0xA4, 0x8A, 0xE5, 0xA4, 0x95, 0xE5, 0xA4, 0xA7,
+ 0xE5, 0xA5, 0xB3, 0xE5, 0xAD, 0x90, 0xE5, 0xAE,
+ 0x80, 0xE5, 0xAF, 0xB8, 0xE5, 0xB0, 0x8F, 0xE5,
+ 0xB0, 0xA2, 0xE5, 0xB0, 0xB8, 0xE5, 0xB1, 0xAE,
+ 0xE5, 0xB1, 0xB1, 0xE5, 0xB7, 0x9B, 0xE5, 0xB7,
+ 0xA5, 0xE5, 0xB7, 0xB1, 0xE5, 0xB7, 0xBE, 0xE5,
+ 0xB9, 0xB2, 0xE5, 0xB9, 0xBA, 0xE5, 0xB9, 0xBF,
+ 0xE5, 0xBB, 0xB4, 0xE5, 0xBB, 0xBE, 0xE5, 0xBC,
+ 0x8B, 0xE5, 0xBC, 0x93, 0xE5, 0xBD, 0x90, 0xE5,
+ 0xBD, 0xA1, 0xE5, 0xBD, 0xB3, 0xE5, 0xBF, 0x83,
+ 0xE6, 0x88, 0x88, 0xE6, 0x88, 0xB6, 0xE6, 0x89,
+ 0x8B, 0xE6, 0x94, 0xAF, 0xE6, 0x94, 0xB4, 0xE6,
+ 0x96, 0x87, 0xE6, 0x96, 0x97, 0xE6, 0x96, 0xA4,
+ 0xE6, 0x96, 0xB9, 0xE6, 0x97, 0xA0, 0xE6, 0x97,
+ 0xA5, 0xE6, 0x9B, 0xB0, 0xE6, 0x9C, 0x88, 0xE6,
+ 0x9C, 0xA8, 0xE6, 0xAC, 0xA0, 0xE6, 0xAD, 0xA2,
+ 0xE6, 0xAD, 0xB9, 0xE6, 0xAE, 0xB3, 0xE6, 0xAF,
+ 0x8B, 0xE6, 0xAF, 0x94, 0xE6, 0xAF, 0x9B, 0xE6,
+ 0xB0, 0x8F, 0xE6, 0xB0, 0x94, 0xE6, 0xB0, 0xB4,
+ 0xE7, 0x81, 0xAB, 0xE7, 0x88, 0xAA, 0xE7, 0x88,
+ 0xB6, 0xE7, 0x88, 0xBB, 0xE7, 0x88, 0xBF, 0xE7,
+ 0x89, 0x87, 0xE7, 0x89, 0x99, 0xE7, 0x89, 0x9B,
+ 0xE7, 0x8A, 0xAC, 0xE7, 0x8E, 0x84, 0xE7, 0x8E,
+ 0x89, 0xE7, 0x93, 0x9C, 0xE7, 0x93, 0xA6, 0xE7,
+ 0x94, 0x98, 0xE7, 0x94, 0x9F, 0xE7, 0x94, 0xA8,
+ 0xE7, 0x94, 0xB0, 0xE7, 0x96, 0x8B, 0xE7, 0x96,
+ 0x92, 0xE7, 0x99, 0xB6, 0xE7, 0x99, 0xBD, 0xE7,
+ 0x9A, 0xAE, 0xE7, 0x9A, 0xBF, 0xE7, 0x9B, 0xAE,
+ 0xE7, 0x9F, 0x9B, 0xE7, 0x9F, 0xA2, 0xE7, 0x9F,
+ 0xB3, 0xE7, 0xA4, 0xBA, 0xE7, 0xA6, 0xB8, 0xE7,
+ 0xA6, 0xBE, 0xE7, 0xA9, 0xB4, 0xE7, 0xAB, 0x8B,
+ 0xE7, 0xAB, 0xB9, 0xE7, 0xB1, 0xB3, 0xE7, 0xB3,
+ 0xB8, 0xE7, 0xBC, 0xB6, 0xE7, 0xBD, 0x91, 0xE7,
+ 0xBE, 0x8A, 0xE7, 0xBE, 0xBD, 0xE8, 0x80, 0x81,
+ 0xE8, 0x80, 0x8C, 0xE8, 0x80, 0x92, 0xE8, 0x80,
+ 0xB3, 0xE8, 0x81, 0xBF, 0xE8, 0x82, 0x89, 0xE8,
+ 0x87, 0xA3, 0xE8, 0x87, 0xAA, 0xE8, 0x87, 0xB3,
+ 0xE8, 0x87, 0xBC, 0xE8, 0x88, 0x8C, 0xE8, 0x88,
+ 0x9B, 0xE8, 0x88, 0x9F, 0xE8, 0x89, 0xAE, 0xE8,
+ 0x89, 0xB2, 0xE8, 0x89, 0xB8, 0xE8, 0x99, 0x8D,
+ 0xE8, 0x99, 0xAB, 0xE8, 0xA1, 0x80, 0xE8, 0xA1,
+ 0x8C, 0xE8, 0xA1, 0xA3, 0xE8, 0xA5, 0xBE, 0xE8,
+ 0xA6, 0x8B, 0xE8, 0xA7, 0x92, 0xE8, 0xA8, 0x80,
+ 0xE8, 0xB0, 0xB7, 0xE8, 0xB1, 0x86, 0xE8, 0xB1,
+ 0x95, 0xE8, 0xB1, 0xB8, 0xE8, 0xB2, 0x9D, 0xE8,
+ 0xB5, 0xA4, 0xE8, 0xB5, 0xB0, 0xE8, 0xB6, 0xB3,
+ 0xE8, 0xBA, 0xAB, 0xE8, 0xBB, 0x8A, 0xE8, 0xBE,
+ 0x9B, 0xE8, 0xBE, 0xB0, 0xE8, 0xBE, 0xB5, 0xE9,
+ 0x82, 0x91, 0xE9, 0x85, 0x89, 0xE9, 0x87, 0x86,
+ 0xE9, 0x87, 0x8C, 0xE9, 0x87, 0x91, 0xE9, 0x95,
+ 0xB7, 0xE9, 0x96, 0x80, 0xE9, 0x98, 0x9C, 0xE9,
+ 0x9A, 0xB6, 0xE9, 0x9A, 0xB9, 0xE9, 0x9B, 0xA8,
+ 0xE9, 0x9D, 0x91, 0xE9, 0x9D, 0x9E, 0xE9, 0x9D,
+ 0xA2, 0xE9, 0x9D, 0xA9, 0xE9, 0x9F, 0x8B, 0xE9,
+ 0x9F, 0xAD, 0xE9, 0x9F, 0xB3, 0xE9, 0xA0, 0x81,
+ 0xE9, 0xA2, 0xA8, 0xE9, 0xA3, 0x9B, 0xE9, 0xA3,
+ 0x9F, 0xE9, 0xA6, 0x96, 0xE9, 0xA6, 0x99, 0xE9,
+ 0xA6, 0xAC, 0xE9, 0xAA, 0xA8, 0xE9, 0xAB, 0x98,
+ 0xE9, 0xAB, 0x9F, 0xE9, 0xAC, 0xA5, 0xE9, 0xAC,
+ 0xAF, 0xE9, 0xAC, 0xB2, 0xE9, 0xAC, 0xBC, 0xE9,
+ 0xAD, 0x9A, 0xE9, 0xB3, 0xA5, 0xE9, 0xB9, 0xB5,
+ 0xE9, 0xB9, 0xBF, 0xE9, 0xBA, 0xA5, 0xE9, 0xBA,
+ 0xBB, 0xE9, 0xBB, 0x83, 0xE9, 0xBB, 0x8D, 0xE9,
+ 0xBB, 0x91, 0xE9, 0xBB, 0xB9, 0xE9, 0xBB, 0xBD,
+ 0xE9, 0xBC, 0x8E, 0xE9, 0xBC, 0x93, 0xE9, 0xBC,
+ 0xA0, 0xE9, 0xBC, 0xBB, 0xE9, 0xBD, 0x8A, 0xE9,
+ 0xBD, 0x92, 0xE9, 0xBE, 0x8D, 0xE9, 0xBE, 0x9C,
+ 0xE9, 0xBE, 0xA0, 0x20, 0xE3, 0x80, 0x92, 0xE5,
+ 0x8D, 0x81, 0xE5, 0x8D, 0x84, 0xE5, 0x8D, 0x85,
+ 0xF6, 0xE3, 0x81, 0x8B, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x81, 0x8D, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x81, 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81,
+ 0x91, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x93,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x95, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x97, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x81, 0x99, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x81, 0x9B, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x81, 0x9D, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x81, 0x9F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81,
+ 0xA1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA4,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA6, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA8, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x9A, 0xF6,
+ 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x81, 0xB2, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81,
+ 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB5,
+ 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xB8, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, 0x82,
+ 0x9A, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x9A, 0xF6,
+ 0xE3, 0x81, 0x86, 0xE3, 0x82, 0x99, 0x20, 0xE3,
+ 0x82, 0x99, 0x20, 0xE3, 0x82, 0x9A, 0xF6, 0xE3,
+ 0x82, 0x9D, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0x88,
+ 0xE3, 0x82, 0x8A, 0xF6, 0xE3, 0x82, 0xAB, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAD, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x82, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82,
+ 0xB7, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xB9,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBB, 0xE3,
+ 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBD, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x83, 0x84, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x83, 0x86, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83,
+ 0x88, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, 0xE3,
+ 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A,
+ 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x9A, 0xF6, 0xE3,
+ 0x83, 0x98, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83,
+ 0x98, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x9B,
+ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x9B, 0xE3,
+ 0x82, 0x9A, 0xF6, 0xE3, 0x82, 0xA6, 0xE3, 0x82,
+ 0x99, 0xF6, 0xE3, 0x83, 0xAF, 0xE3, 0x82, 0x99,
+ 0xF6, 0xE3, 0x83, 0xB0, 0xE3, 0x82, 0x99, 0xF6,
+ 0xE3, 0x83, 0xB1, 0xE3, 0x82, 0x99, 0xF6, 0xE3,
+ 0x83, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83,
+ 0xBD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xB3, 0xE3,
+ 0x83, 0x88, 0xE1, 0x84, 0x80, 0xE1, 0x84, 0x81,
+ 0xE1, 0x86, 0xAA, 0xE1, 0x84, 0x82, 0xE1, 0x86,
+ 0xAC, 0xE1, 0x86, 0xAD, 0xE1, 0x84, 0x83, 0xE1,
+ 0x84, 0x84, 0xE1, 0x84, 0x85, 0xE1, 0x86, 0xB0,
+ 0xE1, 0x86, 0xB1, 0xE1, 0x86, 0xB2, 0xE1, 0x86,
+ 0xB3, 0xE1, 0x86, 0xB4, 0xE1, 0x86, 0xB5, 0xE1,
+ 0x84, 0x9A, 0xE1, 0x84, 0x86, 0xE1, 0x84, 0x87,
+ 0xE1, 0x84, 0x88, 0xE1, 0x84, 0xA1, 0xE1, 0x84,
+ 0x89, 0xE1, 0x84, 0x8A, 0xE1, 0x84, 0x8B, 0xE1,
+ 0x84, 0x8C, 0xE1, 0x84, 0x8D, 0xE1, 0x84, 0x8E,
+ 0xE1, 0x84, 0x8F, 0xE1, 0x84, 0x90, 0xE1, 0x84,
+ 0x91, 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1,
+ 0x85, 0xA2, 0xE1, 0x85, 0xA3, 0xE1, 0x85, 0xA4,
+ 0xE1, 0x85, 0xA5, 0xE1, 0x85, 0xA6, 0xE1, 0x85,
+ 0xA7, 0xE1, 0x85, 0xA8, 0xE1, 0x85, 0xA9, 0xE1,
+ 0x85, 0xAA, 0xE1, 0x85, 0xAB, 0xE1, 0x85, 0xAC,
+ 0xE1, 0x85, 0xAD, 0xE1, 0x85, 0xAE, 0xE1, 0x85,
+ 0xAF, 0xE1, 0x85, 0xB0, 0xE1, 0x85, 0xB1, 0xE1,
+ 0x85, 0xB2, 0xE1, 0x85, 0xB3, 0xE1, 0x85, 0xB4,
+ 0xE1, 0x85, 0xB5, 0xE1, 0x85, 0xA0, 0xE1, 0x84,
+ 0x94, 0xE1, 0x84, 0x95, 0xE1, 0x87, 0x87, 0xE1,
+ 0x87, 0x88, 0xE1, 0x87, 0x8C, 0xE1, 0x87, 0x8E,
+ 0xE1, 0x87, 0x93, 0xE1, 0x87, 0x97, 0xE1, 0x87,
+ 0x99, 0xE1, 0x84, 0x9C, 0xE1, 0x87, 0x9D, 0xE1,
+ 0x87, 0x9F, 0xE1, 0x84, 0x9D, 0xE1, 0x84, 0x9E,
+ 0xE1, 0x84, 0xA0, 0xE1, 0x84, 0xA2, 0xE1, 0x84,
+ 0xA3, 0xE1, 0x84, 0xA7, 0xE1, 0x84, 0xA9, 0xE1,
+ 0x84, 0xAB, 0xE1, 0x84, 0xAC, 0xE1, 0x84, 0xAD,
+ 0xE1, 0x84, 0xAE, 0xE1, 0x84, 0xAF, 0xE1, 0x84,
+ 0xB2, 0xE1, 0x84, 0xB6, 0xE1, 0x85, 0x80, 0xE1,
+ 0x85, 0x87, 0xE1, 0x85, 0x8C, 0xE1, 0x87, 0xB1,
+ 0xE1, 0x87, 0xB2, 0xE1, 0x85, 0x97, 0xE1, 0x85,
+ 0x98, 0xE1, 0x85, 0x99, 0xE1, 0x86, 0x84, 0xE1,
+ 0x86, 0x85, 0xE1, 0x86, 0x88, 0xE1, 0x86, 0x91,
+ 0xE1, 0x86, 0x92, 0xE1, 0x86, 0x94, 0xE1, 0x86,
+ 0x9E, 0xE1, 0x86, 0xA1, 0xE4, 0xB8, 0x80, 0xE4,
+ 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, 0x9B,
+ 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, 0xB8,
+ 0x8B, 0xE7, 0x94, 0xB2, 0xE4, 0xB9, 0x99, 0xE4,
+ 0xB8, 0x99, 0xE4, 0xB8, 0x81, 0xE5, 0xA4, 0xA9,
+ 0xE5, 0x9C, 0xB0, 0xE4, 0xBA, 0xBA, 0x28, 0xE1,
+ 0x84, 0x80, 0x29, 0x28, 0xE1, 0x84, 0x82, 0x29,
+ 0x28, 0xE1, 0x84, 0x83, 0x29, 0x28, 0xE1, 0x84,
+ 0x85, 0x29, 0x28, 0xE1, 0x84, 0x86, 0x29, 0x28,
+ 0xE1, 0x84, 0x87, 0x29, 0x28, 0xE1, 0x84, 0x89,
+ 0x29, 0x28, 0xE1, 0x84, 0x8B, 0x29, 0x28, 0xE1,
+ 0x84, 0x8C, 0x29, 0x28, 0xE1, 0x84, 0x8E, 0x29,
+ 0x28, 0xE1, 0x84, 0x8F, 0x29, 0x28, 0xE1, 0x84,
+ 0x90, 0x29, 0x28, 0xE1, 0x84, 0x91, 0x29, 0x28,
+ 0xE1, 0x84, 0x92, 0x29, 0x28, 0xE1, 0x84, 0x80,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x82,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x83,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x85,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x86,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x87,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x89,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8B,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8C,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8E,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8F,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x90,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x91,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x92,
+ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8C,
+ 0xE1, 0x85, 0xAE, 0x29, 0x28, 0xE1, 0x84, 0x8B,
+ 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x8C, 0xE1, 0x85,
+ 0xA5, 0xE1, 0x86, 0xAB, 0x29, 0x28, 0xE1, 0x84,
+ 0x8B, 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x92, 0xE1,
+ 0x85, 0xAE, 0x29, 0x28, 0xE4, 0xB8, 0x80, 0x29,
+ 0x28, 0xE4, 0xBA, 0x8C, 0x29, 0x28, 0xE4, 0xB8,
+ 0x89, 0x29, 0x28, 0xE5, 0x9B, 0x9B, 0x29, 0x28,
+ 0xE4, 0xBA, 0x94, 0x29, 0x28, 0xE5, 0x85, 0xAD,
+ 0x29, 0x28, 0xE4, 0xB8, 0x83, 0x29, 0x28, 0xE5,
+ 0x85, 0xAB, 0x29, 0x28, 0xE4, 0xB9, 0x9D, 0x29,
+ 0x28, 0xE5, 0x8D, 0x81, 0x29, 0x28, 0xE6, 0x9C,
+ 0x88, 0x29, 0x28, 0xE7, 0x81, 0xAB, 0x29, 0x28,
+ 0xE6, 0xB0, 0xB4, 0x29, 0x28, 0xE6, 0x9C, 0xA8,
+ 0x29, 0x28, 0xE9, 0x87, 0x91, 0x29, 0x28, 0xE5,
+ 0x9C, 0x9F, 0x29, 0x28, 0xE6, 0x97, 0xA5, 0x29,
+ 0x28, 0xE6, 0xA0, 0xAA, 0x29, 0x28, 0xE6, 0x9C,
+ 0x89, 0x29, 0x28, 0xE7, 0xA4, 0xBE, 0x29, 0x28,
+ 0xE5, 0x90, 0x8D, 0x29, 0x28, 0xE7, 0x89, 0xB9,
+ 0x29, 0x28, 0xE8, 0xB2, 0xA1, 0x29, 0x28, 0xE7,
+ 0xA5, 0x9D, 0x29, 0x28, 0xE5, 0x8A, 0xB4, 0x29,
+ 0x28, 0xE4, 0xBB, 0xA3, 0x29, 0x28, 0xE5, 0x91,
+ 0xBC, 0x29, 0x28, 0xE5, 0xAD, 0xA6, 0x29, 0x28,
+ 0xE7, 0x9B, 0xA3, 0x29, 0x28, 0xE4, 0xBC, 0x81,
+ 0x29, 0x28, 0xE8, 0xB3, 0x87, 0x29, 0x28, 0xE5,
+ 0x8D, 0x94, 0x29, 0x28, 0xE7, 0xA5, 0xAD, 0x29,
+ 0x28, 0xE4, 0xBC, 0x91, 0x29, 0x28, 0xE8, 0x87,
+ 0xAA, 0x29, 0x28, 0xE8, 0x87, 0xB3, 0x29, 0x50,
+ 0x54, 0x45, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33,
+ 0x32, 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37,
+ 0x32, 0x38, 0x32, 0x39, 0x33, 0x30, 0x33, 0x31,
+ 0x33, 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35,
+ 0xE1, 0x84, 0x80, 0xE1, 0x84, 0x82, 0xE1, 0x84,
+ 0x83, 0xE1, 0x84, 0x85, 0xE1, 0x84, 0x86, 0xE1,
+ 0x84, 0x87, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8B,
+ 0xE1, 0x84, 0x8C, 0xE1, 0x84, 0x8E, 0xE1, 0x84,
+ 0x8F, 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1,
+ 0x84, 0x92, 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xA1,
+ 0xE1, 0x84, 0x82, 0xE1, 0x85, 0xA1, 0xE1, 0x84,
+ 0x83, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x85, 0xE1,
+ 0x85, 0xA1, 0xE1, 0x84, 0x86, 0xE1, 0x85, 0xA1,
+ 0xE1, 0x84, 0x87, 0xE1, 0x85, 0xA1, 0xE1, 0x84,
+ 0x89, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8B, 0xE1,
+ 0x85, 0xA1, 0xE1, 0x84, 0x8C, 0xE1, 0x85, 0xA1,
+ 0xE1, 0x84, 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x84,
+ 0x8F, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x90, 0xE1,
+ 0x85, 0xA1, 0xE1, 0x84, 0x91, 0xE1, 0x85, 0xA1,
+ 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x84,
+ 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xB7, 0xE1,
+ 0x84, 0x80, 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x8C,
+ 0xE1, 0x85, 0xAE, 0xE1, 0x84, 0x8B, 0xE1, 0x85,
+ 0xB4, 0xE1, 0x84, 0x8B, 0xE1, 0x85, 0xAE, 0xE4,
+ 0xB8, 0x80, 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89,
+ 0xE5, 0x9B, 0x9B, 0xE4, 0xBA, 0x94, 0xE5, 0x85,
+ 0xAD, 0xE4, 0xB8, 0x83, 0xE5, 0x85, 0xAB, 0xE4,
+ 0xB9, 0x9D, 0xE5, 0x8D, 0x81, 0xE6, 0x9C, 0x88,
+ 0xE7, 0x81, 0xAB, 0xE6, 0xB0, 0xB4, 0xE6, 0x9C,
+ 0xA8, 0xE9, 0x87, 0x91, 0xE5, 0x9C, 0x9F, 0xE6,
+ 0x97, 0xA5, 0xE6, 0xA0, 0xAA, 0xE6, 0x9C, 0x89,
+ 0xE7, 0xA4, 0xBE, 0xE5, 0x90, 0x8D, 0xE7, 0x89,
+ 0xB9, 0xE8, 0xB2, 0xA1, 0xE7, 0xA5, 0x9D, 0xE5,
+ 0x8A, 0xB4, 0xE7, 0xA7, 0x98, 0xE7, 0x94, 0xB7,
+ 0xE5, 0xA5, 0xB3, 0xE9, 0x81, 0xA9, 0xE5, 0x84,
+ 0xAA, 0xE5, 0x8D, 0xB0, 0xE6, 0xB3, 0xA8, 0xE9,
+ 0xA0, 0x85, 0xE4, 0xBC, 0x91, 0xE5, 0x86, 0x99,
+ 0xE6, 0xAD, 0xA3, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8,
+ 0xAD, 0xE4, 0xB8, 0x8B, 0xE5, 0xB7, 0xA6, 0xE5,
+ 0x8F, 0xB3, 0xE5, 0x8C, 0xBB, 0xE5, 0xAE, 0x97,
+ 0xE5, 0xAD, 0xA6, 0xE7, 0x9B, 0xA3, 0xE4, 0xBC,
+ 0x81, 0xE8, 0xB3, 0x87, 0xE5, 0x8D, 0x94, 0xE5,
+ 0xA4, 0x9C, 0x33, 0x36, 0x33, 0x37, 0x33, 0x38,
+ 0x33, 0x39, 0x34, 0x30, 0x34, 0x31, 0x34, 0x32,
+ 0x34, 0x33, 0x34, 0x34, 0x34, 0x35, 0x34, 0x36,
+ 0x34, 0x37, 0x34, 0x38, 0x34, 0x39, 0x35, 0x30,
+ 0x31, 0xE6, 0x9C, 0x88, 0x32, 0xE6, 0x9C, 0x88,
+ 0x33, 0xE6, 0x9C, 0x88, 0x34, 0xE6, 0x9C, 0x88,
+ 0x35, 0xE6, 0x9C, 0x88, 0x36, 0xE6, 0x9C, 0x88,
+ 0x37, 0xE6, 0x9C, 0x88, 0x38, 0xE6, 0x9C, 0x88,
+ 0x39, 0xE6, 0x9C, 0x88, 0x31, 0x30, 0xE6, 0x9C,
+ 0x88, 0x31, 0x31, 0xE6, 0x9C, 0x88, 0x31, 0x32,
+ 0xE6, 0x9C, 0x88, 0x48, 0x67, 0x65, 0x72, 0x67,
+ 0x65, 0x56, 0x4C, 0x54, 0x44, 0xE3, 0x82, 0xA2,
+ 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, 0xE3, 0x82,
+ 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, 0xAB, 0xE3,
+ 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0xB1,
+ 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, 0xE3, 0x82,
+ 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xBB, 0xE3,
+ 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, 0x83, 0x81,
+ 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, 0xE3, 0x83,
+ 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8B, 0xE3,
+ 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, 0x83, 0x8E,
+ 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, 0xE3, 0x83,
+ 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0x9B, 0xE3,
+ 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xA0,
+ 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, 0xE3, 0x83,
+ 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, 0xA8, 0xE3,
+ 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xAB,
+ 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, 0xE3, 0x83,
+ 0xAF, 0xE3, 0x83, 0xB0, 0xE3, 0x83, 0xB1, 0xE3,
+ 0x83, 0xB2, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0x8F,
+ 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x83,
+ 0x88, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xAB, 0xE3,
+ 0x83, 0x95, 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA2,
+ 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x98, 0xE3, 0x82,
+ 0x9A, 0xE3, 0x82, 0xA2, 0xE3, 0x82, 0xA2, 0xE3,
+ 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xA4,
+ 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0xB3, 0xE3, 0x82,
+ 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA4, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0xA6,
+ 0xE3, 0x82, 0xA9, 0xE3, 0x83, 0xB3, 0xE3, 0x82,
+ 0xA8, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xAF, 0xE3,
+ 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99,
+ 0xE3, 0x82, 0xA8, 0xE3, 0x83, 0xBC, 0xE3, 0x82,
+ 0xAB, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAA, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xAA,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xA0, 0xE3, 0x82,
+ 0xAB, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xAA, 0xE3,
+ 0x82, 0xAB, 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0x83,
+ 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xAB, 0xE3, 0x83,
+ 0xAD, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAD,
+ 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAB, 0xE3, 0x82,
+ 0x99, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9E, 0xE3,
+ 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xAB,
+ 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xAD, 0xE3, 0x82,
+ 0x99, 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x82, 0xAD, 0xE3, 0x83, 0xA5, 0xE3, 0x83, 0xAA,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x82,
+ 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xBF, 0xE3,
+ 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD,
+ 0xE3, 0x83, 0xAD, 0xE3, 0x82, 0xAD, 0xE3, 0x83,
+ 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, 0xE3,
+ 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x82, 0xAD,
+ 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xA1, 0xE3, 0x83,
+ 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3,
+ 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xAF,
+ 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x82,
+ 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3,
+ 0x83, 0xA0, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99,
+ 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x83,
+ 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3,
+ 0x83, 0xAB, 0xE3, 0x82, 0xBB, 0xE3, 0x82, 0x99,
+ 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xAD, 0xE3, 0x82,
+ 0xAF, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x83, 0x8D, 0xE3, 0x82, 0xB1, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xB3, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0x8A, 0xE3, 0x82, 0xB3, 0xE3,
+ 0x83, 0xBC, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x9A,
+ 0xE3, 0x82, 0xB5, 0xE3, 0x82, 0xA4, 0xE3, 0x82,
+ 0xAF, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xB5, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x83, 0xA0, 0xE3, 0x82, 0xB7, 0xE3, 0x83,
+ 0xAA, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3,
+ 0x82, 0x99, 0xE3, 0x82, 0xBB, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x83, 0x81, 0xE3, 0x82, 0xBB, 0xE3, 0x83,
+ 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xBF, 0xE3,
+ 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB9,
+ 0xE3, 0x83, 0x86, 0xE3, 0x82, 0x99, 0xE3, 0x82,
+ 0xB7, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3,
+ 0x83, 0xAB, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8E, 0xE3, 0x83,
+ 0x8E, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3,
+ 0x83, 0x8F, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0x84,
+ 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, 0xE3, 0x83,
+ 0xBC, 0xE3, 0x82, 0xBB, 0xE3, 0x83, 0xB3, 0xE3,
+ 0x83, 0x88, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x84, 0xE3, 0x83,
+ 0x8F, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x83, 0xAC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92,
+ 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA2, 0xE3, 0x82,
+ 0xB9, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3,
+ 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xAF,
+ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82,
+ 0x9A, 0xE3, 0x82, 0xB3, 0xE3, 0x83, 0x92, 0xE3,
+ 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95,
+ 0xE3, 0x82, 0xA1, 0xE3, 0x83, 0xA9, 0xE3, 0x83,
+ 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3,
+ 0x83, 0x95, 0xE3, 0x82, 0xA3, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x95, 0xE3, 0x82,
+ 0x99, 0xE3, 0x83, 0x83, 0xE3, 0x82, 0xB7, 0xE3,
+ 0x82, 0xA7, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95,
+ 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xB3, 0xE3, 0x83,
+ 0x98, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0xBF, 0xE3,
+ 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x98,
+ 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xBD, 0xE3, 0x83,
+ 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0x8B, 0xE3,
+ 0x83, 0x92, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0xAB,
+ 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x98, 0xE3, 0x82,
+ 0x9A, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xB9, 0xE3,
+ 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0x99, 0xE3, 0x83,
+ 0x98, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3,
+ 0x82, 0xBF, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x9A,
+ 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xB3, 0xE3, 0x83,
+ 0x88, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x99, 0xE3,
+ 0x83, 0xAB, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x9B,
+ 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9B, 0xE3, 0x82,
+ 0x9A, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3,
+ 0x82, 0x99, 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x9B, 0xE3, 0x83,
+ 0xBC, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9E, 0xE3,
+ 0x82, 0xA4, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD,
+ 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xA4, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x83, 0xE3,
+ 0x83, 0x8F, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0xAB,
+ 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0x9E, 0xE3, 0x83,
+ 0xB3, 0xE3, 0x82, 0xB7, 0xE3, 0x83, 0xA7, 0xE3,
+ 0x83, 0xB3, 0xE3, 0x83, 0x9F, 0xE3, 0x82, 0xAF,
+ 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xB3, 0xE3, 0x83,
+ 0x9F, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0x9F, 0xE3,
+ 0x83, 0xAA, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x99,
+ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83,
+ 0xA1, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3,
+ 0x83, 0xA1, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99,
+ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x83,
+ 0xA1, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3,
+ 0x83, 0xAB, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xBC,
+ 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x83,
+ 0xA4, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3,
+ 0x83, 0xA6, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xB3,
+ 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0x83, 0xE3, 0x83,
+ 0x88, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAA, 0xE3,
+ 0x83, 0xA9, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92,
+ 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x95, 0xE3,
+ 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAC,
+ 0xE3, 0x83, 0xA0, 0xE3, 0x83, 0xAC, 0xE3, 0x83,
+ 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xB1, 0xE3,
+ 0x82, 0x99, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0xAF,
+ 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0x30, 0xE7,
+ 0x82, 0xB9, 0x31, 0xE7, 0x82, 0xB9, 0x32, 0xE7,
+ 0x82, 0xB9, 0x33, 0xE7, 0x82, 0xB9, 0x34, 0xE7,
+ 0x82, 0xB9, 0x35, 0xE7, 0x82, 0xB9, 0x36, 0xE7,
+ 0x82, 0xB9, 0x37, 0xE7, 0x82, 0xB9, 0x38, 0xE7,
+ 0x82, 0xB9, 0x39, 0xE7, 0x82, 0xB9, 0x31, 0x30,
+ 0xE7, 0x82, 0xB9, 0x31, 0x31, 0xE7, 0x82, 0xB9,
+ 0x31, 0x32, 0xE7, 0x82, 0xB9, 0x31, 0x33, 0xE7,
+ 0x82, 0xB9, 0x31, 0x34, 0xE7, 0x82, 0xB9, 0x31,
+ 0x35, 0xE7, 0x82, 0xB9, 0x31, 0x36, 0xE7, 0x82,
+ 0xB9, 0x31, 0x37, 0xE7, 0x82, 0xB9, 0x31, 0x38,
+ 0xE7, 0x82, 0xB9, 0x31, 0x39, 0xE7, 0x82, 0xB9,
+ 0x32, 0x30, 0xE7, 0x82, 0xB9, 0x32, 0x31, 0xE7,
+ 0x82, 0xB9, 0x32, 0x32, 0xE7, 0x82, 0xB9, 0x32,
+ 0x33, 0xE7, 0x82, 0xB9, 0x32, 0x34, 0xE7, 0x82,
+ 0xB9, 0x68, 0x50, 0x61, 0x64, 0x61, 0x41, 0x55,
+ 0x62, 0x61, 0x72, 0x6F, 0x56, 0x70, 0x63, 0x64,
+ 0x6D, 0x64, 0x6D, 0x32, 0x64, 0x6D, 0x33, 0x49,
+ 0x55, 0xE5, 0xB9, 0xB3, 0xE6, 0x88, 0x90, 0xE6,
+ 0x98, 0xAD, 0xE5, 0x92, 0x8C, 0xE5, 0xA4, 0xA7,
+ 0xE6, 0xAD, 0xA3, 0xE6, 0x98, 0x8E, 0xE6, 0xB2,
+ 0xBB, 0xE6, 0xA0, 0xAA, 0xE5, 0xBC, 0x8F, 0xE4,
+ 0xBC, 0x9A, 0xE7, 0xA4, 0xBE, 0x70, 0x41, 0x6E,
+ 0x41, 0xCE, 0xBC, 0x41, 0x6D, 0x41, 0x6B, 0x41,
+ 0x4B, 0x42, 0x4D, 0x42, 0x47, 0x42, 0x63, 0x61,
+ 0x6C, 0x6B, 0x63, 0x61, 0x6C, 0x70, 0x46, 0x6E,
+ 0x46, 0xCE, 0xBC, 0x46, 0xCE, 0xBC, 0x67, 0x6D,
+ 0x67, 0x6B, 0x67, 0x48, 0x7A, 0x6B, 0x48, 0x7A,
+ 0x4D, 0x48, 0x7A, 0x47, 0x48, 0x7A, 0x54, 0x48,
+ 0x7A, 0xCE, 0xBC, 0x6C, 0x6D, 0x6C, 0x64, 0x6C,
+ 0x6B, 0x6C, 0x66, 0x6D, 0x6E, 0x6D, 0xCE, 0xBC,
+ 0x6D, 0x6D, 0x6D, 0x63, 0x6D, 0x6B, 0x6D, 0x6D,
+ 0x6D, 0x32, 0x63, 0x6D, 0x32, 0x6D, 0x32, 0x6B,
+ 0x6D, 0x32, 0x6D, 0x6D, 0x33, 0x63, 0x6D, 0x33,
+ 0x6D, 0x33, 0x6B, 0x6D, 0x33, 0x6D, 0xE2, 0x88,
+ 0x95, 0x73, 0x6D, 0xE2, 0x88, 0x95, 0x73, 0x32,
+ 0x50, 0x61, 0x6B, 0x50, 0x61, 0x4D, 0x50, 0x61,
+ 0x47, 0x50, 0x61, 0x72, 0x61, 0x64, 0x72, 0x61,
+ 0x64, 0xE2, 0x88, 0x95, 0x73, 0x72, 0x61, 0x64,
+ 0xE2, 0x88, 0x95, 0x73, 0x32, 0x70, 0x73, 0x6E,
+ 0x73, 0xCE, 0xBC, 0x73, 0x6D, 0x73, 0x70, 0x56,
+ 0x6E, 0x56, 0xCE, 0xBC, 0x56, 0x6D, 0x56, 0x6B,
+ 0x56, 0x4D, 0x56, 0x70, 0x57, 0x6E, 0x57, 0xCE,
+ 0xBC, 0x57, 0x6D, 0x57, 0x6B, 0x57, 0x4D, 0x57,
+ 0x6B, 0xCE, 0xA9, 0x4D, 0xCE, 0xA9, 0x61, 0x2E,
+ 0x6D, 0x2E, 0x42, 0x71, 0x63, 0x63, 0x63, 0x64,
+ 0x43, 0xE2, 0x88, 0x95, 0x6B, 0x67, 0x43, 0x6F,
+ 0x2E, 0x64, 0x42, 0x47, 0x79, 0x68, 0x61, 0x48,
+ 0x50, 0x69, 0x6E, 0x4B, 0x4B, 0x4B, 0x4D, 0x6B,
+ 0x74, 0x6C, 0x6D, 0x6C, 0x6E, 0x6C, 0x6F, 0x67,
+ 0x6C, 0x78, 0x6D, 0x62, 0x6D, 0x69, 0x6C, 0x6D,
+ 0x6F, 0x6C, 0x50, 0x48, 0x70, 0x2E, 0x6D, 0x2E,
+ 0x50, 0x50, 0x4D, 0x50, 0x52, 0x73, 0x72, 0x53,
+ 0x76, 0x57, 0x62, 0x56, 0xE2, 0x88, 0x95, 0x6D,
+ 0x41, 0xE2, 0x88, 0x95, 0x6D, 0x31, 0xE6, 0x97,
+ 0xA5, 0x32, 0xE6, 0x97, 0xA5, 0x33, 0xE6, 0x97,
+ 0xA5, 0x34, 0xE6, 0x97, 0xA5, 0x35, 0xE6, 0x97,
+ 0xA5, 0x36, 0xE6, 0x97, 0xA5, 0x37, 0xE6, 0x97,
+ 0xA5, 0x38, 0xE6, 0x97, 0xA5, 0x39, 0xE6, 0x97,
+ 0xA5, 0x31, 0x30, 0xE6, 0x97, 0xA5, 0x31, 0x31,
+ 0xE6, 0x97, 0xA5, 0x31, 0x32, 0xE6, 0x97, 0xA5,
+ 0x31, 0x33, 0xE6, 0x97, 0xA5, 0x31, 0x34, 0xE6,
+ 0x97, 0xA5, 0x31, 0x35, 0xE6, 0x97, 0xA5, 0x31,
+ 0x36, 0xE6, 0x97, 0xA5, 0x31, 0x37, 0xE6, 0x97,
+ 0xA5, 0x31, 0x38, 0xE6, 0x97, 0xA5, 0x31, 0x39,
+ 0xE6, 0x97, 0xA5, 0x32, 0x30, 0xE6, 0x97, 0xA5,
+ 0x32, 0x31, 0xE6, 0x97, 0xA5, 0x32, 0x32, 0xE6,
+ 0x97, 0xA5, 0x32, 0x33, 0xE6, 0x97, 0xA5, 0x32,
+ 0x34, 0xE6, 0x97, 0xA5, 0x32, 0x35, 0xE6, 0x97,
+ 0xA5, 0x32, 0x36, 0xE6, 0x97, 0xA5, 0x32, 0x37,
+ 0xE6, 0x97, 0xA5, 0x32, 0x38, 0xE6, 0x97, 0xA5,
+ 0x32, 0x39, 0xE6, 0x97, 0xA5, 0x33, 0x30, 0xE6,
+ 0x97, 0xA5, 0x33, 0x31, 0xE6, 0x97, 0xA5, 0x67,
+ 0x61, 0x6C, 0xF6, 0xE8, 0xB1, 0x88, 0xF6, 0xE6,
+ 0x9B, 0xB4, 0xF6, 0xE8, 0xBB, 0x8A, 0xF6, 0xE8,
+ 0xB3, 0x88, 0xF6, 0xE6, 0xBB, 0x91, 0xF6, 0xE4,
+ 0xB8, 0xB2, 0xF6, 0xE5, 0x8F, 0xA5, 0xF6, 0xE9,
+ 0xBE, 0x9C, 0xF6, 0xE9, 0xBE, 0x9C, 0xF6, 0xE5,
+ 0xA5, 0x91, 0xF6, 0xE9, 0x87, 0x91, 0xF6, 0xE5,
+ 0x96, 0x87, 0xF6, 0xE5, 0xA5, 0x88, 0xF6, 0xE6,
+ 0x87, 0xB6, 0xF6, 0xE7, 0x99, 0xA9, 0xF6, 0xE7,
+ 0xBE, 0x85, 0xF6, 0xE8, 0x98, 0xBF, 0xF6, 0xE8,
+ 0x9E, 0xBA, 0xF6, 0xE8, 0xA3, 0xB8, 0xF6, 0xE9,
+ 0x82, 0x8F, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, 0xE6,
+ 0xB4, 0x9B, 0xF6, 0xE7, 0x83, 0x99, 0xF6, 0xE7,
+ 0x8F, 0x9E, 0xF6, 0xE8, 0x90, 0xBD, 0xF6, 0xE9,
+ 0x85, 0xAA, 0xF6, 0xE9, 0xA7, 0xB1, 0xF6, 0xE4,
+ 0xBA, 0x82, 0xF6, 0xE5, 0x8D, 0xB5, 0xF6, 0xE6,
+ 0xAC, 0x84, 0xF6, 0xE7, 0x88, 0x9B, 0xF6, 0xE8,
+ 0x98, 0xAD, 0xF6, 0xE9, 0xB8, 0x9E, 0xF6, 0xE5,
+ 0xB5, 0x90, 0xF6, 0xE6, 0xBF, 0xAB, 0xF6, 0xE8,
+ 0x97, 0x8D, 0xF6, 0xE8, 0xA5, 0xA4, 0xF6, 0xE6,
+ 0x8B, 0x89, 0xF6, 0xE8, 0x87, 0x98, 0xF6, 0xE8,
+ 0xA0, 0x9F, 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xE6,
+ 0x9C, 0x97, 0xF6, 0xE6, 0xB5, 0xAA, 0xF6, 0xE7,
+ 0x8B, 0xBC, 0xF6, 0xE9, 0x83, 0x8E, 0xF6, 0xE4,
+ 0xBE, 0x86, 0xF6, 0xE5, 0x86, 0xB7, 0xF6, 0xE5,
+ 0x8B, 0x9E, 0xF6, 0xE6, 0x93, 0x84, 0xF6, 0xE6,
+ 0xAB, 0x93, 0xF6, 0xE7, 0x88, 0x90, 0xF6, 0xE7,
+ 0x9B, 0xA7, 0xF6, 0xE8, 0x80, 0x81, 0xF6, 0xE8,
+ 0x98, 0x86, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, 0xE8,
+ 0xB7, 0xAF, 0xF6, 0xE9, 0x9C, 0xB2, 0xF6, 0xE9,
+ 0xAD, 0xAF, 0xF6, 0xE9, 0xB7, 0xBA, 0xF6, 0xE7,
+ 0xA2, 0x8C, 0xF6, 0xE7, 0xA5, 0xBF, 0xF6, 0xE7,
+ 0xB6, 0xA0, 0xF6, 0xE8, 0x8F, 0x89, 0xF6, 0xE9,
+ 0x8C, 0x84, 0xF6, 0xE9, 0xB9, 0xBF, 0xF6, 0xE8,
+ 0xAB, 0x96, 0xF6, 0xE5, 0xA3, 0x9F, 0xF6, 0xE5,
+ 0xBC, 0x84, 0xF6, 0xE7, 0xB1, 0xA0, 0xF6, 0xE8,
+ 0x81, 0xBE, 0xF6, 0xE7, 0x89, 0xA2, 0xF6, 0xE7,
+ 0xA3, 0x8A, 0xF6, 0xE8, 0xB3, 0x82, 0xF6, 0xE9,
+ 0x9B, 0xB7, 0xF6, 0xE5, 0xA3, 0x98, 0xF6, 0xE5,
+ 0xB1, 0xA2, 0xF6, 0xE6, 0xA8, 0x93, 0xF6, 0xE6,
+ 0xB7, 0x9A, 0xF6, 0xE6, 0xBC, 0x8F, 0xF6, 0xE7,
+ 0xB4, 0xAF, 0xF6, 0xE7, 0xB8, 0xB7, 0xF6, 0xE9,
+ 0x99, 0x8B, 0xF6, 0xE5, 0x8B, 0x92, 0xF6, 0xE8,
+ 0x82, 0x8B, 0xF6, 0xE5, 0x87, 0x9C, 0xF6, 0xE5,
+ 0x87, 0x8C, 0xF6, 0xE7, 0xA8, 0x9C, 0xF6, 0xE7,
+ 0xB6, 0xBE, 0xF6, 0xE8, 0x8F, 0xB1, 0xF6, 0xE9,
+ 0x99, 0xB5, 0xF6, 0xE8, 0xAE, 0x80, 0xF6, 0xE6,
+ 0x8B, 0x8F, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, 0xE8,
+ 0xAB, 0xBE, 0xF6, 0xE4, 0xB8, 0xB9, 0xF6, 0xE5,
+ 0xAF, 0xA7, 0xF6, 0xE6, 0x80, 0x92, 0xF6, 0xE7,
+ 0x8E, 0x87, 0xF6, 0xE7, 0x95, 0xB0, 0xF6, 0xE5,
+ 0x8C, 0x97, 0xF6, 0xE7, 0xA3, 0xBB, 0xF6, 0xE4,
+ 0xBE, 0xBF, 0xF6, 0xE5, 0xBE, 0xA9, 0xF6, 0xE4,
+ 0xB8, 0x8D, 0xF6, 0xE6, 0xB3, 0x8C, 0xF6, 0xE6,
+ 0x95, 0xB8, 0xF6, 0xE7, 0xB4, 0xA2, 0xF6, 0xE5,
+ 0x8F, 0x83, 0xF6, 0xE5, 0xA1, 0x9E, 0xF6, 0xE7,
+ 0x9C, 0x81, 0xF6, 0xE8, 0x91, 0x89, 0xF6, 0xE8,
+ 0xAA, 0xAA, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE8,
+ 0xBE, 0xB0, 0xF6, 0xE6, 0xB2, 0x88, 0xF6, 0xE6,
+ 0x8B, 0xBE, 0xF6, 0xE8, 0x8B, 0xA5, 0xF6, 0xE6,
+ 0x8E, 0xA0, 0xF6, 0xE7, 0x95, 0xA5, 0xF6, 0xE4,
+ 0xBA, 0xAE, 0xF6, 0xE5, 0x85, 0xA9, 0xF6, 0xE5,
+ 0x87, 0x89, 0xF6, 0xE6, 0xA2, 0x81, 0xF6, 0xE7,
+ 0xB3, 0xA7, 0xF6, 0xE8, 0x89, 0xAF, 0xF6, 0xE8,
+ 0xAB, 0x92, 0xF6, 0xE9, 0x87, 0x8F, 0xF6, 0xE5,
+ 0x8B, 0xB5, 0xF6, 0xE5, 0x91, 0x82, 0xF6, 0xE5,
+ 0xA5, 0xB3, 0xF6, 0xE5, 0xBB, 0xAC, 0xF6, 0xE6,
+ 0x97, 0x85, 0xF6, 0xE6, 0xBF, 0xBE, 0xF6, 0xE7,
+ 0xA4, 0xAA, 0xF6, 0xE9, 0x96, 0xAD, 0xF6, 0xE9,
+ 0xA9, 0xAA, 0xF6, 0xE9, 0xBA, 0x97, 0xF6, 0xE9,
+ 0xBB, 0x8E, 0xF6, 0xE5, 0x8A, 0x9B, 0xF6, 0xE6,
+ 0x9B, 0x86, 0xF6, 0xE6, 0xAD, 0xB7, 0xF6, 0xE8,
+ 0xBD, 0xA2, 0xF6, 0xE5, 0xB9, 0xB4, 0xF6, 0xE6,
+ 0x86, 0x90, 0xF6, 0xE6, 0x88, 0x80, 0xF6, 0xE6,
+ 0x92, 0x9A, 0xF6, 0xE6, 0xBC, 0xA3, 0xF6, 0xE7,
+ 0x85, 0x89, 0xF6, 0xE7, 0x92, 0x89, 0xF6, 0xE7,
+ 0xA7, 0x8A, 0xF6, 0xE7, 0xB7, 0xB4, 0xF6, 0xE8,
+ 0x81, 0xAF, 0xF6, 0xE8, 0xBC, 0xA6, 0xF6, 0xE8,
+ 0x93, 0xAE, 0xF6, 0xE9, 0x80, 0xA3, 0xF6, 0xE9,
+ 0x8D, 0x8A, 0xF6, 0xE5, 0x88, 0x97, 0xF6, 0xE5,
+ 0x8A, 0xA3, 0xF6, 0xE5, 0x92, 0xBD, 0xF6, 0xE7,
+ 0x83, 0x88, 0xF6, 0xE8, 0xA3, 0x82, 0xF6, 0xE8,
+ 0xAA, 0xAA, 0xF6, 0xE5, 0xBB, 0x89, 0xF6, 0xE5,
+ 0xBF, 0xB5, 0xF6, 0xE6, 0x8D, 0xBB, 0xF6, 0xE6,
+ 0xAE, 0xAE, 0xF6, 0xE7, 0xB0, 0xBE, 0xF6, 0xE7,
+ 0x8D, 0xB5, 0xF6, 0xE4, 0xBB, 0xA4, 0xF6, 0xE5,
+ 0x9B, 0xB9, 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5,
+ 0xB6, 0xBA, 0xF6, 0xE6, 0x80, 0x9C, 0xF6, 0xE7,
+ 0x8E, 0xB2, 0xF6, 0xE7, 0x91, 0xA9, 0xF6, 0xE7,
+ 0xBE, 0x9A, 0xF6, 0xE8, 0x81, 0x86, 0xF6, 0xE9,
+ 0x88, 0xB4, 0xF6, 0xE9, 0x9B, 0xB6, 0xF6, 0xE9,
+ 0x9D, 0x88, 0xF6, 0xE9, 0xA0, 0x98, 0xF6, 0xE4,
+ 0xBE, 0x8B, 0xF6, 0xE7, 0xA6, 0xAE, 0xF6, 0xE9,
+ 0x86, 0xB4, 0xF6, 0xE9, 0x9A, 0xB8, 0xF6, 0xE6,
+ 0x83, 0xA1, 0xF6, 0xE4, 0xBA, 0x86, 0xF6, 0xE5,
+ 0x83, 0x9A, 0xF6, 0xE5, 0xAF, 0xAE, 0xF6, 0xE5,
+ 0xB0, 0xBF, 0xF6, 0xE6, 0x96, 0x99, 0xF6, 0xE6,
+ 0xA8, 0x82, 0xF6, 0xE7, 0x87, 0x8E, 0xF6, 0xE7,
+ 0x99, 0x82, 0xF6, 0xE8, 0x93, 0xBC, 0xF6, 0xE9,
+ 0x81, 0xBC, 0xF6, 0xE9, 0xBE, 0x8D, 0xF6, 0xE6,
+ 0x9A, 0x88, 0xF6, 0xE9, 0x98, 0xAE, 0xF6, 0xE5,
+ 0x8A, 0x89, 0xF6, 0xE6, 0x9D, 0xBB, 0xF6, 0xE6,
+ 0x9F, 0xB3, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6,
+ 0xBA, 0x9C, 0xF6, 0xE7, 0x90, 0x89, 0xF6, 0xE7,
+ 0x95, 0x99, 0xF6, 0xE7, 0xA1, 0xAB, 0xF6, 0xE7,
+ 0xB4, 0x90, 0xF6, 0xE9, 0xA1, 0x9E, 0xF6, 0xE5,
+ 0x85, 0xAD, 0xF6, 0xE6, 0x88, 0xAE, 0xF6, 0xE9,
+ 0x99, 0xB8, 0xF6, 0xE5, 0x80, 0xAB, 0xF6, 0xE5,
+ 0xB4, 0x99, 0xF6, 0xE6, 0xB7, 0xAA, 0xF6, 0xE8,
+ 0xBC, 0xAA, 0xF6, 0xE5, 0xBE, 0x8B, 0xF6, 0xE6,
+ 0x85, 0x84, 0xF6, 0xE6, 0xA0, 0x97, 0xF6, 0xE7,
+ 0x8E, 0x87, 0xF6, 0xE9, 0x9A, 0x86, 0xF6, 0xE5,
+ 0x88, 0xA9, 0xF6, 0xE5, 0x90, 0x8F, 0xF6, 0xE5,
+ 0xB1, 0xA5, 0xF6, 0xE6, 0x98, 0x93, 0xF6, 0xE6,
+ 0x9D, 0x8E, 0xF6, 0xE6, 0xA2, 0xA8, 0xF6, 0xE6,
+ 0xB3, 0xA5, 0xF6, 0xE7, 0x90, 0x86, 0xF6, 0xE7,
+ 0x97, 0xA2, 0xF6, 0xE7, 0xBD, 0xB9, 0xF6, 0xE8,
+ 0xA3, 0x8F, 0xF6, 0xE8, 0xA3, 0xA1, 0xF6, 0xE9,
+ 0x87, 0x8C, 0xF6, 0xE9, 0x9B, 0xA2, 0xF6, 0xE5,
+ 0x8C, 0xBF, 0xF6, 0xE6, 0xBA, 0xBA, 0xF6, 0xE5,
+ 0x90, 0x9D, 0xF6, 0xE7, 0x87, 0x90, 0xF6, 0xE7,
+ 0x92, 0x98, 0xF6, 0xE8, 0x97, 0xBA, 0xF6, 0xE9,
+ 0x9A, 0xA3, 0xF6, 0xE9, 0xB1, 0x97, 0xF6, 0xE9,
+ 0xBA, 0x9F, 0xF6, 0xE6, 0x9E, 0x97, 0xF6, 0xE6,
+ 0xB7, 0x8B, 0xF6, 0xE8, 0x87, 0xA8, 0xF6, 0xE7,
+ 0xAB, 0x8B, 0xF6, 0xE7, 0xAC, 0xA0, 0xF6, 0xE7,
+ 0xB2, 0x92, 0xF6, 0xE7, 0x8B, 0x80, 0xF6, 0xE7,
+ 0x82, 0x99, 0xF6, 0xE8, 0xAD, 0x98, 0xF6, 0xE4,
+ 0xBB, 0x80, 0xF6, 0xE8, 0x8C, 0xB6, 0xF6, 0xE5,
+ 0x88, 0xBA, 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5,
+ 0xBA, 0xA6, 0xF6, 0xE6, 0x8B, 0x93, 0xF6, 0xE7,
+ 0xB3, 0x96, 0xF6, 0xE5, 0xAE, 0x85, 0xF6, 0xE6,
+ 0xB4, 0x9E, 0xF6, 0xE6, 0x9A, 0xB4, 0xF6, 0xE8,
+ 0xBC, 0xBB, 0xF6, 0xE8, 0xA1, 0x8C, 0xF6, 0xE9,
+ 0x99, 0x8D, 0xF6, 0xE8, 0xA6, 0x8B, 0xF6, 0xE5,
+ 0xBB, 0x93, 0xF6, 0xE5, 0x85, 0x80, 0xF6, 0xE5,
+ 0x97, 0x80, 0xF6, 0xE5, 0xA1, 0x9A, 0xF6, 0xE6,
+ 0x99, 0xB4, 0xF6, 0xE5, 0x87, 0x9E, 0xF6, 0xE7,
+ 0x8C, 0xAA, 0xF6, 0xE7, 0x9B, 0x8A, 0xF6, 0xE7,
+ 0xA4, 0xBC, 0xF6, 0xE7, 0xA5, 0x9E, 0xF6, 0xE7,
+ 0xA5, 0xA5, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE9,
+ 0x9D, 0x96, 0xF6, 0xE7, 0xB2, 0xBE, 0xF6, 0xE7,
+ 0xBE, 0xBD, 0xF6, 0xE8, 0x98, 0x92, 0xF6, 0xE8,
+ 0xAB, 0xB8, 0xF6, 0xE9, 0x80, 0xB8, 0xF6, 0xE9,
+ 0x83, 0xBD, 0xF6, 0xE9, 0xA3, 0xAF, 0xF6, 0xE9,
+ 0xA3, 0xBC, 0xF6, 0xE9, 0xA4, 0xA8, 0xF6, 0xE9,
+ 0xB6, 0xB4, 0xF6, 0xE4, 0xBE, 0xAE, 0xF6, 0xE5,
+ 0x83, 0xA7, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, 0xE5,
+ 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5,
+ 0x8D, 0x91, 0xF6, 0xE5, 0x96, 0x9D, 0xF6, 0xE5,
+ 0x98, 0x86, 0xF6, 0xE5, 0x99, 0xA8, 0xF6, 0xE5,
+ 0xA1, 0x80, 0xF6, 0xE5, 0xA2, 0xA8, 0xF6, 0xE5,
+ 0xB1, 0xA4, 0xF6, 0xE5, 0xB1, 0xAE, 0xF6, 0xE6,
+ 0x82, 0x94, 0xF6, 0xE6, 0x85, 0xA8, 0xF6, 0xE6,
+ 0x86, 0x8E, 0xF6, 0xE6, 0x87, 0xB2, 0xF6, 0xE6,
+ 0x95, 0x8F, 0xF6, 0xE6, 0x97, 0xA2, 0xF6, 0xE6,
+ 0x9A, 0x91, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xE6,
+ 0xB5, 0xB7, 0xF6, 0xE6, 0xB8, 0x9A, 0xF6, 0xE6,
+ 0xBC, 0xA2, 0xF6, 0xE7, 0x85, 0xAE, 0xF6, 0xE7,
+ 0x88, 0xAB, 0xF6, 0xE7, 0x90, 0xA2, 0xF6, 0xE7,
+ 0xA2, 0x91, 0xF6, 0xE7, 0xA4, 0xBE, 0xF6, 0xE7,
+ 0xA5, 0x89, 0xF6, 0xE7, 0xA5, 0x88, 0xF6, 0xE7,
+ 0xA5, 0x90, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, 0xE7,
+ 0xA5, 0x9D, 0xF6, 0xE7, 0xA6, 0x8D, 0xF6, 0xE7,
+ 0xA6, 0x8E, 0xF6, 0xE7, 0xA9, 0x80, 0xF6, 0xE7,
+ 0xAA, 0x81, 0xF6, 0xE7, 0xAF, 0x80, 0xF6, 0xE7,
+ 0xB7, 0xB4, 0xF6, 0xE7, 0xB8, 0x89, 0xF6, 0xE7,
+ 0xB9, 0x81, 0xF6, 0xE7, 0xBD, 0xB2, 0xF6, 0xE8,
+ 0x80, 0x85, 0xF6, 0xE8, 0x87, 0xAD, 0xF6, 0xE8,
+ 0x89, 0xB9, 0xF6, 0xE8, 0x89, 0xB9, 0xF6, 0xE8,
+ 0x91, 0x97, 0xF6, 0xE8, 0xA4, 0x90, 0xF6, 0xE8,
+ 0xA6, 0x96, 0xF6, 0xE8, 0xAC, 0x81, 0xF6, 0xE8,
+ 0xAC, 0xB9, 0xF6, 0xE8, 0xB3, 0x93, 0xF6, 0xE8,
+ 0xB4, 0x88, 0xF6, 0xE8, 0xBE, 0xB6, 0xF6, 0xE9,
+ 0x80, 0xB8, 0xF6, 0xE9, 0x9B, 0xA3, 0xF6, 0xE9,
+ 0x9F, 0xBF, 0xF6, 0xE9, 0xA0, 0xBB, 0xF6, 0xE4,
+ 0xB8, 0xA6, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xE5,
+ 0x85, 0xA8, 0xF6, 0xE4, 0xBE, 0x80, 0xF6, 0xE5,
+ 0x85, 0x85, 0xF6, 0xE5, 0x86, 0x80, 0xF6, 0xE5,
+ 0x8B, 0x87, 0xF6, 0xE5, 0x8B, 0xBA, 0xF6, 0xE5,
+ 0x96, 0x9D, 0xF6, 0xE5, 0x95, 0x95, 0xF6, 0xE5,
+ 0x96, 0x99, 0xF6, 0xE5, 0x97, 0xA2, 0xF6, 0xE5,
+ 0xA1, 0x9A, 0xF6, 0xE5, 0xA2, 0xB3, 0xF6, 0xE5,
+ 0xA5, 0x84, 0xF6, 0xE5, 0xA5, 0x94, 0xF6, 0xE5,
+ 0xA9, 0xA2, 0xF6, 0xE5, 0xAC, 0xA8, 0xF6, 0xE5,
+ 0xBB, 0x92, 0xF6, 0xE5, 0xBB, 0x99, 0xF6, 0xE5,
+ 0xBD, 0xA9, 0xF6, 0xE5, 0xBE, 0xAD, 0xF6, 0xE6,
+ 0x83, 0x98, 0xF6, 0xE6, 0x85, 0x8E, 0xF6, 0xE6,
+ 0x84, 0x88, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6,
+ 0x85, 0xA0, 0xF6, 0xE6, 0x87, 0xB2, 0xF6, 0xE6,
+ 0x88, 0xB4, 0xF6, 0xE6, 0x8F, 0x84, 0xF6, 0xE6,
+ 0x90, 0x9C, 0xF6, 0xE6, 0x91, 0x92, 0xF6, 0xE6,
+ 0x95, 0x96, 0xF6, 0xE6, 0x99, 0xB4, 0xF6, 0xE6,
+ 0x9C, 0x97, 0xF6, 0xE6, 0x9C, 0x9B, 0xF6, 0xE6,
+ 0x9D, 0x96, 0xF6, 0xE6, 0xAD, 0xB9, 0xF6, 0xE6,
+ 0xAE, 0xBA, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6,
+ 0xBB, 0x9B, 0xF6, 0xE6, 0xBB, 0x8B, 0xF6, 0xE6,
+ 0xBC, 0xA2, 0xF6, 0xE7, 0x80, 0x9E, 0xF6, 0xE7,
+ 0x85, 0xAE, 0xF6, 0xE7, 0x9E, 0xA7, 0xF6, 0xE7,
+ 0x88, 0xB5, 0xF6, 0xE7, 0x8A, 0xAF, 0xF6, 0xE7,
+ 0x8C, 0xAA, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, 0xE7,
+ 0x94, 0x86, 0xF6, 0xE7, 0x94, 0xBB, 0xF6, 0xE7,
+ 0x98, 0x9D, 0xF6, 0xE7, 0x98, 0x9F, 0xF6, 0xE7,
+ 0x9B, 0x8A, 0xF6, 0xE7, 0x9B, 0x9B, 0xF6, 0xE7,
+ 0x9B, 0xB4, 0xF6, 0xE7, 0x9D, 0x8A, 0xF6, 0xE7,
+ 0x9D, 0x80, 0xF6, 0xE7, 0xA3, 0x8C, 0xF6, 0xE7,
+ 0xAA, 0xB1, 0xF6, 0xE7, 0xAF, 0x80, 0xF6, 0xE7,
+ 0xB1, 0xBB, 0xF6, 0xE7, 0xB5, 0x9B, 0xF6, 0xE7,
+ 0xB7, 0xB4, 0xF6, 0xE7, 0xBC, 0xBE, 0xF6, 0xE8,
+ 0x80, 0x85, 0xF6, 0xE8, 0x8D, 0x92, 0xF6, 0xE8,
+ 0x8F, 0xAF, 0xF6, 0xE8, 0x9D, 0xB9, 0xF6, 0xE8,
+ 0xA5, 0x81, 0xF6, 0xE8, 0xA6, 0x86, 0xF6, 0xE8,
+ 0xA6, 0x96, 0xF6, 0xE8, 0xAA, 0xBF, 0xF6, 0xE8,
+ 0xAB, 0xB8, 0xF6, 0xE8, 0xAB, 0x8B, 0xF6, 0xE8,
+ 0xAC, 0x81, 0xF6, 0xE8, 0xAB, 0xBE, 0xF6, 0xE8,
+ 0xAB, 0xAD, 0xF6, 0xE8, 0xAC, 0xB9, 0xF6, 0xE8,
+ 0xAE, 0x8A, 0xF6, 0xE8, 0xB4, 0x88, 0xF6, 0xE8,
+ 0xBC, 0xB8, 0xF6, 0xE9, 0x81, 0xB2, 0xF6, 0xE9,
+ 0x86, 0x99, 0xF6, 0xE9, 0x89, 0xB6, 0xF6, 0xE9,
+ 0x99, 0xBC, 0xF6, 0xE9, 0x9B, 0xA3, 0xF6, 0xE9,
+ 0x9D, 0x96, 0xF6, 0xE9, 0x9F, 0x9B, 0xF6, 0xE9,
+ 0x9F, 0xBF, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, 0xE9,
+ 0xA0, 0xBB, 0xF6, 0xE9, 0xAC, 0x92, 0xF6, 0xE9,
+ 0xBE, 0x9C, 0xF6, 0xF0, 0xA2, 0xA1, 0x8A, 0xF6,
+ 0xF0, 0xA2, 0xA1, 0x84, 0xF6, 0xF0, 0xA3, 0x8F,
+ 0x95, 0xF6, 0xE3, 0xAE, 0x9D, 0xF6, 0xE4, 0x80,
+ 0x98, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xF0, 0xA5,
+ 0x89, 0x89, 0xF6, 0xF0, 0xA5, 0xB3, 0x90, 0xF6,
+ 0xF0, 0xA7, 0xBB, 0x93, 0xF6, 0xE9, 0xBD, 0x83,
+ 0xF6, 0xE9, 0xBE, 0x8E, 0x66, 0x66, 0x66, 0x69,
+ 0x66, 0x6C, 0x66, 0x66, 0x69, 0x66, 0x66, 0x6C,
+ 0x73, 0x74, 0x73, 0x74, 0xD5, 0xB4, 0xD5, 0xB6,
+ 0xD5, 0xB4, 0xD5, 0xA5, 0xD5, 0xB4, 0xD5, 0xAB,
+ 0xD5, 0xBE, 0xD5, 0xB6, 0xD5, 0xB4, 0xD5, 0xAD,
+ 0xF6, 0xD7, 0x99, 0xD6, 0xB4, 0xF6, 0xD7, 0xB2,
+ 0xD6, 0xB7, 0xD7, 0xA2, 0xD7, 0x90, 0xD7, 0x93,
+ 0xD7, 0x94, 0xD7, 0x9B, 0xD7, 0x9C, 0xD7, 0x9D,
+ 0xD7, 0xA8, 0xD7, 0xAA, 0x2B, 0xF6, 0xD7, 0xA9,
+ 0xD7, 0x81, 0xF6, 0xD7, 0xA9, 0xD7, 0x82, 0xF6,
+ 0xD7, 0xA9, 0xD6, 0xBC, 0xD7, 0x81, 0xF6, 0xD7,
+ 0xA9, 0xD6, 0xBC, 0xD7, 0x82, 0xF6, 0xD7, 0x90,
+ 0xD6, 0xB7, 0xF6, 0xD7, 0x90, 0xD6, 0xB8, 0xF6,
+ 0xD7, 0x90, 0xD6, 0xBC, 0xF6, 0xD7, 0x91, 0xD6,
+ 0xBC, 0xF6, 0xD7, 0x92, 0xD6, 0xBC, 0xF6, 0xD7,
+ 0x93, 0xD6, 0xBC, 0xF6, 0xD7, 0x94, 0xD6, 0xBC,
+ 0xF6, 0xD7, 0x95, 0xD6, 0xBC, 0xF6, 0xD7, 0x96,
+ 0xD6, 0xBC, 0xF6, 0xD7, 0x98, 0xD6, 0xBC, 0xF6,
+ 0xD7, 0x99, 0xD6, 0xBC, 0xF6, 0xD7, 0x9A, 0xD6,
+ 0xBC, 0xF6, 0xD7, 0x9B, 0xD6, 0xBC, 0xF6, 0xD7,
+ 0x9C, 0xD6, 0xBC, 0xF6, 0xD7, 0x9E, 0xD6, 0xBC,
+ 0xF6, 0xD7, 0xA0, 0xD6, 0xBC, 0xF6, 0xD7, 0xA1,
+ 0xD6, 0xBC, 0xF6, 0xD7, 0xA3, 0xD6, 0xBC, 0xF6,
+ 0xD7, 0xA4, 0xD6, 0xBC, 0xF6, 0xD7, 0xA6, 0xD6,
+ 0xBC, 0xF6, 0xD7, 0xA7, 0xD6, 0xBC, 0xF6, 0xD7,
+ 0xA8, 0xD6, 0xBC, 0xF6, 0xD7, 0xA9, 0xD6, 0xBC,
+ 0xF6, 0xD7, 0xAA, 0xD6, 0xBC, 0xF6, 0xD7, 0x95,
+ 0xD6, 0xB9, 0xF6, 0xD7, 0x91, 0xD6, 0xBF, 0xF6,
+ 0xD7, 0x9B, 0xD6, 0xBF, 0xF6, 0xD7, 0xA4, 0xD6,
+ 0xBF, 0xD7, 0x90, 0xD7, 0x9C, 0xD9, 0xB1, 0xD9,
+ 0xB1, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9,
+ 0xBB, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9,
+ 0xBE, 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, 0xDA,
+ 0x80, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9,
+ 0xBA, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9,
+ 0xBF, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9,
+ 0xB9, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA,
+ 0xA4, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA,
+ 0xA6, 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, 0xDA,
+ 0x84, 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, 0xDA,
+ 0x83, 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, 0xDA,
+ 0x86, 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, 0xDA,
+ 0x87, 0xDA, 0x8D, 0xDA, 0x8D, 0xDA, 0x8C, 0xDA,
+ 0x8C, 0xDA, 0x8E, 0xDA, 0x8E, 0xDA, 0x88, 0xDA,
+ 0x88, 0xDA, 0x98, 0xDA, 0x98, 0xDA, 0x91, 0xDA,
+ 0x91, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA,
+ 0xA9, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA,
+ 0xAF, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA,
+ 0xB3, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA,
+ 0xB1, 0xDA, 0xBA, 0xDA, 0xBA, 0xDA, 0xBB, 0xDA,
+ 0xBB, 0xDA, 0xBB, 0xDA, 0xBB, 0xDB, 0x95, 0xD9,
+ 0x94, 0xDB, 0x95, 0xD9, 0x94, 0xDB, 0x81, 0xDB,
+ 0x81, 0xDB, 0x81, 0xDB, 0x81, 0xDA, 0xBE, 0xDA,
+ 0xBE, 0xDA, 0xBE, 0xDA, 0xBE, 0xDB, 0x92, 0xDB,
+ 0x92, 0xDB, 0x92, 0xD9, 0x94, 0xDB, 0x92, 0xD9,
+ 0x94, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA,
+ 0xAD, 0xDB, 0x87, 0xDB, 0x87, 0xDB, 0x86, 0xDB,
+ 0x86, 0xDB, 0x88, 0xDB, 0x88, 0xDB, 0x87, 0xD9,
+ 0xB4, 0xDB, 0x8B, 0xDB, 0x8B, 0xDB, 0x85, 0xDB,
+ 0x85, 0xDB, 0x89, 0xDB, 0x89, 0xDB, 0x90, 0xDB,
+ 0x90, 0xDB, 0x90, 0xDB, 0x90, 0xD9, 0x89, 0xD9,
+ 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xDB, 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xDB,
+ 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xDB, 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB,
+ 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xDB, 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB,
+ 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, 0xD9,
+ 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xDB, 0x8C, 0xDB,
+ 0x8C, 0xDB, 0x8C, 0xDB, 0x8C, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8,
+ 0xAD, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8,
+ 0xA8, 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8,
+ 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x89, 0xD8,
+ 0xA8, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD8,
+ 0xAA, 0xD8, 0xAD, 0xD8, 0xAA, 0xD8, 0xAE, 0xD8,
+ 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x89, 0xD8,
+ 0xAA, 0xD9, 0x8A, 0xD8, 0xAB, 0xD8, 0xAC, 0xD8,
+ 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x89, 0xD8,
+ 0xAB, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8,
+ 0xAC, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8,
+ 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD8,
+ 0xAE, 0xD8, 0xAD, 0xD8, 0xAE, 0xD9, 0x85, 0xD8,
+ 0xB3, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8,
+ 0xB3, 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8,
+ 0xB5, 0xD8, 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD8,
+ 0xB6, 0xD8, 0xAC, 0xD8, 0xB6, 0xD8, 0xAD, 0xD8,
+ 0xB6, 0xD8, 0xAE, 0xD8, 0xB6, 0xD9, 0x85, 0xD8,
+ 0xB7, 0xD8, 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8,
+ 0xB8, 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8,
+ 0xB9, 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8,
+ 0xBA, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9,
+ 0x81, 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9,
+ 0x81, 0xD9, 0x85, 0xD9, 0x81, 0xD9, 0x89, 0xD9,
+ 0x81, 0xD9, 0x8A, 0xD9, 0x82, 0xD8, 0xAD, 0xD9,
+ 0x82, 0xD9, 0x85, 0xD9, 0x82, 0xD9, 0x89, 0xD9,
+ 0x82, 0xD9, 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9,
+ 0x83, 0xD8, 0xAC, 0xD9, 0x83, 0xD8, 0xAD, 0xD9,
+ 0x83, 0xD8, 0xAE, 0xD9, 0x83, 0xD9, 0x84, 0xD9,
+ 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, 0xD9,
+ 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9,
+ 0x84, 0xD8, 0xAD, 0xD9, 0x84, 0xD8, 0xAE, 0xD9,
+ 0x84, 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9,
+ 0x84, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9,
+ 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9,
+ 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x89, 0xD9,
+ 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9,
+ 0x86, 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9,
+ 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x89, 0xD9,
+ 0x86, 0xD9, 0x8A, 0xD9, 0x87, 0xD8, 0xAC, 0xD9,
+ 0x87, 0xD9, 0x85, 0xD9, 0x87, 0xD9, 0x89, 0xD9,
+ 0x87, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9,
+ 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x89, 0xD9,
+ 0x8A, 0xD9, 0x8A, 0xD8, 0xB0, 0xD9, 0xB0, 0xD8,
+ 0xB1, 0xD9, 0xB0, 0xD9, 0x89, 0xD9, 0xB0, 0x20,
+ 0xD9, 0x8C, 0xD9, 0x91, 0x20, 0xD9, 0x8D, 0xD9,
+ 0x91, 0x20, 0xD9, 0x8E, 0xD9, 0x91, 0x20, 0xD9,
+ 0x8F, 0xD9, 0x91, 0x20, 0xD9, 0x90, 0xD9, 0x91,
+ 0x20, 0xD9, 0x91, 0xD9, 0xB0, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xD8, 0xB1, 0xD9, 0x8A, 0xD9, 0x94, 0xD8,
+ 0xB2, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD9, 0x86, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9,
+ 0x8A, 0xD8, 0xA8, 0xD8, 0xB1, 0xD8, 0xA8, 0xD8,
+ 0xB2, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9,
+ 0x86, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, 0xD9,
+ 0x8A, 0xD8, 0xAA, 0xD8, 0xB1, 0xD8, 0xAA, 0xD8,
+ 0xB2, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9,
+ 0x86, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, 0xD9,
+ 0x8A, 0xD8, 0xAB, 0xD8, 0xB1, 0xD8, 0xAB, 0xD8,
+ 0xB2, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9,
+ 0x86, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, 0xD9,
+ 0x8A, 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, 0xD9,
+ 0x8A, 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, 0xD9,
+ 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, 0xD9,
+ 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9,
+ 0x89, 0xD9, 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD9,
+ 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, 0xD9,
+ 0x8A, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x85, 0xD9,
+ 0x85, 0xD9, 0x86, 0xD8, 0xB1, 0xD9, 0x86, 0xD8,
+ 0xB2, 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9,
+ 0x86, 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, 0xD9,
+ 0x8A, 0xD9, 0x89, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8,
+ 0xB1, 0xD9, 0x8A, 0xD8, 0xB2, 0xD9, 0x8A, 0xD9,
+ 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x8A, 0xD9,
+ 0x89, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8,
+ 0xAD, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAE, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xD9, 0x87, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8,
+ 0xA8, 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8,
+ 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x87, 0xD8,
+ 0xAA, 0xD8, 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8,
+ 0xAA, 0xD8, 0xAE, 0xD8, 0xAA, 0xD9, 0x85, 0xD8,
+ 0xAA, 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8,
+ 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8,
+ 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8,
+ 0xAE, 0xD8, 0xAC, 0xD8, 0xAE, 0xD9, 0x85, 0xD8,
+ 0xB3, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8,
+ 0xB3, 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8,
+ 0xB5, 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAE, 0xD8,
+ 0xB5, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAC, 0xD8,
+ 0xB6, 0xD8, 0xAD, 0xD8, 0xB6, 0xD8, 0xAE, 0xD8,
+ 0xB6, 0xD9, 0x85, 0xD8, 0xB7, 0xD8, 0xAD, 0xD8,
+ 0xB8, 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8,
+ 0xB9, 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8,
+ 0xBA, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9,
+ 0x81, 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9,
+ 0x81, 0xD9, 0x85, 0xD9, 0x82, 0xD8, 0xAD, 0xD9,
+ 0x82, 0xD9, 0x85, 0xD9, 0x83, 0xD8, 0xAC, 0xD9,
+ 0x83, 0xD8, 0xAD, 0xD9, 0x83, 0xD8, 0xAE, 0xD9,
+ 0x83, 0xD9, 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9,
+ 0x84, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAD, 0xD9,
+ 0x84, 0xD8, 0xAE, 0xD9, 0x84, 0xD9, 0x85, 0xD9,
+ 0x84, 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, 0xD9,
+ 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9,
+ 0x85, 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9,
+ 0x86, 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9,
+ 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9,
+ 0x87, 0xD8, 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9,
+ 0x87, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9,
+ 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9,
+ 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9,
+ 0x94, 0xD9, 0x87, 0xD8, 0xA8, 0xD9, 0x85, 0xD8,
+ 0xA8, 0xD9, 0x87, 0xD8, 0xAA, 0xD9, 0x85, 0xD8,
+ 0xAA, 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8,
+ 0xAB, 0xD9, 0x87, 0xD8, 0xB3, 0xD9, 0x85, 0xD8,
+ 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x85, 0xD8,
+ 0xB4, 0xD9, 0x87, 0xD9, 0x83, 0xD9, 0x84, 0xD9,
+ 0x83, 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x85, 0xD9,
+ 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9,
+ 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9,
+ 0x80, 0xD9, 0x8E, 0xD9, 0x91, 0xD9, 0x80, 0xD9,
+ 0x8F, 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x90, 0xD9,
+ 0x91, 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9,
+ 0x8A, 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9,
+ 0x8A, 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9,
+ 0x8A, 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9,
+ 0x8A, 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9,
+ 0x8A, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9,
+ 0x8A, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9,
+ 0x8A, 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9,
+ 0x8A, 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9,
+ 0x8A, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8,
+ 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9,
+ 0x85, 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8,
+ 0xB1, 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8,
+ 0xB1, 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9,
+ 0x8A, 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9,
+ 0x8A, 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9,
+ 0x8A, 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9,
+ 0x8A, 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9,
+ 0x8A, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9,
+ 0x8A, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9,
+ 0x8A, 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9,
+ 0x8A, 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9,
+ 0x8A, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8,
+ 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9,
+ 0x85, 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8,
+ 0xB1, 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8,
+ 0xB1, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8,
+ 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9,
+ 0x85, 0xD8, 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9,
+ 0x87, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB3, 0xD8,
+ 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, 0xD8,
+ 0xAE, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8,
+ 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB7, 0xD9,
+ 0x85, 0xD8, 0xB8, 0xD9, 0x85, 0xD8, 0xA7, 0xD9,
+ 0x8B, 0xD8, 0xA7, 0xD9, 0x8B, 0xD8, 0xAA, 0xD8,
+ 0xAC, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8,
+ 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8,
+ 0xAA, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAA, 0xD8,
+ 0xAE, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x85, 0xD8,
+ 0xAC, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAD, 0xD8,
+ 0xAA, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9,
+ 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8,
+ 0xAD, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x8A, 0xD8,
+ 0xAD, 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xB3, 0xD8,
+ 0xAD, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAC, 0xD8,
+ 0xAD, 0xD8, 0xB3, 0xD8, 0xAC, 0xD9, 0x89, 0xD8,
+ 0xB3, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9,
+ 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, 0xD8,
+ 0xAC, 0xD8, 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8,
+ 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB5, 0xD8,
+ 0xAD, 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAD, 0xD8,
+ 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8,
+ 0xB4, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8,
+ 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8,
+ 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9,
+ 0x85, 0xD9, 0x85, 0xD8, 0xB4, 0xD9, 0x85, 0xD9,
+ 0x85, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x89, 0xD8,
+ 0xB6, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB6, 0xD8,
+ 0xAE, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, 0xD8,
+ 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, 0xD8,
+ 0xB7, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB7, 0xD9,
+ 0x85, 0xD9, 0x8A, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9,
+ 0x85, 0xD8, 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8,
+ 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, 0xD9,
+ 0x85, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x85, 0xD9,
+ 0x85, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x8A, 0xD8,
+ 0xBA, 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x81, 0xD8,
+ 0xAE, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAE, 0xD9,
+ 0x85, 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9,
+ 0x82, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x84, 0xD8,
+ 0xAD, 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, 0xD9,
+ 0x8A, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x89, 0xD9,
+ 0x84, 0xD8, 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8,
+ 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAE, 0xD9,
+ 0x85, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, 0xD9,
+ 0x84, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, 0xD9,
+ 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAD, 0xD8,
+ 0xAC, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD9,
+ 0x85, 0xD8, 0xAD, 0xD9, 0x8A, 0xD9, 0x85, 0xD8,
+ 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAC, 0xD9,
+ 0x85, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9,
+ 0x85, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x85, 0xD8,
+ 0xAC, 0xD8, 0xAE, 0xD9, 0x87, 0xD9, 0x85, 0xD8,
+ 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x85, 0xD9,
+ 0x86, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x86, 0xD8,
+ 0xAD, 0xD9, 0x89, 0xD9, 0x86, 0xD8, 0xAC, 0xD9,
+ 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, 0xD9,
+ 0x86, 0xD8, 0xAC, 0xD9, 0x89, 0xD9, 0x86, 0xD9,
+ 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x85, 0xD9,
+ 0x89, 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD9,
+ 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xA8, 0xD8,
+ 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x89, 0xD8,
+ 0xAA, 0xD8, 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8,
+ 0xAE, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x85, 0xD9,
+ 0x8A, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x89, 0xD8,
+ 0xAC, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8,
+ 0xAD, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x85, 0xD9,
+ 0x89, 0xD8, 0xB3, 0xD8, 0xAE, 0xD9, 0x89, 0xD8,
+ 0xB5, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xB4, 0xD8,
+ 0xAD, 0xD9, 0x8A, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9,
+ 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9,
+ 0x84, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8,
+ 0xAD, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9,
+ 0x85, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x82, 0xD9,
+ 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAD, 0xD9,
+ 0x8A, 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9,
+ 0x84, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB9, 0xD9,
+ 0x85, 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9,
+ 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9,
+ 0x85, 0xD8, 0xAE, 0xD9, 0x8A, 0xD9, 0x84, 0xD8,
+ 0xAC, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x85, 0xD9,
+ 0x85, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x85, 0xD9,
+ 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8,
+ 0xAD, 0xD9, 0x8A, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9,
+ 0x81, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8,
+ 0xAD, 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9,
+ 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, 0xD8,
+ 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, 0xD8,
+ 0xAE, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9,
+ 0x8A, 0xD8, 0xB5, 0xD9, 0x84, 0xDB, 0x92, 0xD9,
+ 0x82, 0xD9, 0x84, 0xDB, 0x92, 0xD8, 0xA7, 0xD9,
+ 0x84, 0xD9, 0x84, 0xD9, 0x87, 0xD8, 0xA7, 0xD9,
+ 0x83, 0xD8, 0xA8, 0xD8, 0xB1, 0xD9, 0x85, 0xD8,
+ 0xAD, 0xD9, 0x85, 0xD8, 0xAF, 0xD8, 0xB5, 0xD9,
+ 0x84, 0xD8, 0xB9, 0xD9, 0x85, 0xD8, 0xB1, 0xD8,
+ 0xB3, 0xD9, 0x88, 0xD9, 0x84, 0xD8, 0xB9, 0xD9,
+ 0x84, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x88, 0xD8,
+ 0xB3, 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xB5, 0xD9,
+ 0x84, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x84, 0xD9,
+ 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 0x84,
+ 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9,
+ 0x8A, 0xD9, 0x87, 0x20, 0xD9, 0x88, 0xD8, 0xB3,
+ 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x84,
+ 0x20, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xA7, 0xD9,
+ 0x84, 0xD9, 0x87, 0xD8, 0xB1, 0xDB, 0x8C, 0xD8,
+ 0xA7, 0xD9, 0x84, 0x2C, 0xE3, 0x80, 0x81, 0xE3,
+ 0x80, 0x82, 0x3A, 0x3B, 0x21, 0x3F, 0xE3, 0x80,
+ 0x96, 0xE3, 0x80, 0x97, 0x2E, 0x2E, 0x2E, 0x2E,
+ 0x2E, 0xE2, 0x80, 0x94, 0xE2, 0x80, 0x93, 0x5F,
+ 0x5F, 0x28, 0x29, 0x7B, 0x7D, 0xE3, 0x80, 0x94,
+ 0xE3, 0x80, 0x95, 0xE3, 0x80, 0x90, 0xE3, 0x80,
+ 0x91, 0xE3, 0x80, 0x8A, 0xE3, 0x80, 0x8B, 0xE3,
+ 0x80, 0x88, 0xE3, 0x80, 0x89, 0xE3, 0x80, 0x8C,
+ 0xE3, 0x80, 0x8D, 0xE3, 0x80, 0x8E, 0xE3, 0x80,
+ 0x8F, 0x5B, 0x5D, 0x20, 0xCC, 0x85, 0x20, 0xCC,
+ 0x85, 0x20, 0xCC, 0x85, 0x20, 0xCC, 0x85, 0x5F,
+ 0x5F, 0x5F, 0x2C, 0xE3, 0x80, 0x81, 0x2E, 0x3B,
+ 0x3A, 0x3F, 0x21, 0xE2, 0x80, 0x94, 0x28, 0x29,
+ 0x7B, 0x7D, 0xE3, 0x80, 0x94, 0xE3, 0x80, 0x95,
+ 0x23, 0x26, 0x2A, 0x2B, 0x2D, 0x3C, 0x3E, 0x3D,
+ 0x5C, 0x24, 0x25, 0x40, 0x20, 0xD9, 0x8B, 0xD9,
+ 0x80, 0xD9, 0x8B, 0x20, 0xD9, 0x8C, 0x20, 0xD9,
+ 0x8D, 0x20, 0xD9, 0x8E, 0xD9, 0x80, 0xD9, 0x8E,
+ 0x20, 0xD9, 0x8F, 0xD9, 0x80, 0xD9, 0x8F, 0x20,
+ 0xD9, 0x90, 0xD9, 0x80, 0xD9, 0x90, 0x20, 0xD9,
+ 0x91, 0xD9, 0x80, 0xD9, 0x91, 0x20, 0xD9, 0x92,
+ 0xD9, 0x80, 0xD9, 0x92, 0xD8, 0xA1, 0xD8, 0xA7,
+ 0xD9, 0x93, 0xD8, 0xA7, 0xD9, 0x93, 0xD8, 0xA7,
+ 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, 0x88,
+ 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x94, 0xD8, 0xA7,
+ 0xD9, 0x95, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x8A,
+ 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7,
+ 0xD8, 0xA7, 0xD8, 0xA8, 0xD8, 0xA8, 0xD8, 0xA8,
+ 0xD8, 0xA8, 0xD8, 0xA9, 0xD8, 0xA9, 0xD8, 0xAA,
+ 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAB,
+ 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAC,
+ 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAD,
+ 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAE,
+ 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAF,
+ 0xD8, 0xAF, 0xD8, 0xB0, 0xD8, 0xB0, 0xD8, 0xB1,
+ 0xD8, 0xB1, 0xD8, 0xB2, 0xD8, 0xB2, 0xD8, 0xB3,
+ 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB4,
+ 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB5,
+ 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB6,
+ 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB7,
+ 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB8,
+ 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB9,
+ 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xBA,
+ 0xD8, 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD9, 0x81,
+ 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x82,
+ 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x83,
+ 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x84,
+ 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x85,
+ 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x86,
+ 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x87,
+ 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x88,
+ 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x89, 0xD9, 0x8A,
+ 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x84,
+ 0xD8, 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7,
+ 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94,
+ 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, 0x84,
+ 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7,
+ 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x84,
+ 0xD8, 0xA7, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26,
+ 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E,
+ 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
+ 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E,
+ 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46,
+ 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E,
+ 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56,
+ 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E,
+ 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66,
+ 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E,
+ 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
+ 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E,
+ 0xE2, 0xA6, 0x85, 0xE2, 0xA6, 0x86, 0xE3, 0x80,
+ 0x82, 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, 0xE3,
+ 0x80, 0x81, 0xE3, 0x83, 0xBB, 0xE3, 0x83, 0xB2,
+ 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA3, 0xE3, 0x82,
+ 0xA5, 0xE3, 0x82, 0xA7, 0xE3, 0x82, 0xA9, 0xE3,
+ 0x83, 0xA3, 0xE3, 0x83, 0xA5, 0xE3, 0x83, 0xA7,
+ 0xE3, 0x83, 0x83, 0xE3, 0x83, 0xBC, 0xE3, 0x82,
+ 0xA2, 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, 0xE3,
+ 0x82, 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, 0xAB,
+ 0xE3, 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82,
+ 0xB1, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, 0xE3,
+ 0x82, 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xBB,
+ 0xE3, 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, 0x83,
+ 0x81, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, 0xE3,
+ 0x83, 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8B,
+ 0xE3, 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, 0x83,
+ 0x8E, 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, 0xE3,
+ 0x83, 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0x9B,
+ 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, 0x83,
+ 0xA0, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, 0xE3,
+ 0x83, 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, 0xA8,
+ 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, 0x83,
+ 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, 0xE3,
+ 0x83, 0xAF, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0x99,
+ 0xE3, 0x82, 0x9A, 0xE1, 0x85, 0xA0, 0xE1, 0x84,
+ 0x80, 0xE1, 0x84, 0x81, 0xE1, 0x86, 0xAA, 0xE1,
+ 0x84, 0x82, 0xE1, 0x86, 0xAC, 0xE1, 0x86, 0xAD,
+ 0xE1, 0x84, 0x83, 0xE1, 0x84, 0x84, 0xE1, 0x84,
+ 0x85, 0xE1, 0x86, 0xB0, 0xE1, 0x86, 0xB1, 0xE1,
+ 0x86, 0xB2, 0xE1, 0x86, 0xB3, 0xE1, 0x86, 0xB4,
+ 0xE1, 0x86, 0xB5, 0xE1, 0x84, 0x9A, 0xE1, 0x84,
+ 0x86, 0xE1, 0x84, 0x87, 0xE1, 0x84, 0x88, 0xE1,
+ 0x84, 0xA1, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8A,
+ 0xE1, 0x84, 0x8B, 0xE1, 0x84, 0x8C, 0xE1, 0x84,
+ 0x8D, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, 0xE1,
+ 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, 0x92,
+ 0xE1, 0x85, 0xA1, 0xE1, 0x85, 0xA2, 0xE1, 0x85,
+ 0xA3, 0xE1, 0x85, 0xA4, 0xE1, 0x85, 0xA5, 0xE1,
+ 0x85, 0xA6, 0xE1, 0x85, 0xA7, 0xE1, 0x85, 0xA8,
+ 0xE1, 0x85, 0xA9, 0xE1, 0x85, 0xAA, 0xE1, 0x85,
+ 0xAB, 0xE1, 0x85, 0xAC, 0xE1, 0x85, 0xAD, 0xE1,
+ 0x85, 0xAE, 0xE1, 0x85, 0xAF, 0xE1, 0x85, 0xB0,
+ 0xE1, 0x85, 0xB1, 0xE1, 0x85, 0xB2, 0xE1, 0x85,
+ 0xB3, 0xE1, 0x85, 0xB4, 0xE1, 0x85, 0xB5, 0xC2,
+ 0xA2, 0xC2, 0xA3, 0xC2, 0xAC, 0x20, 0xCC, 0x84,
+ 0xC2, 0xA6, 0xC2, 0xA5, 0xE2, 0x82, 0xA9, 0xE2,
+ 0x94, 0x82, 0xE2, 0x86, 0x90, 0xE2, 0x86, 0x91,
+ 0xE2, 0x86, 0x92, 0xE2, 0x86, 0x93, 0xE2, 0x96,
+ 0xA0, 0xE2, 0x97, 0x8B, 0xF6, 0xF0, 0x9D, 0x85,
+ 0x97, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, 0x9D,
+ 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0,
+ 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0,
+ 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x85, 0x98,
+ 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF,
+ 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85,
+ 0xA5, 0xF0, 0x9D, 0x85, 0xB0, 0xF6, 0xF0, 0x9D,
+ 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D,
+ 0x85, 0xB1, 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0,
+ 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB2, 0xF6,
+ 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, 0x85, 0xA5,
+ 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85,
+ 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D,
+ 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0,
+ 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, 0xA5, 0xF0,
+ 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x86, 0xB9,
+ 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF,
+ 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85,
+ 0xA5, 0xF0, 0x9D, 0x85, 0xAF, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E,
+ 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
+ 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, 0x44,
+ 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C,
+ 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54,
+ 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62,
+ 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A,
+ 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72,
+ 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A,
+ 0x41, 0x43, 0x44, 0x47, 0x4A, 0x4B, 0x4E, 0x4F,
+ 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+ 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x66, 0x68,
+ 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44,
+ 0x45, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E,
+ 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66,
+ 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E,
+ 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76,
+ 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, 0x45,
+ 0x46, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4F,
+ 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65,
+ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
+ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75,
+ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43,
+ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
+ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
+ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61,
+ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
+ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7A, 0xC4, 0xB1, 0xC8, 0xB7, 0xCE, 0x91, 0xCE,
+ 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE,
+ 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE,
+ 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE,
+ 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE,
+ 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE,
+ 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2,
+ 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3,
+ 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7,
+ 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB,
+ 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF,
+ 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83,
+ 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87,
+ 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE,
+ 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF,
+ 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE,
+ 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE,
+ 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE,
+ 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE,
+ 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE,
+ 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE,
+ 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87,
+ 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4,
+ 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8,
+ 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC,
+ 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80,
+ 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84,
+ 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88,
+ 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE,
+ 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF,
+ 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE,
+ 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE,
+ 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE,
+ 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE,
+ 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE,
+ 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE,
+ 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1,
+ 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5,
+ 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9,
+ 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD,
+ 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81,
+ 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85,
+ 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89,
+ 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE,
+ 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE,
+ 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE,
+ 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE,
+ 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE,
+ 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE,
+ 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE,
+ 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE,
+ 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2,
+ 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6,
+ 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA,
+ 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE,
+ 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82,
+ 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86,
+ 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88,
+ 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF,
+ 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE,
+ 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE,
+ 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE,
+ 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE,
+ 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE,
+ 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE,
+ 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2,
+ 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3,
+ 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7,
+ 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB,
+ 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF,
+ 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83,
+ 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87,
+ 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE,
+ 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF,
+ 0x81, 0xCF, 0x80, 0xCF, 0x9C, 0xCF, 0x9D, 0x30,
+ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
+ 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, 0x33, 0x34,
+ 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, 0x32,
+ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30,
+ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0xF6, 0xE4, 0xB8, 0xBD, 0xF6, 0xE4, 0xB8,
+ 0xB8, 0xF6, 0xE4, 0xB9, 0x81, 0xF6, 0xF0, 0xA0,
+ 0x84, 0xA2, 0xF6, 0xE4, 0xBD, 0xA0, 0xF6, 0xE4,
+ 0xBE, 0xAE, 0xF6, 0xE4, 0xBE, 0xBB, 0xF6, 0xE5,
+ 0x80, 0x82, 0xF6, 0xE5, 0x81, 0xBA, 0xF6, 0xE5,
+ 0x82, 0x99, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, 0xE5,
+ 0x83, 0x8F, 0xF6, 0xE3, 0x92, 0x9E, 0xF6, 0xF0,
+ 0xA0, 0x98, 0xBA, 0xF6, 0xE5, 0x85, 0x8D, 0xF6,
+ 0xE5, 0x85, 0x94, 0xF6, 0xE5, 0x85, 0xA4, 0xF6,
+ 0xE5, 0x85, 0xB7, 0xF6, 0xF0, 0xA0, 0x94, 0x9C,
+ 0xF6, 0xE3, 0x92, 0xB9, 0xF6, 0xE5, 0x85, 0xA7,
+ 0xF6, 0xE5, 0x86, 0x8D, 0xF6, 0xF0, 0xA0, 0x95,
+ 0x8B, 0xF6, 0xE5, 0x86, 0x97, 0xF6, 0xE5, 0x86,
+ 0xA4, 0xF6, 0xE4, 0xBB, 0x8C, 0xF6, 0xE5, 0x86,
+ 0xAC, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xF0, 0xA9,
+ 0x87, 0x9F, 0xF6, 0xE5, 0x87, 0xB5, 0xF6, 0xE5,
+ 0x88, 0x83, 0xF6, 0xE3, 0x93, 0x9F, 0xF6, 0xE5,
+ 0x88, 0xBB, 0xF6, 0xE5, 0x89, 0x86, 0xF6, 0xE5,
+ 0x89, 0xB2, 0xF6, 0xE5, 0x89, 0xB7, 0xF6, 0xE3,
+ 0x94, 0x95, 0xF6, 0xE5, 0x8B, 0x87, 0xF6, 0xE5,
+ 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5,
+ 0x8B, 0xBA, 0xF6, 0xE5, 0x8C, 0x85, 0xF6, 0xE5,
+ 0x8C, 0x86, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, 0xE5,
+ 0x8D, 0x89, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, 0xE5,
+ 0x8D, 0x9A, 0xF6, 0xE5, 0x8D, 0xB3, 0xF6, 0xE5,
+ 0x8D, 0xBD, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xE5,
+ 0x8D, 0xBF, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xF0,
+ 0xA0, 0xA8, 0xAC, 0xF6, 0xE7, 0x81, 0xB0, 0xF6,
+ 0xE5, 0x8F, 0x8A, 0xF6, 0xE5, 0x8F, 0x9F, 0xF6,
+ 0xF0, 0xA0, 0xAD, 0xA3, 0xF6, 0xE5, 0x8F, 0xAB,
+ 0xF6, 0xE5, 0x8F, 0xB1, 0xF6, 0xE5, 0x90, 0x86,
+ 0xF6, 0xE5, 0x92, 0x9E, 0xF6, 0xE5, 0x90, 0xB8,
+ 0xF6, 0xE5, 0x91, 0x88, 0xF6, 0xE5, 0x91, 0xA8,
+ 0xF6, 0xE5, 0x92, 0xA2, 0xF6, 0xE5, 0x93, 0xB6,
+ 0xF6, 0xE5, 0x94, 0x90, 0xF6, 0xE5, 0x95, 0x93,
+ 0xF6, 0xE5, 0x95, 0xA3, 0xF6, 0xE5, 0x96, 0x84,
+ 0xF6, 0xE5, 0x96, 0x84, 0xF6, 0xE5, 0x96, 0x99,
+ 0xF6, 0xE5, 0x96, 0xAB, 0xF6, 0xE5, 0x96, 0xB3,
+ 0xF6, 0xE5, 0x97, 0x82, 0xF6, 0xE5, 0x9C, 0x96,
+ 0xF6, 0xE5, 0x98, 0x86, 0xF6, 0xE5, 0x9C, 0x97,
+ 0xF6, 0xE5, 0x99, 0x91, 0xF6, 0xE5, 0x99, 0xB4,
+ 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xA3, 0xAE,
+ 0xF6, 0xE5, 0x9F, 0x8E, 0xF6, 0xE5, 0x9F, 0xB4,
+ 0xF6, 0xE5, 0xA0, 0x8D, 0xF6, 0xE5, 0x9E, 0x8B,
+ 0xF6, 0xE5, 0xA0, 0xB2, 0xF6, 0xE5, 0xA0, 0xB1,
+ 0xF6, 0xE5, 0xA2, 0xAC, 0xF6, 0xF0, 0xA1, 0x93,
+ 0xA4, 0xF6, 0xE5, 0xA3, 0xB2, 0xF6, 0xE5, 0xA3,
+ 0xB7, 0xF6, 0xE5, 0xA4, 0x86, 0xF6, 0xE5, 0xA4,
+ 0x9A, 0xF6, 0xE5, 0xA4, 0xA2, 0xF6, 0xE5, 0xA5,
+ 0xA2, 0xF6, 0xF0, 0xA1, 0x9A, 0xA8, 0xF6, 0xF0,
+ 0xA1, 0x9B, 0xAA, 0xF6, 0xE5, 0xA7, 0xAC, 0xF6,
+ 0xE5, 0xA8, 0x9B, 0xF6, 0xE5, 0xA8, 0xA7, 0xF6,
+ 0xE5, 0xA7, 0x98, 0xF6, 0xE5, 0xA9, 0xA6, 0xF6,
+ 0xE3, 0x9B, 0xAE, 0xF6, 0xE3, 0x9B, 0xBC, 0xF6,
+ 0xE5, 0xAC, 0x88, 0xF6, 0xE5, 0xAC, 0xBE, 0xF6,
+ 0xE5, 0xAC, 0xBE, 0xF6, 0xF0, 0xA1, 0xA7, 0x88,
+ 0xF6, 0xE5, 0xAF, 0x83, 0xF6, 0xE5, 0xAF, 0x98,
+ 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xAF, 0xB3,
+ 0xF6, 0xF0, 0xA1, 0xAC, 0x98, 0xF6, 0xE5, 0xAF,
+ 0xBF, 0xF6, 0xE5, 0xB0, 0x86, 0xF6, 0xE5, 0xBD,
+ 0x93, 0xF6, 0xE5, 0xB0, 0xA2, 0xF6, 0xE3, 0x9E,
+ 0x81, 0xF6, 0xE5, 0xB1, 0xA0, 0xF6, 0xE5, 0xB1,
+ 0xAE, 0xF6, 0xE5, 0xB3, 0x80, 0xF6, 0xE5, 0xB2,
+ 0x8D, 0xF6, 0xF0, 0xA1, 0xB7, 0xA4, 0xF6, 0xE5,
+ 0xB5, 0x83, 0xF6, 0xF0, 0xA1, 0xB7, 0xA6, 0xF6,
+ 0xE5, 0xB5, 0xAE, 0xF6, 0xE5, 0xB5, 0xAB, 0xF6,
+ 0xE5, 0xB5, 0xBC, 0xF6, 0xE5, 0xB7, 0xA1, 0xF6,
+ 0xE5, 0xB7, 0xA2, 0xF6, 0xE3, 0xA0, 0xAF, 0xF6,
+ 0xE5, 0xB7, 0xBD, 0xF6, 0xE5, 0xB8, 0xA8, 0xF6,
+ 0xE5, 0xB8, 0xBD, 0xF6, 0xE5, 0xB9, 0xA9, 0xF6,
+ 0xE3, 0xA1, 0xA2, 0xF6, 0xF0, 0xA2, 0x86, 0x83,
+ 0xF6, 0xE3, 0xA1, 0xBC, 0xF6, 0xE5, 0xBA, 0xB0,
+ 0xF6, 0xE5, 0xBA, 0xB3, 0xF6, 0xE5, 0xBA, 0xB6,
+ 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xF0, 0xAA, 0x8E,
+ 0x92, 0xF6, 0xE5, 0xBB, 0xBE, 0xF6, 0xF0, 0xA2,
+ 0x8C, 0xB1, 0xF6, 0xF0, 0xA2, 0x8C, 0xB1, 0xF6,
+ 0xE8, 0x88, 0x81, 0xF6, 0xE5, 0xBC, 0xA2, 0xF6,
+ 0xE5, 0xBC, 0xA2, 0xF6, 0xE3, 0xA3, 0x87, 0xF6,
+ 0xF0, 0xA3, 0x8A, 0xB8, 0xF6, 0xF0, 0xA6, 0x87,
+ 0x9A, 0xF6, 0xE5, 0xBD, 0xA2, 0xF6, 0xE5, 0xBD,
+ 0xAB, 0xF6, 0xE3, 0xA3, 0xA3, 0xF6, 0xE5, 0xBE,
+ 0x9A, 0xF6, 0xE5, 0xBF, 0x8D, 0xF6, 0xE5, 0xBF,
+ 0x97, 0xF6, 0xE5, 0xBF, 0xB9, 0xF6, 0xE6, 0x82,
+ 0x81, 0xF6, 0xE3, 0xA4, 0xBA, 0xF6, 0xE3, 0xA4,
+ 0x9C, 0xF6, 0xE6, 0x82, 0x94, 0xF6, 0xF0, 0xA2,
+ 0x9B, 0x94, 0xF6, 0xE6, 0x83, 0x87, 0xF6, 0xE6,
+ 0x85, 0x88, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6,
+ 0x85, 0x8E, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6,
+ 0x85, 0xBA, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6,
+ 0x86, 0xB2, 0xF6, 0xE6, 0x86, 0xA4, 0xF6, 0xE6,
+ 0x86, 0xAF, 0xF6, 0xE6, 0x87, 0x9E, 0xF6, 0xE6,
+ 0x87, 0xB2, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, 0xE6,
+ 0x88, 0x90, 0xF6, 0xE6, 0x88, 0x9B, 0xF6, 0xE6,
+ 0x89, 0x9D, 0xF6, 0xE6, 0x8A, 0xB1, 0xF6, 0xE6,
+ 0x8B, 0x94, 0xF6, 0xE6, 0x8D, 0x90, 0xF6, 0xF0,
+ 0xA2, 0xAC, 0x8C, 0xF6, 0xE6, 0x8C, 0xBD, 0xF6,
+ 0xE6, 0x8B, 0xBC, 0xF6, 0xE6, 0x8D, 0xA8, 0xF6,
+ 0xE6, 0x8E, 0x83, 0xF6, 0xE6, 0x8F, 0xA4, 0xF6,
+ 0xF0, 0xA2, 0xAF, 0xB1, 0xF6, 0xE6, 0x90, 0xA2,
+ 0xF6, 0xE6, 0x8F, 0x85, 0xF6, 0xE6, 0x8E, 0xA9,
+ 0xF6, 0xE3, 0xA8, 0xAE, 0xF6, 0xE6, 0x91, 0xA9,
+ 0xF6, 0xE6, 0x91, 0xBE, 0xF6, 0xE6, 0x92, 0x9D,
+ 0xF6, 0xE6, 0x91, 0xB7, 0xF6, 0xE3, 0xA9, 0xAC,
+ 0xF6, 0xE6, 0x95, 0x8F, 0xF6, 0xE6, 0x95, 0xAC,
+ 0xF6, 0xF0, 0xA3, 0x80, 0x8A, 0xF6, 0xE6, 0x97,
+ 0xA3, 0xF6, 0xE6, 0x9B, 0xB8, 0xF6, 0xE6, 0x99,
+ 0x89, 0xF6, 0xE3, 0xAC, 0x99, 0xF6, 0xE6, 0x9A,
+ 0x91, 0xF6, 0xE3, 0xAC, 0x88, 0xF6, 0xE3, 0xAB,
+ 0xA4, 0xF6, 0xE5, 0x86, 0x92, 0xF6, 0xE5, 0x86,
+ 0x95, 0xF6, 0xE6, 0x9C, 0x80, 0xF6, 0xE6, 0x9A,
+ 0x9C, 0xF6, 0xE8, 0x82, 0xAD, 0xF6, 0xE4, 0x8F,
+ 0x99, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, 0xE6, 0x9C,
+ 0x9B, 0xF6, 0xE6, 0x9C, 0xA1, 0xF6, 0xE6, 0x9D,
+ 0x9E, 0xF6, 0xE6, 0x9D, 0x93, 0xF6, 0xF0, 0xA3,
+ 0x8F, 0x83, 0xF6, 0xE3, 0xAD, 0x89, 0xF6, 0xE6,
+ 0x9F, 0xBA, 0xF6, 0xE6, 0x9E, 0x85, 0xF6, 0xE6,
+ 0xA1, 0x92, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xF0,
+ 0xA3, 0x91, 0xAD, 0xF6, 0xE6, 0xA2, 0x8E, 0xF6,
+ 0xE6, 0xA0, 0x9F, 0xF6, 0xE6, 0xA4, 0x94, 0xF6,
+ 0xE3, 0xAE, 0x9D, 0xF6, 0xE6, 0xA5, 0x82, 0xF6,
+ 0xE6, 0xA6, 0xA3, 0xF6, 0xE6, 0xA7, 0xAA, 0xF6,
+ 0xE6, 0xAA, 0xA8, 0xF6, 0xF0, 0xA3, 0x9A, 0xA3,
+ 0xF6, 0xE6, 0xAB, 0x9B, 0xF6, 0xE3, 0xB0, 0x98,
+ 0xF6, 0xE6, 0xAC, 0xA1, 0xF6, 0xF0, 0xA3, 0xA2,
+ 0xA7, 0xF6, 0xE6, 0xAD, 0x94, 0xF6, 0xE3, 0xB1,
+ 0x8E, 0xF6, 0xE6, 0xAD, 0xB2, 0xF6, 0xE6, 0xAE,
+ 0x9F, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE6, 0xAE,
+ 0xBB, 0xF6, 0xF0, 0xA3, 0xAA, 0x8D, 0xF6, 0xF0,
+ 0xA1, 0xB4, 0x8B, 0xF6, 0xF0, 0xA3, 0xAB, 0xBA,
+ 0xF6, 0xE6, 0xB1, 0x8E, 0xF6, 0xF0, 0xA3, 0xB2,
+ 0xBC, 0xF6, 0xE6, 0xB2, 0xBF, 0xF6, 0xE6, 0xB3,
+ 0x8D, 0xF6, 0xE6, 0xB1, 0xA7, 0xF6, 0xE6, 0xB4,
+ 0x96, 0xF6, 0xE6, 0xB4, 0xBE, 0xF6, 0xE6, 0xB5,
+ 0xB7, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xB5,
+ 0xA9, 0xF6, 0xE6, 0xB5, 0xB8, 0xF6, 0xE6, 0xB6,
+ 0x85, 0xF6, 0xF0, 0xA3, 0xB4, 0x9E, 0xF6, 0xE6,
+ 0xB4, 0xB4, 0xF6, 0xE6, 0xB8, 0xAF, 0xF6, 0xE6,
+ 0xB9, 0xAE, 0xF6, 0xE3, 0xB4, 0xB3, 0xF6, 0xE6,
+ 0xBB, 0x8B, 0xF6, 0xE6, 0xBB, 0x87, 0xF6, 0xF0,
+ 0xA3, 0xBB, 0x91, 0xF6, 0xE6, 0xB7, 0xB9, 0xF6,
+ 0xE6, 0xBD, 0xAE, 0xF6, 0xF0, 0xA3, 0xBD, 0x9E,
+ 0xF6, 0xF0, 0xA3, 0xBE, 0x8E, 0xF6, 0xE6, 0xBF,
+ 0x86, 0xF6, 0xE7, 0x80, 0xB9, 0xF6, 0xE7, 0x80,
+ 0x9E, 0xF6, 0xE7, 0x80, 0x9B, 0xF6, 0xE3, 0xB6,
+ 0x96, 0xF6, 0xE7, 0x81, 0x8A, 0xF6, 0xE7, 0x81,
+ 0xBD, 0xF6, 0xE7, 0x81, 0xB7, 0xF6, 0xE7, 0x82,
+ 0xAD, 0xF6, 0xF0, 0xA0, 0x94, 0xA5, 0xF6, 0xE7,
+ 0x85, 0x85, 0xF6, 0xF0, 0xA4, 0x89, 0xA3, 0xF6,
+ 0xE7, 0x86, 0x9C, 0xF6, 0xF0, 0xA4, 0x8E, 0xAB,
+ 0xF6, 0xE7, 0x88, 0xA8, 0xF6, 0xE7, 0x88, 0xB5,
+ 0xF6, 0xE7, 0x89, 0x90, 0xF6, 0xF0, 0xA4, 0x98,
+ 0x88, 0xF6, 0xE7, 0x8A, 0x80, 0xF6, 0xE7, 0x8A,
+ 0x95, 0xF6, 0xF0, 0xA4, 0x9C, 0xB5, 0xF6, 0xF0,
+ 0xA4, 0xA0, 0x94, 0xF6, 0xE7, 0x8D, 0xBA, 0xF6,
+ 0xE7, 0x8E, 0x8B, 0xF6, 0xE3, 0xBA, 0xAC, 0xF6,
+ 0xE7, 0x8E, 0xA5, 0xF6, 0xE3, 0xBA, 0xB8, 0xF6,
+ 0xE3, 0xBA, 0xB8, 0xF6, 0xE7, 0x91, 0x87, 0xF6,
+ 0xE7, 0x91, 0x9C, 0xF6, 0xE7, 0x91, 0xB1, 0xF6,
+ 0xE7, 0x92, 0x85, 0xF6, 0xE7, 0x93, 0x8A, 0xF6,
+ 0xE3, 0xBC, 0x9B, 0xF6, 0xE7, 0x94, 0xA4, 0xF6,
+ 0xF0, 0xA4, 0xB0, 0xB6, 0xF6, 0xE7, 0x94, 0xBE,
+ 0xF6, 0xF0, 0xA4, 0xB2, 0x92, 0xF6, 0xE7, 0x95,
+ 0xB0, 0xF6, 0xF0, 0xA2, 0x86, 0x9F, 0xF6, 0xE7,
+ 0x98, 0x90, 0xF6, 0xF0, 0xA4, 0xBE, 0xA1, 0xF6,
+ 0xF0, 0xA4, 0xBE, 0xB8, 0xF6, 0xF0, 0xA5, 0x81,
+ 0x84, 0xF6, 0xE3, 0xBF, 0xBC, 0xF6, 0xE4, 0x80,
+ 0x88, 0xF6, 0xE7, 0x9B, 0xB4, 0xF6, 0xF0, 0xA5,
+ 0x83, 0xB3, 0xF6, 0xF0, 0xA5, 0x83, 0xB2, 0xF6,
+ 0xF0, 0xA5, 0x84, 0x99, 0xF6, 0xF0, 0xA5, 0x84,
+ 0xB3, 0xF6, 0xE7, 0x9C, 0x9E, 0xF6, 0xE7, 0x9C,
+ 0x9F, 0xF6, 0xE7, 0x9C, 0x9F, 0xF6, 0xE7, 0x9D,
+ 0x8A, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xE7, 0x9E,
+ 0x8B, 0xF6, 0xE4, 0x81, 0x86, 0xF6, 0xE4, 0x82,
+ 0x96, 0xF6, 0xF0, 0xA5, 0x90, 0x9D, 0xF6, 0xE7,
+ 0xA1, 0x8E, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, 0xE7,
+ 0xA3, 0x8C, 0xF6, 0xE4, 0x83, 0xA3, 0xF6, 0xF0,
+ 0xA5, 0x98, 0xA6, 0xF6, 0xE7, 0xA5, 0x96, 0xF6,
+ 0xF0, 0xA5, 0x9A, 0x9A, 0xF6, 0xF0, 0xA5, 0x9B,
+ 0x85, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE7, 0xA7,
+ 0xAB, 0xF6, 0xE4, 0x84, 0xAF, 0xF6, 0xE7, 0xA9,
+ 0x80, 0xF6, 0xE7, 0xA9, 0x8A, 0xF6, 0xE7, 0xA9,
+ 0x8F, 0xF6, 0xF0, 0xA5, 0xA5, 0xBC, 0xF6, 0xF0,
+ 0xA5, 0xAA, 0xA7, 0xF6, 0xF0, 0xA5, 0xAA, 0xA7,
+ 0xF6, 0xE7, 0xAB, 0xAE, 0xF6, 0xE4, 0x88, 0x82,
+ 0xF6, 0xF0, 0xA5, 0xAE, 0xAB, 0xF6, 0xE7, 0xAF,
+ 0x86, 0xF6, 0xE7, 0xAF, 0x89, 0xF6, 0xE4, 0x88,
+ 0xA7, 0xF6, 0xF0, 0xA5, 0xB2, 0x80, 0xF6, 0xE7,
+ 0xB3, 0x92, 0xF6, 0xE4, 0x8A, 0xA0, 0xF6, 0xE7,
+ 0xB3, 0xA8, 0xF6, 0xE7, 0xB3, 0xA3, 0xF6, 0xE7,
+ 0xB4, 0x80, 0xF6, 0xF0, 0xA5, 0xBE, 0x86, 0xF6,
+ 0xE7, 0xB5, 0xA3, 0xF6, 0xE4, 0x8C, 0x81, 0xF6,
+ 0xE7, 0xB7, 0x87, 0xF6, 0xE7, 0xB8, 0x82, 0xF6,
+ 0xE7, 0xB9, 0x85, 0xF6, 0xE4, 0x8C, 0xB4, 0xF6,
+ 0xF0, 0xA6, 0x88, 0xA8, 0xF6, 0xF0, 0xA6, 0x89,
+ 0x87, 0xF6, 0xE4, 0x8D, 0x99, 0xF6, 0xF0, 0xA6,
+ 0x8B, 0x99, 0xF6, 0xE7, 0xBD, 0xBA, 0xF6, 0xF0,
+ 0xA6, 0x8C, 0xBE, 0xF6, 0xE7, 0xBE, 0x95, 0xF6,
+ 0xE7, 0xBF, 0xBA, 0xF6, 0xE8, 0x80, 0x85, 0xF6,
+ 0xF0, 0xA6, 0x93, 0x9A, 0xF6, 0xF0, 0xA6, 0x94,
+ 0xA3, 0xF6, 0xE8, 0x81, 0xA0, 0xF6, 0xF0, 0xA6,
+ 0x96, 0xA8, 0xF6, 0xE8, 0x81, 0xB0, 0xF6, 0xF0,
+ 0xA3, 0x8D, 0x9F, 0xF6, 0xE4, 0x8F, 0x95, 0xF6,
+ 0xE8, 0x82, 0xB2, 0xF6, 0xE8, 0x84, 0x83, 0xF6,
+ 0xE4, 0x90, 0x8B, 0xF6, 0xE8, 0x84, 0xBE, 0xF6,
+ 0xE5, 0xAA, 0xB5, 0xF6, 0xF0, 0xA6, 0x9E, 0xA7,
+ 0xF6, 0xF0, 0xA6, 0x9E, 0xB5, 0xF6, 0xF0, 0xA3,
+ 0x8E, 0x93, 0xF6, 0xF0, 0xA3, 0x8E, 0x9C, 0xF6,
+ 0xE8, 0x88, 0x81, 0xF6, 0xE8, 0x88, 0x84, 0xF6,
+ 0xE8, 0xBE, 0x9E, 0xF6, 0xE4, 0x91, 0xAB, 0xF6,
+ 0xE8, 0x8A, 0x91, 0xF6, 0xE8, 0x8A, 0x8B, 0xF6,
+ 0xE8, 0x8A, 0x9D, 0xF6, 0xE5, 0x8A, 0xB3, 0xF6,
+ 0xE8, 0x8A, 0xB1, 0xF6, 0xE8, 0x8A, 0xB3, 0xF6,
+ 0xE8, 0x8A, 0xBD, 0xF6, 0xE8, 0x8B, 0xA6, 0xF6,
+ 0xF0, 0xA6, 0xAC, 0xBC, 0xF6, 0xE8, 0x8B, 0xA5,
+ 0xF6, 0xE8, 0x8C, 0x9D, 0xF6, 0xE8, 0x8D, 0xA3,
+ 0xF6, 0xE8, 0x8E, 0xAD, 0xF6, 0xE8, 0x8C, 0xA3,
+ 0xF6, 0xE8, 0x8E, 0xBD, 0xF6, 0xE8, 0x8F, 0xA7,
+ 0xF6, 0xE8, 0x91, 0x97, 0xF6, 0xE8, 0x8D, 0x93,
+ 0xF6, 0xE8, 0x8F, 0x8A, 0xF6, 0xE8, 0x8F, 0x8C,
+ 0xF6, 0xE8, 0x8F, 0x9C, 0xF6, 0xF0, 0xA6, 0xB0,
+ 0xB6, 0xF6, 0xF0, 0xA6, 0xB5, 0xAB, 0xF6, 0xF0,
+ 0xA6, 0xB3, 0x95, 0xF6, 0xE4, 0x94, 0xAB, 0xF6,
+ 0xE8, 0x93, 0xB1, 0xF6, 0xE8, 0x93, 0xB3, 0xF6,
+ 0xE8, 0x94, 0x96, 0xF6, 0xF0, 0xA7, 0x8F, 0x8A,
+ 0xF6, 0xE8, 0x95, 0xA4, 0xF6, 0xF0, 0xA6, 0xBC,
+ 0xAC, 0xF6, 0xE4, 0x95, 0x9D, 0xF6, 0xE4, 0x95,
+ 0xA1, 0xF6, 0xF0, 0xA6, 0xBE, 0xB1, 0xF6, 0xF0,
+ 0xA7, 0x83, 0x92, 0xF6, 0xE4, 0x95, 0xAB, 0xF6,
+ 0xE8, 0x99, 0x90, 0xF6, 0xE8, 0x99, 0x9C, 0xF6,
+ 0xE8, 0x99, 0xA7, 0xF6, 0xE8, 0x99, 0xA9, 0xF6,
+ 0xE8, 0x9A, 0xA9, 0xF6, 0xE8, 0x9A, 0x88, 0xF6,
+ 0xE8, 0x9C, 0x8E, 0xF6, 0xE8, 0x9B, 0xA2, 0xF6,
+ 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, 0x9C, 0xA8, 0xF6,
+ 0xE8, 0x9D, 0xAB, 0xF6, 0xE8, 0x9E, 0x86, 0xF6,
+ 0xE4, 0x97, 0x97, 0xF6, 0xE8, 0x9F, 0xA1, 0xF6,
+ 0xE8, 0xA0, 0x81, 0xF6, 0xE4, 0x97, 0xB9, 0xF6,
+ 0xE8, 0xA1, 0xA0, 0xF6, 0xE8, 0xA1, 0xA3, 0xF6,
+ 0xF0, 0xA7, 0x99, 0xA7, 0xF6, 0xE8, 0xA3, 0x97,
+ 0xF6, 0xE8, 0xA3, 0x9E, 0xF6, 0xE4, 0x98, 0xB5,
+ 0xF6, 0xE8, 0xA3, 0xBA, 0xF6, 0xE3, 0x92, 0xBB,
+ 0xF6, 0xF0, 0xA7, 0xA2, 0xAE, 0xF6, 0xF0, 0xA7,
+ 0xA5, 0xA6, 0xF6, 0xE4, 0x9A, 0xBE, 0xF6, 0xE4,
+ 0x9B, 0x87, 0xF6, 0xE8, 0xAA, 0xA0, 0xF6, 0xE8,
+ 0xAB, 0xAD, 0xF6, 0xE8, 0xAE, 0x8A, 0xF6, 0xE8,
+ 0xB1, 0x95, 0xF6, 0xF0, 0xA7, 0xB2, 0xA8, 0xF6,
+ 0xE8, 0xB2, 0xAB, 0xF6, 0xE8, 0xB3, 0x81, 0xF6,
+ 0xE8, 0xB4, 0x9B, 0xF6, 0xE8, 0xB5, 0xB7, 0xF6,
+ 0xF0, 0xA7, 0xBC, 0xAF, 0xF6, 0xF0, 0xA0, 0xA0,
+ 0x84, 0xF6, 0xE8, 0xB7, 0x8B, 0xF6, 0xE8, 0xB6,
+ 0xBC, 0xF6, 0xE8, 0xB7, 0xB0, 0xF6, 0xF0, 0xA0,
+ 0xA3, 0x9E, 0xF6, 0xE8, 0xBB, 0x94, 0xF6, 0xE8,
+ 0xBC, 0xB8, 0xF6, 0xF0, 0xA8, 0x97, 0x92, 0xF6,
+ 0xF0, 0xA8, 0x97, 0xAD, 0xF6, 0xE9, 0x82, 0x94,
+ 0xF6, 0xE9, 0x83, 0xB1, 0xF6, 0xE9, 0x84, 0x91,
+ 0xF6, 0xF0, 0xA8, 0x9C, 0xAE, 0xF6, 0xE9, 0x84,
+ 0x9B, 0xF6, 0xE9, 0x88, 0xB8, 0xF6, 0xE9, 0x8B,
+ 0x97, 0xF6, 0xE9, 0x8B, 0x98, 0xF6, 0xE9, 0x89,
+ 0xBC, 0xF6, 0xE9, 0x8F, 0xB9, 0xF6, 0xE9, 0x90,
+ 0x95, 0xF6, 0xF0, 0xA8, 0xAF, 0xBA, 0xF6, 0xE9,
+ 0x96, 0x8B, 0xF6, 0xE4, 0xA6, 0x95, 0xF6, 0xE9,
+ 0x96, 0xB7, 0xF6, 0xF0, 0xA8, 0xB5, 0xB7, 0xF6,
+ 0xE4, 0xA7, 0xA6, 0xF6, 0xE9, 0x9B, 0x83, 0xF6,
+ 0xE5, 0xB6, 0xB2, 0xF6, 0xE9, 0x9C, 0xA3, 0xF6,
+ 0xF0, 0xA9, 0x85, 0x85, 0xF6, 0xF0, 0xA9, 0x88,
+ 0x9A, 0xF6, 0xE4, 0xA9, 0xAE, 0xF6, 0xE4, 0xA9,
+ 0xB6, 0xF6, 0xE9, 0x9F, 0xA0, 0xF6, 0xF0, 0xA9,
+ 0x90, 0x8A, 0xF6, 0xE4, 0xAA, 0xB2, 0xF6, 0xF0,
+ 0xA9, 0x92, 0x96, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6,
+ 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, 0xA0, 0xA9, 0xF6,
+ 0xF0, 0xA9, 0x96, 0xB6, 0xF6, 0xE9, 0xA3, 0xA2,
+ 0xF6, 0xE4, 0xAC, 0xB3, 0xF6, 0xE9, 0xA4, 0xA9,
+ 0xF6, 0xE9, 0xA6, 0xA7, 0xF6, 0xE9, 0xA7, 0x82,
+ 0xF6, 0xE9, 0xA7, 0xBE, 0xF6, 0xE4, 0xAF, 0x8E,
+ 0xF6, 0xF0, 0xA9, 0xAC, 0xB0, 0xF6, 0xE9, 0xAC,
+ 0x92, 0xF6, 0xE9, 0xB1, 0x80, 0xF6, 0xE9, 0xB3,
+ 0xBD, 0xF6, 0xE4, 0xB3, 0x8E, 0xF6, 0xE4, 0xB3,
+ 0xAD, 0xF6, 0xE9, 0xB5, 0xA7, 0xF6, 0xF0, 0xAA,
+ 0x83, 0x8E, 0xF6, 0xE4, 0xB3, 0xB8, 0xF6, 0xF0,
+ 0xAA, 0x84, 0x85, 0xF6, 0xF0, 0xAA, 0x88, 0x8E,
+ 0xF6, 0xF0, 0xAA, 0x8A, 0x91, 0xF6, 0xE9, 0xBA,
+ 0xBB, 0xF6, 0xE4, 0xB5, 0x96, 0xF6, 0xE9, 0xBB,
+ 0xB9, 0xF6, 0xE9, 0xBB, 0xBE, 0xF6, 0xE9, 0xBC,
+ 0x85, 0xF6, 0xE9, 0xBC, 0x8F, 0xF6, 0xE9, 0xBC,
+ 0x96, 0xF6, 0xE9, 0xBC, 0xBB, 0xF6, 0xF0, 0xAA,
+ 0x98, 0x80,
+ },
+};
+
+static const uchar_t u8_case_common_b2_tbl[2][2][256] = {
+ {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, 1, 2, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, 3,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ {
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 4, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+
+ },
+ {
+ {
+ 0, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, 1, 2, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, 3,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+ {
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ 4, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ N_, N_, N_, N_, N_, N_, N_, N_,
+ },
+
+ },
+
+};
+
+static const u8_displacement_t u8_tolower_b3_tbl[2][5][256] = {
+ {
+ { /* Third byte table 0. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { 0, 0 },
+ { 1, 60 }, { 2, 123 }, { 3, 185 }, { 4, 257 },
+ { 5, 321 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 6, 373 }, { 7, 439 },
+ { 8, 465 }, { 9, 561 }, { 10, 593 }, { 11, 649 },
+ { 12, 703 }, { 13, 749 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 1. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 14, 795 }, { 15, 891 }, { 16, 987 }, { 17, 1068 },
+ { 18, 1155 }, { 19, 1245 }, { 20, 1299 }, { 21, 1386 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 2. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 22, 1443 }, { 23, 1448 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 24, 1496 }, { 25, 1526 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 3. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 26, 1574 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 4. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 27, 1652 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ },
+ {
+ { /* Third byte table 0. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { 0, 0 },
+ { 1, 60 }, { 2, 123 }, { 3, 185 }, { 4, 257 },
+ { 5, 321 }, { 6, 383 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 7, 401 }, { 8, 467 },
+ { 9, 505 }, { 10, 601 }, { 11, 633 }, { 12, 689 },
+ { 13, 753 }, { 14, 803 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 1. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 15, 849 }, { 16, 945 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 17, 963 }, { 18, 1059 }, { 19, 1155 }, { 20, 1236 },
+ { 21, 1323 }, { 22, 1413 }, { 23, 1467 }, { 24, 1554 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 2. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 25, 1611 }, { 26, 1619 }, { 27, 1667 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 28, 1670 }, { 29, 1700 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 30, 1748 }, { 31, 1889 }, { 32, 1911 }, { 33, 2007 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 3. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 34, 2061 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 4. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 35, 2139 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ },
+};
+
+static const uchar_t u8_tolower_b4_tbl[2][36][257] = {
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 46, 48, 50, 52, 54, 56, 58, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 34, 34, 36, 36, 38, 38, 40,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 48, 49, 49, 51, 51, 53, 53, 55,
+ 55, 55, 57, 57, 59, 59, 61, 61,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 8, 10, 10, 12, 12, 14,
+ 14, 16, 16, 18, 18, 20, 20, 22,
+ 22, 24, 24, 26, 26, 28, 28, 30,
+ 30, 32, 32, 34, 34, 36, 36, 38,
+ 38, 40, 40, 42, 42, 44, 44, 46,
+ 46, 48, 48, 50, 50, 52, 52, 54,
+ 54, 56, 58, 58, 60, 60, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 4, 4, 6, 6, 8,
+ 10, 10, 12, 14, 16, 16, 16, 18,
+ 20, 22, 24, 24, 26, 28, 28, 30,
+ 32, 34, 34, 34, 34, 36, 38, 38,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 50, 50, 52, 52, 52, 54, 54, 56,
+ 58, 58, 60, 62, 64, 64, 66, 66,
+ 68, 70, 70, 70, 70, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 2, 4, 4,
+ 6, 8, 8, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 28, 30,
+ 30, 32, 32, 34, 34, 36, 36, 38,
+ 38, 40, 40, 42, 42, 44, 44, 46,
+ 46, 46, 48, 50, 50, 52, 52, 54,
+ 56, 58, 58, 60, 60, 62, 62, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 34, 34, 36, 36, 38, 38, 40,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 48, 50, 50, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2,
+ 2, 4, 6, 8, 8, 10, 10, 12,
+ 14, 14, 16, 18, 20, 22, 24, 26,
+ 28, 30, 32, 34, 36, 38, 40, 42,
+ 44, 46, 48, 48, 50, 52, 54, 56,
+ 58, 60, 62, 64, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 24, 24, 24, 24, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78,
+ 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 34, 34, 36, 36, 38, 38, 40,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 48, 50, 50, 52, 52, 54, 54, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 14, 16, 16, 18, 18, 20, 20, 22,
+ 22, 24, 24, 26, 26, 28, 28, 30,
+ 30, 32, 32, 34, 34, 36, 36, 38,
+ 38, 40, 40, 42, 42, 44, 44, 46,
+ 46, 48, 48, 50, 50, 52, 52, 52,
+ 52, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 18, 20, 22, 24, 26, 28,
+ 30, 32, 34, 36, 38, 40, 42, 44,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 36,
+ 36, 39, 39, 42, 42, 45, 45, 48,
+ 48, 51, 51, 54, 54, 57, 57, 60,
+ 60, 63, 63, 66, 66, 69, 69, 72,
+ 72, 75, 75, 78, 78, 81, 81, 84,
+ 84, 87, 87, 90, 90, 93, 93, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 36,
+ 36, 39, 39, 42, 42, 45, 45, 48,
+ 48, 51, 51, 54, 54, 57, 57, 60,
+ 60, 63, 63, 66, 66, 69, 69, 72,
+ 72, 75, 75, 78, 78, 81, 81, 84,
+ 84, 87, 87, 90, 90, 93, 93, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 36, 36, 39, 39, 42, 42, 45,
+ 45, 48, 48, 51, 51, 54, 54, 57,
+ 57, 60, 60, 63, 63, 66, 66, 69,
+ 69, 72, 72, 75, 75, 78, 78, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 36,
+ 36, 39, 39, 42, 42, 45, 45, 48,
+ 48, 51, 51, 54, 54, 57, 57, 60,
+ 60, 63, 63, 66, 66, 69, 69, 72,
+ 72, 75, 75, 78, 78, 81, 81, 84,
+ 84, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 27, 30, 33, 36, 39, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 45, 48, 51, 54, 57, 60, 63,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 69, 72, 75, 78, 81, 84, 87,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 21, 21, 24, 24, 27, 27,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 33, 36, 39, 42, 45, 48, 51,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 75, 78, 81, 84, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 18, 21, 24, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 30, 33, 36, 39, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 45, 48, 51, 54, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2,
+ 2, 2, 2, 3, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 3,
+ 6, 9, 12, 15, 18, 21, 24, 27,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 6, 9, 12, 15, 18,
+ 21, 24, 27, 30, 33, 36, 39, 42,
+ 45, 48, 51, 54, 57, 60, 63, 66,
+ 69, 72, 75, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152, 152, 152, 152, 152, 152, 152, 152,
+ 152,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ },
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 46, 48, 50, 52, 54, 56, 58, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60,
+ 60,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 34, 34, 36, 36, 38, 38, 40,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 48, 49, 49, 51, 51, 53, 53, 55,
+ 55, 55, 57, 57, 59, 59, 61, 61,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 8, 10, 10, 12, 12, 14,
+ 14, 16, 16, 18, 18, 20, 20, 22,
+ 22, 24, 24, 26, 26, 28, 28, 30,
+ 30, 32, 32, 34, 34, 36, 36, 38,
+ 38, 40, 40, 42, 42, 44, 44, 46,
+ 46, 48, 48, 50, 50, 52, 52, 54,
+ 54, 56, 58, 58, 60, 60, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 4, 4, 6, 6, 8,
+ 10, 10, 12, 14, 16, 16, 16, 18,
+ 20, 22, 24, 24, 26, 28, 28, 30,
+ 32, 34, 34, 34, 34, 36, 38, 38,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 50, 50, 52, 52, 52, 54, 54, 56,
+ 58, 58, 60, 62, 64, 64, 66, 66,
+ 68, 70, 70, 70, 70, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 2, 4, 4,
+ 6, 8, 8, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 28, 30,
+ 30, 32, 32, 34, 34, 36, 36, 38,
+ 38, 40, 40, 42, 42, 44, 44, 46,
+ 46, 46, 48, 50, 50, 52, 52, 54,
+ 56, 58, 58, 60, 60, 62, 62, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 34, 34, 36, 36, 38, 38, 40,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 48, 50, 50, 52, 52, 52, 52, 52,
+ 52, 52, 52, 55, 57, 57, 59, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 6, 8, 10,
+ 10, 12, 12, 14, 14, 16, 16, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2,
+ 2, 4, 6, 8, 8, 10, 10, 12,
+ 14, 14, 16, 18, 20, 22, 24, 26,
+ 28, 30, 32, 34, 36, 38, 40, 42,
+ 44, 46, 48, 48, 50, 52, 54, 56,
+ 58, 60, 62, 64, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 24, 24, 24, 24, 26, 26, 26,
+ 28, 28, 30, 32, 32, 32, 34, 36,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78,
+ 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 34, 34, 36, 36, 38, 38, 40,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 48, 50, 50, 52, 52, 54, 54, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 4, 6, 6, 8, 8,
+ 10, 10, 12, 12, 14, 14, 16, 16,
+ 16, 18, 18, 20, 20, 22, 22, 24,
+ 24, 26, 26, 28, 28, 30, 30, 32,
+ 32, 34, 34, 36, 36, 38, 38, 40,
+ 40, 42, 42, 44, 44, 46, 46, 48,
+ 48, 50, 50, 52, 52, 54, 54, 56,
+ 56, 58, 58, 60, 60, 62, 62, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 16,
+ 16, 18, 18, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 22, 24, 26, 28, 30, 32,
+ 34, 36, 38, 40, 42, 44, 46, 48,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 46,
+ 46,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 36,
+ 36, 39, 39, 42, 42, 45, 45, 48,
+ 48, 51, 51, 54, 54, 57, 57, 60,
+ 60, 63, 63, 66, 66, 69, 69, 72,
+ 72, 75, 75, 78, 78, 81, 81, 84,
+ 84, 87, 87, 90, 90, 93, 93, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 36,
+ 36, 39, 39, 42, 42, 45, 45, 48,
+ 48, 51, 51, 54, 54, 57, 57, 60,
+ 60, 63, 63, 66, 66, 69, 69, 72,
+ 72, 75, 75, 78, 78, 81, 81, 84,
+ 84, 87, 87, 90, 90, 93, 93, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 36, 36, 39, 39, 42, 42, 45,
+ 45, 48, 48, 51, 51, 54, 54, 57,
+ 57, 60, 60, 63, 63, 66, 66, 69,
+ 69, 72, 72, 75, 75, 78, 78, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81,
+ 81,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 36,
+ 36, 39, 39, 42, 42, 45, 45, 48,
+ 48, 51, 51, 54, 54, 57, 57, 60,
+ 60, 63, 63, 66, 66, 69, 69, 72,
+ 72, 75, 75, 78, 78, 81, 81, 84,
+ 84, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 27, 30, 33, 36, 39, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 45, 48, 51, 54, 57, 60, 63,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 69, 72, 75, 78, 81, 84, 87,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 21, 21, 24, 24, 27, 27,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 33, 36, 39, 42, 45, 48, 51,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 75, 78, 81, 84, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 18, 21, 24, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 30, 33, 36, 39, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 45, 48, 51, 54, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57, 57, 57, 57, 57, 57, 57, 57,
+ 57,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2,
+ 2, 2, 2, 3, 5, 5, 5, 5,
+ 5, 5, 5, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 3,
+ 6, 9, 12, 15, 18, 21, 24, 27,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 117,
+ 120, 123, 126, 129, 132, 135, 138, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141, 141, 141, 141, 141, 141, 141, 141,
+ 141,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 5, 8, 10, 10, 10,
+ 13, 13, 16, 16, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22,
+ 22,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 36,
+ 36, 39, 39, 42, 42, 45, 45, 48,
+ 48, 51, 51, 54, 54, 57, 57, 60,
+ 60, 63, 63, 66, 66, 69, 69, 72,
+ 72, 75, 75, 78, 78, 81, 81, 84,
+ 84, 87, 87, 90, 90, 93, 93, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 3, 6, 6, 9, 9, 12,
+ 12, 15, 15, 18, 18, 21, 21, 24,
+ 24, 27, 27, 30, 30, 33, 33, 36,
+ 36, 39, 39, 42, 42, 45, 45, 48,
+ 48, 51, 51, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 6, 9, 12, 15, 18,
+ 21, 24, 27, 30, 33, 36, 39, 42,
+ 45, 48, 51, 54, 57, 60, 63, 66,
+ 69, 72, 75, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160, 160, 160, 160, 160, 160, 160, 160,
+ 160,
+ },
+ },
+};
+
+static const uchar_t u8_tolower_final_tbl[2][2299] = {
+ {
+ 0xC3, 0xA0, 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3,
+ 0xC3, 0xA4, 0xC3, 0xA5, 0xC3, 0xA6, 0xC3, 0xA7,
+ 0xC3, 0xA8, 0xC3, 0xA9, 0xC3, 0xAA, 0xC3, 0xAB,
+ 0xC3, 0xAC, 0xC3, 0xAD, 0xC3, 0xAE, 0xC3, 0xAF,
+ 0xC3, 0xB0, 0xC3, 0xB1, 0xC3, 0xB2, 0xC3, 0xB3,
+ 0xC3, 0xB4, 0xC3, 0xB5, 0xC3, 0xB6, 0xC3, 0xB8,
+ 0xC3, 0xB9, 0xC3, 0xBA, 0xC3, 0xBB, 0xC3, 0xBC,
+ 0xC3, 0xBD, 0xC3, 0xBE, 0xC4, 0x81, 0xC4, 0x83,
+ 0xC4, 0x85, 0xC4, 0x87, 0xC4, 0x89, 0xC4, 0x8B,
+ 0xC4, 0x8D, 0xC4, 0x8F, 0xC4, 0x91, 0xC4, 0x93,
+ 0xC4, 0x95, 0xC4, 0x97, 0xC4, 0x99, 0xC4, 0x9B,
+ 0xC4, 0x9D, 0xC4, 0x9F, 0xC4, 0xA1, 0xC4, 0xA3,
+ 0xC4, 0xA5, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xAB,
+ 0xC4, 0xAD, 0xC4, 0xAF, 0x69, 0xC4, 0xB3, 0xC4,
+ 0xB5, 0xC4, 0xB7, 0xC4, 0xBA, 0xC4, 0xBC, 0xC4,
+ 0xBE, 0xC5, 0x80, 0xC5, 0x82, 0xC5, 0x84, 0xC5,
+ 0x86, 0xC5, 0x88, 0xC5, 0x8B, 0xC5, 0x8D, 0xC5,
+ 0x8F, 0xC5, 0x91, 0xC5, 0x93, 0xC5, 0x95, 0xC5,
+ 0x97, 0xC5, 0x99, 0xC5, 0x9B, 0xC5, 0x9D, 0xC5,
+ 0x9F, 0xC5, 0xA1, 0xC5, 0xA3, 0xC5, 0xA5, 0xC5,
+ 0xA7, 0xC5, 0xA9, 0xC5, 0xAB, 0xC5, 0xAD, 0xC5,
+ 0xAF, 0xC5, 0xB1, 0xC5, 0xB3, 0xC5, 0xB5, 0xC5,
+ 0xB7, 0xC3, 0xBF, 0xC5, 0xBA, 0xC5, 0xBC, 0xC5,
+ 0xBE, 0xC9, 0x93, 0xC6, 0x83, 0xC6, 0x85, 0xC9,
+ 0x94, 0xC6, 0x88, 0xC9, 0x96, 0xC9, 0x97, 0xC6,
+ 0x8C, 0xC7, 0x9D, 0xC9, 0x99, 0xC9, 0x9B, 0xC6,
+ 0x92, 0xC9, 0xA0, 0xC9, 0xA3, 0xC9, 0xA9, 0xC9,
+ 0xA8, 0xC6, 0x99, 0xC9, 0xAF, 0xC9, 0xB2, 0xC9,
+ 0xB5, 0xC6, 0xA1, 0xC6, 0xA3, 0xC6, 0xA5, 0xCA,
+ 0x80, 0xC6, 0xA8, 0xCA, 0x83, 0xC6, 0xAD, 0xCA,
+ 0x88, 0xC6, 0xB0, 0xCA, 0x8A, 0xCA, 0x8B, 0xC6,
+ 0xB4, 0xC6, 0xB6, 0xCA, 0x92, 0xC6, 0xB9, 0xC6,
+ 0xBD, 0xC7, 0x86, 0xC7, 0x86, 0xC7, 0x89, 0xC7,
+ 0x89, 0xC7, 0x8C, 0xC7, 0x8C, 0xC7, 0x8E, 0xC7,
+ 0x90, 0xC7, 0x92, 0xC7, 0x94, 0xC7, 0x96, 0xC7,
+ 0x98, 0xC7, 0x9A, 0xC7, 0x9C, 0xC7, 0x9F, 0xC7,
+ 0xA1, 0xC7, 0xA3, 0xC7, 0xA5, 0xC7, 0xA7, 0xC7,
+ 0xA9, 0xC7, 0xAB, 0xC7, 0xAD, 0xC7, 0xAF, 0xC7,
+ 0xB3, 0xC7, 0xB3, 0xC7, 0xB5, 0xC6, 0x95, 0xC6,
+ 0xBF, 0xC7, 0xB9, 0xC7, 0xBB, 0xC7, 0xBD, 0xC7,
+ 0xBF, 0xC8, 0x81, 0xC8, 0x83, 0xC8, 0x85, 0xC8,
+ 0x87, 0xC8, 0x89, 0xC8, 0x8B, 0xC8, 0x8D, 0xC8,
+ 0x8F, 0xC8, 0x91, 0xC8, 0x93, 0xC8, 0x95, 0xC8,
+ 0x97, 0xC8, 0x99, 0xC8, 0x9B, 0xC8, 0x9D, 0xC8,
+ 0x9F, 0xC6, 0x9E, 0xC8, 0xA3, 0xC8, 0xA5, 0xC8,
+ 0xA7, 0xC8, 0xA9, 0xC8, 0xAB, 0xC8, 0xAD, 0xC8,
+ 0xAF, 0xC8, 0xB1, 0xC8, 0xB3, 0xCE, 0xAC, 0xCE,
+ 0xAD, 0xCE, 0xAE, 0xCE, 0xAF, 0xCF, 0x8C, 0xCF,
+ 0x8D, 0xCF, 0x8E, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE,
+ 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE,
+ 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE,
+ 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE,
+ 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x83, 0xCF,
+ 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF,
+ 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, 0xCF,
+ 0x99, 0xCF, 0x9B, 0xCF, 0x9D, 0xCF, 0x9F, 0xCF,
+ 0xA1, 0xCF, 0xA3, 0xCF, 0xA5, 0xCF, 0xA7, 0xCF,
+ 0xA9, 0xCF, 0xAB, 0xCF, 0xAD, 0xCF, 0xAF, 0xCE,
+ 0xB8, 0xD1, 0x90, 0xD1, 0x91, 0xD1, 0x92, 0xD1,
+ 0x93, 0xD1, 0x94, 0xD1, 0x95, 0xD1, 0x96, 0xD1,
+ 0x97, 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1,
+ 0x9B, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, 0xD1,
+ 0x9F, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0,
+ 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD0, 0xB6, 0xD0,
+ 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0,
+ 0xBB, 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0,
+ 0xBF, 0xD1, 0x80, 0xD1, 0x81, 0xD1, 0x82, 0xD1,
+ 0x83, 0xD1, 0x84, 0xD1, 0x85, 0xD1, 0x86, 0xD1,
+ 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, 0xD1,
+ 0x8B, 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1,
+ 0x8F, 0xD1, 0xA1, 0xD1, 0xA3, 0xD1, 0xA5, 0xD1,
+ 0xA7, 0xD1, 0xA9, 0xD1, 0xAB, 0xD1, 0xAD, 0xD1,
+ 0xAF, 0xD1, 0xB1, 0xD1, 0xB3, 0xD1, 0xB5, 0xD1,
+ 0xB7, 0xD1, 0xB9, 0xD1, 0xBB, 0xD1, 0xBD, 0xD1,
+ 0xBF, 0xD2, 0x81, 0xD2, 0x8B, 0xD2, 0x8D, 0xD2,
+ 0x8F, 0xD2, 0x91, 0xD2, 0x93, 0xD2, 0x95, 0xD2,
+ 0x97, 0xD2, 0x99, 0xD2, 0x9B, 0xD2, 0x9D, 0xD2,
+ 0x9F, 0xD2, 0xA1, 0xD2, 0xA3, 0xD2, 0xA5, 0xD2,
+ 0xA7, 0xD2, 0xA9, 0xD2, 0xAB, 0xD2, 0xAD, 0xD2,
+ 0xAF, 0xD2, 0xB1, 0xD2, 0xB3, 0xD2, 0xB5, 0xD2,
+ 0xB7, 0xD2, 0xB9, 0xD2, 0xBB, 0xD2, 0xBD, 0xD2,
+ 0xBF, 0xD3, 0x82, 0xD3, 0x84, 0xD3, 0x86, 0xD3,
+ 0x88, 0xD3, 0x8A, 0xD3, 0x8C, 0xD3, 0x8E, 0xD3,
+ 0x91, 0xD3, 0x93, 0xD3, 0x95, 0xD3, 0x97, 0xD3,
+ 0x99, 0xD3, 0x9B, 0xD3, 0x9D, 0xD3, 0x9F, 0xD3,
+ 0xA1, 0xD3, 0xA3, 0xD3, 0xA5, 0xD3, 0xA7, 0xD3,
+ 0xA9, 0xD3, 0xAB, 0xD3, 0xAD, 0xD3, 0xAF, 0xD3,
+ 0xB1, 0xD3, 0xB3, 0xD3, 0xB5, 0xD3, 0xB9, 0xD4,
+ 0x81, 0xD4, 0x83, 0xD4, 0x85, 0xD4, 0x87, 0xD4,
+ 0x89, 0xD4, 0x8B, 0xD4, 0x8D, 0xD4, 0x8F, 0xD5,
+ 0xA1, 0xD5, 0xA2, 0xD5, 0xA3, 0xD5, 0xA4, 0xD5,
+ 0xA5, 0xD5, 0xA6, 0xD5, 0xA7, 0xD5, 0xA8, 0xD5,
+ 0xA9, 0xD5, 0xAA, 0xD5, 0xAB, 0xD5, 0xAC, 0xD5,
+ 0xAD, 0xD5, 0xAE, 0xD5, 0xAF, 0xD5, 0xB0, 0xD5,
+ 0xB1, 0xD5, 0xB2, 0xD5, 0xB3, 0xD5, 0xB4, 0xD5,
+ 0xB5, 0xD5, 0xB6, 0xD5, 0xB7, 0xD5, 0xB8, 0xD5,
+ 0xB9, 0xD5, 0xBA, 0xD5, 0xBB, 0xD5, 0xBC, 0xD5,
+ 0xBD, 0xD5, 0xBE, 0xD5, 0xBF, 0xD6, 0x80, 0xD6,
+ 0x81, 0xD6, 0x82, 0xD6, 0x83, 0xD6, 0x84, 0xD6,
+ 0x85, 0xD6, 0x86, 0xE1, 0xB8, 0x81, 0xE1, 0xB8,
+ 0x83, 0xE1, 0xB8, 0x85, 0xE1, 0xB8, 0x87, 0xE1,
+ 0xB8, 0x89, 0xE1, 0xB8, 0x8B, 0xE1, 0xB8, 0x8D,
+ 0xE1, 0xB8, 0x8F, 0xE1, 0xB8, 0x91, 0xE1, 0xB8,
+ 0x93, 0xE1, 0xB8, 0x95, 0xE1, 0xB8, 0x97, 0xE1,
+ 0xB8, 0x99, 0xE1, 0xB8, 0x9B, 0xE1, 0xB8, 0x9D,
+ 0xE1, 0xB8, 0x9F, 0xE1, 0xB8, 0xA1, 0xE1, 0xB8,
+ 0xA3, 0xE1, 0xB8, 0xA5, 0xE1, 0xB8, 0xA7, 0xE1,
+ 0xB8, 0xA9, 0xE1, 0xB8, 0xAB, 0xE1, 0xB8, 0xAD,
+ 0xE1, 0xB8, 0xAF, 0xE1, 0xB8, 0xB1, 0xE1, 0xB8,
+ 0xB3, 0xE1, 0xB8, 0xB5, 0xE1, 0xB8, 0xB7, 0xE1,
+ 0xB8, 0xB9, 0xE1, 0xB8, 0xBB, 0xE1, 0xB8, 0xBD,
+ 0xE1, 0xB8, 0xBF, 0xE1, 0xB9, 0x81, 0xE1, 0xB9,
+ 0x83, 0xE1, 0xB9, 0x85, 0xE1, 0xB9, 0x87, 0xE1,
+ 0xB9, 0x89, 0xE1, 0xB9, 0x8B, 0xE1, 0xB9, 0x8D,
+ 0xE1, 0xB9, 0x8F, 0xE1, 0xB9, 0x91, 0xE1, 0xB9,
+ 0x93, 0xE1, 0xB9, 0x95, 0xE1, 0xB9, 0x97, 0xE1,
+ 0xB9, 0x99, 0xE1, 0xB9, 0x9B, 0xE1, 0xB9, 0x9D,
+ 0xE1, 0xB9, 0x9F, 0xE1, 0xB9, 0xA1, 0xE1, 0xB9,
+ 0xA3, 0xE1, 0xB9, 0xA5, 0xE1, 0xB9, 0xA7, 0xE1,
+ 0xB9, 0xA9, 0xE1, 0xB9, 0xAB, 0xE1, 0xB9, 0xAD,
+ 0xE1, 0xB9, 0xAF, 0xE1, 0xB9, 0xB1, 0xE1, 0xB9,
+ 0xB3, 0xE1, 0xB9, 0xB5, 0xE1, 0xB9, 0xB7, 0xE1,
+ 0xB9, 0xB9, 0xE1, 0xB9, 0xBB, 0xE1, 0xB9, 0xBD,
+ 0xE1, 0xB9, 0xBF, 0xE1, 0xBA, 0x81, 0xE1, 0xBA,
+ 0x83, 0xE1, 0xBA, 0x85, 0xE1, 0xBA, 0x87, 0xE1,
+ 0xBA, 0x89, 0xE1, 0xBA, 0x8B, 0xE1, 0xBA, 0x8D,
+ 0xE1, 0xBA, 0x8F, 0xE1, 0xBA, 0x91, 0xE1, 0xBA,
+ 0x93, 0xE1, 0xBA, 0x95, 0xE1, 0xBA, 0xA1, 0xE1,
+ 0xBA, 0xA3, 0xE1, 0xBA, 0xA5, 0xE1, 0xBA, 0xA7,
+ 0xE1, 0xBA, 0xA9, 0xE1, 0xBA, 0xAB, 0xE1, 0xBA,
+ 0xAD, 0xE1, 0xBA, 0xAF, 0xE1, 0xBA, 0xB1, 0xE1,
+ 0xBA, 0xB3, 0xE1, 0xBA, 0xB5, 0xE1, 0xBA, 0xB7,
+ 0xE1, 0xBA, 0xB9, 0xE1, 0xBA, 0xBB, 0xE1, 0xBA,
+ 0xBD, 0xE1, 0xBA, 0xBF, 0xE1, 0xBB, 0x81, 0xE1,
+ 0xBB, 0x83, 0xE1, 0xBB, 0x85, 0xE1, 0xBB, 0x87,
+ 0xE1, 0xBB, 0x89, 0xE1, 0xBB, 0x8B, 0xE1, 0xBB,
+ 0x8D, 0xE1, 0xBB, 0x8F, 0xE1, 0xBB, 0x91, 0xE1,
+ 0xBB, 0x93, 0xE1, 0xBB, 0x95, 0xE1, 0xBB, 0x97,
+ 0xE1, 0xBB, 0x99, 0xE1, 0xBB, 0x9B, 0xE1, 0xBB,
+ 0x9D, 0xE1, 0xBB, 0x9F, 0xE1, 0xBB, 0xA1, 0xE1,
+ 0xBB, 0xA3, 0xE1, 0xBB, 0xA5, 0xE1, 0xBB, 0xA7,
+ 0xE1, 0xBB, 0xA9, 0xE1, 0xBB, 0xAB, 0xE1, 0xBB,
+ 0xAD, 0xE1, 0xBB, 0xAF, 0xE1, 0xBB, 0xB1, 0xE1,
+ 0xBB, 0xB3, 0xE1, 0xBB, 0xB5, 0xE1, 0xBB, 0xB7,
+ 0xE1, 0xBB, 0xB9, 0xE1, 0xBC, 0x80, 0xE1, 0xBC,
+ 0x81, 0xE1, 0xBC, 0x82, 0xE1, 0xBC, 0x83, 0xE1,
+ 0xBC, 0x84, 0xE1, 0xBC, 0x85, 0xE1, 0xBC, 0x86,
+ 0xE1, 0xBC, 0x87, 0xE1, 0xBC, 0x90, 0xE1, 0xBC,
+ 0x91, 0xE1, 0xBC, 0x92, 0xE1, 0xBC, 0x93, 0xE1,
+ 0xBC, 0x94, 0xE1, 0xBC, 0x95, 0xE1, 0xBC, 0xA0,
+ 0xE1, 0xBC, 0xA1, 0xE1, 0xBC, 0xA2, 0xE1, 0xBC,
+ 0xA3, 0xE1, 0xBC, 0xA4, 0xE1, 0xBC, 0xA5, 0xE1,
+ 0xBC, 0xA6, 0xE1, 0xBC, 0xA7, 0xE1, 0xBC, 0xB0,
+ 0xE1, 0xBC, 0xB1, 0xE1, 0xBC, 0xB2, 0xE1, 0xBC,
+ 0xB3, 0xE1, 0xBC, 0xB4, 0xE1, 0xBC, 0xB5, 0xE1,
+ 0xBC, 0xB6, 0xE1, 0xBC, 0xB7, 0xE1, 0xBD, 0x80,
+ 0xE1, 0xBD, 0x81, 0xE1, 0xBD, 0x82, 0xE1, 0xBD,
+ 0x83, 0xE1, 0xBD, 0x84, 0xE1, 0xBD, 0x85, 0xE1,
+ 0xBD, 0x91, 0xE1, 0xBD, 0x93, 0xE1, 0xBD, 0x95,
+ 0xE1, 0xBD, 0x97, 0xE1, 0xBD, 0xA0, 0xE1, 0xBD,
+ 0xA1, 0xE1, 0xBD, 0xA2, 0xE1, 0xBD, 0xA3, 0xE1,
+ 0xBD, 0xA4, 0xE1, 0xBD, 0xA5, 0xE1, 0xBD, 0xA6,
+ 0xE1, 0xBD, 0xA7, 0xE1, 0xBE, 0x80, 0xE1, 0xBE,
+ 0x81, 0xE1, 0xBE, 0x82, 0xE1, 0xBE, 0x83, 0xE1,
+ 0xBE, 0x84, 0xE1, 0xBE, 0x85, 0xE1, 0xBE, 0x86,
+ 0xE1, 0xBE, 0x87, 0xE1, 0xBE, 0x90, 0xE1, 0xBE,
+ 0x91, 0xE1, 0xBE, 0x92, 0xE1, 0xBE, 0x93, 0xE1,
+ 0xBE, 0x94, 0xE1, 0xBE, 0x95, 0xE1, 0xBE, 0x96,
+ 0xE1, 0xBE, 0x97, 0xE1, 0xBE, 0xA0, 0xE1, 0xBE,
+ 0xA1, 0xE1, 0xBE, 0xA2, 0xE1, 0xBE, 0xA3, 0xE1,
+ 0xBE, 0xA4, 0xE1, 0xBE, 0xA5, 0xE1, 0xBE, 0xA6,
+ 0xE1, 0xBE, 0xA7, 0xE1, 0xBE, 0xB0, 0xE1, 0xBE,
+ 0xB1, 0xE1, 0xBD, 0xB0, 0xE1, 0xBD, 0xB1, 0xE1,
+ 0xBE, 0xB3, 0xE1, 0xBD, 0xB2, 0xE1, 0xBD, 0xB3,
+ 0xE1, 0xBD, 0xB4, 0xE1, 0xBD, 0xB5, 0xE1, 0xBF,
+ 0x83, 0xE1, 0xBF, 0x90, 0xE1, 0xBF, 0x91, 0xE1,
+ 0xBD, 0xB6, 0xE1, 0xBD, 0xB7, 0xE1, 0xBF, 0xA0,
+ 0xE1, 0xBF, 0xA1, 0xE1, 0xBD, 0xBA, 0xE1, 0xBD,
+ 0xBB, 0xE1, 0xBF, 0xA5, 0xE1, 0xBD, 0xB8, 0xE1,
+ 0xBD, 0xB9, 0xE1, 0xBD, 0xBC, 0xE1, 0xBD, 0xBD,
+ 0xE1, 0xBF, 0xB3, 0xCF, 0x89, 0x6B, 0xC3, 0xA5,
+ 0xE2, 0x85, 0xB0, 0xE2, 0x85, 0xB1, 0xE2, 0x85,
+ 0xB2, 0xE2, 0x85, 0xB3, 0xE2, 0x85, 0xB4, 0xE2,
+ 0x85, 0xB5, 0xE2, 0x85, 0xB6, 0xE2, 0x85, 0xB7,
+ 0xE2, 0x85, 0xB8, 0xE2, 0x85, 0xB9, 0xE2, 0x85,
+ 0xBA, 0xE2, 0x85, 0xBB, 0xE2, 0x85, 0xBC, 0xE2,
+ 0x85, 0xBD, 0xE2, 0x85, 0xBE, 0xE2, 0x85, 0xBF,
+ 0xE2, 0x93, 0x90, 0xE2, 0x93, 0x91, 0xE2, 0x93,
+ 0x92, 0xE2, 0x93, 0x93, 0xE2, 0x93, 0x94, 0xE2,
+ 0x93, 0x95, 0xE2, 0x93, 0x96, 0xE2, 0x93, 0x97,
+ 0xE2, 0x93, 0x98, 0xE2, 0x93, 0x99, 0xE2, 0x93,
+ 0x9A, 0xE2, 0x93, 0x9B, 0xE2, 0x93, 0x9C, 0xE2,
+ 0x93, 0x9D, 0xE2, 0x93, 0x9E, 0xE2, 0x93, 0x9F,
+ 0xE2, 0x93, 0xA0, 0xE2, 0x93, 0xA1, 0xE2, 0x93,
+ 0xA2, 0xE2, 0x93, 0xA3, 0xE2, 0x93, 0xA4, 0xE2,
+ 0x93, 0xA5, 0xE2, 0x93, 0xA6, 0xE2, 0x93, 0xA7,
+ 0xE2, 0x93, 0xA8, 0xE2, 0x93, 0xA9, 0xEF, 0xBD,
+ 0x81, 0xEF, 0xBD, 0x82, 0xEF, 0xBD, 0x83, 0xEF,
+ 0xBD, 0x84, 0xEF, 0xBD, 0x85, 0xEF, 0xBD, 0x86,
+ 0xEF, 0xBD, 0x87, 0xEF, 0xBD, 0x88, 0xEF, 0xBD,
+ 0x89, 0xEF, 0xBD, 0x8A, 0xEF, 0xBD, 0x8B, 0xEF,
+ 0xBD, 0x8C, 0xEF, 0xBD, 0x8D, 0xEF, 0xBD, 0x8E,
+ 0xEF, 0xBD, 0x8F, 0xEF, 0xBD, 0x90, 0xEF, 0xBD,
+ 0x91, 0xEF, 0xBD, 0x92, 0xEF, 0xBD, 0x93, 0xEF,
+ 0xBD, 0x94, 0xEF, 0xBD, 0x95, 0xEF, 0xBD, 0x96,
+ 0xEF, 0xBD, 0x97, 0xEF, 0xBD, 0x98, 0xEF, 0xBD,
+ 0x99, 0xEF, 0xBD, 0x9A, 0xF0, 0x90, 0x90, 0xA8,
+ 0xF0, 0x90, 0x90, 0xA9, 0xF0, 0x90, 0x90, 0xAA,
+ 0xF0, 0x90, 0x90, 0xAB, 0xF0, 0x90, 0x90, 0xAC,
+ 0xF0, 0x90, 0x90, 0xAD, 0xF0, 0x90, 0x90, 0xAE,
+ 0xF0, 0x90, 0x90, 0xAF, 0xF0, 0x90, 0x90, 0xB0,
+ 0xF0, 0x90, 0x90, 0xB1, 0xF0, 0x90, 0x90, 0xB2,
+ 0xF0, 0x90, 0x90, 0xB3, 0xF0, 0x90, 0x90, 0xB4,
+ 0xF0, 0x90, 0x90, 0xB5, 0xF0, 0x90, 0x90, 0xB6,
+ 0xF0, 0x90, 0x90, 0xB7, 0xF0, 0x90, 0x90, 0xB8,
+ 0xF0, 0x90, 0x90, 0xB9, 0xF0, 0x90, 0x90, 0xBA,
+ 0xF0, 0x90, 0x90, 0xBB, 0xF0, 0x90, 0x90, 0xBC,
+ 0xF0, 0x90, 0x90, 0xBD, 0xF0, 0x90, 0x90, 0xBE,
+ 0xF0, 0x90, 0x90, 0xBF, 0xF0, 0x90, 0x91, 0x80,
+ 0xF0, 0x90, 0x91, 0x81, 0xF0, 0x90, 0x91, 0x82,
+ 0xF0, 0x90, 0x91, 0x83, 0xF0, 0x90, 0x91, 0x84,
+ 0xF0, 0x90, 0x91, 0x85, 0xF0, 0x90, 0x91, 0x86,
+ 0xF0, 0x90, 0x91, 0x87, 0xF0, 0x90, 0x91, 0x88,
+ 0xF0, 0x90, 0x91, 0x89, 0xF0, 0x90, 0x91, 0x8A,
+ 0xF0, 0x90, 0x91, 0x8B, 0xF0, 0x90, 0x91, 0x8C,
+ 0xF0, 0x90, 0x91, 0x8D, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0,
+ },
+ {
+ 0xC3, 0xA0, 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3,
+ 0xC3, 0xA4, 0xC3, 0xA5, 0xC3, 0xA6, 0xC3, 0xA7,
+ 0xC3, 0xA8, 0xC3, 0xA9, 0xC3, 0xAA, 0xC3, 0xAB,
+ 0xC3, 0xAC, 0xC3, 0xAD, 0xC3, 0xAE, 0xC3, 0xAF,
+ 0xC3, 0xB0, 0xC3, 0xB1, 0xC3, 0xB2, 0xC3, 0xB3,
+ 0xC3, 0xB4, 0xC3, 0xB5, 0xC3, 0xB6, 0xC3, 0xB8,
+ 0xC3, 0xB9, 0xC3, 0xBA, 0xC3, 0xBB, 0xC3, 0xBC,
+ 0xC3, 0xBD, 0xC3, 0xBE, 0xC4, 0x81, 0xC4, 0x83,
+ 0xC4, 0x85, 0xC4, 0x87, 0xC4, 0x89, 0xC4, 0x8B,
+ 0xC4, 0x8D, 0xC4, 0x8F, 0xC4, 0x91, 0xC4, 0x93,
+ 0xC4, 0x95, 0xC4, 0x97, 0xC4, 0x99, 0xC4, 0x9B,
+ 0xC4, 0x9D, 0xC4, 0x9F, 0xC4, 0xA1, 0xC4, 0xA3,
+ 0xC4, 0xA5, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xAB,
+ 0xC4, 0xAD, 0xC4, 0xAF, 0x69, 0xC4, 0xB3, 0xC4,
+ 0xB5, 0xC4, 0xB7, 0xC4, 0xBA, 0xC4, 0xBC, 0xC4,
+ 0xBE, 0xC5, 0x80, 0xC5, 0x82, 0xC5, 0x84, 0xC5,
+ 0x86, 0xC5, 0x88, 0xC5, 0x8B, 0xC5, 0x8D, 0xC5,
+ 0x8F, 0xC5, 0x91, 0xC5, 0x93, 0xC5, 0x95, 0xC5,
+ 0x97, 0xC5, 0x99, 0xC5, 0x9B, 0xC5, 0x9D, 0xC5,
+ 0x9F, 0xC5, 0xA1, 0xC5, 0xA3, 0xC5, 0xA5, 0xC5,
+ 0xA7, 0xC5, 0xA9, 0xC5, 0xAB, 0xC5, 0xAD, 0xC5,
+ 0xAF, 0xC5, 0xB1, 0xC5, 0xB3, 0xC5, 0xB5, 0xC5,
+ 0xB7, 0xC3, 0xBF, 0xC5, 0xBA, 0xC5, 0xBC, 0xC5,
+ 0xBE, 0xC9, 0x93, 0xC6, 0x83, 0xC6, 0x85, 0xC9,
+ 0x94, 0xC6, 0x88, 0xC9, 0x96, 0xC9, 0x97, 0xC6,
+ 0x8C, 0xC7, 0x9D, 0xC9, 0x99, 0xC9, 0x9B, 0xC6,
+ 0x92, 0xC9, 0xA0, 0xC9, 0xA3, 0xC9, 0xA9, 0xC9,
+ 0xA8, 0xC6, 0x99, 0xC9, 0xAF, 0xC9, 0xB2, 0xC9,
+ 0xB5, 0xC6, 0xA1, 0xC6, 0xA3, 0xC6, 0xA5, 0xCA,
+ 0x80, 0xC6, 0xA8, 0xCA, 0x83, 0xC6, 0xAD, 0xCA,
+ 0x88, 0xC6, 0xB0, 0xCA, 0x8A, 0xCA, 0x8B, 0xC6,
+ 0xB4, 0xC6, 0xB6, 0xCA, 0x92, 0xC6, 0xB9, 0xC6,
+ 0xBD, 0xC7, 0x86, 0xC7, 0x86, 0xC7, 0x89, 0xC7,
+ 0x89, 0xC7, 0x8C, 0xC7, 0x8C, 0xC7, 0x8E, 0xC7,
+ 0x90, 0xC7, 0x92, 0xC7, 0x94, 0xC7, 0x96, 0xC7,
+ 0x98, 0xC7, 0x9A, 0xC7, 0x9C, 0xC7, 0x9F, 0xC7,
+ 0xA1, 0xC7, 0xA3, 0xC7, 0xA5, 0xC7, 0xA7, 0xC7,
+ 0xA9, 0xC7, 0xAB, 0xC7, 0xAD, 0xC7, 0xAF, 0xC7,
+ 0xB3, 0xC7, 0xB3, 0xC7, 0xB5, 0xC6, 0x95, 0xC6,
+ 0xBF, 0xC7, 0xB9, 0xC7, 0xBB, 0xC7, 0xBD, 0xC7,
+ 0xBF, 0xC8, 0x81, 0xC8, 0x83, 0xC8, 0x85, 0xC8,
+ 0x87, 0xC8, 0x89, 0xC8, 0x8B, 0xC8, 0x8D, 0xC8,
+ 0x8F, 0xC8, 0x91, 0xC8, 0x93, 0xC8, 0x95, 0xC8,
+ 0x97, 0xC8, 0x99, 0xC8, 0x9B, 0xC8, 0x9D, 0xC8,
+ 0x9F, 0xC6, 0x9E, 0xC8, 0xA3, 0xC8, 0xA5, 0xC8,
+ 0xA7, 0xC8, 0xA9, 0xC8, 0xAB, 0xC8, 0xAD, 0xC8,
+ 0xAF, 0xC8, 0xB1, 0xC8, 0xB3, 0xE2, 0xB1, 0xA5,
+ 0xC8, 0xBC, 0xC6, 0x9A, 0xE2, 0xB1, 0xA6, 0xC9,
+ 0x82, 0xC6, 0x80, 0xCA, 0x89, 0xCA, 0x8C, 0xC9,
+ 0x87, 0xC9, 0x89, 0xC9, 0x8B, 0xC9, 0x8D, 0xC9,
+ 0x8F, 0xCE, 0xAC, 0xCE, 0xAD, 0xCE, 0xAE, 0xCE,
+ 0xAF, 0xCF, 0x8C, 0xCF, 0x8D, 0xCF, 0x8E, 0xCE,
+ 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE,
+ 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE,
+ 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE,
+ 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF,
+ 0x81, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF,
+ 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xCF,
+ 0x8A, 0xCF, 0x8B, 0xCF, 0x99, 0xCF, 0x9B, 0xCF,
+ 0x9D, 0xCF, 0x9F, 0xCF, 0xA1, 0xCF, 0xA3, 0xCF,
+ 0xA5, 0xCF, 0xA7, 0xCF, 0xA9, 0xCF, 0xAB, 0xCF,
+ 0xAD, 0xCF, 0xAF, 0xCE, 0xB8, 0xCF, 0xB8, 0xCF,
+ 0xB2, 0xCF, 0xBB, 0xCD, 0xBB, 0xCD, 0xBC, 0xCD,
+ 0xBD, 0xD1, 0x90, 0xD1, 0x91, 0xD1, 0x92, 0xD1,
+ 0x93, 0xD1, 0x94, 0xD1, 0x95, 0xD1, 0x96, 0xD1,
+ 0x97, 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1,
+ 0x9B, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, 0xD1,
+ 0x9F, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0,
+ 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD0, 0xB6, 0xD0,
+ 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0,
+ 0xBB, 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0,
+ 0xBF, 0xD1, 0x80, 0xD1, 0x81, 0xD1, 0x82, 0xD1,
+ 0x83, 0xD1, 0x84, 0xD1, 0x85, 0xD1, 0x86, 0xD1,
+ 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, 0xD1,
+ 0x8B, 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1,
+ 0x8F, 0xD1, 0xA1, 0xD1, 0xA3, 0xD1, 0xA5, 0xD1,
+ 0xA7, 0xD1, 0xA9, 0xD1, 0xAB, 0xD1, 0xAD, 0xD1,
+ 0xAF, 0xD1, 0xB1, 0xD1, 0xB3, 0xD1, 0xB5, 0xD1,
+ 0xB7, 0xD1, 0xB9, 0xD1, 0xBB, 0xD1, 0xBD, 0xD1,
+ 0xBF, 0xD2, 0x81, 0xD2, 0x8B, 0xD2, 0x8D, 0xD2,
+ 0x8F, 0xD2, 0x91, 0xD2, 0x93, 0xD2, 0x95, 0xD2,
+ 0x97, 0xD2, 0x99, 0xD2, 0x9B, 0xD2, 0x9D, 0xD2,
+ 0x9F, 0xD2, 0xA1, 0xD2, 0xA3, 0xD2, 0xA5, 0xD2,
+ 0xA7, 0xD2, 0xA9, 0xD2, 0xAB, 0xD2, 0xAD, 0xD2,
+ 0xAF, 0xD2, 0xB1, 0xD2, 0xB3, 0xD2, 0xB5, 0xD2,
+ 0xB7, 0xD2, 0xB9, 0xD2, 0xBB, 0xD2, 0xBD, 0xD2,
+ 0xBF, 0xD3, 0x8F, 0xD3, 0x82, 0xD3, 0x84, 0xD3,
+ 0x86, 0xD3, 0x88, 0xD3, 0x8A, 0xD3, 0x8C, 0xD3,
+ 0x8E, 0xD3, 0x91, 0xD3, 0x93, 0xD3, 0x95, 0xD3,
+ 0x97, 0xD3, 0x99, 0xD3, 0x9B, 0xD3, 0x9D, 0xD3,
+ 0x9F, 0xD3, 0xA1, 0xD3, 0xA3, 0xD3, 0xA5, 0xD3,
+ 0xA7, 0xD3, 0xA9, 0xD3, 0xAB, 0xD3, 0xAD, 0xD3,
+ 0xAF, 0xD3, 0xB1, 0xD3, 0xB3, 0xD3, 0xB5, 0xD3,
+ 0xB7, 0xD3, 0xB9, 0xD3, 0xBB, 0xD3, 0xBD, 0xD3,
+ 0xBF, 0xD4, 0x81, 0xD4, 0x83, 0xD4, 0x85, 0xD4,
+ 0x87, 0xD4, 0x89, 0xD4, 0x8B, 0xD4, 0x8D, 0xD4,
+ 0x8F, 0xD4, 0x91, 0xD4, 0x93, 0xD5, 0xA1, 0xD5,
+ 0xA2, 0xD5, 0xA3, 0xD5, 0xA4, 0xD5, 0xA5, 0xD5,
+ 0xA6, 0xD5, 0xA7, 0xD5, 0xA8, 0xD5, 0xA9, 0xD5,
+ 0xAA, 0xD5, 0xAB, 0xD5, 0xAC, 0xD5, 0xAD, 0xD5,
+ 0xAE, 0xD5, 0xAF, 0xD5, 0xB0, 0xD5, 0xB1, 0xD5,
+ 0xB2, 0xD5, 0xB3, 0xD5, 0xB4, 0xD5, 0xB5, 0xD5,
+ 0xB6, 0xD5, 0xB7, 0xD5, 0xB8, 0xD5, 0xB9, 0xD5,
+ 0xBA, 0xD5, 0xBB, 0xD5, 0xBC, 0xD5, 0xBD, 0xD5,
+ 0xBE, 0xD5, 0xBF, 0xD6, 0x80, 0xD6, 0x81, 0xD6,
+ 0x82, 0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6,
+ 0x86, 0xE2, 0xB4, 0x80, 0xE2, 0xB4, 0x81, 0xE2,
+ 0xB4, 0x82, 0xE2, 0xB4, 0x83, 0xE2, 0xB4, 0x84,
+ 0xE2, 0xB4, 0x85, 0xE2, 0xB4, 0x86, 0xE2, 0xB4,
+ 0x87, 0xE2, 0xB4, 0x88, 0xE2, 0xB4, 0x89, 0xE2,
+ 0xB4, 0x8A, 0xE2, 0xB4, 0x8B, 0xE2, 0xB4, 0x8C,
+ 0xE2, 0xB4, 0x8D, 0xE2, 0xB4, 0x8E, 0xE2, 0xB4,
+ 0x8F, 0xE2, 0xB4, 0x90, 0xE2, 0xB4, 0x91, 0xE2,
+ 0xB4, 0x92, 0xE2, 0xB4, 0x93, 0xE2, 0xB4, 0x94,
+ 0xE2, 0xB4, 0x95, 0xE2, 0xB4, 0x96, 0xE2, 0xB4,
+ 0x97, 0xE2, 0xB4, 0x98, 0xE2, 0xB4, 0x99, 0xE2,
+ 0xB4, 0x9A, 0xE2, 0xB4, 0x9B, 0xE2, 0xB4, 0x9C,
+ 0xE2, 0xB4, 0x9D, 0xE2, 0xB4, 0x9E, 0xE2, 0xB4,
+ 0x9F, 0xE2, 0xB4, 0xA0, 0xE2, 0xB4, 0xA1, 0xE2,
+ 0xB4, 0xA2, 0xE2, 0xB4, 0xA3, 0xE2, 0xB4, 0xA4,
+ 0xE2, 0xB4, 0xA5, 0xE1, 0xB8, 0x81, 0xE1, 0xB8,
+ 0x83, 0xE1, 0xB8, 0x85, 0xE1, 0xB8, 0x87, 0xE1,
+ 0xB8, 0x89, 0xE1, 0xB8, 0x8B, 0xE1, 0xB8, 0x8D,
+ 0xE1, 0xB8, 0x8F, 0xE1, 0xB8, 0x91, 0xE1, 0xB8,
+ 0x93, 0xE1, 0xB8, 0x95, 0xE1, 0xB8, 0x97, 0xE1,
+ 0xB8, 0x99, 0xE1, 0xB8, 0x9B, 0xE1, 0xB8, 0x9D,
+ 0xE1, 0xB8, 0x9F, 0xE1, 0xB8, 0xA1, 0xE1, 0xB8,
+ 0xA3, 0xE1, 0xB8, 0xA5, 0xE1, 0xB8, 0xA7, 0xE1,
+ 0xB8, 0xA9, 0xE1, 0xB8, 0xAB, 0xE1, 0xB8, 0xAD,
+ 0xE1, 0xB8, 0xAF, 0xE1, 0xB8, 0xB1, 0xE1, 0xB8,
+ 0xB3, 0xE1, 0xB8, 0xB5, 0xE1, 0xB8, 0xB7, 0xE1,
+ 0xB8, 0xB9, 0xE1, 0xB8, 0xBB, 0xE1, 0xB8, 0xBD,
+ 0xE1, 0xB8, 0xBF, 0xE1, 0xB9, 0x81, 0xE1, 0xB9,
+ 0x83, 0xE1, 0xB9, 0x85, 0xE1, 0xB9, 0x87, 0xE1,
+ 0xB9, 0x89, 0xE1, 0xB9, 0x8B, 0xE1, 0xB9, 0x8D,
+ 0xE1, 0xB9, 0x8F, 0xE1, 0xB9, 0x91, 0xE1, 0xB9,
+ 0x93, 0xE1, 0xB9, 0x95, 0xE1, 0xB9, 0x97, 0xE1,
+ 0xB9, 0x99, 0xE1, 0xB9, 0x9B, 0xE1, 0xB9, 0x9D,
+ 0xE1, 0xB9, 0x9F, 0xE1, 0xB9, 0xA1, 0xE1, 0xB9,
+ 0xA3, 0xE1, 0xB9, 0xA5, 0xE1, 0xB9, 0xA7, 0xE1,
+ 0xB9, 0xA9, 0xE1, 0xB9, 0xAB, 0xE1, 0xB9, 0xAD,
+ 0xE1, 0xB9, 0xAF, 0xE1, 0xB9, 0xB1, 0xE1, 0xB9,
+ 0xB3, 0xE1, 0xB9, 0xB5, 0xE1, 0xB9, 0xB7, 0xE1,
+ 0xB9, 0xB9, 0xE1, 0xB9, 0xBB, 0xE1, 0xB9, 0xBD,
+ 0xE1, 0xB9, 0xBF, 0xE1, 0xBA, 0x81, 0xE1, 0xBA,
+ 0x83, 0xE1, 0xBA, 0x85, 0xE1, 0xBA, 0x87, 0xE1,
+ 0xBA, 0x89, 0xE1, 0xBA, 0x8B, 0xE1, 0xBA, 0x8D,
+ 0xE1, 0xBA, 0x8F, 0xE1, 0xBA, 0x91, 0xE1, 0xBA,
+ 0x93, 0xE1, 0xBA, 0x95, 0xE1, 0xBA, 0xA1, 0xE1,
+ 0xBA, 0xA3, 0xE1, 0xBA, 0xA5, 0xE1, 0xBA, 0xA7,
+ 0xE1, 0xBA, 0xA9, 0xE1, 0xBA, 0xAB, 0xE1, 0xBA,
+ 0xAD, 0xE1, 0xBA, 0xAF, 0xE1, 0xBA, 0xB1, 0xE1,
+ 0xBA, 0xB3, 0xE1, 0xBA, 0xB5, 0xE1, 0xBA, 0xB7,
+ 0xE1, 0xBA, 0xB9, 0xE1, 0xBA, 0xBB, 0xE1, 0xBA,
+ 0xBD, 0xE1, 0xBA, 0xBF, 0xE1, 0xBB, 0x81, 0xE1,
+ 0xBB, 0x83, 0xE1, 0xBB, 0x85, 0xE1, 0xBB, 0x87,
+ 0xE1, 0xBB, 0x89, 0xE1, 0xBB, 0x8B, 0xE1, 0xBB,
+ 0x8D, 0xE1, 0xBB, 0x8F, 0xE1, 0xBB, 0x91, 0xE1,
+ 0xBB, 0x93, 0xE1, 0xBB, 0x95, 0xE1, 0xBB, 0x97,
+ 0xE1, 0xBB, 0x99, 0xE1, 0xBB, 0x9B, 0xE1, 0xBB,
+ 0x9D, 0xE1, 0xBB, 0x9F, 0xE1, 0xBB, 0xA1, 0xE1,
+ 0xBB, 0xA3, 0xE1, 0xBB, 0xA5, 0xE1, 0xBB, 0xA7,
+ 0xE1, 0xBB, 0xA9, 0xE1, 0xBB, 0xAB, 0xE1, 0xBB,
+ 0xAD, 0xE1, 0xBB, 0xAF, 0xE1, 0xBB, 0xB1, 0xE1,
+ 0xBB, 0xB3, 0xE1, 0xBB, 0xB5, 0xE1, 0xBB, 0xB7,
+ 0xE1, 0xBB, 0xB9, 0xE1, 0xBC, 0x80, 0xE1, 0xBC,
+ 0x81, 0xE1, 0xBC, 0x82, 0xE1, 0xBC, 0x83, 0xE1,
+ 0xBC, 0x84, 0xE1, 0xBC, 0x85, 0xE1, 0xBC, 0x86,
+ 0xE1, 0xBC, 0x87, 0xE1, 0xBC, 0x90, 0xE1, 0xBC,
+ 0x91, 0xE1, 0xBC, 0x92, 0xE1, 0xBC, 0x93, 0xE1,
+ 0xBC, 0x94, 0xE1, 0xBC, 0x95, 0xE1, 0xBC, 0xA0,
+ 0xE1, 0xBC, 0xA1, 0xE1, 0xBC, 0xA2, 0xE1, 0xBC,
+ 0xA3, 0xE1, 0xBC, 0xA4, 0xE1, 0xBC, 0xA5, 0xE1,
+ 0xBC, 0xA6, 0xE1, 0xBC, 0xA7, 0xE1, 0xBC, 0xB0,
+ 0xE1, 0xBC, 0xB1, 0xE1, 0xBC, 0xB2, 0xE1, 0xBC,
+ 0xB3, 0xE1, 0xBC, 0xB4, 0xE1, 0xBC, 0xB5, 0xE1,
+ 0xBC, 0xB6, 0xE1, 0xBC, 0xB7, 0xE1, 0xBD, 0x80,
+ 0xE1, 0xBD, 0x81, 0xE1, 0xBD, 0x82, 0xE1, 0xBD,
+ 0x83, 0xE1, 0xBD, 0x84, 0xE1, 0xBD, 0x85, 0xE1,
+ 0xBD, 0x91, 0xE1, 0xBD, 0x93, 0xE1, 0xBD, 0x95,
+ 0xE1, 0xBD, 0x97, 0xE1, 0xBD, 0xA0, 0xE1, 0xBD,
+ 0xA1, 0xE1, 0xBD, 0xA2, 0xE1, 0xBD, 0xA3, 0xE1,
+ 0xBD, 0xA4, 0xE1, 0xBD, 0xA5, 0xE1, 0xBD, 0xA6,
+ 0xE1, 0xBD, 0xA7, 0xE1, 0xBE, 0x80, 0xE1, 0xBE,
+ 0x81, 0xE1, 0xBE, 0x82, 0xE1, 0xBE, 0x83, 0xE1,
+ 0xBE, 0x84, 0xE1, 0xBE, 0x85, 0xE1, 0xBE, 0x86,
+ 0xE1, 0xBE, 0x87, 0xE1, 0xBE, 0x90, 0xE1, 0xBE,
+ 0x91, 0xE1, 0xBE, 0x92, 0xE1, 0xBE, 0x93, 0xE1,
+ 0xBE, 0x94, 0xE1, 0xBE, 0x95, 0xE1, 0xBE, 0x96,
+ 0xE1, 0xBE, 0x97, 0xE1, 0xBE, 0xA0, 0xE1, 0xBE,
+ 0xA1, 0xE1, 0xBE, 0xA2, 0xE1, 0xBE, 0xA3, 0xE1,
+ 0xBE, 0xA4, 0xE1, 0xBE, 0xA5, 0xE1, 0xBE, 0xA6,
+ 0xE1, 0xBE, 0xA7, 0xE1, 0xBE, 0xB0, 0xE1, 0xBE,
+ 0xB1, 0xE1, 0xBD, 0xB0, 0xE1, 0xBD, 0xB1, 0xE1,
+ 0xBE, 0xB3, 0xE1, 0xBD, 0xB2, 0xE1, 0xBD, 0xB3,
+ 0xE1, 0xBD, 0xB4, 0xE1, 0xBD, 0xB5, 0xE1, 0xBF,
+ 0x83, 0xE1, 0xBF, 0x90, 0xE1, 0xBF, 0x91, 0xE1,
+ 0xBD, 0xB6, 0xE1, 0xBD, 0xB7, 0xE1, 0xBF, 0xA0,
+ 0xE1, 0xBF, 0xA1, 0xE1, 0xBD, 0xBA, 0xE1, 0xBD,
+ 0xBB, 0xE1, 0xBF, 0xA5, 0xE1, 0xBD, 0xB8, 0xE1,
+ 0xBD, 0xB9, 0xE1, 0xBD, 0xBC, 0xE1, 0xBD, 0xBD,
+ 0xE1, 0xBF, 0xB3, 0xCF, 0x89, 0x6B, 0xC3, 0xA5,
+ 0xE2, 0x85, 0x8E, 0xE2, 0x85, 0xB0, 0xE2, 0x85,
+ 0xB1, 0xE2, 0x85, 0xB2, 0xE2, 0x85, 0xB3, 0xE2,
+ 0x85, 0xB4, 0xE2, 0x85, 0xB5, 0xE2, 0x85, 0xB6,
+ 0xE2, 0x85, 0xB7, 0xE2, 0x85, 0xB8, 0xE2, 0x85,
+ 0xB9, 0xE2, 0x85, 0xBA, 0xE2, 0x85, 0xBB, 0xE2,
+ 0x85, 0xBC, 0xE2, 0x85, 0xBD, 0xE2, 0x85, 0xBE,
+ 0xE2, 0x85, 0xBF, 0xE2, 0x86, 0x84, 0xE2, 0x93,
+ 0x90, 0xE2, 0x93, 0x91, 0xE2, 0x93, 0x92, 0xE2,
+ 0x93, 0x93, 0xE2, 0x93, 0x94, 0xE2, 0x93, 0x95,
+ 0xE2, 0x93, 0x96, 0xE2, 0x93, 0x97, 0xE2, 0x93,
+ 0x98, 0xE2, 0x93, 0x99, 0xE2, 0x93, 0x9A, 0xE2,
+ 0x93, 0x9B, 0xE2, 0x93, 0x9C, 0xE2, 0x93, 0x9D,
+ 0xE2, 0x93, 0x9E, 0xE2, 0x93, 0x9F, 0xE2, 0x93,
+ 0xA0, 0xE2, 0x93, 0xA1, 0xE2, 0x93, 0xA2, 0xE2,
+ 0x93, 0xA3, 0xE2, 0x93, 0xA4, 0xE2, 0x93, 0xA5,
+ 0xE2, 0x93, 0xA6, 0xE2, 0x93, 0xA7, 0xE2, 0x93,
+ 0xA8, 0xE2, 0x93, 0xA9, 0xE2, 0xB0, 0xB0, 0xE2,
+ 0xB0, 0xB1, 0xE2, 0xB0, 0xB2, 0xE2, 0xB0, 0xB3,
+ 0xE2, 0xB0, 0xB4, 0xE2, 0xB0, 0xB5, 0xE2, 0xB0,
+ 0xB6, 0xE2, 0xB0, 0xB7, 0xE2, 0xB0, 0xB8, 0xE2,
+ 0xB0, 0xB9, 0xE2, 0xB0, 0xBA, 0xE2, 0xB0, 0xBB,
+ 0xE2, 0xB0, 0xBC, 0xE2, 0xB0, 0xBD, 0xE2, 0xB0,
+ 0xBE, 0xE2, 0xB0, 0xBF, 0xE2, 0xB1, 0x80, 0xE2,
+ 0xB1, 0x81, 0xE2, 0xB1, 0x82, 0xE2, 0xB1, 0x83,
+ 0xE2, 0xB1, 0x84, 0xE2, 0xB1, 0x85, 0xE2, 0xB1,
+ 0x86, 0xE2, 0xB1, 0x87, 0xE2, 0xB1, 0x88, 0xE2,
+ 0xB1, 0x89, 0xE2, 0xB1, 0x8A, 0xE2, 0xB1, 0x8B,
+ 0xE2, 0xB1, 0x8C, 0xE2, 0xB1, 0x8D, 0xE2, 0xB1,
+ 0x8E, 0xE2, 0xB1, 0x8F, 0xE2, 0xB1, 0x90, 0xE2,
+ 0xB1, 0x91, 0xE2, 0xB1, 0x92, 0xE2, 0xB1, 0x93,
+ 0xE2, 0xB1, 0x94, 0xE2, 0xB1, 0x95, 0xE2, 0xB1,
+ 0x96, 0xE2, 0xB1, 0x97, 0xE2, 0xB1, 0x98, 0xE2,
+ 0xB1, 0x99, 0xE2, 0xB1, 0x9A, 0xE2, 0xB1, 0x9B,
+ 0xE2, 0xB1, 0x9C, 0xE2, 0xB1, 0x9D, 0xE2, 0xB1,
+ 0x9E, 0xE2, 0xB1, 0xA1, 0xC9, 0xAB, 0xE1, 0xB5,
+ 0xBD, 0xC9, 0xBD, 0xE2, 0xB1, 0xA8, 0xE2, 0xB1,
+ 0xAA, 0xE2, 0xB1, 0xAC, 0xE2, 0xB1, 0xB6, 0xE2,
+ 0xB2, 0x81, 0xE2, 0xB2, 0x83, 0xE2, 0xB2, 0x85,
+ 0xE2, 0xB2, 0x87, 0xE2, 0xB2, 0x89, 0xE2, 0xB2,
+ 0x8B, 0xE2, 0xB2, 0x8D, 0xE2, 0xB2, 0x8F, 0xE2,
+ 0xB2, 0x91, 0xE2, 0xB2, 0x93, 0xE2, 0xB2, 0x95,
+ 0xE2, 0xB2, 0x97, 0xE2, 0xB2, 0x99, 0xE2, 0xB2,
+ 0x9B, 0xE2, 0xB2, 0x9D, 0xE2, 0xB2, 0x9F, 0xE2,
+ 0xB2, 0xA1, 0xE2, 0xB2, 0xA3, 0xE2, 0xB2, 0xA5,
+ 0xE2, 0xB2, 0xA7, 0xE2, 0xB2, 0xA9, 0xE2, 0xB2,
+ 0xAB, 0xE2, 0xB2, 0xAD, 0xE2, 0xB2, 0xAF, 0xE2,
+ 0xB2, 0xB1, 0xE2, 0xB2, 0xB3, 0xE2, 0xB2, 0xB5,
+ 0xE2, 0xB2, 0xB7, 0xE2, 0xB2, 0xB9, 0xE2, 0xB2,
+ 0xBB, 0xE2, 0xB2, 0xBD, 0xE2, 0xB2, 0xBF, 0xE2,
+ 0xB3, 0x81, 0xE2, 0xB3, 0x83, 0xE2, 0xB3, 0x85,
+ 0xE2, 0xB3, 0x87, 0xE2, 0xB3, 0x89, 0xE2, 0xB3,
+ 0x8B, 0xE2, 0xB3, 0x8D, 0xE2, 0xB3, 0x8F, 0xE2,
+ 0xB3, 0x91, 0xE2, 0xB3, 0x93, 0xE2, 0xB3, 0x95,
+ 0xE2, 0xB3, 0x97, 0xE2, 0xB3, 0x99, 0xE2, 0xB3,
+ 0x9B, 0xE2, 0xB3, 0x9D, 0xE2, 0xB3, 0x9F, 0xE2,
+ 0xB3, 0xA1, 0xE2, 0xB3, 0xA3, 0xEF, 0xBD, 0x81,
+ 0xEF, 0xBD, 0x82, 0xEF, 0xBD, 0x83, 0xEF, 0xBD,
+ 0x84, 0xEF, 0xBD, 0x85, 0xEF, 0xBD, 0x86, 0xEF,
+ 0xBD, 0x87, 0xEF, 0xBD, 0x88, 0xEF, 0xBD, 0x89,
+ 0xEF, 0xBD, 0x8A, 0xEF, 0xBD, 0x8B, 0xEF, 0xBD,
+ 0x8C, 0xEF, 0xBD, 0x8D, 0xEF, 0xBD, 0x8E, 0xEF,
+ 0xBD, 0x8F, 0xEF, 0xBD, 0x90, 0xEF, 0xBD, 0x91,
+ 0xEF, 0xBD, 0x92, 0xEF, 0xBD, 0x93, 0xEF, 0xBD,
+ 0x94, 0xEF, 0xBD, 0x95, 0xEF, 0xBD, 0x96, 0xEF,
+ 0xBD, 0x97, 0xEF, 0xBD, 0x98, 0xEF, 0xBD, 0x99,
+ 0xEF, 0xBD, 0x9A, 0xF0, 0x90, 0x90, 0xA8, 0xF0,
+ 0x90, 0x90, 0xA9, 0xF0, 0x90, 0x90, 0xAA, 0xF0,
+ 0x90, 0x90, 0xAB, 0xF0, 0x90, 0x90, 0xAC, 0xF0,
+ 0x90, 0x90, 0xAD, 0xF0, 0x90, 0x90, 0xAE, 0xF0,
+ 0x90, 0x90, 0xAF, 0xF0, 0x90, 0x90, 0xB0, 0xF0,
+ 0x90, 0x90, 0xB1, 0xF0, 0x90, 0x90, 0xB2, 0xF0,
+ 0x90, 0x90, 0xB3, 0xF0, 0x90, 0x90, 0xB4, 0xF0,
+ 0x90, 0x90, 0xB5, 0xF0, 0x90, 0x90, 0xB6, 0xF0,
+ 0x90, 0x90, 0xB7, 0xF0, 0x90, 0x90, 0xB8, 0xF0,
+ 0x90, 0x90, 0xB9, 0xF0, 0x90, 0x90, 0xBA, 0xF0,
+ 0x90, 0x90, 0xBB, 0xF0, 0x90, 0x90, 0xBC, 0xF0,
+ 0x90, 0x90, 0xBD, 0xF0, 0x90, 0x90, 0xBE, 0xF0,
+ 0x90, 0x90, 0xBF, 0xF0, 0x90, 0x91, 0x80, 0xF0,
+ 0x90, 0x91, 0x81, 0xF0, 0x90, 0x91, 0x82, 0xF0,
+ 0x90, 0x91, 0x83, 0xF0, 0x90, 0x91, 0x84, 0xF0,
+ 0x90, 0x91, 0x85, 0xF0, 0x90, 0x91, 0x86, 0xF0,
+ 0x90, 0x91, 0x87, 0xF0, 0x90, 0x91, 0x88, 0xF0,
+ 0x90, 0x91, 0x89, 0xF0, 0x90, 0x91, 0x8A, 0xF0,
+ 0x90, 0x91, 0x8B, 0xF0, 0x90, 0x91, 0x8C, 0xF0,
+ 0x90, 0x91, 0x8D, 0xF0, 0x90, 0x91, 0x8E, 0xF0,
+ 0x90, 0x91, 0x8F,
+ },
+};
+
+static const u8_displacement_t u8_toupper_b3_tbl[2][5][256] = {
+ {
+ { /* Third byte table 0. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0, 0 }, { 1, 2 },
+ { 2, 64 }, { 3, 125 }, { 4, 188 }, { 5, 226 },
+ { 6, 288 }, { 7, 338 }, { 8, 364 }, { N_, 0 },
+ { N_, 0 }, { 9, 376 }, { 10, 378 }, { 11, 416 },
+ { 12, 486 }, { 13, 518 }, { 14, 614 }, { 15, 670 },
+ { 16, 724 }, { 17, 740 }, { 18, 802 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 1. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 19, 816 }, { 20, 912 }, { 21, 1008 }, { 22, 1092 },
+ { 23, 1179 }, { 24, 1269 }, { 25, 1365 }, { 26, 1448 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 2. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 27, 1469 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { 28, 1517 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 3. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 29, 1595 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 4. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 30, 1673 }, { 31, 1769 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ },
+ {
+ { /* Third byte table 0. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { 0, 0 }, { 1, 2 },
+ { 2, 64 }, { 3, 125 }, { 4, 188 }, { 5, 230 },
+ { 6, 292 }, { 7, 344 }, { 8, 388 }, { N_, 0 },
+ { N_, 0 }, { 9, 404 }, { 10, 412 }, { 11, 450 },
+ { 12, 524 }, { 13, 556 }, { 14, 652 }, { 15, 708 },
+ { 16, 772 }, { 17, 792 }, { 18, 854 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 1. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 19, 868 }, { N_, 0 }, { N_, 0 },
+ { 20, 871 }, { 21, 967 }, { 22, 1063 }, { 23, 1147 },
+ { 24, 1234 }, { 25, 1324 }, { 26, 1420 }, { 27, 1503 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 2. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 28, 1524 }, { 29, 1575 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { 30, 1578 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 31, 1656 }, { 32, 1704 }, { 33, 1816 }, { 34, 1912 },
+ { 35, 1966 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 3. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { 36, 2080 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ { /* Third byte table 4. */
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { 37, 2158 }, { 38, 2254 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 },
+ },
+ },
+};
+
+static const uchar_t u8_toupper_b4_tbl[2][39][257] = {
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 49, 49, 51, 51, 53, 53,
+ 55, 55, 55, 57, 57, 59, 59, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 50, 50, 52, 52, 54, 54,
+ 56, 56, 56, 58, 58, 60, 60, 62,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 2, 4, 4,
+ 4, 6, 6, 6, 6, 8, 8, 8,
+ 8, 8, 8, 10, 10, 10, 12, 12,
+ 12, 12, 14, 14, 14, 14, 14, 16,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 22, 24, 24, 24, 24, 24, 26, 26,
+ 26, 28, 28, 28, 28, 30, 30, 32,
+ 32, 32, 34, 34, 34, 34, 36, 36,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 4,
+ 4, 6, 8, 8, 10, 12, 12, 14,
+ 14, 16, 16, 18, 18, 20, 20, 22,
+ 22, 24, 24, 26, 26, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 48, 50, 52, 52, 54, 54,
+ 54, 54, 56, 56, 58, 58, 60, 60,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 32, 32, 34, 34, 36, 36,
+ 38, 38, 40, 40, 42, 42, 44, 44,
+ 46, 46, 48, 48, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50,
+ 50,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 4, 4, 6,
+ 8, 8, 10, 10, 12, 12, 12, 12,
+ 12, 14, 14, 14, 16, 16, 16, 16,
+ 16, 18, 20, 20, 20, 20, 20, 20,
+ 22, 22, 22, 24, 24, 24, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26,
+ 26,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 2, 4, 4, 4, 4,
+ 4, 6, 6, 8, 10, 10, 10, 10,
+ 10, 10, 10, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 12,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 2, 4, 6,
+ 8, 8, 10, 12, 14, 16, 18, 20,
+ 22, 24, 26, 28, 30, 32, 34, 36,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 30, 32, 34, 34, 34, 34, 36, 38,
+ 38, 38, 40, 40, 42, 42, 44, 44,
+ 46, 46, 48, 48, 50, 50, 52, 52,
+ 54, 54, 56, 56, 58, 58, 60, 60,
+ 62, 64, 66, 68, 68, 68, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70, 70, 70, 70, 70, 70, 70, 70,
+ 70,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 64, 66, 66, 68, 68, 70, 70,
+ 72, 72, 74, 74, 76, 76, 78, 78,
+ 80, 80, 82, 82, 84, 84, 86, 86,
+ 88, 88, 90, 90, 92, 92, 94, 94,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 50, 50, 52, 52, 54, 54,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 2, 2, 4, 4, 6,
+ 6, 8, 8, 10, 10, 12, 12, 14,
+ 14, 14, 16, 16, 18, 18, 20, 20,
+ 22, 22, 24, 24, 26, 26, 28, 28,
+ 30, 30, 32, 32, 34, 34, 36, 36,
+ 38, 38, 40, 40, 42, 42, 44, 44,
+ 46, 46, 48, 48, 50, 50, 52, 52,
+ 52, 52, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 4, 6, 8, 10, 12,
+ 14, 16, 18, 20, 22, 24, 26, 28,
+ 30, 32, 34, 36, 38, 40, 42, 44,
+ 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 87, 87, 90, 90, 93, 93,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 87, 87, 90, 90, 93, 93,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 33, 33, 33, 33, 36, 36, 36, 36,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 27, 30, 33, 36, 39, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 45, 48, 51, 54, 57, 60, 63,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 69, 72, 75, 78, 81, 84, 87,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 21, 21, 24, 24, 27, 27,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 33, 36, 39, 42, 45, 48, 51,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 57, 60, 63, 66, 69, 72, 75,
+ 78, 81, 84, 87, 90, 93, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 75, 78, 78, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 6, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 12, 15, 15, 15, 15, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 6, 9, 12, 15, 18,
+ 21, 24, 27, 30, 33, 36, 39, 42,
+ 45, 48, 51, 54, 57, 60, 63, 66,
+ 69, 72, 75, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 36. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 37. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ { /* Fourth byte table 38. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0,
+ },
+ },
+ {
+ { /* Fourth byte table 0. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2,
+ },
+ { /* Fourth byte table 1. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 2. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 49, 49, 51, 51, 53, 53,
+ 55, 55, 55, 57, 57, 59, 59, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61, 61, 61, 61, 61, 61, 61, 61,
+ 61,
+ },
+ { /* Fourth byte table 3. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 50, 50, 52, 52, 54, 54,
+ 56, 56, 56, 58, 58, 60, 60, 62,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63,
+ },
+ { /* Fourth byte table 4. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 2, 4, 4, 6, 6,
+ 6, 8, 8, 8, 8, 10, 10, 10,
+ 10, 10, 10, 12, 12, 12, 14, 14,
+ 14, 14, 16, 18, 18, 18, 18, 20,
+ 20, 20, 22, 22, 24, 24, 26, 26,
+ 26, 28, 28, 28, 28, 28, 30, 30,
+ 30, 32, 32, 32, 32, 34, 34, 36,
+ 36, 36, 38, 38, 38, 38, 40, 40,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42,
+ },
+ { /* Fourth byte table 5. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 4,
+ 4, 6, 8, 8, 10, 12, 12, 14,
+ 14, 16, 16, 18, 18, 20, 20, 22,
+ 22, 24, 24, 26, 26, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 48, 50, 52, 52, 54, 54,
+ 54, 54, 56, 56, 58, 58, 60, 60,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 6. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 32, 32, 34, 34, 36, 36,
+ 38, 38, 40, 40, 42, 42, 44, 44,
+ 46, 46, 48, 48, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52,
+ 52,
+ },
+ { /* Fourth byte table 7. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 2, 2, 2, 2, 2,
+ 4, 4, 6, 6, 8, 8, 10, 10,
+ 12, 12, 12, 12, 14, 16, 16, 18,
+ 20, 20, 22, 22, 24, 24, 24, 24,
+ 24, 26, 26, 26, 28, 28, 28, 28,
+ 28, 30, 32, 32, 35, 35, 35, 35,
+ 37, 37, 37, 39, 39, 39, 41, 41,
+ 41, 41, 41, 41, 41, 41, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44,
+ 44,
+ },
+ { /* Fourth byte table 8. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 2, 2, 4, 4, 4, 4,
+ 4, 6, 8, 10, 12, 14, 14, 14,
+ 14, 14, 14, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16,
+ },
+ { /* Fourth byte table 9. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 4, 6, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8,
+ },
+ { /* Fourth byte table 10. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 2, 4, 6,
+ 8, 8, 10, 12, 14, 16, 18, 20,
+ 22, 24, 26, 28, 30, 32, 34, 36,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 38, 38,
+ 38,
+ },
+ { /* Fourth byte table 11. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 30, 32, 34, 34, 34, 34, 36, 38,
+ 38, 38, 40, 40, 42, 42, 44, 44,
+ 46, 46, 48, 48, 50, 50, 52, 52,
+ 54, 54, 56, 56, 58, 58, 60, 60,
+ 62, 64, 66, 68, 68, 68, 70, 70,
+ 70, 72, 72, 72, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74,
+ 74,
+ },
+ { /* Fourth byte table 12. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 32,
+ },
+ { /* Fourth byte table 13. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 64, 66, 66, 68, 68, 70, 70,
+ 72, 72, 74, 74, 76, 76, 78, 78,
+ 80, 80, 82, 82, 84, 84, 86, 86,
+ 88, 88, 90, 90, 92, 92, 94, 94,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 14. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 50, 50, 52, 52, 54, 54,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 56,
+ },
+ { /* Fourth byte table 15. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 2, 2, 4, 4, 6,
+ 6, 8, 8, 10, 10, 12, 12, 14,
+ 16, 16, 18, 18, 20, 20, 22, 22,
+ 24, 24, 26, 26, 28, 28, 30, 30,
+ 32, 32, 34, 34, 36, 36, 38, 38,
+ 40, 40, 42, 42, 44, 44, 46, 46,
+ 48, 48, 50, 50, 52, 52, 54, 54,
+ 56, 56, 58, 58, 60, 60, 62, 62,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ { /* Fourth byte table 16. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14,
+ 16, 16, 18, 18, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20,
+ 20,
+ },
+ { /* Fourth byte table 17. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 4, 6, 8, 10, 12,
+ 14, 16, 18, 20, 22, 24, 26, 28,
+ 30, 32, 34, 36, 38, 40, 42, 44,
+ 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62, 62, 62, 62, 62, 62, 62, 62,
+ 62,
+ },
+ { /* Fourth byte table 18. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 14,
+ },
+ { /* Fourth byte table 19. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 20. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 87, 87, 90, 90, 93, 93,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 21. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 87, 87, 90, 90, 93, 93,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 22. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 33, 33, 33, 33, 36, 36, 36, 36,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84,
+ },
+ { /* Fourth byte table 23. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87,
+ 87,
+ },
+ { /* Fourth byte table 24. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 27, 30, 33, 36, 39, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 45, 48, 51, 54, 57, 60, 63,
+ 66, 66, 66, 66, 66, 66, 66, 66,
+ 66, 69, 72, 75, 78, 81, 84, 87,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90, 90, 90, 90, 90, 90, 90, 90,
+ 90,
+ },
+ { /* Fourth byte table 25. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 21, 21, 24, 24, 27, 27,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 33, 36, 39, 42, 45, 48, 51,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 57, 60, 63, 66, 69, 72, 75,
+ 78, 81, 84, 87, 90, 93, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 26. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 72, 72, 72, 72, 72, 72, 72,
+ 72, 75, 78, 78, 81, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83,
+ 83,
+ },
+ { /* Fourth byte table 27. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 6, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 12, 15, 15, 15, 15, 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 18, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21,
+ 21,
+ },
+ { /* Fourth byte table 28. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 6, 9, 12, 15, 18, 21, 24,
+ 27, 30, 33, 36, 39, 42, 45, 48,
+ 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51,
+ 51,
+ },
+ { /* Fourth byte table 29. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3,
+ },
+ { /* Fourth byte table 30. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78,
+ },
+ { /* Fourth byte table 31. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+ },
+ { /* Fourth byte table 32. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 93, 93, 96, 96, 96, 96, 98, 100,
+ 100, 103, 103, 106, 106, 109, 109, 109,
+ 109, 109, 109, 109, 109, 109, 109, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112, 112, 112, 112, 112, 112, 112, 112,
+ 112,
+ },
+ { /* Fourth byte table 33. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 57, 57,
+ 60, 60, 63, 63, 66, 66, 69, 69,
+ 72, 72, 75, 75, 78, 78, 81, 81,
+ 84, 84, 87, 87, 90, 90, 93, 93,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 34. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 3, 6, 6, 9, 9,
+ 12, 12, 15, 15, 18, 18, 21, 21,
+ 24, 24, 27, 27, 30, 30, 33, 33,
+ 36, 36, 39, 39, 42, 42, 45, 45,
+ 48, 48, 51, 51, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54, 54, 54, 54, 54, 54, 54, 54,
+ 54,
+ },
+ { /* Fourth byte table 35. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 3, 6, 9, 12, 15, 18, 21,
+ 24, 27, 30, 33, 36, 39, 42, 45,
+ 48, 51, 54, 57, 60, 63, 66, 69,
+ 72, 75, 78, 81, 84, 87, 90, 93,
+ 96, 99, 102, 105, 108, 111, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114, 114, 114, 114, 114, 114, 114, 114,
+ 114,
+ },
+ { /* Fourth byte table 36. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 3, 6, 9, 12, 15, 18,
+ 21, 24, 27, 30, 33, 36, 39, 42,
+ 45, 48, 51, 54, 57, 60, 63, 66,
+ 69, 72, 75, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78,
+ 78,
+ },
+ { /* Fourth byte table 37. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96,
+ 96,
+ },
+ { /* Fourth byte table 38. */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64,
+ 64,
+ },
+ },
+};
+
+static const uchar_t u8_toupper_final_tbl[2][2318] = {
+ {
+ 0xCE, 0x9C, 0xC3, 0x80, 0xC3, 0x81, 0xC3, 0x82,
+ 0xC3, 0x83, 0xC3, 0x84, 0xC3, 0x85, 0xC3, 0x86,
+ 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A,
+ 0xC3, 0x8B, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E,
+ 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92,
+ 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, 0xC3, 0x96,
+ 0xC3, 0x98, 0xC3, 0x99, 0xC3, 0x9A, 0xC3, 0x9B,
+ 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, 0xC5, 0xB8,
+ 0xC4, 0x80, 0xC4, 0x82, 0xC4, 0x84, 0xC4, 0x86,
+ 0xC4, 0x88, 0xC4, 0x8A, 0xC4, 0x8C, 0xC4, 0x8E,
+ 0xC4, 0x90, 0xC4, 0x92, 0xC4, 0x94, 0xC4, 0x96,
+ 0xC4, 0x98, 0xC4, 0x9A, 0xC4, 0x9C, 0xC4, 0x9E,
+ 0xC4, 0xA0, 0xC4, 0xA2, 0xC4, 0xA4, 0xC4, 0xA6,
+ 0xC4, 0xA8, 0xC4, 0xAA, 0xC4, 0xAC, 0xC4, 0xAE,
+ 0x49, 0xC4, 0xB2, 0xC4, 0xB4, 0xC4, 0xB6, 0xC4,
+ 0xB9, 0xC4, 0xBB, 0xC4, 0xBD, 0xC4, 0xBF, 0xC5,
+ 0x81, 0xC5, 0x83, 0xC5, 0x85, 0xC5, 0x87, 0xC5,
+ 0x8A, 0xC5, 0x8C, 0xC5, 0x8E, 0xC5, 0x90, 0xC5,
+ 0x92, 0xC5, 0x94, 0xC5, 0x96, 0xC5, 0x98, 0xC5,
+ 0x9A, 0xC5, 0x9C, 0xC5, 0x9E, 0xC5, 0xA0, 0xC5,
+ 0xA2, 0xC5, 0xA4, 0xC5, 0xA6, 0xC5, 0xA8, 0xC5,
+ 0xAA, 0xC5, 0xAC, 0xC5, 0xAE, 0xC5, 0xB0, 0xC5,
+ 0xB2, 0xC5, 0xB4, 0xC5, 0xB6, 0xC5, 0xB9, 0xC5,
+ 0xBB, 0xC5, 0xBD, 0x53, 0xC6, 0x82, 0xC6, 0x84,
+ 0xC6, 0x87, 0xC6, 0x8B, 0xC6, 0x91, 0xC7, 0xB6,
+ 0xC6, 0x98, 0xC8, 0xA0, 0xC6, 0xA0, 0xC6, 0xA2,
+ 0xC6, 0xA4, 0xC6, 0xA7, 0xC6, 0xAC, 0xC6, 0xAF,
+ 0xC6, 0xB3, 0xC6, 0xB5, 0xC6, 0xB8, 0xC6, 0xBC,
+ 0xC7, 0xB7, 0xC7, 0x84, 0xC7, 0x84, 0xC7, 0x87,
+ 0xC7, 0x87, 0xC7, 0x8A, 0xC7, 0x8A, 0xC7, 0x8D,
+ 0xC7, 0x8F, 0xC7, 0x91, 0xC7, 0x93, 0xC7, 0x95,
+ 0xC7, 0x97, 0xC7, 0x99, 0xC7, 0x9B, 0xC6, 0x8E,
+ 0xC7, 0x9E, 0xC7, 0xA0, 0xC7, 0xA2, 0xC7, 0xA4,
+ 0xC7, 0xA6, 0xC7, 0xA8, 0xC7, 0xAA, 0xC7, 0xAC,
+ 0xC7, 0xAE, 0xC7, 0xB1, 0xC7, 0xB1, 0xC7, 0xB4,
+ 0xC7, 0xB8, 0xC7, 0xBA, 0xC7, 0xBC, 0xC7, 0xBE,
+ 0xC8, 0x80, 0xC8, 0x82, 0xC8, 0x84, 0xC8, 0x86,
+ 0xC8, 0x88, 0xC8, 0x8A, 0xC8, 0x8C, 0xC8, 0x8E,
+ 0xC8, 0x90, 0xC8, 0x92, 0xC8, 0x94, 0xC8, 0x96,
+ 0xC8, 0x98, 0xC8, 0x9A, 0xC8, 0x9C, 0xC8, 0x9E,
+ 0xC8, 0xA2, 0xC8, 0xA4, 0xC8, 0xA6, 0xC8, 0xA8,
+ 0xC8, 0xAA, 0xC8, 0xAC, 0xC8, 0xAE, 0xC8, 0xB0,
+ 0xC8, 0xB2, 0xC6, 0x81, 0xC6, 0x86, 0xC6, 0x89,
+ 0xC6, 0x8A, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x93,
+ 0xC6, 0x94, 0xC6, 0x97, 0xC6, 0x96, 0xC6, 0x9C,
+ 0xC6, 0x9D, 0xC6, 0x9F, 0xC6, 0xA6, 0xC6, 0xA9,
+ 0xC6, 0xAE, 0xC6, 0xB1, 0xC6, 0xB2, 0xC6, 0xB7,
+ 0xCE, 0x99, 0xCE, 0x86, 0xCE, 0x88, 0xCE, 0x89,
+ 0xCE, 0x8A, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93,
+ 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97,
+ 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B,
+ 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F,
+ 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0xA3, 0xCE, 0xA3,
+ 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7,
+ 0xCE, 0xA8, 0xCE, 0xA9, 0xCE, 0xAA, 0xCE, 0xAB,
+ 0xCE, 0x8C, 0xCE, 0x8E, 0xCE, 0x8F, 0xCE, 0x92,
+ 0xCE, 0x98, 0xCE, 0xA6, 0xCE, 0xA0, 0xCF, 0x98,
+ 0xCF, 0x9A, 0xCF, 0x9C, 0xCF, 0x9E, 0xCF, 0xA0,
+ 0xCF, 0xA2, 0xCF, 0xA4, 0xCF, 0xA6, 0xCF, 0xA8,
+ 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0xAE, 0xCE, 0x9A,
+ 0xCE, 0xA1, 0xCE, 0xA3, 0xCE, 0x95, 0xD0, 0x90,
+ 0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94,
+ 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, 0xD0, 0x98,
+ 0xD0, 0x99, 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C,
+ 0xD0, 0x9D, 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0,
+ 0xD0, 0xA1, 0xD0, 0xA2, 0xD0, 0xA3, 0xD0, 0xA4,
+ 0xD0, 0xA5, 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8,
+ 0xD0, 0xA9, 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC,
+ 0xD0, 0xAD, 0xD0, 0xAE, 0xD0, 0xAF, 0xD0, 0x80,
+ 0xD0, 0x81, 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84,
+ 0xD0, 0x85, 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88,
+ 0xD0, 0x89, 0xD0, 0x8A, 0xD0, 0x8B, 0xD0, 0x8C,
+ 0xD0, 0x8D, 0xD0, 0x8E, 0xD0, 0x8F, 0xD1, 0xA0,
+ 0xD1, 0xA2, 0xD1, 0xA4, 0xD1, 0xA6, 0xD1, 0xA8,
+ 0xD1, 0xAA, 0xD1, 0xAC, 0xD1, 0xAE, 0xD1, 0xB0,
+ 0xD1, 0xB2, 0xD1, 0xB4, 0xD1, 0xB6, 0xD1, 0xB8,
+ 0xD1, 0xBA, 0xD1, 0xBC, 0xD1, 0xBE, 0xD2, 0x80,
+ 0xD2, 0x8A, 0xD2, 0x8C, 0xD2, 0x8E, 0xD2, 0x90,
+ 0xD2, 0x92, 0xD2, 0x94, 0xD2, 0x96, 0xD2, 0x98,
+ 0xD2, 0x9A, 0xD2, 0x9C, 0xD2, 0x9E, 0xD2, 0xA0,
+ 0xD2, 0xA2, 0xD2, 0xA4, 0xD2, 0xA6, 0xD2, 0xA8,
+ 0xD2, 0xAA, 0xD2, 0xAC, 0xD2, 0xAE, 0xD2, 0xB0,
+ 0xD2, 0xB2, 0xD2, 0xB4, 0xD2, 0xB6, 0xD2, 0xB8,
+ 0xD2, 0xBA, 0xD2, 0xBC, 0xD2, 0xBE, 0xD3, 0x81,
+ 0xD3, 0x83, 0xD3, 0x85, 0xD3, 0x87, 0xD3, 0x89,
+ 0xD3, 0x8B, 0xD3, 0x8D, 0xD3, 0x90, 0xD3, 0x92,
+ 0xD3, 0x94, 0xD3, 0x96, 0xD3, 0x98, 0xD3, 0x9A,
+ 0xD3, 0x9C, 0xD3, 0x9E, 0xD3, 0xA0, 0xD3, 0xA2,
+ 0xD3, 0xA4, 0xD3, 0xA6, 0xD3, 0xA8, 0xD3, 0xAA,
+ 0xD3, 0xAC, 0xD3, 0xAE, 0xD3, 0xB0, 0xD3, 0xB2,
+ 0xD3, 0xB4, 0xD3, 0xB8, 0xD4, 0x80, 0xD4, 0x82,
+ 0xD4, 0x84, 0xD4, 0x86, 0xD4, 0x88, 0xD4, 0x8A,
+ 0xD4, 0x8C, 0xD4, 0x8E, 0xD4, 0xB1, 0xD4, 0xB2,
+ 0xD4, 0xB3, 0xD4, 0xB4, 0xD4, 0xB5, 0xD4, 0xB6,
+ 0xD4, 0xB7, 0xD4, 0xB8, 0xD4, 0xB9, 0xD4, 0xBA,
+ 0xD4, 0xBB, 0xD4, 0xBC, 0xD4, 0xBD, 0xD4, 0xBE,
+ 0xD4, 0xBF, 0xD5, 0x80, 0xD5, 0x81, 0xD5, 0x82,
+ 0xD5, 0x83, 0xD5, 0x84, 0xD5, 0x85, 0xD5, 0x86,
+ 0xD5, 0x87, 0xD5, 0x88, 0xD5, 0x89, 0xD5, 0x8A,
+ 0xD5, 0x8B, 0xD5, 0x8C, 0xD5, 0x8D, 0xD5, 0x8E,
+ 0xD5, 0x8F, 0xD5, 0x90, 0xD5, 0x91, 0xD5, 0x92,
+ 0xD5, 0x93, 0xD5, 0x94, 0xD5, 0x95, 0xD5, 0x96,
+ 0xE1, 0xB8, 0x80, 0xE1, 0xB8, 0x82, 0xE1, 0xB8,
+ 0x84, 0xE1, 0xB8, 0x86, 0xE1, 0xB8, 0x88, 0xE1,
+ 0xB8, 0x8A, 0xE1, 0xB8, 0x8C, 0xE1, 0xB8, 0x8E,
+ 0xE1, 0xB8, 0x90, 0xE1, 0xB8, 0x92, 0xE1, 0xB8,
+ 0x94, 0xE1, 0xB8, 0x96, 0xE1, 0xB8, 0x98, 0xE1,
+ 0xB8, 0x9A, 0xE1, 0xB8, 0x9C, 0xE1, 0xB8, 0x9E,
+ 0xE1, 0xB8, 0xA0, 0xE1, 0xB8, 0xA2, 0xE1, 0xB8,
+ 0xA4, 0xE1, 0xB8, 0xA6, 0xE1, 0xB8, 0xA8, 0xE1,
+ 0xB8, 0xAA, 0xE1, 0xB8, 0xAC, 0xE1, 0xB8, 0xAE,
+ 0xE1, 0xB8, 0xB0, 0xE1, 0xB8, 0xB2, 0xE1, 0xB8,
+ 0xB4, 0xE1, 0xB8, 0xB6, 0xE1, 0xB8, 0xB8, 0xE1,
+ 0xB8, 0xBA, 0xE1, 0xB8, 0xBC, 0xE1, 0xB8, 0xBE,
+ 0xE1, 0xB9, 0x80, 0xE1, 0xB9, 0x82, 0xE1, 0xB9,
+ 0x84, 0xE1, 0xB9, 0x86, 0xE1, 0xB9, 0x88, 0xE1,
+ 0xB9, 0x8A, 0xE1, 0xB9, 0x8C, 0xE1, 0xB9, 0x8E,
+ 0xE1, 0xB9, 0x90, 0xE1, 0xB9, 0x92, 0xE1, 0xB9,
+ 0x94, 0xE1, 0xB9, 0x96, 0xE1, 0xB9, 0x98, 0xE1,
+ 0xB9, 0x9A, 0xE1, 0xB9, 0x9C, 0xE1, 0xB9, 0x9E,
+ 0xE1, 0xB9, 0xA0, 0xE1, 0xB9, 0xA2, 0xE1, 0xB9,
+ 0xA4, 0xE1, 0xB9, 0xA6, 0xE1, 0xB9, 0xA8, 0xE1,
+ 0xB9, 0xAA, 0xE1, 0xB9, 0xAC, 0xE1, 0xB9, 0xAE,
+ 0xE1, 0xB9, 0xB0, 0xE1, 0xB9, 0xB2, 0xE1, 0xB9,
+ 0xB4, 0xE1, 0xB9, 0xB6, 0xE1, 0xB9, 0xB8, 0xE1,
+ 0xB9, 0xBA, 0xE1, 0xB9, 0xBC, 0xE1, 0xB9, 0xBE,
+ 0xE1, 0xBA, 0x80, 0xE1, 0xBA, 0x82, 0xE1, 0xBA,
+ 0x84, 0xE1, 0xBA, 0x86, 0xE1, 0xBA, 0x88, 0xE1,
+ 0xBA, 0x8A, 0xE1, 0xBA, 0x8C, 0xE1, 0xBA, 0x8E,
+ 0xE1, 0xBA, 0x90, 0xE1, 0xBA, 0x92, 0xE1, 0xBA,
+ 0x94, 0xE1, 0xB9, 0xA0, 0xE1, 0xBA, 0xA0, 0xE1,
+ 0xBA, 0xA2, 0xE1, 0xBA, 0xA4, 0xE1, 0xBA, 0xA6,
+ 0xE1, 0xBA, 0xA8, 0xE1, 0xBA, 0xAA, 0xE1, 0xBA,
+ 0xAC, 0xE1, 0xBA, 0xAE, 0xE1, 0xBA, 0xB0, 0xE1,
+ 0xBA, 0xB2, 0xE1, 0xBA, 0xB4, 0xE1, 0xBA, 0xB6,
+ 0xE1, 0xBA, 0xB8, 0xE1, 0xBA, 0xBA, 0xE1, 0xBA,
+ 0xBC, 0xE1, 0xBA, 0xBE, 0xE1, 0xBB, 0x80, 0xE1,
+ 0xBB, 0x82, 0xE1, 0xBB, 0x84, 0xE1, 0xBB, 0x86,
+ 0xE1, 0xBB, 0x88, 0xE1, 0xBB, 0x8A, 0xE1, 0xBB,
+ 0x8C, 0xE1, 0xBB, 0x8E, 0xE1, 0xBB, 0x90, 0xE1,
+ 0xBB, 0x92, 0xE1, 0xBB, 0x94, 0xE1, 0xBB, 0x96,
+ 0xE1, 0xBB, 0x98, 0xE1, 0xBB, 0x9A, 0xE1, 0xBB,
+ 0x9C, 0xE1, 0xBB, 0x9E, 0xE1, 0xBB, 0xA0, 0xE1,
+ 0xBB, 0xA2, 0xE1, 0xBB, 0xA4, 0xE1, 0xBB, 0xA6,
+ 0xE1, 0xBB, 0xA8, 0xE1, 0xBB, 0xAA, 0xE1, 0xBB,
+ 0xAC, 0xE1, 0xBB, 0xAE, 0xE1, 0xBB, 0xB0, 0xE1,
+ 0xBB, 0xB2, 0xE1, 0xBB, 0xB4, 0xE1, 0xBB, 0xB6,
+ 0xE1, 0xBB, 0xB8, 0xE1, 0xBC, 0x88, 0xE1, 0xBC,
+ 0x89, 0xE1, 0xBC, 0x8A, 0xE1, 0xBC, 0x8B, 0xE1,
+ 0xBC, 0x8C, 0xE1, 0xBC, 0x8D, 0xE1, 0xBC, 0x8E,
+ 0xE1, 0xBC, 0x8F, 0xE1, 0xBC, 0x98, 0xE1, 0xBC,
+ 0x99, 0xE1, 0xBC, 0x9A, 0xE1, 0xBC, 0x9B, 0xE1,
+ 0xBC, 0x9C, 0xE1, 0xBC, 0x9D, 0xE1, 0xBC, 0xA8,
+ 0xE1, 0xBC, 0xA9, 0xE1, 0xBC, 0xAA, 0xE1, 0xBC,
+ 0xAB, 0xE1, 0xBC, 0xAC, 0xE1, 0xBC, 0xAD, 0xE1,
+ 0xBC, 0xAE, 0xE1, 0xBC, 0xAF, 0xE1, 0xBC, 0xB8,
+ 0xE1, 0xBC, 0xB9, 0xE1, 0xBC, 0xBA, 0xE1, 0xBC,
+ 0xBB, 0xE1, 0xBC, 0xBC, 0xE1, 0xBC, 0xBD, 0xE1,
+ 0xBC, 0xBE, 0xE1, 0xBC, 0xBF, 0xE1, 0xBD, 0x88,
+ 0xE1, 0xBD, 0x89, 0xE1, 0xBD, 0x8A, 0xE1, 0xBD,
+ 0x8B, 0xE1, 0xBD, 0x8C, 0xE1, 0xBD, 0x8D, 0xE1,
+ 0xBD, 0x99, 0xE1, 0xBD, 0x9B, 0xE1, 0xBD, 0x9D,
+ 0xE1, 0xBD, 0x9F, 0xE1, 0xBD, 0xA8, 0xE1, 0xBD,
+ 0xA9, 0xE1, 0xBD, 0xAA, 0xE1, 0xBD, 0xAB, 0xE1,
+ 0xBD, 0xAC, 0xE1, 0xBD, 0xAD, 0xE1, 0xBD, 0xAE,
+ 0xE1, 0xBD, 0xAF, 0xE1, 0xBE, 0xBA, 0xE1, 0xBE,
+ 0xBB, 0xE1, 0xBF, 0x88, 0xE1, 0xBF, 0x89, 0xE1,
+ 0xBF, 0x8A, 0xE1, 0xBF, 0x8B, 0xE1, 0xBF, 0x9A,
+ 0xE1, 0xBF, 0x9B, 0xE1, 0xBF, 0xB8, 0xE1, 0xBF,
+ 0xB9, 0xE1, 0xBF, 0xAA, 0xE1, 0xBF, 0xAB, 0xE1,
+ 0xBF, 0xBA, 0xE1, 0xBF, 0xBB, 0xE1, 0xBE, 0x88,
+ 0xE1, 0xBE, 0x89, 0xE1, 0xBE, 0x8A, 0xE1, 0xBE,
+ 0x8B, 0xE1, 0xBE, 0x8C, 0xE1, 0xBE, 0x8D, 0xE1,
+ 0xBE, 0x8E, 0xE1, 0xBE, 0x8F, 0xE1, 0xBE, 0x98,
+ 0xE1, 0xBE, 0x99, 0xE1, 0xBE, 0x9A, 0xE1, 0xBE,
+ 0x9B, 0xE1, 0xBE, 0x9C, 0xE1, 0xBE, 0x9D, 0xE1,
+ 0xBE, 0x9E, 0xE1, 0xBE, 0x9F, 0xE1, 0xBE, 0xA8,
+ 0xE1, 0xBE, 0xA9, 0xE1, 0xBE, 0xAA, 0xE1, 0xBE,
+ 0xAB, 0xE1, 0xBE, 0xAC, 0xE1, 0xBE, 0xAD, 0xE1,
+ 0xBE, 0xAE, 0xE1, 0xBE, 0xAF, 0xE1, 0xBE, 0xB8,
+ 0xE1, 0xBE, 0xB9, 0xE1, 0xBE, 0xBC, 0xCE, 0x99,
+ 0xE1, 0xBF, 0x8C, 0xE1, 0xBF, 0x98, 0xE1, 0xBF,
+ 0x99, 0xE1, 0xBF, 0xA8, 0xE1, 0xBF, 0xA9, 0xE1,
+ 0xBF, 0xAC, 0xE1, 0xBF, 0xBC, 0xE2, 0x85, 0xA0,
+ 0xE2, 0x85, 0xA1, 0xE2, 0x85, 0xA2, 0xE2, 0x85,
+ 0xA3, 0xE2, 0x85, 0xA4, 0xE2, 0x85, 0xA5, 0xE2,
+ 0x85, 0xA6, 0xE2, 0x85, 0xA7, 0xE2, 0x85, 0xA8,
+ 0xE2, 0x85, 0xA9, 0xE2, 0x85, 0xAA, 0xE2, 0x85,
+ 0xAB, 0xE2, 0x85, 0xAC, 0xE2, 0x85, 0xAD, 0xE2,
+ 0x85, 0xAE, 0xE2, 0x85, 0xAF, 0xE2, 0x92, 0xB6,
+ 0xE2, 0x92, 0xB7, 0xE2, 0x92, 0xB8, 0xE2, 0x92,
+ 0xB9, 0xE2, 0x92, 0xBA, 0xE2, 0x92, 0xBB, 0xE2,
+ 0x92, 0xBC, 0xE2, 0x92, 0xBD, 0xE2, 0x92, 0xBE,
+ 0xE2, 0x92, 0xBF, 0xE2, 0x93, 0x80, 0xE2, 0x93,
+ 0x81, 0xE2, 0x93, 0x82, 0xE2, 0x93, 0x83, 0xE2,
+ 0x93, 0x84, 0xE2, 0x93, 0x85, 0xE2, 0x93, 0x86,
+ 0xE2, 0x93, 0x87, 0xE2, 0x93, 0x88, 0xE2, 0x93,
+ 0x89, 0xE2, 0x93, 0x8A, 0xE2, 0x93, 0x8B, 0xE2,
+ 0x93, 0x8C, 0xE2, 0x93, 0x8D, 0xE2, 0x93, 0x8E,
+ 0xE2, 0x93, 0x8F, 0xEF, 0xBC, 0xA1, 0xEF, 0xBC,
+ 0xA2, 0xEF, 0xBC, 0xA3, 0xEF, 0xBC, 0xA4, 0xEF,
+ 0xBC, 0xA5, 0xEF, 0xBC, 0xA6, 0xEF, 0xBC, 0xA7,
+ 0xEF, 0xBC, 0xA8, 0xEF, 0xBC, 0xA9, 0xEF, 0xBC,
+ 0xAA, 0xEF, 0xBC, 0xAB, 0xEF, 0xBC, 0xAC, 0xEF,
+ 0xBC, 0xAD, 0xEF, 0xBC, 0xAE, 0xEF, 0xBC, 0xAF,
+ 0xEF, 0xBC, 0xB0, 0xEF, 0xBC, 0xB1, 0xEF, 0xBC,
+ 0xB2, 0xEF, 0xBC, 0xB3, 0xEF, 0xBC, 0xB4, 0xEF,
+ 0xBC, 0xB5, 0xEF, 0xBC, 0xB6, 0xEF, 0xBC, 0xB7,
+ 0xEF, 0xBC, 0xB8, 0xEF, 0xBC, 0xB9, 0xEF, 0xBC,
+ 0xBA, 0xF0, 0x90, 0x90, 0x80, 0xF0, 0x90, 0x90,
+ 0x81, 0xF0, 0x90, 0x90, 0x82, 0xF0, 0x90, 0x90,
+ 0x83, 0xF0, 0x90, 0x90, 0x84, 0xF0, 0x90, 0x90,
+ 0x85, 0xF0, 0x90, 0x90, 0x86, 0xF0, 0x90, 0x90,
+ 0x87, 0xF0, 0x90, 0x90, 0x88, 0xF0, 0x90, 0x90,
+ 0x89, 0xF0, 0x90, 0x90, 0x8A, 0xF0, 0x90, 0x90,
+ 0x8B, 0xF0, 0x90, 0x90, 0x8C, 0xF0, 0x90, 0x90,
+ 0x8D, 0xF0, 0x90, 0x90, 0x8E, 0xF0, 0x90, 0x90,
+ 0x8F, 0xF0, 0x90, 0x90, 0x90, 0xF0, 0x90, 0x90,
+ 0x91, 0xF0, 0x90, 0x90, 0x92, 0xF0, 0x90, 0x90,
+ 0x93, 0xF0, 0x90, 0x90, 0x94, 0xF0, 0x90, 0x90,
+ 0x95, 0xF0, 0x90, 0x90, 0x96, 0xF0, 0x90, 0x90,
+ 0x97, 0xF0, 0x90, 0x90, 0x98, 0xF0, 0x90, 0x90,
+ 0x99, 0xF0, 0x90, 0x90, 0x9A, 0xF0, 0x90, 0x90,
+ 0x9B, 0xF0, 0x90, 0x90, 0x9C, 0xF0, 0x90, 0x90,
+ 0x9D, 0xF0, 0x90, 0x90, 0x9E, 0xF0, 0x90, 0x90,
+ 0x9F, 0xF0, 0x90, 0x90, 0xA0, 0xF0, 0x90, 0x90,
+ 0xA1, 0xF0, 0x90, 0x90, 0xA2, 0xF0, 0x90, 0x90,
+ 0xA3, 0xF0, 0x90, 0x90, 0xA4, 0xF0, 0x90, 0x90,
+ 0xA5, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0,
+ },
+ {
+ 0xCE, 0x9C, 0xC3, 0x80, 0xC3, 0x81, 0xC3, 0x82,
+ 0xC3, 0x83, 0xC3, 0x84, 0xC3, 0x85, 0xC3, 0x86,
+ 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A,
+ 0xC3, 0x8B, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E,
+ 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92,
+ 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, 0xC3, 0x96,
+ 0xC3, 0x98, 0xC3, 0x99, 0xC3, 0x9A, 0xC3, 0x9B,
+ 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, 0xC5, 0xB8,
+ 0xC4, 0x80, 0xC4, 0x82, 0xC4, 0x84, 0xC4, 0x86,
+ 0xC4, 0x88, 0xC4, 0x8A, 0xC4, 0x8C, 0xC4, 0x8E,
+ 0xC4, 0x90, 0xC4, 0x92, 0xC4, 0x94, 0xC4, 0x96,
+ 0xC4, 0x98, 0xC4, 0x9A, 0xC4, 0x9C, 0xC4, 0x9E,
+ 0xC4, 0xA0, 0xC4, 0xA2, 0xC4, 0xA4, 0xC4, 0xA6,
+ 0xC4, 0xA8, 0xC4, 0xAA, 0xC4, 0xAC, 0xC4, 0xAE,
+ 0x49, 0xC4, 0xB2, 0xC4, 0xB4, 0xC4, 0xB6, 0xC4,
+ 0xB9, 0xC4, 0xBB, 0xC4, 0xBD, 0xC4, 0xBF, 0xC5,
+ 0x81, 0xC5, 0x83, 0xC5, 0x85, 0xC5, 0x87, 0xC5,
+ 0x8A, 0xC5, 0x8C, 0xC5, 0x8E, 0xC5, 0x90, 0xC5,
+ 0x92, 0xC5, 0x94, 0xC5, 0x96, 0xC5, 0x98, 0xC5,
+ 0x9A, 0xC5, 0x9C, 0xC5, 0x9E, 0xC5, 0xA0, 0xC5,
+ 0xA2, 0xC5, 0xA4, 0xC5, 0xA6, 0xC5, 0xA8, 0xC5,
+ 0xAA, 0xC5, 0xAC, 0xC5, 0xAE, 0xC5, 0xB0, 0xC5,
+ 0xB2, 0xC5, 0xB4, 0xC5, 0xB6, 0xC5, 0xB9, 0xC5,
+ 0xBB, 0xC5, 0xBD, 0x53, 0xC9, 0x83, 0xC6, 0x82,
+ 0xC6, 0x84, 0xC6, 0x87, 0xC6, 0x8B, 0xC6, 0x91,
+ 0xC7, 0xB6, 0xC6, 0x98, 0xC8, 0xBD, 0xC8, 0xA0,
+ 0xC6, 0xA0, 0xC6, 0xA2, 0xC6, 0xA4, 0xC6, 0xA7,
+ 0xC6, 0xAC, 0xC6, 0xAF, 0xC6, 0xB3, 0xC6, 0xB5,
+ 0xC6, 0xB8, 0xC6, 0xBC, 0xC7, 0xB7, 0xC7, 0x84,
+ 0xC7, 0x84, 0xC7, 0x87, 0xC7, 0x87, 0xC7, 0x8A,
+ 0xC7, 0x8A, 0xC7, 0x8D, 0xC7, 0x8F, 0xC7, 0x91,
+ 0xC7, 0x93, 0xC7, 0x95, 0xC7, 0x97, 0xC7, 0x99,
+ 0xC7, 0x9B, 0xC6, 0x8E, 0xC7, 0x9E, 0xC7, 0xA0,
+ 0xC7, 0xA2, 0xC7, 0xA4, 0xC7, 0xA6, 0xC7, 0xA8,
+ 0xC7, 0xAA, 0xC7, 0xAC, 0xC7, 0xAE, 0xC7, 0xB1,
+ 0xC7, 0xB1, 0xC7, 0xB4, 0xC7, 0xB8, 0xC7, 0xBA,
+ 0xC7, 0xBC, 0xC7, 0xBE, 0xC8, 0x80, 0xC8, 0x82,
+ 0xC8, 0x84, 0xC8, 0x86, 0xC8, 0x88, 0xC8, 0x8A,
+ 0xC8, 0x8C, 0xC8, 0x8E, 0xC8, 0x90, 0xC8, 0x92,
+ 0xC8, 0x94, 0xC8, 0x96, 0xC8, 0x98, 0xC8, 0x9A,
+ 0xC8, 0x9C, 0xC8, 0x9E, 0xC8, 0xA2, 0xC8, 0xA4,
+ 0xC8, 0xA6, 0xC8, 0xA8, 0xC8, 0xAA, 0xC8, 0xAC,
+ 0xC8, 0xAE, 0xC8, 0xB0, 0xC8, 0xB2, 0xC8, 0xBB,
+ 0xC9, 0x81, 0xC9, 0x86, 0xC9, 0x88, 0xC9, 0x8A,
+ 0xC9, 0x8C, 0xC9, 0x8E, 0xC6, 0x81, 0xC6, 0x86,
+ 0xC6, 0x89, 0xC6, 0x8A, 0xC6, 0x8F, 0xC6, 0x90,
+ 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x97, 0xC6, 0x96,
+ 0xE2, 0xB1, 0xA2, 0xC6, 0x9C, 0xC6, 0x9D, 0xC6,
+ 0x9F, 0xE2, 0xB1, 0xA4, 0xC6, 0xA6, 0xC6, 0xA9,
+ 0xC6, 0xAE, 0xC9, 0x84, 0xC6, 0xB1, 0xC6, 0xB2,
+ 0xC9, 0x85, 0xC6, 0xB7, 0xCE, 0x99, 0xCF, 0xBD,
+ 0xCF, 0xBE, 0xCF, 0xBF, 0xCE, 0x86, 0xCE, 0x88,
+ 0xCE, 0x89, 0xCE, 0x8A, 0xCE, 0x91, 0xCE, 0x92,
+ 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96,
+ 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A,
+ 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E,
+ 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0xA3,
+ 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6,
+ 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xCE, 0xAA,
+ 0xCE, 0xAB, 0xCE, 0x8C, 0xCE, 0x8E, 0xCE, 0x8F,
+ 0xCE, 0x92, 0xCE, 0x98, 0xCE, 0xA6, 0xCE, 0xA0,
+ 0xCF, 0x98, 0xCF, 0x9A, 0xCF, 0x9C, 0xCF, 0x9E,
+ 0xCF, 0xA0, 0xCF, 0xA2, 0xCF, 0xA4, 0xCF, 0xA6,
+ 0xCF, 0xA8, 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0xAE,
+ 0xCE, 0x9A, 0xCE, 0xA1, 0xCF, 0xB9, 0xCE, 0x95,
+ 0xCF, 0xB7, 0xCF, 0xBA, 0xD0, 0x90, 0xD0, 0x91,
+ 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, 0xD0, 0x95,
+ 0xD0, 0x96, 0xD0, 0x97, 0xD0, 0x98, 0xD0, 0x99,
+ 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D,
+ 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xD0, 0xA1,
+ 0xD0, 0xA2, 0xD0, 0xA3, 0xD0, 0xA4, 0xD0, 0xA5,
+ 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, 0xD0, 0xA9,
+ 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, 0xD0, 0xAD,
+ 0xD0, 0xAE, 0xD0, 0xAF, 0xD0, 0x80, 0xD0, 0x81,
+ 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, 0xD0, 0x85,
+ 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, 0xD0, 0x89,
+ 0xD0, 0x8A, 0xD0, 0x8B, 0xD0, 0x8C, 0xD0, 0x8D,
+ 0xD0, 0x8E, 0xD0, 0x8F, 0xD1, 0xA0, 0xD1, 0xA2,
+ 0xD1, 0xA4, 0xD1, 0xA6, 0xD1, 0xA8, 0xD1, 0xAA,
+ 0xD1, 0xAC, 0xD1, 0xAE, 0xD1, 0xB0, 0xD1, 0xB2,
+ 0xD1, 0xB4, 0xD1, 0xB6, 0xD1, 0xB8, 0xD1, 0xBA,
+ 0xD1, 0xBC, 0xD1, 0xBE, 0xD2, 0x80, 0xD2, 0x8A,
+ 0xD2, 0x8C, 0xD2, 0x8E, 0xD2, 0x90, 0xD2, 0x92,
+ 0xD2, 0x94, 0xD2, 0x96, 0xD2, 0x98, 0xD2, 0x9A,
+ 0xD2, 0x9C, 0xD2, 0x9E, 0xD2, 0xA0, 0xD2, 0xA2,
+ 0xD2, 0xA4, 0xD2, 0xA6, 0xD2, 0xA8, 0xD2, 0xAA,
+ 0xD2, 0xAC, 0xD2, 0xAE, 0xD2, 0xB0, 0xD2, 0xB2,
+ 0xD2, 0xB4, 0xD2, 0xB6, 0xD2, 0xB8, 0xD2, 0xBA,
+ 0xD2, 0xBC, 0xD2, 0xBE, 0xD3, 0x81, 0xD3, 0x83,
+ 0xD3, 0x85, 0xD3, 0x87, 0xD3, 0x89, 0xD3, 0x8B,
+ 0xD3, 0x8D, 0xD3, 0x80, 0xD3, 0x90, 0xD3, 0x92,
+ 0xD3, 0x94, 0xD3, 0x96, 0xD3, 0x98, 0xD3, 0x9A,
+ 0xD3, 0x9C, 0xD3, 0x9E, 0xD3, 0xA0, 0xD3, 0xA2,
+ 0xD3, 0xA4, 0xD3, 0xA6, 0xD3, 0xA8, 0xD3, 0xAA,
+ 0xD3, 0xAC, 0xD3, 0xAE, 0xD3, 0xB0, 0xD3, 0xB2,
+ 0xD3, 0xB4, 0xD3, 0xB6, 0xD3, 0xB8, 0xD3, 0xBA,
+ 0xD3, 0xBC, 0xD3, 0xBE, 0xD4, 0x80, 0xD4, 0x82,
+ 0xD4, 0x84, 0xD4, 0x86, 0xD4, 0x88, 0xD4, 0x8A,
+ 0xD4, 0x8C, 0xD4, 0x8E, 0xD4, 0x90, 0xD4, 0x92,
+ 0xD4, 0xB1, 0xD4, 0xB2, 0xD4, 0xB3, 0xD4, 0xB4,
+ 0xD4, 0xB5, 0xD4, 0xB6, 0xD4, 0xB7, 0xD4, 0xB8,
+ 0xD4, 0xB9, 0xD4, 0xBA, 0xD4, 0xBB, 0xD4, 0xBC,
+ 0xD4, 0xBD, 0xD4, 0xBE, 0xD4, 0xBF, 0xD5, 0x80,
+ 0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84,
+ 0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88,
+ 0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C,
+ 0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90,
+ 0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94,
+ 0xD5, 0x95, 0xD5, 0x96, 0xE2, 0xB1, 0xA3, 0xE1,
+ 0xB8, 0x80, 0xE1, 0xB8, 0x82, 0xE1, 0xB8, 0x84,
+ 0xE1, 0xB8, 0x86, 0xE1, 0xB8, 0x88, 0xE1, 0xB8,
+ 0x8A, 0xE1, 0xB8, 0x8C, 0xE1, 0xB8, 0x8E, 0xE1,
+ 0xB8, 0x90, 0xE1, 0xB8, 0x92, 0xE1, 0xB8, 0x94,
+ 0xE1, 0xB8, 0x96, 0xE1, 0xB8, 0x98, 0xE1, 0xB8,
+ 0x9A, 0xE1, 0xB8, 0x9C, 0xE1, 0xB8, 0x9E, 0xE1,
+ 0xB8, 0xA0, 0xE1, 0xB8, 0xA2, 0xE1, 0xB8, 0xA4,
+ 0xE1, 0xB8, 0xA6, 0xE1, 0xB8, 0xA8, 0xE1, 0xB8,
+ 0xAA, 0xE1, 0xB8, 0xAC, 0xE1, 0xB8, 0xAE, 0xE1,
+ 0xB8, 0xB0, 0xE1, 0xB8, 0xB2, 0xE1, 0xB8, 0xB4,
+ 0xE1, 0xB8, 0xB6, 0xE1, 0xB8, 0xB8, 0xE1, 0xB8,
+ 0xBA, 0xE1, 0xB8, 0xBC, 0xE1, 0xB8, 0xBE, 0xE1,
+ 0xB9, 0x80, 0xE1, 0xB9, 0x82, 0xE1, 0xB9, 0x84,
+ 0xE1, 0xB9, 0x86, 0xE1, 0xB9, 0x88, 0xE1, 0xB9,
+ 0x8A, 0xE1, 0xB9, 0x8C, 0xE1, 0xB9, 0x8E, 0xE1,
+ 0xB9, 0x90, 0xE1, 0xB9, 0x92, 0xE1, 0xB9, 0x94,
+ 0xE1, 0xB9, 0x96, 0xE1, 0xB9, 0x98, 0xE1, 0xB9,
+ 0x9A, 0xE1, 0xB9, 0x9C, 0xE1, 0xB9, 0x9E, 0xE1,
+ 0xB9, 0xA0, 0xE1, 0xB9, 0xA2, 0xE1, 0xB9, 0xA4,
+ 0xE1, 0xB9, 0xA6, 0xE1, 0xB9, 0xA8, 0xE1, 0xB9,
+ 0xAA, 0xE1, 0xB9, 0xAC, 0xE1, 0xB9, 0xAE, 0xE1,
+ 0xB9, 0xB0, 0xE1, 0xB9, 0xB2, 0xE1, 0xB9, 0xB4,
+ 0xE1, 0xB9, 0xB6, 0xE1, 0xB9, 0xB8, 0xE1, 0xB9,
+ 0xBA, 0xE1, 0xB9, 0xBC, 0xE1, 0xB9, 0xBE, 0xE1,
+ 0xBA, 0x80, 0xE1, 0xBA, 0x82, 0xE1, 0xBA, 0x84,
+ 0xE1, 0xBA, 0x86, 0xE1, 0xBA, 0x88, 0xE1, 0xBA,
+ 0x8A, 0xE1, 0xBA, 0x8C, 0xE1, 0xBA, 0x8E, 0xE1,
+ 0xBA, 0x90, 0xE1, 0xBA, 0x92, 0xE1, 0xBA, 0x94,
+ 0xE1, 0xB9, 0xA0, 0xE1, 0xBA, 0xA0, 0xE1, 0xBA,
+ 0xA2, 0xE1, 0xBA, 0xA4, 0xE1, 0xBA, 0xA6, 0xE1,
+ 0xBA, 0xA8, 0xE1, 0xBA, 0xAA, 0xE1, 0xBA, 0xAC,
+ 0xE1, 0xBA, 0xAE, 0xE1, 0xBA, 0xB0, 0xE1, 0xBA,
+ 0xB2, 0xE1, 0xBA, 0xB4, 0xE1, 0xBA, 0xB6, 0xE1,
+ 0xBA, 0xB8, 0xE1, 0xBA, 0xBA, 0xE1, 0xBA, 0xBC,
+ 0xE1, 0xBA, 0xBE, 0xE1, 0xBB, 0x80, 0xE1, 0xBB,
+ 0x82, 0xE1, 0xBB, 0x84, 0xE1, 0xBB, 0x86, 0xE1,
+ 0xBB, 0x88, 0xE1, 0xBB, 0x8A, 0xE1, 0xBB, 0x8C,
+ 0xE1, 0xBB, 0x8E, 0xE1, 0xBB, 0x90, 0xE1, 0xBB,
+ 0x92, 0xE1, 0xBB, 0x94, 0xE1, 0xBB, 0x96, 0xE1,
+ 0xBB, 0x98, 0xE1, 0xBB, 0x9A, 0xE1, 0xBB, 0x9C,
+ 0xE1, 0xBB, 0x9E, 0xE1, 0xBB, 0xA0, 0xE1, 0xBB,
+ 0xA2, 0xE1, 0xBB, 0xA4, 0xE1, 0xBB, 0xA6, 0xE1,
+ 0xBB, 0xA8, 0xE1, 0xBB, 0xAA, 0xE1, 0xBB, 0xAC,
+ 0xE1, 0xBB, 0xAE, 0xE1, 0xBB, 0xB0, 0xE1, 0xBB,
+ 0xB2, 0xE1, 0xBB, 0xB4, 0xE1, 0xBB, 0xB6, 0xE1,
+ 0xBB, 0xB8, 0xE1, 0xBC, 0x88, 0xE1, 0xBC, 0x89,
+ 0xE1, 0xBC, 0x8A, 0xE1, 0xBC, 0x8B, 0xE1, 0xBC,
+ 0x8C, 0xE1, 0xBC, 0x8D, 0xE1, 0xBC, 0x8E, 0xE1,
+ 0xBC, 0x8F, 0xE1, 0xBC, 0x98, 0xE1, 0xBC, 0x99,
+ 0xE1, 0xBC, 0x9A, 0xE1, 0xBC, 0x9B, 0xE1, 0xBC,
+ 0x9C, 0xE1, 0xBC, 0x9D, 0xE1, 0xBC, 0xA8, 0xE1,
+ 0xBC, 0xA9, 0xE1, 0xBC, 0xAA, 0xE1, 0xBC, 0xAB,
+ 0xE1, 0xBC, 0xAC, 0xE1, 0xBC, 0xAD, 0xE1, 0xBC,
+ 0xAE, 0xE1, 0xBC, 0xAF, 0xE1, 0xBC, 0xB8, 0xE1,
+ 0xBC, 0xB9, 0xE1, 0xBC, 0xBA, 0xE1, 0xBC, 0xBB,
+ 0xE1, 0xBC, 0xBC, 0xE1, 0xBC, 0xBD, 0xE1, 0xBC,
+ 0xBE, 0xE1, 0xBC, 0xBF, 0xE1, 0xBD, 0x88, 0xE1,
+ 0xBD, 0x89, 0xE1, 0xBD, 0x8A, 0xE1, 0xBD, 0x8B,
+ 0xE1, 0xBD, 0x8C, 0xE1, 0xBD, 0x8D, 0xE1, 0xBD,
+ 0x99, 0xE1, 0xBD, 0x9B, 0xE1, 0xBD, 0x9D, 0xE1,
+ 0xBD, 0x9F, 0xE1, 0xBD, 0xA8, 0xE1, 0xBD, 0xA9,
+ 0xE1, 0xBD, 0xAA, 0xE1, 0xBD, 0xAB, 0xE1, 0xBD,
+ 0xAC, 0xE1, 0xBD, 0xAD, 0xE1, 0xBD, 0xAE, 0xE1,
+ 0xBD, 0xAF, 0xE1, 0xBE, 0xBA, 0xE1, 0xBE, 0xBB,
+ 0xE1, 0xBF, 0x88, 0xE1, 0xBF, 0x89, 0xE1, 0xBF,
+ 0x8A, 0xE1, 0xBF, 0x8B, 0xE1, 0xBF, 0x9A, 0xE1,
+ 0xBF, 0x9B, 0xE1, 0xBF, 0xB8, 0xE1, 0xBF, 0xB9,
+ 0xE1, 0xBF, 0xAA, 0xE1, 0xBF, 0xAB, 0xE1, 0xBF,
+ 0xBA, 0xE1, 0xBF, 0xBB, 0xE1, 0xBE, 0x88, 0xE1,
+ 0xBE, 0x89, 0xE1, 0xBE, 0x8A, 0xE1, 0xBE, 0x8B,
+ 0xE1, 0xBE, 0x8C, 0xE1, 0xBE, 0x8D, 0xE1, 0xBE,
+ 0x8E, 0xE1, 0xBE, 0x8F, 0xE1, 0xBE, 0x98, 0xE1,
+ 0xBE, 0x99, 0xE1, 0xBE, 0x9A, 0xE1, 0xBE, 0x9B,
+ 0xE1, 0xBE, 0x9C, 0xE1, 0xBE, 0x9D, 0xE1, 0xBE,
+ 0x9E, 0xE1, 0xBE, 0x9F, 0xE1, 0xBE, 0xA8, 0xE1,
+ 0xBE, 0xA9, 0xE1, 0xBE, 0xAA, 0xE1, 0xBE, 0xAB,
+ 0xE1, 0xBE, 0xAC, 0xE1, 0xBE, 0xAD, 0xE1, 0xBE,
+ 0xAE, 0xE1, 0xBE, 0xAF, 0xE1, 0xBE, 0xB8, 0xE1,
+ 0xBE, 0xB9, 0xE1, 0xBE, 0xBC, 0xCE, 0x99, 0xE1,
+ 0xBF, 0x8C, 0xE1, 0xBF, 0x98, 0xE1, 0xBF, 0x99,
+ 0xE1, 0xBF, 0xA8, 0xE1, 0xBF, 0xA9, 0xE1, 0xBF,
+ 0xAC, 0xE1, 0xBF, 0xBC, 0xE2, 0x84, 0xB2, 0xE2,
+ 0x85, 0xA0, 0xE2, 0x85, 0xA1, 0xE2, 0x85, 0xA2,
+ 0xE2, 0x85, 0xA3, 0xE2, 0x85, 0xA4, 0xE2, 0x85,
+ 0xA5, 0xE2, 0x85, 0xA6, 0xE2, 0x85, 0xA7, 0xE2,
+ 0x85, 0xA8, 0xE2, 0x85, 0xA9, 0xE2, 0x85, 0xAA,
+ 0xE2, 0x85, 0xAB, 0xE2, 0x85, 0xAC, 0xE2, 0x85,
+ 0xAD, 0xE2, 0x85, 0xAE, 0xE2, 0x85, 0xAF, 0xE2,
+ 0x86, 0x83, 0xE2, 0x92, 0xB6, 0xE2, 0x92, 0xB7,
+ 0xE2, 0x92, 0xB8, 0xE2, 0x92, 0xB9, 0xE2, 0x92,
+ 0xBA, 0xE2, 0x92, 0xBB, 0xE2, 0x92, 0xBC, 0xE2,
+ 0x92, 0xBD, 0xE2, 0x92, 0xBE, 0xE2, 0x92, 0xBF,
+ 0xE2, 0x93, 0x80, 0xE2, 0x93, 0x81, 0xE2, 0x93,
+ 0x82, 0xE2, 0x93, 0x83, 0xE2, 0x93, 0x84, 0xE2,
+ 0x93, 0x85, 0xE2, 0x93, 0x86, 0xE2, 0x93, 0x87,
+ 0xE2, 0x93, 0x88, 0xE2, 0x93, 0x89, 0xE2, 0x93,
+ 0x8A, 0xE2, 0x93, 0x8B, 0xE2, 0x93, 0x8C, 0xE2,
+ 0x93, 0x8D, 0xE2, 0x93, 0x8E, 0xE2, 0x93, 0x8F,
+ 0xE2, 0xB0, 0x80, 0xE2, 0xB0, 0x81, 0xE2, 0xB0,
+ 0x82, 0xE2, 0xB0, 0x83, 0xE2, 0xB0, 0x84, 0xE2,
+ 0xB0, 0x85, 0xE2, 0xB0, 0x86, 0xE2, 0xB0, 0x87,
+ 0xE2, 0xB0, 0x88, 0xE2, 0xB0, 0x89, 0xE2, 0xB0,
+ 0x8A, 0xE2, 0xB0, 0x8B, 0xE2, 0xB0, 0x8C, 0xE2,
+ 0xB0, 0x8D, 0xE2, 0xB0, 0x8E, 0xE2, 0xB0, 0x8F,
+ 0xE2, 0xB0, 0x90, 0xE2, 0xB0, 0x91, 0xE2, 0xB0,
+ 0x92, 0xE2, 0xB0, 0x93, 0xE2, 0xB0, 0x94, 0xE2,
+ 0xB0, 0x95, 0xE2, 0xB0, 0x96, 0xE2, 0xB0, 0x97,
+ 0xE2, 0xB0, 0x98, 0xE2, 0xB0, 0x99, 0xE2, 0xB0,
+ 0x9A, 0xE2, 0xB0, 0x9B, 0xE2, 0xB0, 0x9C, 0xE2,
+ 0xB0, 0x9D, 0xE2, 0xB0, 0x9E, 0xE2, 0xB0, 0x9F,
+ 0xE2, 0xB0, 0xA0, 0xE2, 0xB0, 0xA1, 0xE2, 0xB0,
+ 0xA2, 0xE2, 0xB0, 0xA3, 0xE2, 0xB0, 0xA4, 0xE2,
+ 0xB0, 0xA5, 0xE2, 0xB0, 0xA6, 0xE2, 0xB0, 0xA7,
+ 0xE2, 0xB0, 0xA8, 0xE2, 0xB0, 0xA9, 0xE2, 0xB0,
+ 0xAA, 0xE2, 0xB0, 0xAB, 0xE2, 0xB0, 0xAC, 0xE2,
+ 0xB0, 0xAD, 0xE2, 0xB0, 0xAE, 0xE2, 0xB1, 0xA0,
+ 0xC8, 0xBA, 0xC8, 0xBE, 0xE2, 0xB1, 0xA7, 0xE2,
+ 0xB1, 0xA9, 0xE2, 0xB1, 0xAB, 0xE2, 0xB1, 0xB5,
+ 0xE2, 0xB2, 0x80, 0xE2, 0xB2, 0x82, 0xE2, 0xB2,
+ 0x84, 0xE2, 0xB2, 0x86, 0xE2, 0xB2, 0x88, 0xE2,
+ 0xB2, 0x8A, 0xE2, 0xB2, 0x8C, 0xE2, 0xB2, 0x8E,
+ 0xE2, 0xB2, 0x90, 0xE2, 0xB2, 0x92, 0xE2, 0xB2,
+ 0x94, 0xE2, 0xB2, 0x96, 0xE2, 0xB2, 0x98, 0xE2,
+ 0xB2, 0x9A, 0xE2, 0xB2, 0x9C, 0xE2, 0xB2, 0x9E,
+ 0xE2, 0xB2, 0xA0, 0xE2, 0xB2, 0xA2, 0xE2, 0xB2,
+ 0xA4, 0xE2, 0xB2, 0xA6, 0xE2, 0xB2, 0xA8, 0xE2,
+ 0xB2, 0xAA, 0xE2, 0xB2, 0xAC, 0xE2, 0xB2, 0xAE,
+ 0xE2, 0xB2, 0xB0, 0xE2, 0xB2, 0xB2, 0xE2, 0xB2,
+ 0xB4, 0xE2, 0xB2, 0xB6, 0xE2, 0xB2, 0xB8, 0xE2,
+ 0xB2, 0xBA, 0xE2, 0xB2, 0xBC, 0xE2, 0xB2, 0xBE,
+ 0xE2, 0xB3, 0x80, 0xE2, 0xB3, 0x82, 0xE2, 0xB3,
+ 0x84, 0xE2, 0xB3, 0x86, 0xE2, 0xB3, 0x88, 0xE2,
+ 0xB3, 0x8A, 0xE2, 0xB3, 0x8C, 0xE2, 0xB3, 0x8E,
+ 0xE2, 0xB3, 0x90, 0xE2, 0xB3, 0x92, 0xE2, 0xB3,
+ 0x94, 0xE2, 0xB3, 0x96, 0xE2, 0xB3, 0x98, 0xE2,
+ 0xB3, 0x9A, 0xE2, 0xB3, 0x9C, 0xE2, 0xB3, 0x9E,
+ 0xE2, 0xB3, 0xA0, 0xE2, 0xB3, 0xA2, 0xE1, 0x82,
+ 0xA0, 0xE1, 0x82, 0xA1, 0xE1, 0x82, 0xA2, 0xE1,
+ 0x82, 0xA3, 0xE1, 0x82, 0xA4, 0xE1, 0x82, 0xA5,
+ 0xE1, 0x82, 0xA6, 0xE1, 0x82, 0xA7, 0xE1, 0x82,
+ 0xA8, 0xE1, 0x82, 0xA9, 0xE1, 0x82, 0xAA, 0xE1,
+ 0x82, 0xAB, 0xE1, 0x82, 0xAC, 0xE1, 0x82, 0xAD,
+ 0xE1, 0x82, 0xAE, 0xE1, 0x82, 0xAF, 0xE1, 0x82,
+ 0xB0, 0xE1, 0x82, 0xB1, 0xE1, 0x82, 0xB2, 0xE1,
+ 0x82, 0xB3, 0xE1, 0x82, 0xB4, 0xE1, 0x82, 0xB5,
+ 0xE1, 0x82, 0xB6, 0xE1, 0x82, 0xB7, 0xE1, 0x82,
+ 0xB8, 0xE1, 0x82, 0xB9, 0xE1, 0x82, 0xBA, 0xE1,
+ 0x82, 0xBB, 0xE1, 0x82, 0xBC, 0xE1, 0x82, 0xBD,
+ 0xE1, 0x82, 0xBE, 0xE1, 0x82, 0xBF, 0xE1, 0x83,
+ 0x80, 0xE1, 0x83, 0x81, 0xE1, 0x83, 0x82, 0xE1,
+ 0x83, 0x83, 0xE1, 0x83, 0x84, 0xE1, 0x83, 0x85,
+ 0xEF, 0xBC, 0xA1, 0xEF, 0xBC, 0xA2, 0xEF, 0xBC,
+ 0xA3, 0xEF, 0xBC, 0xA4, 0xEF, 0xBC, 0xA5, 0xEF,
+ 0xBC, 0xA6, 0xEF, 0xBC, 0xA7, 0xEF, 0xBC, 0xA8,
+ 0xEF, 0xBC, 0xA9, 0xEF, 0xBC, 0xAA, 0xEF, 0xBC,
+ 0xAB, 0xEF, 0xBC, 0xAC, 0xEF, 0xBC, 0xAD, 0xEF,
+ 0xBC, 0xAE, 0xEF, 0xBC, 0xAF, 0xEF, 0xBC, 0xB0,
+ 0xEF, 0xBC, 0xB1, 0xEF, 0xBC, 0xB2, 0xEF, 0xBC,
+ 0xB3, 0xEF, 0xBC, 0xB4, 0xEF, 0xBC, 0xB5, 0xEF,
+ 0xBC, 0xB6, 0xEF, 0xBC, 0xB7, 0xEF, 0xBC, 0xB8,
+ 0xEF, 0xBC, 0xB9, 0xEF, 0xBC, 0xBA, 0xF0, 0x90,
+ 0x90, 0x80, 0xF0, 0x90, 0x90, 0x81, 0xF0, 0x90,
+ 0x90, 0x82, 0xF0, 0x90, 0x90, 0x83, 0xF0, 0x90,
+ 0x90, 0x84, 0xF0, 0x90, 0x90, 0x85, 0xF0, 0x90,
+ 0x90, 0x86, 0xF0, 0x90, 0x90, 0x87, 0xF0, 0x90,
+ 0x90, 0x88, 0xF0, 0x90, 0x90, 0x89, 0xF0, 0x90,
+ 0x90, 0x8A, 0xF0, 0x90, 0x90, 0x8B, 0xF0, 0x90,
+ 0x90, 0x8C, 0xF0, 0x90, 0x90, 0x8D, 0xF0, 0x90,
+ 0x90, 0x8E, 0xF0, 0x90, 0x90, 0x8F, 0xF0, 0x90,
+ 0x90, 0x90, 0xF0, 0x90, 0x90, 0x91, 0xF0, 0x90,
+ 0x90, 0x92, 0xF0, 0x90, 0x90, 0x93, 0xF0, 0x90,
+ 0x90, 0x94, 0xF0, 0x90, 0x90, 0x95, 0xF0, 0x90,
+ 0x90, 0x96, 0xF0, 0x90, 0x90, 0x97, 0xF0, 0x90,
+ 0x90, 0x98, 0xF0, 0x90, 0x90, 0x99, 0xF0, 0x90,
+ 0x90, 0x9A, 0xF0, 0x90, 0x90, 0x9B, 0xF0, 0x90,
+ 0x90, 0x9C, 0xF0, 0x90, 0x90, 0x9D, 0xF0, 0x90,
+ 0x90, 0x9E, 0xF0, 0x90, 0x90, 0x9F, 0xF0, 0x90,
+ 0x90, 0xA0, 0xF0, 0x90, 0x90, 0xA1, 0xF0, 0x90,
+ 0x90, 0xA2, 0xF0, 0x90, 0x90, 0xA3, 0xF0, 0x90,
+ 0x90, 0xA4, 0xF0, 0x90, 0x90, 0xA5, 0xF0, 0x90,
+ 0x90, 0xA6, 0xF0, 0x90, 0x90, 0xA7,
+ },
+};
+
+#undef N_
+#undef FIL_
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_U8_TEXTPREP_DATA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
new file mode 100644
index 000000000000..465d8998d4e2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
@@ -0,0 +1,427 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _SYS_VNODE_H
+#define _SYS_VNODE_H
+
+#include_next <sys/vnode.h>
+
+#define IS_DEVVP(vp) \
+ ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
+
+#define V_XATTRDIR 0x0000 /* attribute unnamed directory */
+
+#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */
+
+/*
+ * Structure of all optional attributes.
+ */
+typedef struct xoptattr {
+ timestruc_t xoa_createtime; /* Create time of file */
+ uint8_t xoa_archive;
+ uint8_t xoa_system;
+ uint8_t xoa_readonly;
+ uint8_t xoa_hidden;
+ uint8_t xoa_nounlink;
+ uint8_t xoa_immutable;
+ uint8_t xoa_appendonly;
+ uint8_t xoa_nodump;
+ uint8_t xoa_opaque;
+ uint8_t xoa_av_quarantined;
+ uint8_t xoa_av_modified;
+ uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ];
+ uint8_t xoa_reparse;
+ uint64_t xoa_generation;
+ uint8_t xoa_offline;
+ uint8_t xoa_sparse;
+} xoptattr_t;
+
+/*
+ * The xvattr structure is really a variable length structure that
+ * is made up of:
+ * - The classic vattr_t (xva_vattr)
+ * - a 32 bit quantity (xva_mapsize) that specifies the size of the
+ * attribute bitmaps in 32 bit words.
+ * - A pointer to the returned attribute bitmap (needed because the
+ * previous element, the requested attribute bitmap) is variable lenth.
+ * - The requested attribute bitmap, which is an array of 32 bit words.
+ * Callers use the XVA_SET_REQ() macro to set the bits corresponding to
+ * the attributes that are being requested.
+ * - The returned attribute bitmap, which is an array of 32 bit words.
+ * File systems that support optional attributes use the XVA_SET_RTN()
+ * macro to set the bits corresponding to the attributes that are being
+ * returned.
+ * - The xoptattr_t structure which contains the attribute values
+ *
+ * xva_mapsize determines how many words in the attribute bitmaps.
+ * Immediately following the attribute bitmaps is the xoptattr_t.
+ * xva_getxoptattr() is used to get the pointer to the xoptattr_t
+ * section.
+ */
+
+#define XVA_MAPSIZE 3 /* Size of attr bitmaps */
+#define XVA_MAGIC 0x78766174 /* Magic # for verification */
+
+/*
+ * The xvattr structure is an extensible structure which permits optional
+ * attributes to be requested/returned. File systems may or may not support
+ * optional attributes. They do so at their own discretion but if they do
+ * support optional attributes, they must register the VFSFT_XVATTR feature
+ * so that the optional attributes can be set/retrived.
+ *
+ * The fields of the xvattr structure are:
+ *
+ * xva_vattr - The first element of an xvattr is a legacy vattr structure
+ * which includes the common attributes. If AT_XVATTR is set in the va_mask
+ * then the entire structure is treated as an xvattr. If AT_XVATTR is not
+ * set, then only the xva_vattr structure can be used.
+ *
+ * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification.
+ *
+ * xva_mapsize - Size of requested and returned attribute bitmaps.
+ *
+ * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the
+ * size of the array before it, xva_reqattrmap[], could change which means
+ * the location of xva_rtnattrmap[] could change. This will allow unbundled
+ * file systems to find the location of xva_rtnattrmap[] when the sizes change.
+ *
+ * xva_reqattrmap[] - Array of requested attributes. Attributes are
+ * represented by a specific bit in a specific element of the attribute
+ * map array. Callers set the bits corresponding to the attributes
+ * that the caller wants to get/set.
+ *
+ * xva_rtnattrmap[] - Array of attributes that the file system was able to
+ * process. Not all file systems support all optional attributes. This map
+ * informs the caller which attributes the underlying file system was able
+ * to set/get. (Same structure as the requested attributes array in terms
+ * of each attribute corresponding to specific bits and array elements.)
+ *
+ * xva_xoptattrs - Structure containing values of optional attributes.
+ * These values are only valid if the corresponding bits in xva_reqattrmap
+ * are set and the underlying file system supports those attributes.
+ */
+typedef struct xvattr {
+ vattr_t xva_vattr; /* Embedded vattr structure */
+ uint32_t xva_magic; /* Magic Number */
+ uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */
+ uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */
+ uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */
+ uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */
+ xoptattr_t xva_xoptattrs; /* Optional attributes */
+} xvattr_t;
+
+/*
+ * Attributes of interest to the caller of setattr or getattr.
+ */
+#define AT_TYPE 0x00001
+#define AT_MODE 0x00002
+#define AT_UID 0x00004
+#define AT_GID 0x00008
+#define AT_FSID 0x00010
+#define AT_NODEID 0x00020
+#define AT_NLINK 0x00040
+#define AT_SIZE 0x00080
+#define AT_ATIME 0x00100
+#define AT_MTIME 0x00200
+#define AT_CTIME 0x00400
+#define AT_RDEV 0x00800
+#define AT_BLKSIZE 0x01000
+#define AT_NBLOCKS 0x02000
+/* 0x04000 */ /* unused */
+#define AT_SEQ 0x08000
+/*
+ * If AT_XVATTR is set then there are additional bits to process in
+ * the xvattr_t's attribute bitmap. If this is not set then the bitmap
+ * MUST be ignored. Note that this bit must be set/cleared explicitly.
+ * That is, setting AT_ALL will NOT set AT_XVATTR.
+ */
+#define AT_XVATTR 0x10000
+
+#define AT_ALL (AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\
+ AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\
+ AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
+
+#define AT_STAT (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\
+ AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_TYPE)
+
+#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME)
+
+#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
+ AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
+
+/*
+ * Attribute bits used in the extensible attribute's (xva's) attribute
+ * bitmaps. Note that the bitmaps are made up of a variable length number
+ * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n"
+ * is the element in the bitmap (starting at 1). This convention is for
+ * the convenience of the maintainer to keep track of which element each
+ * attribute belongs to.
+ *
+ * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS
+ * MUST USE THE XAT_* DEFINES.
+ */
+#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */
+#define XAT0_CREATETIME 0x00000001 /* Create time of file */
+#define XAT0_ARCHIVE 0x00000002 /* Archive */
+#define XAT0_SYSTEM 0x00000004 /* System */
+#define XAT0_READONLY 0x00000008 /* Readonly */
+#define XAT0_HIDDEN 0x00000010 /* Hidden */
+#define XAT0_NOUNLINK 0x00000020 /* Nounlink */
+#define XAT0_IMMUTABLE 0x00000040 /* immutable */
+#define XAT0_APPENDONLY 0x00000080 /* appendonly */
+#define XAT0_NODUMP 0x00000100 /* nodump */
+#define XAT0_OPAQUE 0x00000200 /* opaque */
+#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */
+#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */
+#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */
+#define XAT0_REPARSE 0x00002000 /* FS reparse point */
+#define XAT0_GEN 0x00004000 /* object generation number */
+#define XAT0_OFFLINE 0x00008000 /* offline */
+#define XAT0_SPARSE 0x00010000 /* sparse */
+
+#define XAT0_ALL_ATTRS (XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \
+ XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \
+ XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| XAT0_AV_MODIFIED| \
+ XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE)
+
+/* Support for XAT_* optional attributes */
+#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */
+#define XVA_SHFT 32 /* Used to shift index */
+
+/*
+ * Used to pry out the index and attribute bits from the XAT_* attributes
+ * defined below. Note that we're masking things down to 32 bits then
+ * casting to uint32_t.
+ */
+#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK))
+#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK))
+
+/*
+ * The following defines present a "flat namespace" so that consumers don't
+ * need to keep track of which element belongs to which bitmap entry.
+ *
+ * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER
+ */
+#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME)
+#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE)
+#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM)
+#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY)
+#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN)
+#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK)
+#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE)
+#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY)
+#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP)
+#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE)
+#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED)
+#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED)
+#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP)
+#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE)
+#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN)
+#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE)
+#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE)
+
+/*
+ * The returned attribute map array (xva_rtnattrmap[]) is located past the
+ * requested attribute map array (xva_reqattrmap[]). Its location changes
+ * when the array sizes change. We use a separate pointer in a known location
+ * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is
+ * set in xva_init()
+ */
+#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp)
+
+/*
+ * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap
+ * of requested attributes (xva_reqattrmap[]).
+ */
+#define XVA_SET_REQ(xvap, attr) { \
+ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \
+ ASSERT((xvap)->xva_magic == XVA_MAGIC); \
+ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \
+}
+/*
+ * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap
+ * of requested attributes (xva_reqattrmap[]).
+ */
+#define XVA_CLR_REQ(xvap, attr) { \
+ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \
+ ASSERT((xvap)->xva_magic == XVA_MAGIC); \
+ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr); \
+}
+
+/*
+ * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap
+ * of returned attributes (xva_rtnattrmap[]).
+ */
+#define XVA_SET_RTN(xvap, attr) { \
+ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \
+ ASSERT((xvap)->xva_magic == XVA_MAGIC); \
+ (XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \
+}
+
+/*
+ * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[])
+ * to see of the corresponding attribute bit is set. If so, returns non-zero.
+ */
+#define XVA_ISSET_REQ(xvap, attr) \
+ ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \
+ ((xvap)->xva_magic == XVA_MAGIC) && \
+ ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \
+ ((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0)
+
+/*
+ * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[])
+ * to see of the corresponding attribute bit is set. If so, returns non-zero.
+ */
+#define XVA_ISSET_RTN(xvap, attr) \
+ ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \
+ ((xvap)->xva_magic == XVA_MAGIC) && \
+ ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \
+ ((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0)
+
+#define MODEMASK 07777 /* mode bits plus permission bits */
+#define PERMMASK 00777 /* permission bits */
+
+/*
+ * VOP_ACCESS flags
+ */
+#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */
+
+/*
+ * Flags for vnode operations.
+ */
+enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */
+enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */
+
+/*
+ * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations
+ */
+
+typedef struct vsecattr {
+ uint_t vsa_mask; /* See below */
+ int vsa_aclcnt; /* ACL entry count */
+ void *vsa_aclentp; /* pointer to ACL entries */
+ int vsa_dfaclcnt; /* default ACL entry count */
+ void *vsa_dfaclentp; /* pointer to default ACL entries */
+ size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */
+ uint_t vsa_aclflags; /* ACE ACL flags */
+} vsecattr_t;
+
+/* vsa_mask values */
+#define VSA_ACL 0x0001
+#define VSA_ACLCNT 0x0002
+#define VSA_DFACL 0x0004
+#define VSA_DFACLCNT 0x0008
+#define VSA_ACE 0x0010
+#define VSA_ACECNT 0x0020
+#define VSA_ACE_ALLTYPES 0x0040
+#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */
+
+/*
+ * Structure used by various vnode operations to determine
+ * the context (pid, host, identity) of a caller.
+ *
+ * The cc_caller_id is used to identify one or more callers who invoke
+ * operations, possibly on behalf of others. For example, the NFS
+ * server could have it's own cc_caller_id which can be detected by
+ * vnode/vfs operations or (FEM) monitors on those operations. New
+ * caller IDs are generated by fs_new_caller_id().
+ */
+typedef struct caller_context {
+ pid_t cc_pid; /* Process ID of the caller */
+ int cc_sysid; /* System ID, used for remote calls */
+ u_longlong_t cc_caller_id; /* Identifier for (set of) caller(s) */
+ ulong_t cc_flags;
+} caller_context_t;
+
+struct taskq;
+
+/*
+ * Flags for VOP_LOOKUP
+ *
+ * Defined in file.h, but also possible, FIGNORECASE and FSEARCH
+ *
+ */
+#define LOOKUP_DIR 0x01 /* want parent dir vp */
+#define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */
+#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */
+#define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */
+
+/*
+ * Flags for VOP_READDIR
+ */
+#define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */
+#define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */
+
+/*
+ * Public vnode manipulation functions.
+ */
+#ifdef _KERNEL
+
+void vn_rele_async(struct vnode *vp, struct taskq *taskq);
+
+/*
+ * Extensible vnode attribute (xva) routines:
+ * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR)
+ * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t
+ */
+void xva_init(xvattr_t *);
+xoptattr_t *xva_getxoptattr(xvattr_t *); /* Get ptr to xoptattr_t */
+
+#define VN_RELE_ASYNC(vp, taskq) { \
+ vn_rele_async(vp, taskq); \
+}
+
+#endif /* _KERNEL */
+
+/*
+ * Flags to VOP_SETATTR/VOP_GETATTR.
+ */
+#define ATTR_UTIME 0x01 /* non-default utime(2) request */
+#define ATTR_EXEC 0x02 /* invocation from exec(2) */
+#define ATTR_COMM 0x04 /* yield common vp attributes */
+#define ATTR_HINT 0x08 /* information returned will be `hint' */
+#define ATTR_REAL 0x10 /* yield attributes of the real vp */
+#define ATTR_NOACLCHECK 0x20 /* Don't check ACL when checking permissions */
+#define ATTR_TRIGGER 0x40 /* Mount first if vnode is a trigger mount */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VNODE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h b/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h
new file mode 100644
index 000000000000..ba0267203ce3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h
@@ -0,0 +1,68 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZMOD_H
+#define _ZMOD_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * zmod - RFC-1950-compatible decompression routines
+ *
+ * This file provides the public interfaces to zmod, an in-kernel RFC 1950
+ * decompression library. More information about the implementation of these
+ * interfaces can be found in the usr/src/uts/common/zmod/ directory.
+ */
+
+#define Z_OK 0
+#define Z_STREAM_END 1
+#define Z_NEED_DICT 2
+#define Z_ERRNO (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR (-3)
+#define Z_MEM_ERROR (-4)
+#define Z_BUF_ERROR (-5)
+#define Z_VERSION_ERROR (-6)
+
+#define Z_NO_COMPRESSION 0
+#define Z_BEST_SPEED 1
+#define Z_BEST_COMPRESSION 9
+#define Z_DEFAULT_COMPRESSION (-1)
+
+extern int z_uncompress(void *, size_t *, const void *, size_t);
+extern int z_compress(void *, size_t *, const void *, size_t);
+extern int z_compress_level(void *, size_t *, const void *, size_t, int);
+extern const char *z_strerror(int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZMOD_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c
new file mode 100644
index 000000000000..3c26dfe2169a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c
@@ -0,0 +1,138 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/systm.h>
+#include <sys/kmem.h>
+#include <sys/zmod.h>
+
+#include <contrib/zlib/zlib.h>
+#include <contrib/zlib/zutil.h>
+
+/*ARGSUSED*/
+static void *
+zfs_zcalloc(void *opaque, uint_t items, uint_t size)
+{
+ void *ptr;
+
+ ptr = malloc((size_t)items * size, M_SOLARIS, M_NOWAIT);
+ return ptr;
+}
+
+/*ARGSUSED*/
+static void
+zfs_zcfree(void *opaque, void *ptr)
+{
+
+ free(ptr, M_SOLARIS);
+}
+
+/*
+ * Uncompress the buffer 'src' into the buffer 'dst'. The caller must store
+ * the expected decompressed data size externally so it can be passed in.
+ * The resulting decompressed size is then returned through dstlen. This
+ * function return Z_OK on success, or another error code on failure.
+ */
+int
+z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
+{
+ z_stream zs;
+ int err;
+
+ bzero(&zs, sizeof (zs));
+ zs.next_in = (uchar_t *)src;
+ zs.avail_in = srclen;
+ zs.next_out = dst;
+ zs.avail_out = *dstlen;
+ zs.zalloc = zfs_zcalloc;
+ zs.zfree = zfs_zcfree;
+
+ /*
+ * Call inflateInit2() specifying a window size of DEF_WBITS
+ * with the 6th bit set to indicate that the compression format
+ * type (zlib or gzip) should be automatically detected.
+ */
+ if ((err = inflateInit2(&zs, DEF_WBITS | 0x20)) != Z_OK)
+ return (err);
+
+ if ((err = inflate(&zs, Z_FINISH)) != Z_STREAM_END) {
+ (void) inflateEnd(&zs);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+
+ *dstlen = zs.total_out;
+ return (inflateEnd(&zs));
+}
+
+int
+z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
+ int level)
+{
+
+ z_stream zs;
+ int err;
+
+ bzero(&zs, sizeof (zs));
+ zs.next_in = (uchar_t *)src;
+ zs.avail_in = srclen;
+ zs.next_out = dst;
+ zs.avail_out = *dstlen;
+ zs.zalloc = zfs_zcalloc;
+ zs.zfree = zfs_zcfree;
+
+ if ((err = deflateInit(&zs, level)) != Z_OK)
+ return (err);
+
+ if ((err = deflate(&zs, Z_FINISH)) != Z_STREAM_END) {
+ (void) deflateEnd(&zs);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+
+ *dstlen = zs.total_out;
+ return (deflateEnd(&zs));
+}
+
+int
+z_compress(void *dst, size_t *dstlen, const void *src, size_t srclen)
+{
+ return (z_compress_level(dst, dstlen, src, srclen,
+ Z_DEFAULT_COMPRESSION));
+}
+
+/*
+ * Convert a zlib error code into a string error message.
+ */
+const char *
+z_strerror(int err)
+{
+ int i = Z_NEED_DICT - err;
+
+ if (i < 0 || i > Z_NEED_DICT - Z_VERSION_ERROR)
+ return ("unknown error");
+
+ return (zError(err));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
new file mode 100644
index 000000000000..502273b73157
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
@@ -0,0 +1,1841 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Portions Copyright 2010 The FreeBSD Foundation
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/fasttrap_isa.h>
+#include <sys/fasttrap_impl.h>
+#include <sys/dtrace.h>
+#include <sys/dtrace_impl.h>
+#include <sys/cmn_err.h>
+#include <sys/types.h>
+#include <sys/dtrace_bsd.h>
+#include <sys/proc.h>
+#include <sys/rmlock.h>
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#include <cddl/dev/dtrace/x86/regset.h>
+#include <machine/segments.h>
+#include <machine/reg.h>
+#include <machine/pcb.h>
+#include <machine/trap.h>
+#include <sys/sysmacros.h>
+#include <sys/ptrace.h>
+
+#ifdef __i386__
+#define r_rax r_eax
+#define r_rbx r_ebx
+#define r_rip r_eip
+#define r_rflags r_eflags
+#define r_rsp r_esp
+#define r_rbp r_ebp
+#endif
+
+/*
+ * Lossless User-Land Tracing on x86
+ * ---------------------------------
+ *
+ * The execution of most instructions is not dependent on the address; for
+ * these instructions it is sufficient to copy them into the user process's
+ * address space and execute them. To effectively single-step an instruction
+ * in user-land, we copy out the following sequence of instructions to scratch
+ * space in the user thread's ulwp_t structure.
+ *
+ * We then set the program counter (%eip or %rip) to point to this scratch
+ * space. Once execution resumes, the original instruction is executed and
+ * then control flow is redirected to what was originally the subsequent
+ * instruction. If the kernel attemps to deliver a signal while single-
+ * stepping, the signal is deferred and the program counter is moved into the
+ * second sequence of instructions. The second sequence ends in a trap into
+ * the kernel where the deferred signal is then properly handled and delivered.
+ *
+ * For instructions whose execute is position dependent, we perform simple
+ * emulation. These instructions are limited to control transfer
+ * instructions in 32-bit mode, but in 64-bit mode there's the added wrinkle
+ * of %rip-relative addressing that means that almost any instruction can be
+ * position dependent. For all the details on how we emulate generic
+ * instructions included %rip-relative instructions, see the code in
+ * fasttrap_pid_probe() below where we handle instructions of type
+ * FASTTRAP_T_COMMON (under the header: Generic Instruction Tracing).
+ */
+
+#define FASTTRAP_MODRM_MOD(modrm) (((modrm) >> 6) & 0x3)
+#define FASTTRAP_MODRM_REG(modrm) (((modrm) >> 3) & 0x7)
+#define FASTTRAP_MODRM_RM(modrm) ((modrm) & 0x7)
+#define FASTTRAP_MODRM(mod, reg, rm) (((mod) << 6) | ((reg) << 3) | (rm))
+
+#define FASTTRAP_SIB_SCALE(sib) (((sib) >> 6) & 0x3)
+#define FASTTRAP_SIB_INDEX(sib) (((sib) >> 3) & 0x7)
+#define FASTTRAP_SIB_BASE(sib) ((sib) & 0x7)
+
+#define FASTTRAP_REX_W(rex) (((rex) >> 3) & 1)
+#define FASTTRAP_REX_R(rex) (((rex) >> 2) & 1)
+#define FASTTRAP_REX_X(rex) (((rex) >> 1) & 1)
+#define FASTTRAP_REX_B(rex) ((rex) & 1)
+#define FASTTRAP_REX(w, r, x, b) \
+ (0x40 | ((w) << 3) | ((r) << 2) | ((x) << 1) | (b))
+
+/*
+ * Single-byte op-codes.
+ */
+#define FASTTRAP_PUSHL_EBP 0x55
+
+#define FASTTRAP_JO 0x70
+#define FASTTRAP_JNO 0x71
+#define FASTTRAP_JB 0x72
+#define FASTTRAP_JAE 0x73
+#define FASTTRAP_JE 0x74
+#define FASTTRAP_JNE 0x75
+#define FASTTRAP_JBE 0x76
+#define FASTTRAP_JA 0x77
+#define FASTTRAP_JS 0x78
+#define FASTTRAP_JNS 0x79
+#define FASTTRAP_JP 0x7a
+#define FASTTRAP_JNP 0x7b
+#define FASTTRAP_JL 0x7c
+#define FASTTRAP_JGE 0x7d
+#define FASTTRAP_JLE 0x7e
+#define FASTTRAP_JG 0x7f
+
+#define FASTTRAP_NOP 0x90
+
+#define FASTTRAP_MOV_EAX 0xb8
+#define FASTTRAP_MOV_ECX 0xb9
+
+#define FASTTRAP_RET16 0xc2
+#define FASTTRAP_RET 0xc3
+
+#define FASTTRAP_LOOPNZ 0xe0
+#define FASTTRAP_LOOPZ 0xe1
+#define FASTTRAP_LOOP 0xe2
+#define FASTTRAP_JCXZ 0xe3
+
+#define FASTTRAP_CALL 0xe8
+#define FASTTRAP_JMP32 0xe9
+#define FASTTRAP_JMP8 0xeb
+
+#define FASTTRAP_INT3 0xcc
+#define FASTTRAP_INT 0xcd
+
+#define FASTTRAP_2_BYTE_OP 0x0f
+#define FASTTRAP_GROUP5_OP 0xff
+
+/*
+ * Two-byte op-codes (second byte only).
+ */
+#define FASTTRAP_0F_JO 0x80
+#define FASTTRAP_0F_JNO 0x81
+#define FASTTRAP_0F_JB 0x82
+#define FASTTRAP_0F_JAE 0x83
+#define FASTTRAP_0F_JE 0x84
+#define FASTTRAP_0F_JNE 0x85
+#define FASTTRAP_0F_JBE 0x86
+#define FASTTRAP_0F_JA 0x87
+#define FASTTRAP_0F_JS 0x88
+#define FASTTRAP_0F_JNS 0x89
+#define FASTTRAP_0F_JP 0x8a
+#define FASTTRAP_0F_JNP 0x8b
+#define FASTTRAP_0F_JL 0x8c
+#define FASTTRAP_0F_JGE 0x8d
+#define FASTTRAP_0F_JLE 0x8e
+#define FASTTRAP_0F_JG 0x8f
+
+#define FASTTRAP_EFLAGS_OF 0x800
+#define FASTTRAP_EFLAGS_DF 0x400
+#define FASTTRAP_EFLAGS_SF 0x080
+#define FASTTRAP_EFLAGS_ZF 0x040
+#define FASTTRAP_EFLAGS_AF 0x010
+#define FASTTRAP_EFLAGS_PF 0x004
+#define FASTTRAP_EFLAGS_CF 0x001
+
+/*
+ * Instruction prefixes.
+ */
+#define FASTTRAP_PREFIX_OPERAND 0x66
+#define FASTTRAP_PREFIX_ADDRESS 0x67
+#define FASTTRAP_PREFIX_CS 0x2E
+#define FASTTRAP_PREFIX_DS 0x3E
+#define FASTTRAP_PREFIX_ES 0x26
+#define FASTTRAP_PREFIX_FS 0x64
+#define FASTTRAP_PREFIX_GS 0x65
+#define FASTTRAP_PREFIX_SS 0x36
+#define FASTTRAP_PREFIX_LOCK 0xF0
+#define FASTTRAP_PREFIX_REP 0xF3
+#define FASTTRAP_PREFIX_REPNE 0xF2
+
+#define FASTTRAP_NOREG 0xff
+
+/*
+ * Map between instruction register encodings and the kernel constants which
+ * correspond to indicies into struct regs.
+ */
+#ifdef __amd64
+static const uint8_t regmap[16] = {
+ REG_RAX, REG_RCX, REG_RDX, REG_RBX, REG_RSP, REG_RBP, REG_RSI, REG_RDI,
+ REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_R13, REG_R14, REG_R15,
+};
+#else
+static const uint8_t regmap[8] = {
+ EAX, ECX, EDX, EBX, UESP, EBP, ESI, EDI
+};
+#endif
+
+static ulong_t fasttrap_getreg(struct reg *, uint_t);
+
+static uint64_t
+fasttrap_anarg(struct reg *rp, int function_entry, int argno)
+{
+ uint64_t value = 0;
+ int shift = function_entry ? 1 : 0;
+
+#ifdef __amd64
+ if (curproc->p_model == DATAMODEL_LP64) {
+ uintptr_t *stack;
+
+ /*
+ * In 64-bit mode, the first six arguments are stored in
+ * registers.
+ */
+ if (argno < 6)
+ switch (argno) {
+ case 0:
+ return (rp->r_rdi);
+ case 1:
+ return (rp->r_rsi);
+ case 2:
+ return (rp->r_rdx);
+ case 3:
+ return (rp->r_rcx);
+ case 4:
+ return (rp->r_r8);
+ case 5:
+ return (rp->r_r9);
+ }
+
+ stack = (uintptr_t *)rp->r_rsp;
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ value = dtrace_fulword(&stack[argno - 6 + shift]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+ } else {
+#endif
+ uint32_t *stack = (uint32_t *)rp->r_rsp;
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ value = dtrace_fuword32(&stack[argno + shift]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+#ifdef __amd64
+ }
+#endif
+
+ return (value);
+}
+
+/*ARGSUSED*/
+int
+fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
+ fasttrap_probe_type_t type)
+{
+ uint8_t instr[FASTTRAP_MAX_INSTR_SIZE + 10];
+ size_t len = FASTTRAP_MAX_INSTR_SIZE;
+ size_t first = MIN(len, PAGESIZE - (pc & PAGEOFFSET));
+ uint_t start = 0;
+ int rmindex, size;
+ uint8_t seg, rex = 0;
+
+ /*
+ * Read the instruction at the given address out of the process's
+ * address space. We don't have to worry about a debugger
+ * changing this instruction before we overwrite it with our trap
+ * instruction since P_PR_LOCK is set. Since instructions can span
+ * pages, we potentially read the instruction in two parts. If the
+ * second part fails, we just zero out that part of the instruction.
+ */
+ if (uread(p, &instr[0], first, pc) != 0)
+ return (-1);
+ if (len > first &&
+ uread(p, &instr[first], len - first, pc + first) != 0) {
+ bzero(&instr[first], len - first);
+ len = first;
+ }
+
+ /*
+ * If the disassembly fails, then we have a malformed instruction.
+ */
+ if ((size = dtrace_instr_size_isa(instr, p->p_model, &rmindex)) <= 0)
+ return (-1);
+
+ /*
+ * Make sure the disassembler isn't completely broken.
+ */
+ ASSERT(-1 <= rmindex && rmindex < size);
+
+ /*
+ * If the computed size is greater than the number of bytes read,
+ * then it was a malformed instruction possibly because it fell on a
+ * page boundary and the subsequent page was missing or because of
+ * some malicious user.
+ */
+ if (size > len)
+ return (-1);
+
+ tp->ftt_size = (uint8_t)size;
+ tp->ftt_segment = FASTTRAP_SEG_NONE;
+
+ /*
+ * Find the start of the instruction's opcode by processing any
+ * legacy prefixes.
+ */
+ for (;;) {
+ seg = 0;
+ switch (instr[start]) {
+ case FASTTRAP_PREFIX_SS:
+ seg++;
+ /*FALLTHRU*/
+ case FASTTRAP_PREFIX_GS:
+ seg++;
+ /*FALLTHRU*/
+ case FASTTRAP_PREFIX_FS:
+ seg++;
+ /*FALLTHRU*/
+ case FASTTRAP_PREFIX_ES:
+ seg++;
+ /*FALLTHRU*/
+ case FASTTRAP_PREFIX_DS:
+ seg++;
+ /*FALLTHRU*/
+ case FASTTRAP_PREFIX_CS:
+ seg++;
+ /*FALLTHRU*/
+ case FASTTRAP_PREFIX_OPERAND:
+ case FASTTRAP_PREFIX_ADDRESS:
+ case FASTTRAP_PREFIX_LOCK:
+ case FASTTRAP_PREFIX_REP:
+ case FASTTRAP_PREFIX_REPNE:
+ if (seg != 0) {
+ /*
+ * It's illegal for an instruction to specify
+ * two segment prefixes -- give up on this
+ * illegal instruction.
+ */
+ if (tp->ftt_segment != FASTTRAP_SEG_NONE)
+ return (-1);
+
+ tp->ftt_segment = seg;
+ }
+ start++;
+ continue;
+ }
+ break;
+ }
+
+#ifdef __amd64
+ /*
+ * Identify the REX prefix on 64-bit processes.
+ */
+ if (p->p_model == DATAMODEL_LP64 && (instr[start] & 0xf0) == 0x40)
+ rex = instr[start++];
+#endif
+
+ /*
+ * Now that we're pretty sure that the instruction is okay, copy the
+ * valid part to the tracepoint.
+ */
+ bcopy(instr, tp->ftt_instr, FASTTRAP_MAX_INSTR_SIZE);
+
+ tp->ftt_type = FASTTRAP_T_COMMON;
+ if (instr[start] == FASTTRAP_2_BYTE_OP) {
+ switch (instr[start + 1]) {
+ case FASTTRAP_0F_JO:
+ case FASTTRAP_0F_JNO:
+ case FASTTRAP_0F_JB:
+ case FASTTRAP_0F_JAE:
+ case FASTTRAP_0F_JE:
+ case FASTTRAP_0F_JNE:
+ case FASTTRAP_0F_JBE:
+ case FASTTRAP_0F_JA:
+ case FASTTRAP_0F_JS:
+ case FASTTRAP_0F_JNS:
+ case FASTTRAP_0F_JP:
+ case FASTTRAP_0F_JNP:
+ case FASTTRAP_0F_JL:
+ case FASTTRAP_0F_JGE:
+ case FASTTRAP_0F_JLE:
+ case FASTTRAP_0F_JG:
+ tp->ftt_type = FASTTRAP_T_JCC;
+ tp->ftt_code = (instr[start + 1] & 0x0f) | FASTTRAP_JO;
+ tp->ftt_dest = pc + tp->ftt_size +
+ /* LINTED - alignment */
+ *(int32_t *)&instr[start + 2];
+ break;
+ }
+ } else if (instr[start] == FASTTRAP_GROUP5_OP) {
+ uint_t mod = FASTTRAP_MODRM_MOD(instr[start + 1]);
+ uint_t reg = FASTTRAP_MODRM_REG(instr[start + 1]);
+ uint_t rm = FASTTRAP_MODRM_RM(instr[start + 1]);
+
+ if (reg == 2 || reg == 4) {
+ uint_t i, sz;
+
+ if (reg == 2)
+ tp->ftt_type = FASTTRAP_T_CALL;
+ else
+ tp->ftt_type = FASTTRAP_T_JMP;
+
+ if (mod == 3)
+ tp->ftt_code = 2;
+ else
+ tp->ftt_code = 1;
+
+ ASSERT(p->p_model == DATAMODEL_LP64 || rex == 0);
+
+ /*
+ * See AMD x86-64 Architecture Programmer's Manual
+ * Volume 3, Section 1.2.7, Table 1-12, and
+ * Appendix A.3.1, Table A-15.
+ */
+ if (mod != 3 && rm == 4) {
+ uint8_t sib = instr[start + 2];
+ uint_t index = FASTTRAP_SIB_INDEX(sib);
+ uint_t base = FASTTRAP_SIB_BASE(sib);
+
+ tp->ftt_scale = FASTTRAP_SIB_SCALE(sib);
+
+ tp->ftt_index = (index == 4) ?
+ FASTTRAP_NOREG :
+ regmap[index | (FASTTRAP_REX_X(rex) << 3)];
+ tp->ftt_base = (mod == 0 && base == 5) ?
+ FASTTRAP_NOREG :
+ regmap[base | (FASTTRAP_REX_B(rex) << 3)];
+
+ i = 3;
+ sz = mod == 1 ? 1 : 4;
+ } else {
+ /*
+ * In 64-bit mode, mod == 0 and r/m == 5
+ * denotes %rip-relative addressing; in 32-bit
+ * mode, the base register isn't used. In both
+ * modes, there is a 32-bit operand.
+ */
+ if (mod == 0 && rm == 5) {
+#ifdef __amd64
+ if (p->p_model == DATAMODEL_LP64)
+ tp->ftt_base = REG_RIP;
+ else
+#endif
+ tp->ftt_base = FASTTRAP_NOREG;
+ sz = 4;
+ } else {
+ uint8_t base = rm |
+ (FASTTRAP_REX_B(rex) << 3);
+
+ tp->ftt_base = regmap[base];
+ sz = mod == 1 ? 1 : mod == 2 ? 4 : 0;
+ }
+ tp->ftt_index = FASTTRAP_NOREG;
+ i = 2;
+ }
+
+ if (sz == 1) {
+ tp->ftt_dest = *(int8_t *)&instr[start + i];
+ } else if (sz == 4) {
+ /* LINTED - alignment */
+ tp->ftt_dest = *(int32_t *)&instr[start + i];
+ } else {
+ tp->ftt_dest = 0;
+ }
+ }
+ } else {
+ switch (instr[start]) {
+ case FASTTRAP_RET:
+ tp->ftt_type = FASTTRAP_T_RET;
+ break;
+
+ case FASTTRAP_RET16:
+ tp->ftt_type = FASTTRAP_T_RET16;
+ /* LINTED - alignment */
+ tp->ftt_dest = *(uint16_t *)&instr[start + 1];
+ break;
+
+ case FASTTRAP_JO:
+ case FASTTRAP_JNO:
+ case FASTTRAP_JB:
+ case FASTTRAP_JAE:
+ case FASTTRAP_JE:
+ case FASTTRAP_JNE:
+ case FASTTRAP_JBE:
+ case FASTTRAP_JA:
+ case FASTTRAP_JS:
+ case FASTTRAP_JNS:
+ case FASTTRAP_JP:
+ case FASTTRAP_JNP:
+ case FASTTRAP_JL:
+ case FASTTRAP_JGE:
+ case FASTTRAP_JLE:
+ case FASTTRAP_JG:
+ tp->ftt_type = FASTTRAP_T_JCC;
+ tp->ftt_code = instr[start];
+ tp->ftt_dest = pc + tp->ftt_size +
+ (int8_t)instr[start + 1];
+ break;
+
+ case FASTTRAP_LOOPNZ:
+ case FASTTRAP_LOOPZ:
+ case FASTTRAP_LOOP:
+ tp->ftt_type = FASTTRAP_T_LOOP;
+ tp->ftt_code = instr[start];
+ tp->ftt_dest = pc + tp->ftt_size +
+ (int8_t)instr[start + 1];
+ break;
+
+ case FASTTRAP_JCXZ:
+ tp->ftt_type = FASTTRAP_T_JCXZ;
+ tp->ftt_dest = pc + tp->ftt_size +
+ (int8_t)instr[start + 1];
+ break;
+
+ case FASTTRAP_CALL:
+ tp->ftt_type = FASTTRAP_T_CALL;
+ tp->ftt_dest = pc + tp->ftt_size +
+ /* LINTED - alignment */
+ *(int32_t *)&instr[start + 1];
+ tp->ftt_code = 0;
+ break;
+
+ case FASTTRAP_JMP32:
+ tp->ftt_type = FASTTRAP_T_JMP;
+ tp->ftt_dest = pc + tp->ftt_size +
+ /* LINTED - alignment */
+ *(int32_t *)&instr[start + 1];
+ break;
+ case FASTTRAP_JMP8:
+ tp->ftt_type = FASTTRAP_T_JMP;
+ tp->ftt_dest = pc + tp->ftt_size +
+ (int8_t)instr[start + 1];
+ break;
+
+ case FASTTRAP_PUSHL_EBP:
+ if (start == 0)
+ tp->ftt_type = FASTTRAP_T_PUSHL_EBP;
+ break;
+
+ case FASTTRAP_NOP:
+#ifdef __amd64
+ ASSERT(p->p_model == DATAMODEL_LP64 || rex == 0);
+
+ /*
+ * On amd64 we have to be careful not to confuse a nop
+ * (actually xchgl %eax, %eax) with an instruction using
+ * the same opcode, but that does something different
+ * (e.g. xchgl %r8d, %eax or xcghq %r8, %rax).
+ */
+ if (FASTTRAP_REX_B(rex) == 0)
+#endif
+ tp->ftt_type = FASTTRAP_T_NOP;
+ break;
+
+ case FASTTRAP_INT3:
+ /*
+ * The pid provider shares the int3 trap with debugger
+ * breakpoints so we can't instrument them.
+ */
+ ASSERT(instr[start] == FASTTRAP_INSTR);
+ return (-1);
+
+ case FASTTRAP_INT:
+ /*
+ * Interrupts seem like they could be traced with
+ * no negative implications, but it's possible that
+ * a thread could be redirected by the trap handling
+ * code which would eventually return to the
+ * instruction after the interrupt. If the interrupt
+ * were in our scratch space, the subsequent
+ * instruction might be overwritten before we return.
+ * Accordingly we refuse to instrument any interrupt.
+ */
+ return (-1);
+ }
+ }
+
+#ifdef __amd64
+ if (p->p_model == DATAMODEL_LP64 && tp->ftt_type == FASTTRAP_T_COMMON) {
+ /*
+ * If the process is 64-bit and the instruction type is still
+ * FASTTRAP_T_COMMON -- meaning we're going to copy it out an
+ * execute it -- we need to watch for %rip-relative
+ * addressing mode. See the portion of fasttrap_pid_probe()
+ * below where we handle tracepoints with type
+ * FASTTRAP_T_COMMON for how we emulate instructions that
+ * employ %rip-relative addressing.
+ */
+ if (rmindex != -1) {
+ uint_t mod = FASTTRAP_MODRM_MOD(instr[rmindex]);
+ uint_t reg = FASTTRAP_MODRM_REG(instr[rmindex]);
+ uint_t rm = FASTTRAP_MODRM_RM(instr[rmindex]);
+
+ ASSERT(rmindex > start);
+
+ if (mod == 0 && rm == 5) {
+ /*
+ * We need to be sure to avoid other
+ * registers used by this instruction. While
+ * the reg field may determine the op code
+ * rather than denoting a register, assuming
+ * that it denotes a register is always safe.
+ * We leave the REX field intact and use
+ * whatever value's there for simplicity.
+ */
+ if (reg != 0) {
+ tp->ftt_ripmode = FASTTRAP_RIP_1 |
+ (FASTTRAP_RIP_X *
+ FASTTRAP_REX_B(rex));
+ rm = 0;
+ } else {
+ tp->ftt_ripmode = FASTTRAP_RIP_2 |
+ (FASTTRAP_RIP_X *
+ FASTTRAP_REX_B(rex));
+ rm = 1;
+ }
+
+ tp->ftt_modrm = tp->ftt_instr[rmindex];
+ tp->ftt_instr[rmindex] =
+ FASTTRAP_MODRM(2, reg, rm);
+ }
+ }
+ }
+#endif
+
+ return (0);
+}
+
+int
+fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
+{
+ fasttrap_instr_t instr = FASTTRAP_INSTR;
+
+ if (uwrite(p, &instr, 1, tp->ftt_pc) != 0)
+ return (-1);
+
+ return (0);
+}
+
+int
+fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
+{
+ uint8_t instr;
+
+ /*
+ * Distinguish between read or write failures and a changed
+ * instruction.
+ */
+ if (uread(p, &instr, 1, tp->ftt_pc) != 0)
+ return (0);
+ if (instr != FASTTRAP_INSTR)
+ return (0);
+ if (uwrite(p, &tp->ftt_instr[0], 1, tp->ftt_pc) != 0)
+ return (-1);
+
+ return (0);
+}
+
+#ifdef __amd64
+static uintptr_t
+fasttrap_fulword_noerr(const void *uaddr)
+{
+ uintptr_t ret;
+
+ if ((ret = fasttrap_fulword(uaddr)) != -1)
+ return (ret);
+
+ return (0);
+}
+#endif
+
+static uint32_t
+fasttrap_fuword32_noerr(const void *uaddr)
+{
+ uint32_t ret;
+
+ if ((ret = fasttrap_fuword32(uaddr)) != -1)
+ return (ret);
+
+ return (0);
+}
+
+static void
+fasttrap_return_common(struct reg *rp, uintptr_t pc, pid_t pid,
+ uintptr_t new_pc)
+{
+ fasttrap_tracepoint_t *tp;
+ fasttrap_bucket_t *bucket;
+ fasttrap_id_t *id;
+ struct rm_priotracker tracker;
+
+ rm_rlock(&fasttrap_tp_lock, &tracker);
+ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
+ tp->ftt_proc->ftpc_acount != 0)
+ break;
+ }
+
+ /*
+ * Don't sweat it if we can't find the tracepoint again; unlike
+ * when we're in fasttrap_pid_probe(), finding the tracepoint here
+ * is not essential to the correct execution of the process.
+ */
+ if (tp == NULL) {
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ return;
+ }
+
+ for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
+ /*
+ * If there's a branch that could act as a return site, we
+ * need to trace it, and check here if the program counter is
+ * external to the function.
+ */
+ if (tp->ftt_type != FASTTRAP_T_RET &&
+ tp->ftt_type != FASTTRAP_T_RET16 &&
+ new_pc - id->fti_probe->ftp_faddr <
+ id->fti_probe->ftp_fsize)
+ continue;
+
+ dtrace_probe(id->fti_probe->ftp_id,
+ pc - id->fti_probe->ftp_faddr,
+ rp->r_rax, rp->r_rbx, 0, 0);
+ }
+
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+}
+
+static void
+fasttrap_sigsegv(proc_t *p, kthread_t *t, uintptr_t addr)
+{
+ ksiginfo_t ksi;
+
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = SIGSEGV;
+ ksi.ksi_code = SEGV_MAPERR;
+ ksi.ksi_addr = (caddr_t)addr;
+ PROC_LOCK(p);
+ (void)tdksignal(t, SIGSEGV, &ksi);
+ PROC_UNLOCK(p);
+}
+
+#ifdef __amd64
+static void
+fasttrap_usdt_args64(fasttrap_probe_t *probe, struct reg *rp, int argc,
+ uintptr_t *argv)
+{
+ int i, x, cap = MIN(argc, probe->ftp_nargs);
+ uintptr_t *stack = (uintptr_t *)rp->r_rsp;
+
+ for (i = 0; i < cap; i++) {
+ x = probe->ftp_argmap[i];
+
+ if (x < 6)
+ argv[i] = (&rp->r_rdi)[x];
+ else
+ argv[i] = fasttrap_fulword_noerr(&stack[x]);
+ }
+
+ for (; i < argc; i++) {
+ argv[i] = 0;
+ }
+}
+#endif
+
+static void
+fasttrap_usdt_args32(fasttrap_probe_t *probe, struct reg *rp, int argc,
+ uint32_t *argv)
+{
+ int i, x, cap = MIN(argc, probe->ftp_nargs);
+ uint32_t *stack = (uint32_t *)rp->r_rsp;
+
+ for (i = 0; i < cap; i++) {
+ x = probe->ftp_argmap[i];
+
+ argv[i] = fasttrap_fuword32_noerr(&stack[x]);
+ }
+
+ for (; i < argc; i++) {
+ argv[i] = 0;
+ }
+}
+
+static int
+fasttrap_do_seg(fasttrap_tracepoint_t *tp, struct reg *rp, uintptr_t *addr)
+{
+ proc_t *p = curproc;
+#ifdef __i386__
+ struct segment_descriptor *desc;
+#else
+ struct user_segment_descriptor *desc;
+#endif
+ uint16_t sel = 0, ndx, type;
+ uintptr_t limit;
+
+ switch (tp->ftt_segment) {
+ case FASTTRAP_SEG_CS:
+ sel = rp->r_cs;
+ break;
+ case FASTTRAP_SEG_DS:
+ sel = rp->r_ds;
+ break;
+ case FASTTRAP_SEG_ES:
+ sel = rp->r_es;
+ break;
+ case FASTTRAP_SEG_FS:
+ sel = rp->r_fs;
+ break;
+ case FASTTRAP_SEG_GS:
+ sel = rp->r_gs;
+ break;
+ case FASTTRAP_SEG_SS:
+ sel = rp->r_ss;
+ break;
+ }
+
+ /*
+ * Make sure the given segment register specifies a user priority
+ * selector rather than a kernel selector.
+ */
+ if (ISPL(sel) != SEL_UPL)
+ return (-1);
+
+ ndx = IDXSEL(sel);
+
+ /*
+ * Check the bounds and grab the descriptor out of the specified
+ * descriptor table.
+ */
+ if (ISLDT(sel)) {
+#ifdef __i386__
+ if (ndx > p->p_md.md_ldt->ldt_len)
+ return (-1);
+
+ desc = (struct segment_descriptor *)
+ p->p_md.md_ldt[ndx].ldt_base;
+#else
+ if (ndx > max_ldt_segment)
+ return (-1);
+
+ desc = (struct user_segment_descriptor *)
+ p->p_md.md_ldt[ndx].ldt_base;
+#endif
+
+ } else {
+ if (ndx >= NGDT)
+ return (-1);
+
+#ifdef __i386__
+ desc = &gdt[ndx].sd;
+#else
+ desc = PCPU_PTR(gdt)[ndx];
+#endif
+ }
+
+ /*
+ * The descriptor must have user privilege level and it must be
+ * present in memory.
+ */
+ if (desc->sd_dpl != SEL_UPL || desc->sd_p != 1)
+ return (-1);
+
+ type = desc->sd_type;
+
+ /*
+ * If the S bit in the type field is not set, this descriptor can
+ * only be used in system context.
+ */
+ if ((type & 0x10) != 0x10)
+ return (-1);
+
+ limit = USD_GETLIMIT(desc) * (desc->sd_gran ? PAGESIZE : 1);
+
+ if (tp->ftt_segment == FASTTRAP_SEG_CS) {
+ /*
+ * The code/data bit and readable bit must both be set.
+ */
+ if ((type & 0xa) != 0xa)
+ return (-1);
+
+ if (*addr > limit)
+ return (-1);
+ } else {
+ /*
+ * The code/data bit must be clear.
+ */
+ if ((type & 0x8) != 0)
+ return (-1);
+
+ /*
+ * If the expand-down bit is clear, we just check the limit as
+ * it would naturally be applied. Otherwise, we need to check
+ * that the address is the range [limit + 1 .. 0xffff] or
+ * [limit + 1 ... 0xffffffff] depending on if the default
+ * operand size bit is set.
+ */
+ if ((type & 0x4) == 0) {
+ if (*addr > limit)
+ return (-1);
+ } else if (desc->sd_def32) {
+ if (*addr < limit + 1 || 0xffff < *addr)
+ return (-1);
+ } else {
+ if (*addr < limit + 1 || 0xffffffff < *addr)
+ return (-1);
+ }
+ }
+
+ *addr += USD_GETBASE(desc);
+
+ return (0);
+}
+
+int
+fasttrap_pid_probe(struct trapframe *tf)
+{
+ struct reg reg, *rp;
+ proc_t *p = curproc, *pp;
+ struct rm_priotracker tracker;
+ uint64_t gen;
+ uintptr_t pc;
+ uintptr_t new_pc = 0;
+ fasttrap_bucket_t *bucket;
+ fasttrap_tracepoint_t *tp, tp_local;
+ pid_t pid;
+ dtrace_icookie_t cookie;
+ uint_t is_enabled = 0;
+
+ fill_frame_regs(tf, &reg);
+ rp = &reg;
+
+ pc = rp->r_rip - 1;
+
+ /*
+ * It's possible that a user (in a veritable orgy of bad planning)
+ * could redirect this thread's flow of control before it reached the
+ * return probe fasttrap. In this case we need to kill the process
+ * since it's in a unrecoverable state.
+ */
+ if (curthread->t_dtrace_step) {
+ ASSERT(curthread->t_dtrace_on);
+ fasttrap_sigtrap(p, curthread, pc);
+ return (0);
+ }
+
+ /*
+ * Clear all user tracing flags.
+ */
+ curthread->t_dtrace_ft = 0;
+ curthread->t_dtrace_pc = 0;
+ curthread->t_dtrace_npc = 0;
+ curthread->t_dtrace_scrpc = 0;
+ curthread->t_dtrace_astpc = 0;
+#ifdef __amd64
+ curthread->t_dtrace_regv = 0;
+#endif
+
+ /*
+ * Treat a child created by a call to vfork(2) as if it were its
+ * parent. We know that there's only one thread of control in such a
+ * process: this one.
+ */
+ pp = p;
+ sx_slock(&proctree_lock);
+ while (pp->p_vmspace == pp->p_pptr->p_vmspace)
+ pp = pp->p_pptr;
+ pid = pp->p_pid;
+ if (pp != p) {
+ PROC_LOCK(pp);
+ if ((pp->p_flag & P_WEXIT) != 0) {
+ /*
+ * This can happen if the child was created with
+ * rfork(2). Userspace tracing cannot work reliably in
+ * such a scenario, but we can at least try.
+ */
+ PROC_UNLOCK(pp);
+ sx_sunlock(&proctree_lock);
+ return (-1);
+ }
+ _PHOLD_LITE(pp);
+ PROC_UNLOCK(pp);
+ }
+ sx_sunlock(&proctree_lock);
+
+ rm_rlock(&fasttrap_tp_lock, &tracker);
+
+ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+ /*
+ * Lookup the tracepoint that the process just hit.
+ */
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
+ tp->ftt_proc->ftpc_acount != 0)
+ break;
+ }
+
+ /*
+ * If we couldn't find a matching tracepoint, either a tracepoint has
+ * been inserted without using the pid<pid> ioctl interface (see
+ * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
+ */
+ if (tp == NULL) {
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ gen = atomic_load_acq_64(&pp->p_fasttrap_tp_gen);
+ if (pp != p)
+ PRELE(pp);
+ if (curthread->t_fasttrap_tp_gen != gen) {
+ /*
+ * At least one tracepoint associated with this PID has
+ * been removed from the table since #BP was raised.
+ * Speculate that we hit a tracepoint that has since
+ * been removed, and retry the instruction.
+ */
+ curthread->t_fasttrap_tp_gen = gen;
+#ifdef __amd64
+ tf->tf_rip = pc;
+#else
+ tf->tf_eip = pc;
+#endif
+ return (0);
+ }
+ return (-1);
+ }
+ if (pp != p)
+ PRELE(pp);
+
+ /*
+ * Set the program counter to the address of the traced instruction
+ * so that it looks right in ustack() output.
+ */
+ rp->r_rip = pc;
+
+ if (tp->ftt_ids != NULL) {
+ fasttrap_id_t *id;
+
+#ifdef __amd64
+ if (p->p_model == DATAMODEL_LP64) {
+ for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
+ fasttrap_probe_t *probe = id->fti_probe;
+
+ if (id->fti_ptype == DTFTP_ENTRY) {
+ /*
+ * We note that this was an entry
+ * probe to help ustack() find the
+ * first caller.
+ */
+ cookie = dtrace_interrupt_disable();
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
+ dtrace_probe(probe->ftp_id, rp->r_rdi,
+ rp->r_rsi, rp->r_rdx, rp->r_rcx,
+ rp->r_r8);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
+ dtrace_interrupt_enable(cookie);
+ } else if (id->fti_ptype == DTFTP_IS_ENABLED) {
+ /*
+ * Note that in this case, we don't
+ * call dtrace_probe() since it's only
+ * an artificial probe meant to change
+ * the flow of control so that it
+ * encounters the true probe.
+ */
+ is_enabled = 1;
+ } else if (probe->ftp_argmap == NULL) {
+ dtrace_probe(probe->ftp_id, rp->r_rdi,
+ rp->r_rsi, rp->r_rdx, rp->r_rcx,
+ rp->r_r8);
+ } else {
+ uintptr_t t[5];
+
+ fasttrap_usdt_args64(probe, rp,
+ sizeof (t) / sizeof (t[0]), t);
+
+ dtrace_probe(probe->ftp_id, t[0], t[1],
+ t[2], t[3], t[4]);
+ }
+ }
+ } else {
+#endif
+ uintptr_t s0, s1, s2, s3, s4, s5;
+ uint32_t *stack = (uint32_t *)rp->r_rsp;
+
+ /*
+ * In 32-bit mode, all arguments are passed on the
+ * stack. If this is a function entry probe, we need
+ * to skip the first entry on the stack as it
+ * represents the return address rather than a
+ * parameter to the function.
+ */
+ s0 = fasttrap_fuword32_noerr(&stack[0]);
+ s1 = fasttrap_fuword32_noerr(&stack[1]);
+ s2 = fasttrap_fuword32_noerr(&stack[2]);
+ s3 = fasttrap_fuword32_noerr(&stack[3]);
+ s4 = fasttrap_fuword32_noerr(&stack[4]);
+ s5 = fasttrap_fuword32_noerr(&stack[5]);
+
+ for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
+ fasttrap_probe_t *probe = id->fti_probe;
+
+ if (id->fti_ptype == DTFTP_ENTRY) {
+ /*
+ * We note that this was an entry
+ * probe to help ustack() find the
+ * first caller.
+ */
+ cookie = dtrace_interrupt_disable();
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
+ dtrace_probe(probe->ftp_id, s1, s2,
+ s3, s4, s5);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
+ dtrace_interrupt_enable(cookie);
+ } else if (id->fti_ptype == DTFTP_IS_ENABLED) {
+ /*
+ * Note that in this case, we don't
+ * call dtrace_probe() since it's only
+ * an artificial probe meant to change
+ * the flow of control so that it
+ * encounters the true probe.
+ */
+ is_enabled = 1;
+ } else if (probe->ftp_argmap == NULL) {
+ dtrace_probe(probe->ftp_id, s0, s1,
+ s2, s3, s4);
+ } else {
+ uint32_t t[5];
+
+ fasttrap_usdt_args32(probe, rp,
+ sizeof (t) / sizeof (t[0]), t);
+
+ dtrace_probe(probe->ftp_id, t[0], t[1],
+ t[2], t[3], t[4]);
+ }
+ }
+#ifdef __amd64
+ }
+#endif
+ }
+
+ /*
+ * We're about to do a bunch of work so we cache a local copy of
+ * the tracepoint to emulate the instruction, and then find the
+ * tracepoint again later if we need to light up any return probes.
+ */
+ tp_local = *tp;
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ tp = &tp_local;
+
+ /*
+ * Set the program counter to appear as though the traced instruction
+ * had completely executed. This ensures that fasttrap_getreg() will
+ * report the expected value for REG_RIP.
+ */
+ rp->r_rip = pc + tp->ftt_size;
+
+ /*
+ * If there's an is-enabled probe connected to this tracepoint it
+ * means that there was a 'xorl %eax, %eax' or 'xorq %rax, %rax'
+ * instruction that was placed there by DTrace when the binary was
+ * linked. As this probe is, in fact, enabled, we need to stuff 1
+ * into %eax or %rax. Accordingly, we can bypass all the instruction
+ * emulation logic since we know the inevitable result. It's possible
+ * that a user could construct a scenario where the 'is-enabled'
+ * probe was on some other instruction, but that would be a rather
+ * exotic way to shoot oneself in the foot.
+ */
+ if (is_enabled) {
+ rp->r_rax = 1;
+ new_pc = rp->r_rip;
+ goto done;
+ }
+
+ /*
+ * We emulate certain types of instructions to ensure correctness
+ * (in the case of position dependent instructions) or optimize
+ * common cases. The rest we have the thread execute back in user-
+ * land.
+ */
+ switch (tp->ftt_type) {
+ case FASTTRAP_T_RET:
+ case FASTTRAP_T_RET16:
+ {
+ uintptr_t dst = 0;
+ uintptr_t addr = 0;
+ int ret = 0;
+
+ /*
+ * We have to emulate _every_ facet of the behavior of a ret
+ * instruction including what happens if the load from %esp
+ * fails; in that case, we send a SIGSEGV.
+ */
+#ifdef __amd64
+ if (p->p_model == DATAMODEL_NATIVE) {
+ ret = dst = fasttrap_fulword((void *)rp->r_rsp);
+ addr = rp->r_rsp + sizeof (uintptr_t);
+ } else {
+#endif
+ uint32_t dst32;
+ ret = dst32 = fasttrap_fuword32((void *)rp->r_rsp);
+ dst = dst32;
+ addr = rp->r_rsp + sizeof (uint32_t);
+#ifdef __amd64
+ }
+#endif
+
+ if (ret == -1) {
+ fasttrap_sigsegv(p, curthread, rp->r_rsp);
+ new_pc = pc;
+ break;
+ }
+
+ if (tp->ftt_type == FASTTRAP_T_RET16)
+ addr += tp->ftt_dest;
+
+ rp->r_rsp = addr;
+ new_pc = dst;
+ break;
+ }
+
+ case FASTTRAP_T_JCC:
+ {
+ uint_t taken = 0;
+
+ switch (tp->ftt_code) {
+ case FASTTRAP_JO:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_OF) != 0;
+ break;
+ case FASTTRAP_JNO:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0;
+ break;
+ case FASTTRAP_JB:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) != 0;
+ break;
+ case FASTTRAP_JAE:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) == 0;
+ break;
+ case FASTTRAP_JE:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0;
+ break;
+ case FASTTRAP_JNE:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0;
+ break;
+ case FASTTRAP_JBE:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) != 0 ||
+ (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0;
+ break;
+ case FASTTRAP_JA:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) == 0 &&
+ (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0;
+ break;
+ case FASTTRAP_JS:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_SF) != 0;
+ break;
+ case FASTTRAP_JNS:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0;
+ break;
+ case FASTTRAP_JP:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_PF) != 0;
+ break;
+ case FASTTRAP_JNP:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_PF) == 0;
+ break;
+ case FASTTRAP_JL:
+ taken = ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) !=
+ ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
+ break;
+ case FASTTRAP_JGE:
+ taken = ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) ==
+ ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
+ break;
+ case FASTTRAP_JLE:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0 ||
+ ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) !=
+ ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
+ break;
+ case FASTTRAP_JG:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0 &&
+ ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) ==
+ ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
+ break;
+
+ }
+
+ if (taken)
+ new_pc = tp->ftt_dest;
+ else
+ new_pc = pc + tp->ftt_size;
+ break;
+ }
+
+ case FASTTRAP_T_LOOP:
+ {
+ uint_t taken = 0;
+#ifdef __amd64
+ greg_t cx = rp->r_rcx--;
+#else
+ greg_t cx = rp->r_ecx--;
+#endif
+
+ switch (tp->ftt_code) {
+ case FASTTRAP_LOOPNZ:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0 &&
+ cx != 0;
+ break;
+ case FASTTRAP_LOOPZ:
+ taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0 &&
+ cx != 0;
+ break;
+ case FASTTRAP_LOOP:
+ taken = (cx != 0);
+ break;
+ }
+
+ if (taken)
+ new_pc = tp->ftt_dest;
+ else
+ new_pc = pc + tp->ftt_size;
+ break;
+ }
+
+ case FASTTRAP_T_JCXZ:
+ {
+#ifdef __amd64
+ greg_t cx = rp->r_rcx;
+#else
+ greg_t cx = rp->r_ecx;
+#endif
+
+ if (cx == 0)
+ new_pc = tp->ftt_dest;
+ else
+ new_pc = pc + tp->ftt_size;
+ break;
+ }
+
+ case FASTTRAP_T_PUSHL_EBP:
+ {
+ int ret = 0;
+
+#ifdef __amd64
+ if (p->p_model == DATAMODEL_NATIVE) {
+ rp->r_rsp -= sizeof (uintptr_t);
+ ret = fasttrap_sulword((void *)rp->r_rsp, rp->r_rbp);
+ } else {
+#endif
+ rp->r_rsp -= sizeof (uint32_t);
+ ret = fasttrap_suword32((void *)rp->r_rsp, rp->r_rbp);
+#ifdef __amd64
+ }
+#endif
+
+ if (ret == -1) {
+ fasttrap_sigsegv(p, curthread, rp->r_rsp);
+ new_pc = pc;
+ break;
+ }
+
+ new_pc = pc + tp->ftt_size;
+ break;
+ }
+
+ case FASTTRAP_T_NOP:
+ new_pc = pc + tp->ftt_size;
+ break;
+
+ case FASTTRAP_T_JMP:
+ case FASTTRAP_T_CALL:
+ if (tp->ftt_code == 0) {
+ new_pc = tp->ftt_dest;
+ } else {
+ uintptr_t value, addr = tp->ftt_dest;
+
+ if (tp->ftt_base != FASTTRAP_NOREG)
+ addr += fasttrap_getreg(rp, tp->ftt_base);
+ if (tp->ftt_index != FASTTRAP_NOREG)
+ addr += fasttrap_getreg(rp, tp->ftt_index) <<
+ tp->ftt_scale;
+
+ if (tp->ftt_code == 1) {
+ /*
+ * If there's a segment prefix for this
+ * instruction, we'll need to check permissions
+ * and bounds on the given selector, and adjust
+ * the address accordingly.
+ */
+ if (tp->ftt_segment != FASTTRAP_SEG_NONE &&
+ fasttrap_do_seg(tp, rp, &addr) != 0) {
+ fasttrap_sigsegv(p, curthread, addr);
+ new_pc = pc;
+ break;
+ }
+
+#ifdef __amd64
+ if (p->p_model == DATAMODEL_NATIVE) {
+#endif
+ if ((value = fasttrap_fulword((void *)addr))
+ == -1) {
+ fasttrap_sigsegv(p, curthread,
+ addr);
+ new_pc = pc;
+ break;
+ }
+ new_pc = value;
+#ifdef __amd64
+ } else {
+ uint32_t value32;
+ addr = (uintptr_t)(uint32_t)addr;
+ if ((value32 = fasttrap_fuword32((void *)addr))
+ == -1) {
+ fasttrap_sigsegv(p, curthread,
+ addr);
+ new_pc = pc;
+ break;
+ }
+ new_pc = value32;
+ }
+#endif
+ } else {
+ new_pc = addr;
+ }
+ }
+
+ /*
+ * If this is a call instruction, we need to push the return
+ * address onto the stack. If this fails, we send the process
+ * a SIGSEGV and reset the pc to emulate what would happen if
+ * this instruction weren't traced.
+ */
+ if (tp->ftt_type == FASTTRAP_T_CALL) {
+ int ret = 0;
+ uintptr_t addr = 0, pcps;
+#ifdef __amd64
+ if (p->p_model == DATAMODEL_NATIVE) {
+ addr = rp->r_rsp - sizeof (uintptr_t);
+ pcps = pc + tp->ftt_size;
+ ret = fasttrap_sulword((void *)addr, pcps);
+ } else {
+#endif
+ addr = rp->r_rsp - sizeof (uint32_t);
+ pcps = (uint32_t)(pc + tp->ftt_size);
+ ret = fasttrap_suword32((void *)addr, pcps);
+#ifdef __amd64
+ }
+#endif
+
+ if (ret == -1) {
+ fasttrap_sigsegv(p, curthread, addr);
+ new_pc = pc;
+ break;
+ }
+
+ rp->r_rsp = addr;
+ }
+
+ break;
+
+ case FASTTRAP_T_COMMON:
+ {
+ uintptr_t addr;
+#if defined(__amd64)
+ uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 22];
+#else
+ uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 7];
+#endif
+ uint_t i = 0;
+ fasttrap_scrspace_t *scrspace;
+ scrspace = fasttrap_scraddr(curthread, tp->ftt_proc);
+ if (scrspace == NULL) {
+ /*
+ * We failed to allocate scratch space for this thread.
+ * Try to write the original instruction back out and
+ * reset the pc.
+ */
+ if (fasttrap_copyout(tp->ftt_instr, (void *)pc,
+ tp->ftt_size))
+ fasttrap_sigtrap(p, curthread, pc);
+ new_pc = pc;
+ break;
+ }
+ addr = scrspace->ftss_addr;
+
+ /*
+ * Generic Instruction Tracing
+ * ---------------------------
+ *
+ * This is the layout of the scratch space in the user-land
+ * thread structure for our generated instructions.
+ *
+ * 32-bit mode bytes
+ * ------------------------ -----
+ * a: <original instruction> <= 15
+ * jmp <pc + tp->ftt_size> 5
+ * b: <original instruction> <= 15
+ * int T_DTRACE_RET 2
+ * -----
+ * <= 37
+ *
+ * 64-bit mode bytes
+ * ------------------------ -----
+ * a: <original instruction> <= 15
+ * jmp 0(%rip) 6
+ * <pc + tp->ftt_size> 8
+ * b: <original instruction> <= 15
+ * int T_DTRACE_RET 2
+ * -----
+ * <= 46
+ *
+ * The %pc is set to a, and curthread->t_dtrace_astpc is set
+ * to b. If we encounter a signal on the way out of the
+ * kernel, trap() will set %pc to curthread->t_dtrace_astpc
+ * so that we execute the original instruction and re-enter
+ * the kernel rather than redirecting to the next instruction.
+ *
+ * If there are return probes (so we know that we're going to
+ * need to reenter the kernel after executing the original
+ * instruction), the scratch space will just contain the
+ * original instruction followed by an interrupt -- the same
+ * data as at b.
+ *
+ * %rip-relative Addressing
+ * ------------------------
+ *
+ * There's a further complication in 64-bit mode due to %rip-
+ * relative addressing. While this is clearly a beneficial
+ * architectural decision for position independent code, it's
+ * hard not to see it as a personal attack against the pid
+ * provider since before there was a relatively small set of
+ * instructions to emulate; with %rip-relative addressing,
+ * almost every instruction can potentially depend on the
+ * address at which it's executed. Rather than emulating
+ * the broad spectrum of instructions that can now be
+ * position dependent, we emulate jumps and others as in
+ * 32-bit mode, and take a different tack for instructions
+ * using %rip-relative addressing.
+ *
+ * For every instruction that uses the ModRM byte, the
+ * in-kernel disassembler reports its location. We use the
+ * ModRM byte to identify that an instruction uses
+ * %rip-relative addressing and to see what other registers
+ * the instruction uses. To emulate those instructions,
+ * we modify the instruction to be %rax-relative rather than
+ * %rip-relative (or %rcx-relative if the instruction uses
+ * %rax; or %r8- or %r9-relative if the REX.B is present so
+ * we don't have to rewrite the REX prefix). We then load
+ * the value that %rip would have been into the scratch
+ * register and generate an instruction to reset the scratch
+ * register back to its original value. The instruction
+ * sequence looks like this:
+ *
+ * 64-mode %rip-relative bytes
+ * ------------------------ -----
+ * a: <modified instruction> <= 15
+ * movq $<value>, %<scratch> 6
+ * jmp 0(%rip) 6
+ * <pc + tp->ftt_size> 8
+ * b: <modified instruction> <= 15
+ * int T_DTRACE_RET 2
+ * -----
+ * 52
+ *
+ * We set curthread->t_dtrace_regv so that upon receiving
+ * a signal we can reset the value of the scratch register.
+ */
+
+ ASSERT(tp->ftt_size <= FASTTRAP_MAX_INSTR_SIZE);
+
+ curthread->t_dtrace_scrpc = addr;
+ bcopy(tp->ftt_instr, &scratch[i], tp->ftt_size);
+ i += tp->ftt_size;
+
+#ifdef __amd64
+ if (tp->ftt_ripmode != 0) {
+ greg_t *reg = NULL;
+
+ ASSERT(p->p_model == DATAMODEL_LP64);
+ ASSERT(tp->ftt_ripmode &
+ (FASTTRAP_RIP_1 | FASTTRAP_RIP_2));
+
+ /*
+ * If this was a %rip-relative instruction, we change
+ * it to be either a %rax- or %rcx-relative
+ * instruction (depending on whether those registers
+ * are used as another operand; or %r8- or %r9-
+ * relative depending on the value of REX.B). We then
+ * set that register and generate a movq instruction
+ * to reset the value.
+ */
+ if (tp->ftt_ripmode & FASTTRAP_RIP_X)
+ scratch[i++] = FASTTRAP_REX(1, 0, 0, 1);
+ else
+ scratch[i++] = FASTTRAP_REX(1, 0, 0, 0);
+
+ if (tp->ftt_ripmode & FASTTRAP_RIP_1)
+ scratch[i++] = FASTTRAP_MOV_EAX;
+ else
+ scratch[i++] = FASTTRAP_MOV_ECX;
+
+ switch (tp->ftt_ripmode) {
+ case FASTTRAP_RIP_1:
+ reg = &rp->r_rax;
+ curthread->t_dtrace_reg = REG_RAX;
+ break;
+ case FASTTRAP_RIP_2:
+ reg = &rp->r_rcx;
+ curthread->t_dtrace_reg = REG_RCX;
+ break;
+ case FASTTRAP_RIP_1 | FASTTRAP_RIP_X:
+ reg = &rp->r_r8;
+ curthread->t_dtrace_reg = REG_R8;
+ break;
+ case FASTTRAP_RIP_2 | FASTTRAP_RIP_X:
+ reg = &rp->r_r9;
+ curthread->t_dtrace_reg = REG_R9;
+ break;
+ }
+
+ /* LINTED - alignment */
+ *(uint64_t *)&scratch[i] = *reg;
+ curthread->t_dtrace_regv = *reg;
+ *reg = pc + tp->ftt_size;
+ i += sizeof (uint64_t);
+ }
+#endif
+
+ /*
+ * Generate the branch instruction to what would have
+ * normally been the subsequent instruction. In 32-bit mode,
+ * this is just a relative branch; in 64-bit mode this is a
+ * %rip-relative branch that loads the 64-bit pc value
+ * immediately after the jmp instruction.
+ */
+#ifdef __amd64
+ if (p->p_model == DATAMODEL_LP64) {
+ scratch[i++] = FASTTRAP_GROUP5_OP;
+ scratch[i++] = FASTTRAP_MODRM(0, 4, 5);
+ /* LINTED - alignment */
+ *(uint32_t *)&scratch[i] = 0;
+ i += sizeof (uint32_t);
+ /* LINTED - alignment */
+ *(uint64_t *)&scratch[i] = pc + tp->ftt_size;
+ i += sizeof (uint64_t);
+ } else {
+#endif
+ /*
+ * Set up the jmp to the next instruction; note that
+ * the size of the traced instruction cancels out.
+ */
+ scratch[i++] = FASTTRAP_JMP32;
+ /* LINTED - alignment */
+ *(uint32_t *)&scratch[i] = pc - addr - 5;
+ i += sizeof (uint32_t);
+#ifdef __amd64
+ }
+#endif
+
+ curthread->t_dtrace_astpc = addr + i;
+ bcopy(tp->ftt_instr, &scratch[i], tp->ftt_size);
+ i += tp->ftt_size;
+ scratch[i++] = FASTTRAP_INT;
+ scratch[i++] = T_DTRACE_RET;
+
+ ASSERT(i <= sizeof (scratch));
+
+ if (fasttrap_copyout(scratch, (char *)addr, i)) {
+ fasttrap_sigtrap(p, curthread, pc);
+ new_pc = pc;
+ break;
+ }
+ if (tp->ftt_retids != NULL) {
+ curthread->t_dtrace_step = 1;
+ curthread->t_dtrace_ret = 1;
+ new_pc = curthread->t_dtrace_astpc;
+ } else {
+ new_pc = curthread->t_dtrace_scrpc;
+ }
+
+ curthread->t_dtrace_pc = pc;
+ curthread->t_dtrace_npc = pc + tp->ftt_size;
+ curthread->t_dtrace_on = 1;
+ break;
+ }
+
+ default:
+ panic("fasttrap: mishandled an instruction");
+ }
+
+done:
+ /*
+ * If there were no return probes when we first found the tracepoint,
+ * we should feel no obligation to honor any return probes that were
+ * subsequently enabled -- they'll just have to wait until the next
+ * time around.
+ */
+ if (tp->ftt_retids != NULL) {
+ /*
+ * We need to wait until the results of the instruction are
+ * apparent before invoking any return probes. If this
+ * instruction was emulated we can just call
+ * fasttrap_return_common(); if it needs to be executed, we
+ * need to wait until the user thread returns to the kernel.
+ */
+ if (tp->ftt_type != FASTTRAP_T_COMMON) {
+ /*
+ * Set the program counter to the address of the traced
+ * instruction so that it looks right in ustack()
+ * output. We had previously set it to the end of the
+ * instruction to simplify %rip-relative addressing.
+ */
+ rp->r_rip = pc;
+
+ fasttrap_return_common(rp, pc, pid, new_pc);
+ } else {
+ ASSERT(curthread->t_dtrace_ret != 0);
+ ASSERT(curthread->t_dtrace_pc == pc);
+ ASSERT(curthread->t_dtrace_scrpc != 0);
+ ASSERT(new_pc == curthread->t_dtrace_astpc);
+ }
+ }
+
+ rp->r_rip = new_pc;
+
+ PROC_LOCK(p);
+ proc_write_regs(curthread, rp);
+ PROC_UNLOCK(p);
+
+ return (0);
+}
+
+int
+fasttrap_return_probe(struct trapframe *tf)
+{
+ struct reg reg, *rp;
+ proc_t *p = curproc;
+ uintptr_t pc = curthread->t_dtrace_pc;
+ uintptr_t npc = curthread->t_dtrace_npc;
+
+ fill_frame_regs(tf, &reg);
+ rp = &reg;
+
+ curthread->t_dtrace_pc = 0;
+ curthread->t_dtrace_npc = 0;
+ curthread->t_dtrace_scrpc = 0;
+ curthread->t_dtrace_astpc = 0;
+
+#ifdef illumos
+ /*
+ * Treat a child created by a call to vfork(2) as if it were its
+ * parent. We know that there's only one thread of control in such a
+ * process: this one.
+ */
+ while (p->p_flag & SVFORK) {
+ p = p->p_parent;
+ }
+#endif
+
+ /*
+ * We set rp->r_rip to the address of the traced instruction so
+ * that it appears to dtrace_probe() that we're on the original
+ * instruction.
+ */
+ rp->r_rip = pc;
+
+ fasttrap_return_common(rp, pc, p->p_pid, npc);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+uint64_t
+fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+ int aframes)
+{
+ struct reg r;
+
+ fill_regs(curthread, &r);
+
+ return (fasttrap_anarg(&r, 1, argno));
+}
+
+/*ARGSUSED*/
+uint64_t
+fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+ int aframes)
+{
+ struct reg r;
+
+ fill_regs(curthread, &r);
+
+ return (fasttrap_anarg(&r, 0, argno));
+}
+
+static ulong_t
+fasttrap_getreg(struct reg *rp, uint_t reg)
+{
+#ifdef __amd64
+ switch (reg) {
+ case REG_R15: return (rp->r_r15);
+ case REG_R14: return (rp->r_r14);
+ case REG_R13: return (rp->r_r13);
+ case REG_R12: return (rp->r_r12);
+ case REG_R11: return (rp->r_r11);
+ case REG_R10: return (rp->r_r10);
+ case REG_R9: return (rp->r_r9);
+ case REG_R8: return (rp->r_r8);
+ case REG_RDI: return (rp->r_rdi);
+ case REG_RSI: return (rp->r_rsi);
+ case REG_RBP: return (rp->r_rbp);
+ case REG_RBX: return (rp->r_rbx);
+ case REG_RDX: return (rp->r_rdx);
+ case REG_RCX: return (rp->r_rcx);
+ case REG_RAX: return (rp->r_rax);
+ case REG_TRAPNO: return (rp->r_trapno);
+ case REG_ERR: return (rp->r_err);
+ case REG_RIP: return (rp->r_rip);
+ case REG_CS: return (rp->r_cs);
+ case REG_RFL: return (rp->r_rflags);
+ case REG_RSP: return (rp->r_rsp);
+ case REG_SS: return (rp->r_ss);
+ case REG_FS: return (rp->r_fs);
+ case REG_GS: return (rp->r_gs);
+ case REG_DS: return (rp->r_ds);
+ case REG_ES: return (rp->r_es);
+ case REG_FSBASE: return (rdmsr(MSR_FSBASE));
+ case REG_GSBASE: return (rdmsr(MSR_GSBASE));
+ }
+
+ panic("dtrace: illegal register constant");
+ /*NOTREACHED*/
+#else
+#define _NGREG 19
+ if (reg >= _NGREG)
+ panic("dtrace: illegal register constant");
+
+ return (((greg_t *)&rp->r_gs)[reg]);
+#endif
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/intel/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/intel/sys/fasttrap_isa.h
new file mode 100644
index 000000000000..9fee8cdb6be0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/intel/sys/fasttrap_isa.h
@@ -0,0 +1,114 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_ISA_H
+#define _FASTTRAP_ISA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FASTTRAP_MAX_INSTR_SIZE 15
+
+#define FASTTRAP_INSTR 0xcc
+
+#define FASTTRAP_SUNWDTRACE_SIZE 64
+
+typedef uint8_t fasttrap_instr_t;
+
+typedef struct fasttrap_machtp {
+ uint8_t ftmt_instr[FASTTRAP_MAX_INSTR_SIZE]; /* orig. instr. */
+ uint8_t ftmt_size; /* instruction size */
+#ifdef __amd64
+ uint8_t ftmt_ripmode; /* %rip-relative handling mode */
+ uint8_t ftmt_modrm; /* saved modrm byte */
+#endif
+ uint8_t ftmt_type; /* emulation type */
+ uint8_t ftmt_code; /* branch condition */
+ uint8_t ftmt_base; /* branch base */
+ uint8_t ftmt_index; /* branch index */
+ uint8_t ftmt_scale; /* branch scale */
+ uint8_t ftmt_segment; /* segment for memory accesses */
+ uintptr_t ftmt_dest; /* destination of control flow */
+} fasttrap_machtp_t;
+
+#define ftt_instr ftt_mtp.ftmt_instr
+#ifdef __amd64
+#define ftt_ripmode ftt_mtp.ftmt_ripmode
+#define ftt_modrm ftt_mtp.ftmt_modrm
+#endif
+#define ftt_size ftt_mtp.ftmt_size
+#define ftt_type ftt_mtp.ftmt_type
+#define ftt_code ftt_mtp.ftmt_code
+#define ftt_base ftt_mtp.ftmt_base
+#define ftt_index ftt_mtp.ftmt_index
+#define ftt_scale ftt_mtp.ftmt_scale
+#define ftt_segment ftt_mtp.ftmt_segment
+#define ftt_dest ftt_mtp.ftmt_dest
+
+#define FASTTRAP_T_COMMON 0x00 /* common case -- no emulation */
+#define FASTTRAP_T_JCC 0x01 /* near and far conditional jumps */
+#define FASTTRAP_T_LOOP 0x02 /* loop instructions */
+#define FASTTRAP_T_JCXZ 0x03 /* jump if %ecx/%rcx is zero */
+#define FASTTRAP_T_JMP 0x04 /* relative jump */
+#define FASTTRAP_T_CALL 0x05 /* near call (and link) */
+#define FASTTRAP_T_RET 0x06 /* ret */
+#define FASTTRAP_T_RET16 0x07 /* ret <imm16> */
+
+/*
+ * For performance rather than correctness.
+ */
+#define FASTTRAP_T_PUSHL_EBP 0x10 /* pushl %ebp (for function entry) */
+#define FASTTRAP_T_NOP 0x11 /* nop */
+
+#define FASTTRAP_RIP_1 0x1
+#define FASTTRAP_RIP_2 0x2
+#define FASTTRAP_RIP_X 0x4
+
+/*
+ * Segment values.
+ */
+#define FASTTRAP_SEG_NONE 0
+#define FASTTRAP_SEG_CS 1
+#define FASTTRAP_SEG_DS 2
+#define FASTTRAP_SEG_ES 3
+#define FASTTRAP_SEG_FS 4
+#define FASTTRAP_SEG_GS 5
+#define FASTTRAP_SEG_SS 6
+
+#define FASTTRAP_AFRAMES 3
+#define FASTTRAP_RETURN_AFRAMES 4
+#define FASTTRAP_ENTRY_AFRAMES 3
+#define FASTTRAP_OFFSET_AFRAMES 3
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_ISA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c
new file mode 100644
index 000000000000..a31eac8cf0b8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c
@@ -0,0 +1,30 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+/*
+ * XXX: Placeholder for MIPS fasttrap code
+ */
diff --git a/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h
new file mode 100644
index 000000000000..eb99752ce415
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h
@@ -0,0 +1,48 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_ISA_H
+#define _FASTTRAP_ISA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * XXXDTRACE: placehodler for MIPS fasttrap stuff
+ */
+
+typedef uint32_t fasttrap_instr_t;
+#define FASTTRAP_SUNWDTRACE_SIZE 64
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_ISA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c
new file mode 100644
index 000000000000..69ba0f3c64d0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c
@@ -0,0 +1,548 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Portions Copyright 2013 Justin Hibbits */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/fasttrap_isa.h>
+#include <sys/fasttrap_impl.h>
+#include <sys/dtrace.h>
+#include <sys/dtrace_impl.h>
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/ptrace.h>
+#include <sys/rmlock.h>
+#include <sys/sysent.h>
+
+#define OP(x) ((x) >> 26)
+#define OPX(x) (((x) >> 2) & 0x3FF)
+#define OP_BO(x) (((x) & 0x03E00000) >> 21)
+#define OP_BI(x) (((x) & 0x001F0000) >> 16)
+#define OP_RS(x) (((x) & 0x03E00000) >> 21)
+#define OP_RA(x) (((x) & 0x001F0000) >> 16)
+#define OP_RB(x) (((x) & 0x0000F100) >> 11)
+
+int
+fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
+{
+ fasttrap_instr_t instr = FASTTRAP_INSTR;
+
+ if (uwrite(p, &instr, 4, tp->ftt_pc) != 0)
+ return (-1);
+
+ return (0);
+}
+
+int
+fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
+{
+ uint32_t instr;
+
+ /*
+ * Distinguish between read or write failures and a changed
+ * instruction.
+ */
+ if (uread(p, &instr, 4, tp->ftt_pc) != 0)
+ return (0);
+ if (instr != FASTTRAP_INSTR)
+ return (0);
+ if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0)
+ return (-1);
+
+ return (0);
+}
+
+int
+fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
+ fasttrap_probe_type_t type)
+{
+ uint32_t instr;
+ //int32_t disp;
+
+ /*
+ * Read the instruction at the given address out of the process's
+ * address space. We don't have to worry about a debugger
+ * changing this instruction before we overwrite it with our trap
+ * instruction since P_PR_LOCK is set.
+ */
+ if (uread(p, &instr, 4, pc) != 0)
+ return (-1);
+
+ /*
+ * Decode the instruction to fill in the probe flags. We can have
+ * the process execute most instructions on its own using a pc/npc
+ * trick, but pc-relative control transfer present a problem since
+ * we're relocating the instruction. We emulate these instructions
+ * in the kernel. We assume a default type and over-write that as
+ * needed.
+ *
+ * pc-relative instructions must be emulated for correctness;
+ * other instructions (which represent a large set of commonly traced
+ * instructions) are emulated or otherwise optimized for performance.
+ */
+ tp->ftt_type = FASTTRAP_T_COMMON;
+ tp->ftt_instr = instr;
+
+ switch (OP(instr)) {
+ /* The following are invalid for trapping (invalid opcodes, tw/twi). */
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 5:
+ case 6:
+ case 30:
+ case 39:
+ case 58:
+ case 62:
+ case 3: /* twi */
+ return (-1);
+ case 31: /* tw */
+ if (OPX(instr) == 4)
+ return (-1);
+ else if (OPX(instr) == 444 && OP_RS(instr) == OP_RA(instr) &&
+ OP_RS(instr) == OP_RB(instr))
+ tp->ftt_type = FASTTRAP_T_NOP;
+ break;
+ case 16:
+ tp->ftt_type = FASTTRAP_T_BC;
+ tp->ftt_dest = instr & 0x0000FFFC; /* Extract target address */
+ if (instr & 0x00008000)
+ tp->ftt_dest |= 0xFFFF0000;
+ /* Use as offset if not absolute address. */
+ if (!(instr & 0x02))
+ tp->ftt_dest += pc;
+ tp->ftt_bo = OP_BO(instr);
+ tp->ftt_bi = OP_BI(instr);
+ break;
+ case 18:
+ tp->ftt_type = FASTTRAP_T_B;
+ tp->ftt_dest = instr & 0x03FFFFFC; /* Extract target address */
+ if (instr & 0x02000000)
+ tp->ftt_dest |= 0xFC000000;
+ /* Use as offset if not absolute address. */
+ if (!(instr & 0x02))
+ tp->ftt_dest += pc;
+ break;
+ case 19:
+ switch (OPX(instr)) {
+ case 528: /* bcctr */
+ tp->ftt_type = FASTTRAP_T_BCTR;
+ tp->ftt_bo = OP_BO(instr);
+ tp->ftt_bi = OP_BI(instr);
+ break;
+ case 16: /* bclr */
+ tp->ftt_type = FASTTRAP_T_BCTR;
+ tp->ftt_bo = OP_BO(instr);
+ tp->ftt_bi = OP_BI(instr);
+ break;
+ };
+ break;
+ case 24:
+ if (OP_RS(instr) == OP_RA(instr) &&
+ (instr & 0x0000FFFF) == 0)
+ tp->ftt_type = FASTTRAP_T_NOP;
+ break;
+ };
+
+ /*
+ * We don't know how this tracepoint is going to be used, but in case
+ * it's used as part of a function return probe, we need to indicate
+ * whether it's always a return site or only potentially a return
+ * site. If it's part of a return probe, it's always going to be a
+ * return from that function if it's a restore instruction or if
+ * the previous instruction was a return. If we could reliably
+ * distinguish jump tables from return sites, this wouldn't be
+ * necessary.
+ */
+#if 0
+ if (tp->ftt_type != FASTTRAP_T_RESTORE &&
+ (uread(p, &instr, 4, pc - sizeof (instr)) != 0 ||
+ !(OP(instr) == 2 && OP3(instr) == OP3_RETURN)))
+ tp->ftt_flags |= FASTTRAP_F_RETMAYBE;
+#endif
+
+ return (0);
+}
+
+static uint64_t
+fasttrap_anarg(struct reg *rp, int argno)
+{
+ uint64_t value;
+ proc_t *p = curproc;
+
+ /* The first 8 arguments are in registers. */
+ if (argno < 8)
+ return rp->fixreg[argno + 3];
+
+ /* Arguments on stack start after SP+LR (2 register slots). */
+ if (SV_PROC_FLAG(p, SV_ILP32)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ value = dtrace_fuword32((void *)(rp->fixreg[1] + 8 +
+ ((argno - 8) * sizeof(uint32_t))));
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+ } else {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ value = dtrace_fuword64((void *)(rp->fixreg[1] + 48 +
+ ((argno - 8) * sizeof(uint64_t))));
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+ }
+ return value;
+}
+
+uint64_t
+fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+ int aframes)
+{
+ struct reg r;
+
+ fill_regs(curthread, &r);
+
+ return (fasttrap_anarg(&r, argno));
+}
+
+uint64_t
+fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+ int aframes)
+{
+ struct reg r;
+
+ fill_regs(curthread, &r);
+
+ return (fasttrap_anarg(&r, argno));
+}
+
+static void
+fasttrap_usdt_args(fasttrap_probe_t *probe, struct reg *rp, int argc,
+ uintptr_t *argv)
+{
+ int i, x, cap = MIN(argc, probe->ftp_nargs);
+
+ for (i = 0; i < cap; i++) {
+ x = probe->ftp_argmap[i];
+
+ if (x < 8)
+ argv[i] = rp->fixreg[x];
+ else
+ if (SV_PROC_FLAG(curproc, SV_ILP32))
+ argv[i] = fuword32((void *)(rp->fixreg[1] + 8 +
+ (x * sizeof(uint32_t))));
+ else
+ argv[i] = fuword64((void *)(rp->fixreg[1] + 48 +
+ (x * sizeof(uint64_t))));
+ }
+
+ for (; i < argc; i++) {
+ argv[i] = 0;
+ }
+}
+
+static void
+fasttrap_return_common(struct reg *rp, uintptr_t pc, pid_t pid,
+ uintptr_t new_pc)
+{
+ struct rm_priotracker tracker;
+ fasttrap_tracepoint_t *tp;
+ fasttrap_bucket_t *bucket;
+ fasttrap_id_t *id;
+
+ rm_rlock(&fasttrap_tp_lock, &tracker);
+ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
+ tp->ftt_proc->ftpc_acount != 0)
+ break;
+ }
+
+ /*
+ * Don't sweat it if we can't find the tracepoint again; unlike
+ * when we're in fasttrap_pid_probe(), finding the tracepoint here
+ * is not essential to the correct execution of the process.
+ */
+ if (tp == NULL) {
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ return;
+ }
+
+ for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
+ /*
+ * If there's a branch that could act as a return site, we
+ * need to trace it, and check here if the program counter is
+ * external to the function.
+ */
+ /* Skip function-local branches. */
+ if ((new_pc - id->fti_probe->ftp_faddr) < id->fti_probe->ftp_fsize)
+ continue;
+
+ dtrace_probe(id->fti_probe->ftp_id,
+ pc - id->fti_probe->ftp_faddr,
+ rp->fixreg[3], rp->fixreg[4], 0, 0);
+ }
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+}
+
+
+static int
+fasttrap_branch_taken(int bo, int bi, struct reg *regs)
+{
+ int crzero = 0;
+
+ /* Branch always? */
+ if ((bo & 0x14) == 0x14)
+ return 1;
+
+ /* Handle decrementing ctr */
+ if (!(bo & 0x04)) {
+ --regs->ctr;
+ crzero = (regs->ctr == 0);
+ if (bo & 0x10) {
+ return (!(crzero ^ (bo >> 1)));
+ }
+ }
+
+ return (crzero | (((regs->cr >> (31 - bi)) ^ (bo >> 3)) ^ 1));
+}
+
+
+int
+fasttrap_pid_probe(struct trapframe *frame)
+{
+ struct reg reg, *rp;
+ struct rm_priotracker tracker;
+ proc_t *p = curproc;
+ uintptr_t pc;
+ uintptr_t new_pc = 0;
+ fasttrap_bucket_t *bucket;
+ fasttrap_tracepoint_t *tp, tp_local;
+ pid_t pid;
+ dtrace_icookie_t cookie;
+ uint_t is_enabled = 0;
+
+ fill_regs(curthread, &reg);
+ rp = &reg;
+ pc = rp->pc;
+
+ /*
+ * It's possible that a user (in a veritable orgy of bad planning)
+ * could redirect this thread's flow of control before it reached the
+ * return probe fasttrap. In this case we need to kill the process
+ * since it's in a unrecoverable state.
+ */
+ if (curthread->t_dtrace_step) {
+ ASSERT(curthread->t_dtrace_on);
+ fasttrap_sigtrap(p, curthread, pc);
+ return (0);
+ }
+
+ /*
+ * Clear all user tracing flags.
+ */
+ curthread->t_dtrace_ft = 0;
+ curthread->t_dtrace_pc = 0;
+ curthread->t_dtrace_npc = 0;
+ curthread->t_dtrace_scrpc = 0;
+ curthread->t_dtrace_astpc = 0;
+
+ rm_rlock(&fasttrap_tp_lock, &tracker);
+ pid = p->p_pid;
+ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+ /*
+ * Lookup the tracepoint that the process just hit.
+ */
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
+ tp->ftt_proc->ftpc_acount != 0)
+ break;
+ }
+
+ /*
+ * If we couldn't find a matching tracepoint, either a tracepoint has
+ * been inserted without using the pid<pid> ioctl interface (see
+ * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
+ */
+ if (tp == NULL) {
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ return (-1);
+ }
+
+ if (tp->ftt_ids != NULL) {
+ fasttrap_id_t *id;
+
+ for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
+ fasttrap_probe_t *probe = id->fti_probe;
+
+ if (id->fti_ptype == DTFTP_ENTRY) {
+ /*
+ * We note that this was an entry
+ * probe to help ustack() find the
+ * first caller.
+ */
+ cookie = dtrace_interrupt_disable();
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
+ dtrace_probe(probe->ftp_id, rp->fixreg[3],
+ rp->fixreg[4], rp->fixreg[5], rp->fixreg[6],
+ rp->fixreg[7]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
+ dtrace_interrupt_enable(cookie);
+ } else if (id->fti_ptype == DTFTP_IS_ENABLED) {
+ /*
+ * Note that in this case, we don't
+ * call dtrace_probe() since it's only
+ * an artificial probe meant to change
+ * the flow of control so that it
+ * encounters the true probe.
+ */
+ is_enabled = 1;
+ } else if (probe->ftp_argmap == NULL) {
+ dtrace_probe(probe->ftp_id, rp->fixreg[3],
+ rp->fixreg[4], rp->fixreg[5], rp->fixreg[6],
+ rp->fixreg[7]);
+ } else {
+ uintptr_t t[5];
+
+ fasttrap_usdt_args(probe, rp,
+ sizeof (t) / sizeof (t[0]), t);
+
+ dtrace_probe(probe->ftp_id, t[0], t[1],
+ t[2], t[3], t[4]);
+ }
+ }
+ }
+
+ /*
+ * We're about to do a bunch of work so we cache a local copy of
+ * the tracepoint to emulate the instruction, and then find the
+ * tracepoint again later if we need to light up any return probes.
+ */
+ tp_local = *tp;
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ tp = &tp_local;
+
+ /*
+ * If there's an is-enabled probe connected to this tracepoint it
+ * means that there was a 'xor r3, r3, r3'
+ * instruction that was placed there by DTrace when the binary was
+ * linked. As this probe is, in fact, enabled, we need to stuff 1
+ * into R3. Accordingly, we can bypass all the instruction
+ * emulation logic since we know the inevitable result. It's possible
+ * that a user could construct a scenario where the 'is-enabled'
+ * probe was on some other instruction, but that would be a rather
+ * exotic way to shoot oneself in the foot.
+ */
+ if (is_enabled) {
+ rp->fixreg[3] = 1;
+ new_pc = rp->pc + 4;
+ goto done;
+ }
+
+
+ switch (tp->ftt_type) {
+ case FASTTRAP_T_NOP:
+ new_pc = rp->pc + 4;
+ break;
+ case FASTTRAP_T_BC:
+ if (!fasttrap_branch_taken(tp->ftt_bo, tp->ftt_bi, rp))
+ break;
+ /* FALLTHROUGH */
+ case FASTTRAP_T_B:
+ if (tp->ftt_instr & 0x01)
+ rp->lr = rp->pc + 4;
+ new_pc = tp->ftt_dest;
+ break;
+ case FASTTRAP_T_BLR:
+ case FASTTRAP_T_BCTR:
+ if (!fasttrap_branch_taken(tp->ftt_bo, tp->ftt_bi, rp))
+ break;
+ /* FALLTHROUGH */
+ if (tp->ftt_type == FASTTRAP_T_BCTR)
+ new_pc = rp->ctr;
+ else
+ new_pc = rp->lr;
+ if (tp->ftt_instr & 0x01)
+ rp->lr = rp->pc + 4;
+ break;
+ case FASTTRAP_T_COMMON:
+ break;
+ };
+done:
+ /*
+ * If there were no return probes when we first found the tracepoint,
+ * we should feel no obligation to honor any return probes that were
+ * subsequently enabled -- they'll just have to wait until the next
+ * time around.
+ */
+ if (tp->ftt_retids != NULL) {
+ /*
+ * We need to wait until the results of the instruction are
+ * apparent before invoking any return probes. If this
+ * instruction was emulated we can just call
+ * fasttrap_return_common(); if it needs to be executed, we
+ * need to wait until the user thread returns to the kernel.
+ */
+ if (tp->ftt_type != FASTTRAP_T_COMMON) {
+ fasttrap_return_common(rp, pc, pid, new_pc);
+ } else {
+ ASSERT(curthread->t_dtrace_ret != 0);
+ ASSERT(curthread->t_dtrace_pc == pc);
+ ASSERT(curthread->t_dtrace_scrpc != 0);
+ ASSERT(new_pc == curthread->t_dtrace_astpc);
+ }
+ }
+
+ rp->pc = new_pc;
+ set_regs(curthread, rp);
+
+ return (0);
+}
+
+int
+fasttrap_return_probe(struct trapframe *tf)
+{
+ struct reg reg, *rp;
+ proc_t *p = curproc;
+ uintptr_t pc = curthread->t_dtrace_pc;
+ uintptr_t npc = curthread->t_dtrace_npc;
+
+ curthread->t_dtrace_pc = 0;
+ curthread->t_dtrace_npc = 0;
+ curthread->t_dtrace_scrpc = 0;
+ curthread->t_dtrace_astpc = 0;
+
+ fill_regs(curthread, &reg);
+ rp = &reg;
+
+ /*
+ * We set rp->pc to the address of the traced instruction so
+ * that it appears to dtrace_probe() that we're on the original
+ * instruction.
+ */
+ rp->pc = pc;
+
+ fasttrap_return_common(rp, pc, p->p_pid, npc);
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h
new file mode 100644
index 000000000000..98d5d2675662
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Portions Copyright 2013 Justin Hibbits */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_ISA_H
+#define _FASTTRAP_ISA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FASTTRAP_SUNWDTRACE_SIZE 64
+#define FASTTRAP_INSTR 0x0FFFDDDD
+
+typedef uint32_t fasttrap_instr_t;
+
+typedef struct fasttrap_machtp_t {
+ fasttrap_instr_t ftmt_instr; /* original instruction */
+ uintptr_t ftmt_dest; /* branch target */
+ uint8_t ftmt_type; /* emulation type */
+ uint8_t ftmt_flags; /* emulation flags */
+ uint8_t ftmt_bo; /* BO field */
+ uint8_t ftmt_bi; /* BI field (CR bit) */
+} fasttrap_machtp_t;
+
+#define ftt_instr ftt_mtp.ftmt_instr
+#define ftt_dest ftt_mtp.ftmt_dest
+#define ftt_type ftt_mtp.ftmt_type
+#define ftt_flags ftt_mtp.ftmt_flags
+#define ftt_bo ftt_mtp.ftmt_bo
+#define ftt_bi ftt_mtp.ftmt_bi
+
+#define FASTTRAP_T_COMMON 0x00
+#define FASTTRAP_T_B 0x01
+#define FASTTRAP_T_BC 0x02
+#define FASTTRAP_T_BLR 0x03
+#define FASTTRAP_T_BCTR 0x04
+#define FASTTRAP_T_NOP 0x05
+
+#define FASTTRAP_AFRAMES 3
+#define FASTTRAP_RETURN_AFRAMES 4
+#define FASTTRAP_ENTRY_AFRAMES 3
+#define FASTTRAP_OFFSET_AFRAMES 3
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_ISA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/riscv/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/riscv/dtrace/fasttrap_isa.c
new file mode 100644
index 000000000000..17ef0f3a94a7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/riscv/dtrace/fasttrap_isa.c
@@ -0,0 +1,29 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * XXX: Placeholder for RISC-V fasttrap code
+ */
diff --git a/sys/cddl/contrib/opensolaris/uts/riscv/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/riscv/sys/fasttrap_isa.h
new file mode 100644
index 000000000000..9d164e4c5bd8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/riscv/sys/fasttrap_isa.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_ISA_H
+#define _FASTTRAP_ISA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint32_t fasttrap_instr_t;
+
+/* XXX: Place for RISC-V fasttrap headers */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_ISA_H */